Repository: easystats/datawizard
Branch: main
Commit: 89daeba7d094
Files: 321
Total size: 1.8 MB

Directory structure:
gitextract_nyyro61y/

├── .Rbuildignore
├── .dev/
│   ├── _BENCHMARK_RESHAPE.R
│   ├── html5.R
│   ├── revdepcheck.R
│   ├── test-value_at.R
│   └── value_at.R
├── .git-blame-ignore-revs
├── .github/
│   ├── .gitignore
│   ├── CODE_OF_CONDUCT.md
│   ├── CONTRIBUTING.md
│   ├── FUNDING.yml
│   ├── SUPPORT.md
│   ├── dependabot.yaml
│   └── workflows/
│       ├── R-CMD-check-hard.yaml
│       ├── R-CMD-check.yaml
│       ├── check-all-examples.yaml
│       ├── check-link-rot.yaml
│       ├── check-random-test-order.yaml
│       ├── check-readme.yaml
│       ├── check-spelling.yaml
│       ├── check-styling.yaml
│       ├── check-test-warnings.yaml
│       ├── check-vignette-warnings.yaml
│       ├── html-5-check.yaml
│       ├── lint-changed-files.yaml
│       ├── lint.yaml
│       ├── pkgdown-no-suggests.yaml
│       ├── pkgdown.yaml
│       ├── test-coverage-examples.yaml
│       ├── test-coverage.yaml
│       └── update-to-latest-easystats.yaml
├── .gitignore
├── .lintr
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── NEWS.md
├── R/
│   ├── adjust.R
│   ├── assign_labels.R
│   ├── categorize.R
│   ├── center.R
│   ├── contrs.R
│   ├── convert_na_to.R
│   ├── convert_to_na.R
│   ├── data.R
│   ├── data_addprefix.R
│   ├── data_arrange.R
│   ├── data_codebook.R
│   ├── data_duplicated.R
│   ├── data_extract.R
│   ├── data_group.R
│   ├── data_match.R
│   ├── data_merge.R
│   ├── data_modify.R
│   ├── data_partition.R
│   ├── data_peek.R
│   ├── data_read.R
│   ├── data_relocate.R
│   ├── data_remove.R
│   ├── data_rename.R
│   ├── data_replicate.R
│   ├── data_rescale.R
│   ├── data_restoretype.R
│   ├── data_reverse.R
│   ├── data_rotate.R
│   ├── data_seek.R
│   ├── data_select.R
│   ├── data_separate.R
│   ├── data_summary.R
│   ├── data_tabulate.R
│   ├── data_to_long.R
│   ├── data_to_wide.R
│   ├── data_unique.R
│   ├── data_unite.R
│   ├── data_write.R
│   ├── data_xtabulate.R
│   ├── datawizard-package.R
│   ├── demean.R
│   ├── describe_distribution.R
│   ├── descriptives.R
│   ├── extract_column_names.R
│   ├── format.R
│   ├── labels_to_levels.R
│   ├── makepredictcall.R
│   ├── mean_sd.R
│   ├── means_by_group.R
│   ├── normalize.R
│   ├── ranktransform.R
│   ├── recode_into.R
│   ├── recode_values.R
│   ├── remove_empty.R
│   ├── replace_nan_inf.R
│   ├── rescale_weights.R
│   ├── reshape_ci.R
│   ├── row_count.R
│   ├── row_means.R
│   ├── select_nse.R
│   ├── skewness_kurtosis.R
│   ├── slide.R
│   ├── smoothness.R
│   ├── standardize.R
│   ├── standardize.models.R
│   ├── text_format.R
│   ├── to_factor.R
│   ├── to_numeric.R
│   ├── unnormalize.R
│   ├── unstandardize.R
│   ├── utils-cols.R
│   ├── utils-rows.R
│   ├── utils.R
│   ├── utils_labels.R
│   ├── utils_standardize_center.R
│   ├── visualisation_recipe.R
│   ├── weighted_mean_median_sd_mad.R
│   └── winsorize.R
├── README.Rmd
├── README.md
├── air.toml
├── cran-comments.md
├── data/
│   ├── efc.RData
│   └── nhanes_sample.RData
├── datawizard.Rproj
├── datawizard.code-workspace
├── inst/
│   ├── CITATION
│   └── WORDLIST
├── man/
│   ├── adjust.Rd
│   ├── as.prop.table.Rd
│   ├── assign_labels.Rd
│   ├── categorize.Rd
│   ├── center.Rd
│   ├── coef_var.Rd
│   ├── coerce_to_numeric.Rd
│   ├── colnames.Rd
│   ├── contr.deviation.Rd
│   ├── convert_na_to.Rd
│   ├── convert_to_na.Rd
│   ├── data_arrange.Rd
│   ├── data_codebook.Rd
│   ├── data_duplicated.Rd
│   ├── data_extract.Rd
│   ├── data_group.Rd
│   ├── data_match.Rd
│   ├── data_merge.Rd
│   ├── data_modify.Rd
│   ├── data_partition.Rd
│   ├── data_peek.Rd
│   ├── data_prefix_suffix.Rd
│   ├── data_read.Rd
│   ├── data_relocate.Rd
│   ├── data_rename.Rd
│   ├── data_replicate.Rd
│   ├── data_restoretype.Rd
│   ├── data_rotate.Rd
│   ├── data_seek.Rd
│   ├── data_separate.Rd
│   ├── data_summary.Rd
│   ├── data_tabulate.Rd
│   ├── data_to_long.Rd
│   ├── data_to_wide.Rd
│   ├── data_unique.Rd
│   ├── data_unite.Rd
│   ├── datawizard-package.Rd
│   ├── demean.Rd
│   ├── describe_distribution.Rd
│   ├── distribution_mode.Rd
│   ├── efc.Rd
│   ├── extract_column_names.Rd
│   ├── labels_to_levels.Rd
│   ├── makepredictcall.dw_transformer.Rd
│   ├── mean_sd.Rd
│   ├── means_by_group.Rd
│   ├── nhanes_sample.Rd
│   ├── normalize.Rd
│   ├── ranktransform.Rd
│   ├── recode_into.Rd
│   ├── recode_values.Rd
│   ├── reexports.Rd
│   ├── remove_empty.Rd
│   ├── replace_nan_inf.Rd
│   ├── rescale.Rd
│   ├── rescale_weights.Rd
│   ├── reshape_ci.Rd
│   ├── reverse.Rd
│   ├── row_count.Rd
│   ├── row_means.Rd
│   ├── rownames.Rd
│   ├── skewness.Rd
│   ├── slide.Rd
│   ├── smoothness.Rd
│   ├── standardize.Rd
│   ├── standardize.default.Rd
│   ├── text_format.Rd
│   ├── to_factor.Rd
│   ├── to_numeric.Rd
│   ├── visualisation_recipe.Rd
│   ├── weighted_mean.Rd
│   └── winsorize.Rd
├── paper/
│   └── JOSS_files/
│       ├── apa.csl
│       ├── paper.Rmd
│       ├── paper.bib
│       ├── paper.log
│       └── paper.md
├── pkgdown/
│   └── _pkgdown.yaml
├── tests/
│   ├── testthat/
│   │   ├── _snaps/
│   │   │   ├── categorize.md
│   │   │   ├── contr.deviation.md
│   │   │   ├── data_codebook.md
│   │   │   ├── data_modify.md
│   │   │   ├── data_partition.md
│   │   │   ├── data_peek.md
│   │   │   ├── data_read.md
│   │   │   ├── data_rescale.md
│   │   │   ├── data_seek.md
│   │   │   ├── data_separate.md
│   │   │   ├── data_summary.md
│   │   │   ├── data_tabulate.md
│   │   │   ├── data_to_factor.md
│   │   │   ├── data_to_long.md
│   │   │   ├── data_to_numeric.md
│   │   │   ├── demean.md
│   │   │   ├── describe_distribution.md
│   │   │   ├── empty-dataframe.md
│   │   │   ├── means_by_group.md
│   │   │   ├── normalize.md
│   │   │   ├── print.dw_transformer.md
│   │   │   ├── ranktransform.md
│   │   │   ├── rescale_weights.md
│   │   │   ├── reshape_ci.md
│   │   │   ├── skewness-kurtosis.md
│   │   │   ├── smoothness.md
│   │   │   ├── text_format.md
│   │   │   ├── windows/
│   │   │   │   └── means_by_group.md
│   │   │   └── winsorization.md
│   │   ├── helper-state.R
│   │   ├── helper.R
│   │   ├── test-adjust.R
│   │   ├── test-assign_labels.R
│   │   ├── test-attributes-grouped-df.R
│   │   ├── test-attributes.R
│   │   ├── test-categorize.R
│   │   ├── test-center.R
│   │   ├── test-coef_var.R
│   │   ├── test-contr.deviation.R
│   │   ├── test-convert_na_to.R
│   │   ├── test-convert_to_na.R
│   │   ├── test-data_addprefix.R
│   │   ├── test-data_arrange.R
│   │   ├── test-data_codebook.R
│   │   ├── test-data_duplicated.R
│   │   ├── test-data_extract.R
│   │   ├── test-data_group.R
│   │   ├── test-data_match.R
│   │   ├── test-data_merge.R
│   │   ├── test-data_modify.R
│   │   ├── test-data_partition.R
│   │   ├── test-data_peek.R
│   │   ├── test-data_read.R
│   │   ├── test-data_recode.R
│   │   ├── test-data_relocate.R
│   │   ├── test-data_remove.R
│   │   ├── test-data_rename.R
│   │   ├── test-data_reorder.R
│   │   ├── test-data_replicate.R
│   │   ├── test-data_rescale.R
│   │   ├── test-data_restoretype.R
│   │   ├── test-data_reverse.R
│   │   ├── test-data_rotate.R
│   │   ├── test-data_seek.R
│   │   ├── test-data_select.R
│   │   ├── test-data_separate.R
│   │   ├── test-data_shift.R
│   │   ├── test-data_summary.R
│   │   ├── test-data_tabulate.R
│   │   ├── test-data_to_factor.R
│   │   ├── test-data_to_long.R
│   │   ├── test-data_to_numeric.R
│   │   ├── test-data_to_wide.R
│   │   ├── test-data_unique.R
│   │   ├── test-data_unite.R
│   │   ├── test-data_write.R
│   │   ├── test-demean.R
│   │   ├── test-describe_distribution.R
│   │   ├── test-distributions.R
│   │   ├── test-empty-dataframe.R
│   │   ├── test-extract_column_names.R
│   │   ├── test-labelled_data.R
│   │   ├── test-labels_to_levels.R
│   │   ├── test-makepredictcall.R
│   │   ├── test-mean_sd.R
│   │   ├── test-means_by_group.R
│   │   ├── test-normalize.R
│   │   ├── test-print.dw_transformer.R
│   │   ├── test-ranktransform.R
│   │   ├── test-recode_into.R
│   │   ├── test-replace_nan_inf.R
│   │   ├── test-rescale_weights.R
│   │   ├── test-reshape_ci.R
│   │   ├── test-row_count.R
│   │   ├── test-row_means.R
│   │   ├── test-select_nse.R
│   │   ├── test-skewness-kurtosis.R
│   │   ├── test-smoothness.R
│   │   ├── test-standardize-data.R
│   │   ├── test-standardize_datagrid.R
│   │   ├── test-standardize_models.R
│   │   ├── test-std_center.R
│   │   ├── test-std_center_scale_args.R
│   │   ├── test-text_format.R
│   │   ├── test-unnormalize.R
│   │   ├── test-utils.R
│   │   ├── test-utils_cols.R
│   │   ├── test-utils_rows.R
│   │   ├── test-weighted-stats.R
│   │   └── test-winsorization.R
│   └── testthat.R
└── vignettes/
    ├── .gitignore
    ├── bibliography.bib
    ├── overview_of_vignettes.Rmd
    ├── selection_syntax.Rmd
    ├── standardize_data.Rmd
    └── tidyverse_translation.Rmd

================================================
FILE CONTENTS
================================================

================================================
FILE: .Rbuildignore
================================================
^\cache$
^codemeta\.json$
^Meta$
^doc$
^.*\.Rproj$
^\.Rproj\.user$
^README\.Rmd$
^Rplots.pdf$
^README-.*\.png$
^CONDUCT\.md$
^SECURITY\.md$
^cran-comments\.md$
^CODE_OF_CONDUCT\.md$
^SUPPORT\.md$
^\.github$
^NEWS$
^docs$
^revdep$
publication/*
^codecov\.yml$
^\.coveralls\.yml$
^\.travis\.yml$
^_pkgdown\.yml$
^_pkgdown\.yaml$
^appveyor\.yml$
^.gitlab-ci\.yml$
^data-raw$
^pkgdown$
^\.httr-oauth$
^CRAN-RELEASE$
tests\^spelling
^LICENSE\.md$
^\.lintr$
\.code-workspace$
^\.circleci$
^tests/manual$
^revdep$
^\.covrignore$
^\.github/ISSUE_TEMPLATE$
^paper.*$
references.bib
^API$
^\.pre-commit-config\.yaml$
^\.github/workflows/R\.yaml$
^\.github/workflows/pr-commands\.yaml$
^hextools/.
^WIP/.
^CRAN-SUBMISSION$
docs
^.dev$
^vignettes/s.
^vignettes/t.
^[\.]?air\.toml$
^\.vscode$
^\.git-blame-ignore-revs$


================================================
FILE: .dev/_BENCHMARK_RESHAPE.R
================================================
library(tidyr)
library(dplyr)
library(datawizard)

### DATA_TO_LONG ==========================================


# SLOW (5M rows)

wide_data <- data.frame(replicate(5, rnorm(10)))
tmp <- list()
for (i in 1:500000) {
  tmp[[i]] <- wide_data
}

tmp <- data.table::rbindlist(tmp) |>
  as_tibble()

ex1_l <- bench::mark(
  old = old_data_to_long(tmp),
  new = data_to_long(tmp),
  tidyr = pivot_longer(tmp, cols = everything()),
  iterations = 10
)


ex2_l <- bench::mark(
  old = relig_income %>%
    old_data_to_long(-"religion", names_to = "income", values_to = "count"),
  new = relig_income %>%
    data_to_long(-"religion", names_to = "income", values_to = "count"),
  tidyr = relig_income %>%
    pivot_longer(!religion, names_to = "income", values_to = "count"),
  iterations = 100
)

ex3_l <- bench::mark(
  old = billboard %>%
    old_data_to_long(
      cols = starts_with("wk"),
      names_to = "week",
      values_to = "rank"
    ),
  new = billboard %>%
    data_to_long(
      cols = starts_with("wk"),
      names_to = "week",
      values_to = "rank"
    ),
  tidyr = billboard %>%
    pivot_longer(
      cols = starts_with("wk"),
      names_to = "week",
      values_to = "rank"
    ),
  iterations = 50
)

ex4_l <- bench::mark(
  old = who |>
    old_data_to_long(
      cols = 5:60,
      names_to = c("diagnosis", "gender", "age"),
      names_sep = "_",
      values_to = "count"
    ),
  new = who |>
    data_to_long(
      cols = 5:60,
      names_to = c("diagnosis", "gender", "age"),
      names_sep = "_",
      values_to = "count"
    ),
  tidyr = who |>
    pivot_longer(
      cols = 5:60,
      names_to = c("diagnosis", "gender", "age"),
      names_sep = "_",
      values_to = "count"
    ),
  iterations = 10
)

ex5_l <- bench::mark(
  old = who |>
    old_data_to_long(
      cols = 5:60,
      names_to = c("diagnosis", "gender", "age"),
      names_pattern = "new_?(.*)_(.)(.*)",
      values_to = "count"
    ),
  new = who |>
    data_to_long(
      cols = 5:60,
      names_to = c("diagnosis", "gender", "age"),
      names_pattern = "new_?(.*)_(.)(.*)",
      values_to = "count"
    ),
  tidyr = who |>
    pivot_longer(
      cols = 5:60,
      names_to = c("diagnosis", "gender", "age"),
      names_pattern = "new_?(.*)_(.)(.*)",
      values_to = "count"
    ),
  iterations = 10
)


### DATA_TO_WIDE ==========================================

ex1_w <- bench::mark(
  old = fish_encounters %>%
    old_data_to_wide(
      names_from = "station",
      values_from = "seen",
      values_fill = 0
    ),
  new = fish_encounters %>%
     data_to_wide(
      names_from = "station",
      values_from = "seen",
      values_fill = 0
    ),
  tidyr = fish_encounters %>%
    pivot_wider(
      names_from = "station",
      values_from = "seen",
      values_fill = 0
    ),
  iterations = 100
)


production <- expand_grid(
  product = letters,
  country = paste0(letters, "I"),
  year = 2000:2025
) %>%
  mutate(production = rnorm(nrow(.)))


ex2_w <- bench::mark(
  old = production %>%
    old_data_to_wide(
      names_from = c("product", "country"),
      values_from = "production"
    ),
  new = production %>%
     data_to_wide(
      names_from = c("product", "country"),
      values_from = "production"
    ),
  tidyr = production %>%
    pivot_wider(
      names_from = c(product, country),
      values_from = production
    ),
  iterations = 10
)


ex3_w <- bench::mark(
  old = production %>%
    old_data_to_wide(
      names_from = c("product", "country"),
      values_from = "production",
      names_glue = "prod_{product}_{country}"
    ),
  new = production %>%
     data_to_wide(
      names_from = c("product", "country"),
      values_from = "production",
      names_glue = "prod_{product}_{country}"
    ),
  tidyr = production %>%
    pivot_wider(
      names_from = c(product, country),
      values_from = production,
      names_glue = "prod_{product}_{country}"
    ),
  iterations = 10
)


tmp <- list()
for (i in 1:1000) {
  tmp[[i]] <- us_rent_income
}

tmp <- data.table::rbindlist(tmp) |>
  as_tibble()
tmp$GEOID <- rep(1:52000, each = 2)
tmp$NAME <- as.character(rep(1:52000, each = 2))

ex4_w <- bench::mark(
  old = tmp %>%
    old_data_to_wide(
      names_from = "variable",
      values_from = c("estimate", "moe")
    ),
  new = tmp %>%
     data_to_wide(
      names_from = "variable",
      values_from = c("estimate", "moe")
    ),
  tidyr = tmp %>%
    pivot_wider(
      names_from = "variable",
      values_from = c("estimate", "moe")
    ),
  iterations = 10
)

# SLOW (1M rows) ============

set.seed(123)
contacts <- tibble(
  id = rep(1:500000, each = 2),
  field = rep(c("a", "b"), 500000),
  value = sample(letters, 1000000, replace = TRUE)
)

ex5_w <- bench::mark(
  old = contacts %>%
    old_data_to_wide(names_from = "field", values_from = "value"),
  new = contacts %>%
     data_to_wide(names_from = "field", values_from = "value"),
  tidyr = contacts %>%
    tidyr::pivot_wider(names_from = field, values_from = value),
  iterations = 1
)


# SLOWER (10M rows) ============

set.seed(123)
contacts <- tibble(
  id = rep(1:5000000, each = 2),
  field = rep(c("a", "b"), 5000000),
  value = sample(letters, 10000000, replace = TRUE)
)

ex6_w <- bench::mark(
  old = contacts %>%
    old_data_to_wide(names_from = "field", values_from = "value"),
  new = contacts %>%
     data_to_wide(names_from = "field", values_from = "value"),
  tidyr = contacts %>%
    tidyr::pivot_wider(names_from = field, values_from = value),
  iterations = 1
)


reprex:::prex({
  ### DATA_TO_LONG ==========================================

  ex1_l
  ex2_l
  ex3_l
  ex4_l
  ex5_l

  ### DATA_TO_WIDE ==========================================

  ex1_w
  ex2_w
  ex3_w
  ex4_w
  ex5_w
  ex6_w
})


================================================
FILE: .dev/html5.R
================================================
Sys.setenv("_R_CHECK_RD_VALIDATE_RD2HTML_" = "true")
Sys.setenv("_R_CHECK_CRAN_INCOMING_REMOTE_" = "false")
Sys.setenv("_R_CHECK_CRAN_INCOMING_" = "false")

rcmdcheck::rcmdcheck(
  args = c("--as-cran", "--no-codoc", "--no-examples", "--no-tests", "--no-vignettes", "--no-build-vignettes", "--ignore-vignettes", "--no-install"),
  build_args = c("--no-build-vignettes"),
  error_on = "note"
)


================================================
FILE: .dev/revdepcheck.R
================================================
library(revdepcheck)

revdep_check(num_workers = 4)
revdep_report()
revdep_reset()


================================================
FILE: .dev/test-value_at.R
================================================
test_that("value_at", {
  data(efc, package = "datawizard")
  expect_equal(value_at(efc$e42dep, 5), 4, ignore_attr = TRUE)
  expect_equal(value_at(efc$c12hour, 4), NA_real_, ignore_attr = TRUE)
  expect_equal(value_at(efc$c12hour, 4, remove_na = TRUE), 168, ignore_attr = TRUE)
  expect_equal(value_at(efc$c12hour, 5:7), efc$c12hour[5:7], ignore_attr = TRUE)
  expect_equal(value_at(efc$e42dep, 123456, default = 55), 55, ignore_attr = TRUE)
  expect_null(value_at(efc$e42dep, 123456))
  expect_null(value_at(efc$e42dep, NULL))
  expect_error(value_at(efc$e42dep, NA), regex = "`position` can't")
  expect_error(value_at(efc$e42dep, c(3, NA)), regex = "`position` can't")
})


================================================
FILE: .dev/value_at.R
================================================
#' @title Find the value(s) at a specific position in a variable
#' @name value_at
#'
#' @description This function can be used to extract one or more values at a
#' specific position in a variable.
#'
#' @param x A vector or factor.
#' @param position An integer or a vector of integers, indicating the position(s)
#' of the value(s) to be returned. Negative values are counted from the end of
#' the vector. If `NA`, an error is thrown.
#' @param remove_na Logical, if `TRUE`, missing values are removed before
#' computing the position. If `FALSE`, missing values are included in the
#' computation.
#' @param default The value to be returned if the position is out of range.
#'
#' @seealso `data_summary()` to use `value_at()` inside a `data_summary()` call.
#'
#' @return A vector with the value(s) at the specified position(s).
#'
#' @examples
#' data(mtcars)
#' # 5th value
#' value_at(mtcars$mpg, 5)
#' # last value
#' value_at(mtcars$mpg, -1)
#' # out of range, return default
#' value_at(mtcars$mpg, 150)
#' # return 2nd and fifth value
#' value_at(mtcars$mpg, c(2, 5))
#' @export
value_at <- function(x, position = 1, default = NULL, remove_na = FALSE) {
  if (remove_na) {
    x <- x[!is.na(x)]
  }
  n <- length(x)
  unlist(lapply(position, .values_at, x = x, n = n, default = default), use.names = FALSE)
}

# helper ----

.values_at <- function(x, position, n, default) {
  if (is.na(position)) {
    insight::format_error("`position` can't be `NA`.")
  }
  if (position < 0L) {
    position <- position + n + 1
  }
  if (position <= 0 || position > n) {
    return(default)
  }
  x[position]
}


================================================
FILE: .git-blame-ignore-revs
================================================
# Air formatting
5bd245e0bc12d2eecbcfa480a231b6df3ab7d684

================================================
FILE: .github/.gitignore
================================================
*.html


================================================
FILE: .github/CODE_OF_CONDUCT.md
================================================
# Contributor Covenant Code of Conduct

## Our Pledge

We as members, contributors, and leaders pledge to make participation in our
community a harassment-free experience for everyone, regardless of age, body
size, visible or invisible disability, ethnicity, sex characteristics, gender
identity and expression, level of experience, education, socio-economic status,
nationality, personal appearance, race, caste, color, religion, or sexual
identity and orientation.

We pledge to act and interact in ways that contribute to an open, welcoming,
diverse, inclusive, and healthy community.

## Our Standards

Examples of behavior that contributes to a positive environment for our
community include:

* Demonstrating empathy and kindness toward other people
* Being respectful of differing opinions, viewpoints, and experiences
* Giving and gracefully accepting constructive feedback
* Accepting responsibility and apologizing to those affected by our mistakes,
  and learning from the experience
* Focusing on what is best not just for us as individuals, but for the overall
  community

Examples of unacceptable behavior include:

* The use of sexualized language or imagery, and sexual attention or advances of
  any kind
* Trolling, insulting or derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or email address,
  without their explicit permission
* Other conduct which could reasonably be considered inappropriate in a
  professional setting

## Enforcement Responsibilities

Community leaders are responsible for clarifying and enforcing our standards of
acceptable behavior and will take appropriate and fair corrective action in
response to any behavior that they deem inappropriate, threatening, offensive,
or harmful.

Community leaders have the right and responsibility to remove, edit, or reject
comments, commits, code, wiki edits, issues, and other contributions that are
not aligned to this Code of Conduct, and will communicate reasons for moderation
decisions when appropriate.

## Scope

This Code of Conduct applies within all community spaces, and also applies when
an individual is officially representing the community in public spaces.
Examples of representing our community include using an official e-mail address,
posting via an official social media account, or acting as an appointed
representative at an online or offline event.

## Enforcement

Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement at patilindrajeet.science@gmail.com. 
All complaints will be reviewed and investigated promptly and fairly.

All community leaders are obligated to respect the privacy and security of the
reporter of any incident.

## Enforcement Guidelines

Community leaders will follow these Community Impact Guidelines in determining
the consequences for any action they deem in violation of this Code of Conduct:

### 1. Correction

**Community Impact**: Use of inappropriate language or other behavior deemed
unprofessional or unwelcome in the community.

**Consequence**: A private, written warning from community leaders, providing
clarity around the nature of the violation and an explanation of why the
behavior was inappropriate. A public apology may be requested.

### 2. Warning

**Community Impact**: A violation through a single incident or series of
actions.

**Consequence**: A warning with consequences for continued behavior. No
interaction with the people involved, including unsolicited interaction with
those enforcing the Code of Conduct, for a specified period of time. This
includes avoiding interactions in community spaces as well as external channels
like social media. Violating these terms may lead to a temporary or permanent
ban.

### 3. Temporary Ban

**Community Impact**: A serious violation of community standards, including
sustained inappropriate behavior.

**Consequence**: A temporary ban from any sort of interaction or public
communication with the community for a specified period of time. No public or
private interaction with the people involved, including unsolicited interaction
with those enforcing the Code of Conduct, is allowed during this period.
Violating these terms may lead to a permanent ban.

### 4. Permanent Ban

**Community Impact**: Demonstrating a pattern of violation of community
standards, including sustained inappropriate behavior, harassment of an
individual, or aggression toward or disparagement of classes of individuals.

**Consequence**: A permanent ban from any sort of public interaction within the
community.

## Attribution

This Code of Conduct is adapted from the [Contributor Covenant][homepage],
version 2.1, available at
<https://www.contributor-covenant.org/version/2/1/code_of_conduct.html>.

Community Impact Guidelines were inspired by
[Mozilla's code of conduct enforcement ladder][https://github.com/mozilla/inclusion].

For answers to common questions about this code of conduct, see the FAQ at
<https://www.contributor-covenant.org/faq>. Translations are available at <https://www.contributor-covenant.org/translations>.

[homepage]: https://www.contributor-covenant.org


================================================
FILE: .github/CONTRIBUTING.md
================================================
# Contributing to datawizard

This outlines how to propose a change to **datawizard**. 

## Fixing typos

Small typos or grammatical errors in documentation may be edited directly using the GitHub web interface, so long as the changes are made in the _source_ file. If you want to fix typos in the documentation, please edit the related `.R` file in the `R/` folder. Do _not_ edit an `.Rd` file in `man/`.

## Filing an issue

The easiest way to propose a change or new feature is to file an issue. If you've found a
bug, you may also create an associated issue. If possible, try to illustrate your proposal or the bug with a minimal [reproducible example](https://www.tidyverse.org/help/#reprex).

## Pull requests

*  Please create a Git branch for each pull request (PR).
*  Your contributed code should roughly follow the [R style guide](http://style.tidyverse.org), but in particular our [**easystats convention of code-style**](https://github.com/easystats/easystats#convention-of-code-style).
*  datawizard uses [roxygen2](https://cran.r-project.org/package=roxygen2), with
[Markdown syntax](https://cran.r-project.org/web/packages/roxygen2/vignettes/rd-formatting.html),
for documentation.
*  datawizard uses [testthat](https://cran.r-project.org/package=testthat). Adding tests to the PR makes it easier for me to merge your PR into the code base.
*  If your PR is a user-visible change, you may add a bullet to the top of `NEWS.md` describing the changes made. You may optionally add your GitHub username, and links to relevant issue(s)/PR(s).

## Code of Conduct

Please note that this project is released with a [Contributor Code of Conduct](https://easystats.github.io/datawizard/CODE_OF_CONDUCT.html). By participating in this project you agree to
abide by its terms.


================================================
FILE: .github/FUNDING.yml
================================================
# These are supported funding model platforms

github: easystats


================================================
FILE: .github/SUPPORT.md
================================================
# Getting help with `{datawizard}`

Thanks for using `{datawizard}`. Before filing an issue, there are a few places
to explore and pieces to put together to make the process as smooth as possible.

Start by making a minimal **repr**oducible **ex**ample using the 
[reprex](http://reprex.tidyverse.org/) package. If you haven't heard of or used 
reprex before, you're in for a treat! Seriously, reprex will make all of your 
R-question-asking endeavors easier (which is a pretty insane ROI for the five to 
ten minutes it'll take you to learn what it's all about). For additional reprex
pointers, check out the [Get help!](https://www.tidyverse.org/help/) resource
used by the tidyverse team.

Armed with your reprex, the next step is to figure out where to ask: 

  * If it's a question: start with StackOverflow. There are more people there to answer questions.
  * If it's a bug: you're in the right place, file an issue.
  * If you're not sure: let the community help you figure it out! If your 
    problem _is_ a bug or a feature request, you can easily return here and 
    report it. 

Before opening a new issue, be sure to [search issues and pull requests](https://github.com/easystats/datawizard/issues) to make sure the 
bug hasn't been reported and/or already fixed in the development version. By 
default, the search will be pre-populated with `is:issue is:open`. You can 
[edit the qualifiers](https://help.github.com/articles/searching-issues-and-pull-requests/) 
(e.g. `is:pr`, `is:closed`) as needed. For example, you'd simply
remove `is:open` to search _all_ issues in the repo, open or closed.

Thanks for your help!


================================================
FILE: .github/dependabot.yaml
================================================
version: 2

updates:
  # Keep dependencies for GitHub Actions up-to-date
  - package-ecosystem: "github-actions"
    directory: "/"
    schedule:
      interval: "weekly"


================================================
FILE: .github/workflows/R-CMD-check-hard.yaml
================================================
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
#
# NOTE: This workflow only directly installs "hard" dependencies, i.e. Depends,
# Imports, and LinkingTo dependencies. Notably, Suggests dependencies are never
# installed, with the exception of testthat, knitr, and rmarkdown. The cache is
# never used to avoid accidentally restoring a cache containing a suggested
# dependency.
on:
  push:
    branches: [main, master]
  pull_request:
    branches: [main, master]

name: R-CMD-check-hard

jobs:
  R-CMD-check-hard:
    uses: easystats/workflows/.github/workflows/R-CMD-check-hard.yaml@main


================================================
FILE: .github/workflows/R-CMD-check.yaml
================================================
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
#
# NOTE: This workflow is overkill for most R packages and
# check-standard.yaml is likely a better choice.
# usethis::use_github_action("check-standard") will install it.
on:
  push:
    branches: [main, master]
  pull_request:
    branches: [main, master]

name: R-CMD-check

jobs:
  R-CMD-check:
    uses: easystats/workflows/.github/workflows/R-CMD-check.yaml@main


================================================
FILE: .github/workflows/check-all-examples.yaml
================================================
# Make sure all examples run successfully, even the ones that are not supposed
# to be run or tested on CRAN machines by default.
#
# The examples that fail should use
#  - `if (FALSE) { ... }` (if example is included only for illustrative purposes)
#  - `try({ ... })` (if the intent is to show the error)
#
# This workflow helps find such failing examples that need to be modified.
on:
  push:
    branches: [main, master]
  pull_request:
    branches: [main, master]

name: check-all-examples

jobs:
  check-all-examples:
    uses: easystats/workflows/.github/workflows/check-all-examples.yaml@main


================================================
FILE: .github/workflows/check-link-rot.yaml
================================================
on:
  push:
    branches: [main, master]
  pull_request:
    branches: [main, master]

name: check-link-rot

jobs:
  check-link-rot:
    uses: easystats/workflows/.github/workflows/check-link-rot.yaml@main


================================================
FILE: .github/workflows/check-random-test-order.yaml
================================================
# Run tests in random order
on:
  push:
    branches: [main, master]
  pull_request:
    branches: [main, master]

name: check-random-test-order

jobs:
  check-random-test-order:
    uses: easystats/workflows/.github/workflows/check-random-test-order.yaml@main


================================================
FILE: .github/workflows/check-readme.yaml
================================================
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help

on:
  push:
    branches: [main, master]
  pull_request:
    branches: [main, master]

name: check-readme

jobs:
  check-readme:
    uses: easystats/workflows/.github/workflows/check-readme.yaml@main


================================================
FILE: .github/workflows/check-spelling.yaml
================================================
on:
  push:
    branches: [main, master]
  pull_request:
    branches: [main, master]

name: check-spelling

jobs:
  check-spelling:
    uses: easystats/workflows/.github/workflows/check-spelling.yaml@main


================================================
FILE: .github/workflows/check-styling.yaml
================================================
on:
  push:
    branches: [main, master]
  pull_request:
    branches: [main, master]

name: check-styling

jobs:
  check-styling:
    uses: easystats/workflows/.github/workflows/check-styling.yaml@main


================================================
FILE: .github/workflows/check-test-warnings.yaml
================================================
# Running tests with options(warn = 2) to fail on test warnings
on:
  push:
    branches: [main, master]
  pull_request:
    branches: [main, master]

name: check-test-warnings

jobs:
  check-test-warnings:
    uses: easystats/workflows/.github/workflows/check-test-warnings.yaml@main


================================================
FILE: .github/workflows/check-vignette-warnings.yaml
================================================
# Running tests with options(warn = 2) to fail on test warnings
on:
  push:
    branches: [main, master]
  pull_request:
    branches: [main, master]

name: check-vignette-warnings

jobs:
  check-vignette-warnings:
    uses: easystats/workflows/.github/workflows/check-vignette-warnings.yaml@main


================================================
FILE: .github/workflows/html-5-check.yaml
================================================
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
  push:
    branches: [main, master]
  pull_request:
    branches: [main, master]

name: html-5-check

jobs:
  html-5-check:
    uses: easystats/workflows/.github/workflows/html-5-check.yaml@main


================================================
FILE: .github/workflows/lint-changed-files.yaml
================================================
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
  pull_request:
    branches: [main, master]

name: lint-changed-files

jobs:
  lint-changed-files:
    uses: easystats/workflows/.github/workflows/lint-changed-files.yaml@main


================================================
FILE: .github/workflows/lint.yaml
================================================
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
  push:
    branches: [main, master]
  pull_request:
    branches: [main, master]

name: lint

jobs:
  lint:
    uses: easystats/workflows/.github/workflows/lint.yaml@main


================================================
FILE: .github/workflows/pkgdown-no-suggests.yaml
================================================
on:
  push:
    branches: [main, master]
  pull_request:
    branches: [main, master]

name: pkgdown-no-suggests

jobs:
  pkgdown-no-suggests:
    uses: easystats/workflows/.github/workflows/pkgdown-no-suggests.yaml@main


================================================
FILE: .github/workflows/pkgdown.yaml
================================================
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
  push:
    branches: [main, master]
  pull_request:
    branches: [main, master]
  release:
    types: [published]
  workflow_dispatch:

name: pkgdown

jobs:
  pkgdown:
    uses: easystats/workflows/.github/workflows/pkgdown.yaml@main


================================================
FILE: .github/workflows/test-coverage-examples.yaml
================================================
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
  push:
    branches: [main, master]
  pull_request:
    branches: [main, master]

name: test-coverage-examples

jobs:
  test-coverage-examples:
    uses: easystats/workflows/.github/workflows/test-coverage-examples.yaml@main


================================================
FILE: .github/workflows/test-coverage.yaml
================================================
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
  push:
    branches: [main, master]
  pull_request:
    branches: [main, master]

name: test-coverage

jobs:
  test-coverage:
    uses: easystats/workflows/.github/workflows/test-coverage.yaml@main


================================================
FILE: .github/workflows/update-to-latest-easystats.yaml
================================================
on:
  schedule:
    # Check for dependency updates once a month
    - cron: "0 0 1 * *"

name: update-to-latest-easystats

jobs:
  update-to-latest-easystats:
    uses: easystats/workflows/.github/workflows/update-to-latest-easystats.yaml@main


================================================
FILE: .gitignore
================================================
# History files
.Rhistory
.Rapp.history

# Session Data files
.RData

# Example code in package build process
*-Ex.R

# Output files from R CMD build
/*.tar.gz

# Output files from R CMD check
/*.Rcheck/

# RStudio files
.Rproj.user/

# produced vignettes
vignettes/*.html
vignettes/*.pdf

# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
.httr-oauth

# knitr and R markdown default cache directories
/*_cache/
/cache/

# Temporary files created by R markdown
*.utf8.md
*.knit.md

# Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html
rsconnect/

=========================
# Operating System Files
# OSX
.DS_Store
.AppleDouble
.LSOverride

# Thumbnails
._*

# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns

# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
docs
inst/doc
CRAN-SUBMISSION


================================================
FILE: .lintr
================================================
linters: all_linters(
    absolute_path_linter = NULL,
    assignment_linter = NULL,
    commented_code_linter = NULL,
    cyclocomp_linter(25L),
    if_not_else_linter(exceptions = character(0L)),
    implicit_integer_linter = NULL,
    library_call_linter = NULL,
    line_length_linter(120L),
    namespace_linter = NULL,
    nonportable_path_linter = NULL,
    object_name_linter = NULL,
    object_length_linter(50L),
    object_usage_linter = NULL,
    todo_comment_linter = NULL,
    string_boundary_linter = NULL,
    strings_as_factors_linter = NULL,
    undesirable_function_linter = NULL,
    undesirable_operator_linter = NULL,
    unnecessary_concatenation_linter(allow_single_expression = FALSE),
    unused_import_linter = NULL
  )


================================================
FILE: DESCRIPTION
================================================
Type: Package
Package: datawizard
Title: Easy Data Wrangling and Statistical Transformations
Version: 1.3.1
Authors@R: c(
    person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut",
           comment = c(ORCID = "0000-0003-1995-6531")),
    person("Etienne", "Bacher", , "etienne.bacher@protonmail.com", role = c("aut", "cre"),
           comment = c(ORCID = "0000-0002-9271-5075")),
    person("Dominique", "Makowski", , "dom.makowski@gmail.com", role = "aut",
           comment = c(ORCID = "0000-0001-5375-9967")),
    person("Daniel", "Lüdecke", , "d.luedecke@uke.de", role = "aut",
           comment = c(ORCID = "0000-0002-8895-3206")),
    person("Mattan S.", "Ben-Shachar", , "matanshm@post.bgu.ac.il", role = "aut",
           comment = c(ORCID = "0000-0002-4287-4801")),
    person("Brenton M.", "Wiernik", , "brenton@wiernik.org", role = "aut",
           comment = c(ORCID = "0000-0001-9560-6336")),
    person("Rémi", "Thériault", , "remi.theriault@mail.mcgill.ca", role = "ctb",
           comment = c(ORCID = "0000-0003-4315-6788")),
    person("Thomas J.", "Faulkenberry", , "faulkenberry@tarleton.edu", role = "rev"),
    person("Robert", "Garrett", , "rcg4@illinois.edu", role = "rev")
  )
Maintainer: Etienne Bacher <etienne.bacher@protonmail.com>
Description: A lightweight package to assist in key steps involved in any data
    analysis workflow: (1) wrangling the raw data to get it in the needed form,
    (2) applying preprocessing steps and statistical transformations, and
    (3) compute statistical summaries of data properties and distributions.
    It is also the data wrangling backend for packages in 'easystats' ecosystem.
    References: Patil et al. (2022) <doi:10.21105/joss.04684>.
License: MIT + file LICENSE
URL: https://easystats.github.io/datawizard/
BugReports: https://github.com/easystats/datawizard/issues
Depends:
    R (>= 4.0)
Imports:
    insight (>= 1.4.6),
    stats,
    utils
Suggests:
    bayestestR,
    boot,
    BH,
    brms,
    curl,
    data.table,
    dplyr (>= 1.1),
    effectsize,
    emmeans,
    fixest,
    gamm4,
    ggplot2 (>= 3.5.0),
    gt,
    haven,
    httr,
    knitr,
    lme4,
    mediation,
    modelbased,
    nanoparquet,
    openssl,
    parameters (>= 0.21.7),
    performance (>= 0.14.0),
    poorman (>= 0.2.7),
    psych,
    readxl,
    readr,
    rio,
    rmarkdown,
    rstanarm,
    see,
    testthat (>= 3.2.1),
    tibble,
    tidyr,
    tinytable (>= 0.13.0),
    withr
VignetteBuilder:
    knitr
Encoding: UTF-8
Language: en-US
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.3
Config/testthat/edition: 3
Config/testthat/parallel: true
Config/Needs/website: easystats/easystatstemplate


================================================
FILE: LICENSE
================================================
YEAR: 2023
COPYRIGHT HOLDER: datawizard authors


================================================
FILE: LICENSE.md
================================================
# MIT License

Copyright (c) 2023 datawizard authors

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: NAMESPACE
================================================
# Generated by roxygen2: do not edit by hand

S3method(as.data.frame,datawizard_crosstabs)
S3method(as.data.frame,datawizard_tables)
S3method(as.double,parameters_kurtosis)
S3method(as.double,parameters_skewness)
S3method(as.double,parameters_smoothness)
S3method(as.numeric,parameters_kurtosis)
S3method(as.numeric,parameters_skewness)
S3method(as.numeric,parameters_smoothness)
S3method(as.prop.table,datawizard_crosstab)
S3method(as.prop.table,datawizard_crosstabs)
S3method(as.table,datawizard_crosstab)
S3method(as.table,datawizard_crosstabs)
S3method(as.table,datawizard_table)
S3method(as.table,datawizard_tables)
S3method(assign_labels,character)
S3method(assign_labels,data.frame)
S3method(assign_labels,default)
S3method(assign_labels,factor)
S3method(assign_labels,numeric)
S3method(categorize,data.frame)
S3method(categorize,default)
S3method(categorize,factor)
S3method(categorize,grouped_df)
S3method(categorize,numeric)
S3method(center,AsIs)
S3method(center,Date)
S3method(center,character)
S3method(center,data.frame)
S3method(center,default)
S3method(center,factor)
S3method(center,grouped_df)
S3method(center,logical)
S3method(center,numeric)
S3method(coef_var,default)
S3method(coef_var,numeric)
S3method(convert_na_to,character)
S3method(convert_na_to,data.frame)
S3method(convert_na_to,default)
S3method(convert_na_to,factor)
S3method(convert_na_to,numeric)
S3method(convert_to_na,Date)
S3method(convert_to_na,character)
S3method(convert_to_na,data.frame)
S3method(convert_to_na,default)
S3method(convert_to_na,factor)
S3method(convert_to_na,logical)
S3method(convert_to_na,numeric)
S3method(data_arrange,default)
S3method(data_arrange,grouped_df)
S3method(data_duplicated,data.frame)
S3method(data_duplicated,grouped_df)
S3method(data_extract,data.frame)
S3method(data_filter,data.frame)
S3method(data_filter,grouped_df)
S3method(data_merge,data.frame)
S3method(data_merge,list)
S3method(data_modify,data.frame)
S3method(data_modify,default)
S3method(data_modify,grouped_df)
S3method(data_peek,data.frame)
S3method(data_summary,data.frame)
S3method(data_summary,default)
S3method(data_summary,grouped_df)
S3method(data_summary,matrix)
S3method(data_tabulate,data.frame)
S3method(data_tabulate,default)
S3method(data_tabulate,grouped_df)
S3method(data_unique,data.frame)
S3method(data_unique,grouped_df)
S3method(describe_distribution,character)
S3method(describe_distribution,data.frame)
S3method(describe_distribution,default)
S3method(describe_distribution,factor)
S3method(describe_distribution,grouped_df)
S3method(describe_distribution,list)
S3method(describe_distribution,numeric)
S3method(display,data_codebook)
S3method(display,datawizard_crosstab)
S3method(display,datawizard_crosstabs)
S3method(display,datawizard_table)
S3method(display,datawizard_tables)
S3method(display,parameters_distribution)
S3method(format,data_codebook)
S3method(format,datawizard_crosstab)
S3method(format,datawizard_table)
S3method(format,dw_data_peek)
S3method(format,dw_groupmeans)
S3method(format,parameters_distribution)
S3method(kurtosis,data.frame)
S3method(kurtosis,default)
S3method(kurtosis,matrix)
S3method(kurtosis,numeric)
S3method(labels_to_levels,data.frame)
S3method(labels_to_levels,default)
S3method(labels_to_levels,factor)
S3method(makepredictcall,dw_transformer)
S3method(means_by_group,data.frame)
S3method(means_by_group,default)
S3method(means_by_group,numeric)
S3method(normalize,data.frame)
S3method(normalize,factor)
S3method(normalize,grouped_df)
S3method(normalize,matrix)
S3method(normalize,numeric)
S3method(plot,parameters_distribution)
S3method(plot,visualisation_recipe)
S3method(print,data_codebook)
S3method(print,data_seek)
S3method(print,datawizard_crosstab)
S3method(print,datawizard_crosstabs)
S3method(print,datawizard_table)
S3method(print,datawizard_tables)
S3method(print,dw_data_peek)
S3method(print,dw_data_summary)
S3method(print,dw_groupmeans)
S3method(print,dw_groupmeans_list)
S3method(print,dw_transformer)
S3method(print,parameters_distribution)
S3method(print,parameters_kurtosis)
S3method(print,parameters_skewness)
S3method(print,visualisation_recipe)
S3method(print_html,data_codebook)
S3method(print_html,datawizard_crosstab)
S3method(print_html,datawizard_crosstabs)
S3method(print_html,datawizard_table)
S3method(print_html,datawizard_tables)
S3method(print_html,dw_data_peek)
S3method(print_html,parameters_distribution)
S3method(print_md,data_codebook)
S3method(print_md,datawizard_crosstab)
S3method(print_md,datawizard_crosstabs)
S3method(print_md,datawizard_table)
S3method(print_md,datawizard_tables)
S3method(print_md,dw_data_peek)
S3method(print_md,parameters_distribution)
S3method(ranktransform,data.frame)
S3method(ranktransform,factor)
S3method(ranktransform,grouped_df)
S3method(ranktransform,numeric)
S3method(recode_values,character)
S3method(recode_values,data.frame)
S3method(recode_values,default)
S3method(recode_values,factor)
S3method(recode_values,numeric)
S3method(replace_nan_inf,data.frame)
S3method(replace_nan_inf,default)
S3method(rescale,data.frame)
S3method(rescale,default)
S3method(rescale,grouped_df)
S3method(rescale,numeric)
S3method(reverse,data.frame)
S3method(reverse,default)
S3method(reverse,factor)
S3method(reverse,grouped_df)
S3method(reverse,numeric)
S3method(rowid_as_column,default)
S3method(rowid_as_column,grouped_df)
S3method(skewness,data.frame)
S3method(skewness,default)
S3method(skewness,matrix)
S3method(skewness,numeric)
S3method(slide,data.frame)
S3method(slide,default)
S3method(slide,numeric)
S3method(smoothness,data.frame)
S3method(smoothness,default)
S3method(smoothness,numeric)
S3method(standardize,AsIs)
S3method(standardize,Date)
S3method(standardize,Surv)
S3method(standardize,bcplm)
S3method(standardize,biglm)
S3method(standardize,brmsfit)
S3method(standardize,character)
S3method(standardize,clm2)
S3method(standardize,data.frame)
S3method(standardize,datagrid)
S3method(standardize,default)
S3method(standardize,double)
S3method(standardize,factor)
S3method(standardize,fixest)
S3method(standardize,grouped_df)
S3method(standardize,integer)
S3method(standardize,logical)
S3method(standardize,matrix)
S3method(standardize,mediate)
S3method(standardize,mixor)
S3method(standardize,numeric)
S3method(standardize,visualisation_matrix)
S3method(standardize,wbgee)
S3method(standardize,wbm)
S3method(summary,parameters_kurtosis)
S3method(summary,parameters_skewness)
S3method(to_factor,Date)
S3method(to_factor,character)
S3method(to_factor,data.frame)
S3method(to_factor,default)
S3method(to_factor,double)
S3method(to_factor,factor)
S3method(to_factor,haven_labelled)
S3method(to_factor,logical)
S3method(to_factor,numeric)
S3method(to_numeric,Date)
S3method(to_numeric,POSIXct)
S3method(to_numeric,POSIXlt)
S3method(to_numeric,POSIXt)
S3method(to_numeric,character)
S3method(to_numeric,data.frame)
S3method(to_numeric,default)
S3method(to_numeric,double)
S3method(to_numeric,factor)
S3method(to_numeric,haven_labelled)
S3method(to_numeric,logical)
S3method(to_numeric,numeric)
S3method(unnormalize,data.frame)
S3method(unnormalize,default)
S3method(unnormalize,grouped_df)
S3method(unnormalize,numeric)
S3method(unstandardize,array)
S3method(unstandardize,character)
S3method(unstandardize,data.frame)
S3method(unstandardize,datagrid)
S3method(unstandardize,factor)
S3method(unstandardize,grouped_df)
S3method(unstandardize,matrix)
S3method(unstandardize,numeric)
S3method(unstandardize,visualisation_matrix)
S3method(winsorize,character)
S3method(winsorize,data.frame)
S3method(winsorize,factor)
S3method(winsorize,logical)
S3method(winsorize,numeric)
export(adjust)
export(as.prop.table)
export(assign_labels)
export(categorize)
export(center)
export(centre)
export(change_scale)
export(coef_var)
export(coerce_to_numeric)
export(colnames_to_row)
export(column_as_rownames)
export(contr.deviation)
export(convert_na_to)
export(convert_to_na)
export(data_addprefix)
export(data_addsuffix)
export(data_adjust)
export(data_arrange)
export(data_codebook)
export(data_duplicated)
export(data_extract)
export(data_filter)
export(data_group)
export(data_join)
export(data_match)
export(data_merge)
export(data_modify)
export(data_partition)
export(data_peek)
export(data_read)
export(data_relocate)
export(data_remove)
export(data_rename)
export(data_rename_rows)
export(data_reorder)
export(data_replicate)
export(data_restoretype)
export(data_rotate)
export(data_seek)
export(data_select)
export(data_separate)
export(data_summary)
export(data_tabulate)
export(data_to_long)
export(data_to_wide)
export(data_transpose)
export(data_ungroup)
export(data_unique)
export(data_unite)
export(data_write)
export(degroup)
export(demean)
export(describe_distribution)
export(detrend)
export(display)
export(distribution_coef_var)
export(distribution_mode)
export(empty_columns)
export(empty_rows)
export(extract_column_names)
export(find_columns)
export(kurtosis)
export(labels_to_levels)
export(mean_sd)
export(means_by_group)
export(median_mad)
export(normalize)
export(print_html)
export(print_md)
export(ranktransform)
export(recode_into)
export(recode_values)
export(remove_empty)
export(remove_empty_columns)
export(remove_empty_rows)
export(replace_nan_inf)
export(rescale)
export(rescale_weights)
export(reshape_ci)
export(reshape_longer)
export(reshape_wider)
export(reverse)
export(reverse_scale)
export(row_count)
export(row_means)
export(row_sums)
export(row_to_colnames)
export(rowid_as_column)
export(rownames_as_column)
export(skewness)
export(slide)
export(smoothness)
export(standardise)
export(standardize)
export(text_concatenate)
export(text_format)
export(text_fullstop)
export(text_lastchar)
export(text_paste)
export(text_remove)
export(text_wrap)
export(to_factor)
export(to_numeric)
export(unnormalize)
export(unstandardise)
export(unstandardize)
export(visualisation_recipe)
export(weighted_mad)
export(weighted_mean)
export(weighted_median)
export(weighted_sd)
export(winsorize)
importFrom(insight,display)
importFrom(insight,print_html)
importFrom(insight,print_md)
importFrom(stats,makepredictcall)


================================================
FILE: NEWS.md
================================================
# datawizard 1.3.1

CHANGES

* `data_summary()` now allows expressions to return more than one summary
  value. For each value, a new column is created. Additionally, the optional
  `suffix` argument controls the naming of these columns; if `suffix = NULL`,
  column names are auto-generated (e.g., with numeric suffixes).

* `standardize()` now works on `fixest` estimations (#665).

* `data_read()` and `data_write()` gain a `password` argument, to encrypt and
  decrypt data files. This currently only works for R file formats (`.rda`,
  `.rds`, and `.rdata`). Data encryption is based on the AES-GCM algorithm using
  the `openssl::aes_gcm_encrypt()` function (#675).

FIXES

* Fix a test due to R-devel change (#677).

# datawizard 1.3.0

BREAKING CHANGES

* Argument `values_fill` in `data_to_wide()` is now defunct, because it did not
  work as intended (#645).

* `data_to_wide()` no longer removes empty columns that were created after
  widening data frames, to behave similarly to `tidyr::pivot_wider()` (#645).

CHANGES

* `data_tabulate()` now saves the table of proportions for crosstables as
  attribute, accessible via the new `as.prop.table()` method (#656).

* Due to changes in the package `insight`, `data_tabulate()` no longer prints
  decimals when all values in a column are integers (#641).

* Argument `values_from` in `data_to_wide()` now supports select-helpers like
  the `select` argument in other `{datawizard}` functions (#645).

* Added a `display()` method for `data_codebook()` (#646).

* `display()` methods now support the `{tinytable}` package. Use `format = "tt"`
  to export tables as `tinytable` objects (#646).

* Improved performance for several functions that process grouped data frames
  when the input is a grouped `tibble` (#651).

BUG FIXES

* Fixed an issue when `demean()`ing nested structures with more than 2 grouping
  variables (#635).

* Fixed an issue when `demean()`ing crossed structures with more than 2 grouping
  variables (#638).

* Fixed issue in `data_to_wide()` with multiple variables assigned in
  `values_from` when IDs were not balanced (equally spread across observations)
  (#644).

* Fixed issue in `data_replicate()` when data frame had only one column to
  replicate (#654).

# datawizard 1.2.0

BREAKING CHANGES

* The following deprecated arguments have been removed (#603):
  - `drop_na` in `data_match()`
  - `safe`, `pattern`, and `verbose` in `data_rename()`

CHANGES

* `data_read()` and `data_write()` now support the `.parquet` file format, via
  the *nanoparquet* package (#625).

* `data_tabulate()` gets a `display()` method (#627).

* `data_tabulate()` gets an `as.table()` method to coerce the frequency or
  contingency table into a (list of) `table()` object(s). This can be useful for
  further statistical analysis, e.g. in combination with `chisq.test()` (#629).

* The `print()` method for `data_tabulate()` now appears in the documentation,
  making the `big_mark` argument visible (#627).

BUG FIXES

* Fixed an issue when printing cross tables using `data_tabulate(by = ...)`,
  which was caused by the recent changes in `insight::export_table()`.

* Fixed another issue when printing cross tables using `data_tabulate(by = ...)`,
  when more than one variable was selected for `select` (#630).

* Fixed typo in the documentation of `data_match()`.

# datawizard 1.1.0

BREAKING CHANGES

* `data_read()` now also returns Bayesian models from packages *brms* and
  *rstanarm* as original model objects, and no longer coerces them into data
  frames (#606).

* The output format of `describe_distribution()` on grouped data has changed.
  Before, it printed one table per group combination. Now, it prints a single
  table with group columns at the start (#610).

* The output format of `describe_distribution()` when confidence intervals are
  requested has changed. Now, for each centrality measure a confidence interval
  is calculated (#617).

* `data_modify()` now always uses values of a vector for a modified or newly
  created variable, and no longer tries to detect whether a character value
  possibly contains an expression. To allow expression provided as string (or
  character vectors), use the helper-function `as_expr()`. Only literal
  expressions or strings wrapped in `as_expr()` will be evaluated as
  expressions, everything else will be treated as vector with values for new
  variables (#605).

CHANGES

* `display()` is now re-exported from package *insight*.

* `data_read()` and `data_write()` now rely on base-R functions for files of
  type `.rds`, `.rda` or `.rdata`. Thus, package *rio*  is no longer required
  to be installed for these file types (#607).

* `data_codebook()` gives an informative warning when no column names matched
  the selection pattern (#601).

* `data_to_long()` now errors when columns selected to reshape do not exist in
  the data, to avoid nonsensical results that could be missed (#602).

* New argument `by` in `describe_distribution()` (#604).

* `describe_distribution()` now gives informative errors when column names
  in the input data frame conflict with column from the output table (#612).

* The methods for `parameters_distribution` objects are now defined in
  `datawizard` (they were previously in `parameters`) (#613).

BUG FIXES

* Fixed bug in `data_to_wide()`, where new column names in `names_from` were
  ignored when that column only contained one unique value.

* Fixed bug in `describe_distribution()` when some group combinations
  didn't appear in the data (#609).

* Fixed bug in `describe_distribution()` when more than one value for the
  `centrality` argument were specified (#617).

* Fixed bug in `describe_distribution()` where setting `verbose = FALSE`
  didn't hide some warnings (#617).

* Fixed warning in `data_summary()` when a variable had the same name as
  another object in the global environment (#585).

# datawizard 1.0.2

BUG FIXES

* Fixed failing R CMD check on ATLAS, noLD, and OpenBLAS due to small numerical
  differences (#592).

# datawizard 1.0.1

BUG FIXES

* Fixed issue in `data_arrange()` for data frames that only had one column.
  Formerly, the data frame was coerced into a vector, now the data frame class
  is preserved.

* Fixed issue in R-devel (4.5.0) due to a change in how `grep()` handles logical
  arguments with missing values (#588).

# datawizard 1.0.0

BREAKING CHANGES AND DEPRECATIONS

* *datawizard* now requires R >= 4.0 (#515).

* Argument `drop_na` in `data_match()` is deprecated now. Please use
  `remove_na` instead (#556).

* In `data_rename()` (#567):
  - argument `pattern` is deprecated. Use `select` instead.
  - argument `safe` is deprecated. The function now errors when `select`
    contains unknown column names.
  - when `replacement` is `NULL`, an error is now thrown (previously, column
    indices were used as new names).
  - if `select` (previously `pattern`) is a named vector, then all elements
    must be named, e.g. `c(length = "Sepal.Length", "Sepal.Width")` errors.

* Order of arguments `by` and `probability_weights` in `rescale_weights()` has
  changed, because for `method = "kish"`, the `by` argument is optional (#575).

* The name of the rescaled weights variables in `rescale_weights()` have been
  renamed. `pweights_a` and `pweights_b` are now named `rescaled_weights_a`
  and `rescaled_weights_b` (#575).

* `print()` methods for `data_tabulate()` with multiple sub-tables (i.e. when
  length of `by` was > 1) were revised. Now, an integrated table instead of
  multiple tables is returned. Furthermore, `print_html()` did not work, which
  was also fixed now (#577).

* `demean()` (and `degroup()`) gets an `append` argument that defaults to `TRUE`,
  to append the centered variables to the original data frame, instead of
  returning the de- and group-meaned variables only. Use `append = FALSE` to
  for the previous default behaviour (i.e. only returning the newly created
  variables) (#579).

CHANGES

* `rescale_weights()` gets a `method` argument, to choose method to rescale
  weights. Options are `"carle"` (the default) and `"kish"` (#575).

* The `select` argument, which is available in different functions to select
  variables, can now also be a character vector with quoted variable names,
  including a colon to indicate a range of several variables (e.g. `"cyl:gear"`)
  (#551).

* New function `row_sums()`, to calculate row sums (optionally with minimum
  amount of valid values), as complement to `row_means()` (#552).

* New function `row_count()`, to count specific values row-wise (#553).

* `data_read()` no longer shows warning about forthcoming breaking changes
  in upstream packages when reading `.RData` files (#557).

* `data_modify()` now recognizes `n()`, for example to create an index for data
  groups with `1:n()` (#535).

* The `replacement` argument in `data_rename()` now supports glue-styled
  tokens  (#563).

* `data_summary()` also accepts the results of `bayestestR::ci()` as summary
  function (#483).

* `ranktransform()` has a new argument `zeros` to determine how zeros should be
  handled when `sign = TRUE` (#573).

BUG FIXES

* `describe_distribution()` no longer errors if the sample was too sparse to compute
  CIs. Instead, it warns the user and returns `NA` (#550).

* `data_read()` preserves variable types when importing files from `rds` or
  `rdata` format (#558).

# datawizard 0.13.0

BREAKING CHANGES

* `data_rename()` now errors when the `replacement` argument contains `NA` values
  or empty strings (#539).

* Removed deprecated functions `get_columns()`, `data_find()`, `format_text()` (#546).

* Removed deprecated arguments `group` and `na.rm` in multiple functions. Use `by` and `remove_na` instead (#546).

* The default value for the argument `dummy_factors` in `to_numeric()` has
  changed from `TRUE` to `FALSE` (#544).

CHANGES

* The `pattern` argument in `data_rename()` can also be a named vector. In this
  case, names are used as values for the `replacement` argument (i.e. `pattern`
  can be a character vector using `<new name> = "<old name>"`).

* `categorize()` gains a new `breaks` argument, to decide whether breaks are
  inclusive or exclusive (#548).

* The `labels` argument in `categorize()` gets two new options, `"range"` and
  `"observed"`, to use the range of categorized values as labels (i.e. factor
  levels) (#548).

* Minor additions to `reshape_ci()` to work with forthcoming changes in the
  `{bayestestR}` package.

# datawizard 0.12.3

CHANGES

* `demean()` (and `degroup()`) now also work for nested designs, if argument
  `nested = TRUE` and  `by` specifies more than one variable (#533).

* Vignettes are no longer provided in the package, they are now only available
  on the website. There is only one "Overview" vignette available in the package,
  it contains links to the other vignettes on the website. This is because there
  are CRAN errors occurring when building vignettes on macOS and we couldn't
  determine the cause after multiple patch releases (#534).

# datawizard 0.12.2

* Remove `htmltools` from `Suggests` in an attempt of fixing an error in CRAN
  checks due to failures to build a vignette (#528).

# datawizard 0.12.1

This is a patch release to fix one error on CRAN checks occurring because of a
missing package namespace in one of the vignettes.

# datawizard 0.12.0

BREAKING CHANGES

* The argument `include_na` in `data_tabulate()` and `data_summary()` has been
  renamed into `remove_na`. Consequently, to mimic former behaviour, `FALSE` and
  `TRUE` need to be switched (i.e. `remove_na = TRUE` is equivalent to the former
  `include_na = FALSE`).

* Class names for objects returned by `data_tabulate()` have been changed to
  `datawizard_table` and `datawizard_crosstable` (resp. the plural forms,
  `*_tables`), to provide a clearer and more consistent naming scheme.

CHANGES

* `data_select()` can directly rename selected variables when a named vector
  is provided in `select`, e.g. `data_select(mtcars, c(new1 = "mpg", new2 = "cyl"))`.

* `data_tabulate()` gains an `as.data.frame()` method, to return the frequency
  table as a data frame. The structure of the returned object is a nested data
  frame, where the first column contains name of the variable for which
  frequencies were calculated, and the second column contains the frequency table.

* `demean()` (and `degroup()`) now also work for cross-classified designs, or
  more generally, for data with multiple grouping or cluster variables (i.e.
  `by` can now specify more than one variable).

# datawizard 0.11.0

BREAKING CHANGES

* Arguments named `group` or `group_by` are deprecated and will be removed
  in a future release. Please use `by` instead. This affects the following
  functions in *datawizard* (#502).

  * `data_partition()`
  * `demean()` and `degroup()`
  * `means_by_group()`
  * `rescale_weights()`

* Following aliases are deprecated and will be removed in a future release (#504):

  * `get_columns()`, use `data_select()` instead.
  * `data_find()` and `find_columns()`, use `extract_column_names()` instead.
  * `format_text()`, use `text_format()` instead.

CHANGES

* `recode_into()` is more relaxed regarding checking the type of `NA` values.
  If you recode into a numeric variable, and one of the recode values is `NA`,
  you no longer need to use `NA_real_` for numeric `NA` values.

* Improved documentation for some functions.

BUG FIXES

* `data_to_long()` did not work for data frame where columns had attributes
  (like labelled data).

# datawizard 0.10.0

BREAKING CHANGES

* The following arguments were deprecated in 0.5.0 and are now removed:

  * in `data_to_wide()`: `colnames_from`, `rows_from`, `sep`
  * in `data_to_long()`: `colnames_to`
  * in `data_partition()`: `training_proportion`

NEW FUNCTIONS

* `data_summary()`, to compute summary statistics of (grouped) data frames.

* `data_replicate()`, to expand a data frame by replicating rows based on another
  variable that contains the counts of replications per row.

CHANGES

* `data_modify()` gets three new arguments, `.at`, `.if` and `.modify`, to modify
  variables at specific positions or based on logical conditions.

* `data_tabulate()` was revised and gets several new arguments: a `weights`
  argument, to compute weighted frequency tables. `include_na` allows to include
  or omit missing values from the table. Furthermore, a `by` argument was added,
  to compute crosstables (#479, #481).

# datawizard 0.9.1

CHANGES

* `rescale()` gains `multiply` and `add` arguments, to expand ranges by a given
  factor or value.

* `to_factor()` and `to_numeric()` now support class `haven_labelled`.

BUG FIXES

* `to_numeric()` now correctly deals with inversed factor levels when
  `preserve_levels = TRUE`.

* `to_numeric()` inversed order of value labels when `dummy_factors = FALSE`.

* `convert_to_na()` now preserves attributes for factors when `drop_levels = TRUE`.

# datawizard 0.9.0

NEW FUNCTIONS

* `row_means()`, to compute row means, optionally only for the rows with at
  least `min_valid` non-missing values.

* `contr.deviation()` for sum-deviation contrast coding of factors.

* `means_by_group()`, to compute mean values of variables, grouped by levels
  of specified factors.

* `data_seek()`, to seek for variables in a data frame, based on their
  column names, variables labels, value labels or factor levels. Searching for
  labels only works for "labelled" data, i.e. when variables have a `label` or
  `labels` attribute.

CHANGES

* `recode_into()` gains an `overwrite` argument to skip overwriting already
  recoded cases when multiple recode patterns apply to the same case.

* `recode_into()` gains an `preserve_na` argument to preserve `NA` values
  when recoding.

* `data_read()` now passes the `encoding` argument to `data.table::fread()`.
  This allows to read files with non-ASCII characters.

* `datawizard` moves from the GPL-3 license to the MIT license.

* `unnormalize()` and `unstandardize()` now work with grouped data (#415).

* `unnormalize()` now errors instead of emitting a warning if it doesn't have the
  necessary info (#415).

BUG FIXES

* Fixed issue in `labels_to_levels()` when values of labels were not in sorted
  order and values were not sequentially numbered.

* Fixed issues in `data_write()` when writing labelled data into SPSS format
  and vectors were of different type as value labels.

* Fixed issues in `data_write()` when writing labelled data into SPSS format
  for character vectors with missing value labels, but existing variable
  labels.

* Fixed issue in `recode_into()` with probably wrong case number printed in the
  warning when several recode patterns match to one case.

* Fixed issue in `recode_into()` when original data contained `NA` values and
  `NA` was not included in the recode pattern.

* Fixed issue in `data_filter()` where functions containing a `=` (e.g. when
  naming arguments, like `grepl(pattern, x = a)`) were mistakenly seen as
  faulty syntax.

* Fixed issue in `empty_column()` for strings with invalid multibyte strings.
  For such data frames or files, `empty_column()` or `data_read()` no longer
  fails.

# datawizard 0.8.0

BREAKING CHANGES

* The following re-exported functions from `{insight}` have now been removed:
  `object_has_names()`, `object_has_rownames()`, `is_empty_object()`,
  `compact_list()`, `compact_character()`.

* Argument `na.rm` was renamed to `remove_na` throughout `{datawizard}` functions.
  `na.rm` is kept for backward compatibility, but will be deprecated and later
  removed in future updates.

* The way expressions are defined in `data_filter()` was revised. The `filter`
  argument was replaced by `...`, allowing to separate multiple expression with
  a comma (which are then combined with `&`). Furthermore, expressions can now also be
  defined as strings, or be provided as character vectors, to allow string-friendly
  programming.

CHANGES

* Weighted-functions (`weighted_sd()`, `weighted_mean()`, ...) gain a `remove_na`
  argument, to remove or keep missing and infinite values. By default,
  `remove_na = TRUE`, i.e. missing and infinite values are removed by default.

* `reverse_scale()`, `normalize()` and `rescale()` gain an `append` argument
  (similar to other data frame methods of transformation functions), to append
  recoded variables to the input data frame instead of overwriting existing
  variables.

NEW FUNCTIONS

* `rowid_as_column()` to complement `rownames_as_column()` (and to mimic
  `tibble::rowid_to_column()`). Note that its behavior is different from
  `tibble::rowid_to_column()` for grouped data. See the Details section in the
  docs.

* `data_unite()`, to merge values of multiple variables into one new variable.

* `data_separate()`, as counterpart to `data_unite()`, to separate a single
  variable into multiple new variables.

* `data_modify()`, to create new variables, or modify or remove existing
  variables in a data frame.

MINOR CHANGES

* `to_numeric()` for variables of type `Date`, `POSIXct` and `POSIXlt` now
  includes the class name in the warning message.

* Added a `print()` method for `center()`, `standardize()`, `normalize()` and
  `rescale()`.

BUG FIXES

* `standardize_parameters()` now works when the package namespace is in the model
  formula (#401).

* `data_merge()` no longer yields a warning for `tibbles` when `join = "bind"`.

* `center()` and `standardize()` did not work for grouped data frames (of class
  `grouped_df`) when `force = TRUE`.

* The `data.frame` method of `describe_distribution()` returns `NULL` instead of
  an error if no valid variable were passed (for example a factor variable with
  `include_factors = FALSE`) (#421).

# datawizard 0.7.1

BREAKING CHANGES

* `add_labs()` was renamed into `assign_labels()`. Since `add_labs()` existed
  only for a few days, there will be no alias for backwards compatibility.

NEW FUNCTIONS

* `labels_to_levels()`, to use value labels of factors as their levels.

MINOR CHANGES

* `data_read()` now checks if the imported object actually is a data frame (or
  coercible to a data frame), and if not, no longer errors, but gives an
  informative warning of the type of object that was imported.

BUG FIXES

* Fix test for CRAN check on Mac OS arm64

# datawizard 0.7.0

BREAKING CHANGES

* In selection patterns, expressions like `-var1:var3` to exclude all variables
  between `var1` and `var3` are no longer accepted. The correct expression is
  `-(var1:var3)`. This is for 2 reasons:

  * to be consistent with the behavior for numerics (`-1:2` is not accepted but
    `-(1:2)` is);
  * to be consistent with `dplyr::select()`, which throws a warning and only
    uses the first variable in the first expression.

NEW FUNCTIONS

* `recode_into()`, similar to `dplyr::case_when()`, to recode values from one
  or more variables into a new variable.

* `mean_sd()` and `median_mad()` for summarizing vectors to their mean (or
  median) and a range of one SD (or MAD) above and below.

* `data_write()` as counterpart to `data_read()`, to write data frames into
  CSV, SPSS, SAS, Stata files and many other file types. One advantage over
  existing functions to write data in other packages is that labelled (numeric)
  data can be converted into factors (with values labels used as factor levels)
  even for text formats like CSV and similar. This allows exporting "labelled"
  data into those file formats, too.

* `add_labs()`, to manually add value and variable labels as attributes to
  variables. These attributes are stored as `"label"` and `"labels"` attributes,
  similar to the `labelled` class from the _haven_ package.

MINOR CHANGES

* `data_rename()` gets a `verbose` argument.
* `winsorize()` now errors if the threshold is incorrect (previously, it provided
  a warning and returned the unchanged data). The argument `verbose` is now
  useless but is kept for backward compatibility. The documentation now contains
  details about the valid values for `threshold` (#357).
* In all functions that have arguments `select` and/or `exclude`, there is now
  one warning per misspelled variable. The previous behavior was to have only one
  warning.
* Fixed inconsistent behaviour in `standardize()` when only one of the arguments
  `center` or `scale` were provided (#365).
* `unstandardize()` and `replace_nan_inf()` now work with select helpers (#376).
* Added informative warning and error messages to `reverse()`. Furthermore, the
  docs now describe the `range` argument more clearly (#380).
* `unnormalize()` errors with unexpected inputs (#383).

BUG FIXES

* `empty_columns()` (and therefore `remove_empty_columns()`) now correctly detects
  columns containing only `NA_character_` (#349).
* Select helpers now work in custom functions when argument is called `select`
  (#356).
* Fix unexpected warning in `convert_na_to()` when `select` is a list (#352).
* Fixed issue with correct labelling of numeric variables with more than nine
  unique values and associated value labels.


# datawizard 0.6.5

MAJOR CHANGES

* Etienne Bacher is the new maintainer.

MINOR CHANGES

* `standardize()`, `center()`, `normalize()` and `rescale()` can be used in
  model formulas, similar to `base::scale()`.

* `data_codebook()` now includes the proportion for each category/value, in
  addition to the counts. Furthermore, if data contains tagged `NA` values,
  these are included in the frequency table.

BUG FIXES

* `center(x)` now works correctly when `x` is a single value and either
  `reference` or `center` is specified (#324).

* Fixed issue in `data_codebook()`, which failed for labelled vectors when
  values of labels were not in sorted order.

# datawizard 0.6.4

NEW FUNCTIONS

* `data_codebook()`: to generate codebooks of data frames.

* New functions to deal with duplicates: `data_duplicated()` (keep all duplicates,
  including the first occurrence) and `data_unique()` (returns the data, excluding
  all duplicates except one instance of each, based on the selected method).

MINOR CHANGES

* `.data.frame` methods should now preserve custom attributes.

* The `include_bounds` argument in `normalize()` can now also be a numeric
  value, defining the limit to the upper and lower bound (i.e. the distance
  to 1 and 0).

* `data_filter()` now works with grouped data.

BUG FIXES

* `data_read()` no longer prints message for empty columns when the data
  actually had no empty columns.

 * `data_to_wide()` now drops columns that are not in `id_cols` (if specified),
  `names_from`, or `values_from`. This is the behaviour observed in `tidyr::pivot_wider()`.

# datawizard 0.6.3

MAJOR CHANGES

* There is a new publication about the `{datawizard}` package:
  <https://joss.theoj.org/papers/10.21105/joss.04684>

* Fixes failing tests due to changes in `R-devel`.

* `data_to_long()` and `data_to_wide()` have had significant performance
  improvements, sometimes as high as a ten-fold speedup.

MINOR CHANGES

* When column names are misspelled, most functions now suggest which existing
  columns possibly could be meant.

* Miscellaneous performance gains.

* `convert_to_na()` now requires argument `na` to be of class 'Date' to convert
  specific dates to `NA`. For example, `convert_to_na(x, na = "2022-10-17")`
  must be changed to `convert_to_na(x, na = as.Date("2022-10-17"))`.

BUG FIXES

* `data_to_long()` and `data_to_wide()` now correctly keep the `date` format.

# datawizard 0.6.2

BREAKING CHANGES

* Methods for grouped data frames (`.grouped_df`) no longer support
  `dplyr::group_by()` for `{dplyr}` before version `0.8.0`.

* `empty_columns()` and `remove_empty_columns()` now also remove columns that
  contain only empty characters. Likewise, `empty_rows()` and
  `remove_empty_rows()` remove observations that completely have missing or
  empty character values.

MINOR CHANGES

* `data_read()` gains a `convert_factors` argument, to turn off automatic
  conversion from numeric variables into factors.

BUG FIXES

* `data_arrange()` now works with data frames that were grouped using
  `data_group()` (#274).

# datawizard 0.6.1

* Updates tests for upcoming changes in the `{tidyselect}` package (#267).

# datawizard 0.6.0

BREAKING CHANGES

* The minimum needed R version has been bumped to `3.6`.

* Following deprecated functions have been removed:

`data_cut()`, `data_recode()`, `data_shift()`, `data_reverse()`,
`data_rescale()`, `data_to_factor()`, `data_to_numeric()`

* New `text_format()` alias is introduced for `format_text()`, latter of which
  will be removed in the next release.

* New `recode_values()` alias is introduced for `change_code()`, latter of which
  will be removed in the next release.

* `data_merge()` now errors if columns specified in `by` are not in both
  datasets.

* Using negative values in arguments `select` and `exclude` now removes the
  columns from the selection/exclusion. The previous behavior was to start the
  selection/exclusion from the end of the dataset, which was inconsistent with
  the use of "-" with other selecting possibilities.

NEW FUNCTIONS

* `data_peek()`: to peek at values and type of variables in a data frame.

* `coef_var()`: to compute the coefficient of variation.

CHANGES

* `data_filter()` will give more informative messages on malformed syntax of the
  `filter` argument.

* It is now possible to use curly brackets to pass variable names to
  `data_filter()`, like the following example. See examples section in the
  documentation of `data_filter()`.

* The `regex` argument was added to functions that use select-helpers and did
  not already have this argument.

* Select helpers `starts_with()`, `ends_with()`, and `contains()` now accept
  several patterns, e.g `starts_with("Sep", "Petal")`.

* Arguments `select` and `exclude` that are present in most functions have been
  improved to work in loops and in custom functions. For example, the following
  code now works:

```r
foo <- function(data) {
  i <- "Sep"
  find_columns(data, select = starts_with(i))
}
foo(iris)

for (i in c("Sepal", "Sp")) {
  head(iris) |>
    find_columns(select = starts_with(i)) |>
    print()
}
```

* There is now a vignette summarizing the various ways to select or exclude
  variables in most `{datawizard}` functions.

# datawizard 0.5.1

* Fixes failing tests due to `{poorman}` update.

# datawizard 0.5.0

MAJOR CHANGES

* Following statistical transformation functions have been renamed to not have
  `data_*()` prefix, since they do not work exclusively with data frames, but
  are typically first of all used with vectors, and therefore had misleading
  names:

  - `data_cut()` -> `categorize()`

  - `data_recode()` -> `change_code()`

  - `data_shift()` -> `slide()`

  - `data_reverse()` -> `reverse()`

  - `data_rescale()` -> `rescale()`

  - `data_to_factor()` -> `to_factor()`

  - `data_to_numeric()` -> `to_numeric()`

Note that these functions also have `.data.frame()` methods and still work for
data frames as well. Former function names are still available as aliases, but
will be deprecated and removed in a future release.

* Bumps the needed minimum R version to `3.5`.

* Removed deprecated function `data_findcols()`. Please use its replacement,
  `data_find()`.

* Removed alias `extract()` for `data_extract()` function since it collided with
  `tidyr::extract()`.

* Argument `training_proportion` in `data_partition()` is deprecated. Please use
  `proportion` now.

* Given his continued and significant contributions to the package, Etienne
  Bacher (@etiennebacher) is now included as an author.

* `unstandardise()` now works for `center(x)`

* `unnormalize()` now works for `change_scale(x)`

* `reshape_wider()` now follows more consistently `tidyr::pivot_wider()` syntax.
  Arguments `colnames_from`, `sep`, and `rows_from` are deprecated and should be
  replaced by `names_from`, `names_sep`, and `id_cols` respectively.
  `reshape_wider()` also gains an argument `names_glue` (#182, #198).

* Similarly, `reshape_longer()` now follows more consistently
  `tidyr::pivot_longer()` syntax. Argument `colnames_to` is deprecated and
  should be replaced by `names_to`. `reshape_longer()` also gains new arguments:
  `names_prefix`, `names_sep`, `names_pattern`, and `values_drop_na` (#189).

CHANGES

* Some of the text formatting helpers (like `text_concatenate()`) gain an
  `enclose` argument, to wrap text elements with surrounding characters.

* `winsorize` now accepts "raw" and "zscore" methods (in addition to
  "percentile"). Additionally, when `robust` is set to `TRUE` together with
  `method = "zscore"`, winsorizes via the median and median absolute deviation
  (MAD); else via the mean and standard deviation. (@rempsyc, #177, #49, #47).

* `convert_na_to` now accepts numeric replacements on character vectors and
  single replacement for multiple vector classes. (@rempsyc, #214).

* `data_partition()` now allows to create multiple partitions from the data,
  returning multiple training and a remaining test set.

* Functions like `center()`, `normalize()` or `standardize()` no longer fail
  when data contains infinite values (`Inf`).

NEW FUNCTIONS

* `row_to_colnames()` and `colnames_to_row()` to move a row to column names, and
  column names to row (@etiennebacher, #169).

* `data_arrange()` to sort the rows of a dataframe according to the values of
  the selected columns.

BUG FIXES

* Fixed wrong column names in `data_to_wide()` (#173).

# datawizard 0.4.1

BREAKING

* Added the `standardize.default()` method (moved from package **effectsize**),
  to be consistent in that the default-method now is in the same package as the
  generic. `standardize.default()` behaves exactly like in **effectsize** and
  particularly works for regression model objects. **effectsize** now re-exports
  `standardize()` from **datawizard**.

NEW FUNCTIONS

* `data_shift()` to shift the value range of numeric variables.

* `data_recode()` to recode old into new values.

* `data_to_factor()` as counterpart to `data_to_numeric()`.

* `data_tabulate()` to create frequency tables of variables.

* `data_read()` to read (import) data files (from text, or foreign statistical
  packages).

* `unnormalize()` as counterpart to `normalize()`. This function only works for
  variables that have been normalized with `normalize()`.

* `data_group()` and `data_ungroup()` to create grouped data frames, or to
  remove the grouping information from grouped data frames.

CHANGES

* `data_find()` was added as alias to `find_colums()`, to have consistent name
  patterns for the **datawizard** functions. `data_findcols()` will be removed
  in a future update and usage is discouraged.

* The `select` argument (and thus, also the `exclude` argument) now also accepts
  functions testing for logical conditions, e.g. `is.numeric()` (or
  `is.numeric`), or any user-defined function that selects the variables for
  which the function returns `TRUE` (like: `foo <- function(x) mean(x) > 3`).

* Arguments `select` and `exclude` now allow the negation of select-helpers,
  like `-ends_with("")`, `-is.numeric` or `-Sepal.Width:Petal.Length`.

* Many functions now get a `.default` method, to capture unsupported classes.
  This now yields a message and returns the original input, and hence, the
  `.data.frame` methods won't stop due to an error.

* The `filter` argument in `data_filter()` can also be a numeric vector, to
  indicate row indices of those rows that should be returned.

* `convert_to_na()` gets methods for variables of class `logical` and `Date`.

* `convert_to_na()` for factors (and data frames) gains a `drop_levels`
  argument, to drop unused levels that have been replaced by `NA`.

* `data_to_numeric()` gains two more arguments, `preserve_levels` and `lowest`,
  to give better control of conversion of factors.

BUG FIXES

* When logicals were passed to `center()` or `standardize()` and `force = TRUE`,
  these were not properly converted to numeric variables.

# datawizard 0.4.0

MAJOR CHANGES

* `data_match()` now returns filtered data by default. Old behavior (returning
  rows indices) can be set by setting `return_indices = TRUE`.

* The following functions are now re-exported from `{insight}` package:
  `object_has_names()`, `object_has_rownames()`, `is_empty_object()`,
  `compact_list()`, `compact_character()`

* `data_findcols()` will become deprecated in future updates. Please use the new
  replacements `find_columns()` and `get_columns()`.

* The vignette *Analysing Longitudinal or Panel Data* has now moved to
  [parameters
  package](https://easystats.github.io/parameters/articles/demean.html).

NEW FUNCTIONS

* To convert rownames to a column, and *vice versa*: `rownames_as_column()` and
  `column_as_rownames()` (@etiennebacher, #80).

* `find_columns()` and `get_columns()` to find column names or retrieve subsets
  of data frames, based on various select-methods (including select-helpers).
  These function will supersede `data_findcols()` in the future.

* `data_filter()` as complement for `data_match()`, which works with logical
  expressions for filtering rows of data frames.

* For computing weighted centrality measures and dispersion: `weighted_mean()`,
  `weighted_median()`, `weighted_sd()` and `weighted_mad()`.

* To replace `NA` in vectors and dataframes: `convert_na_to()` (@etiennebacher,
  #111).

MINOR CHANGES

* The `select` argument in several functions (like `data_remove()`,
  `reshape_longer()`, or `data_extract()`) now allows the use of select-helpers
  for selecting variables based on specific patterns.

* `data_extract()` gains new arguments to allow type-safe return values,

i.e. *always* return a vector *or* a data frame. Thus, `data_extract()` can now
be used to select multiple variables or pull a single variable from data
frames.

* `data_match()` gains a `match` argument, to indicate with which logical
  operation matching results should be combined.

* Improved support for *labelled data* for many functions, i.e. returned data
  frame will preserve value and variable label attributes, where possible and
  applicable.

* `describe_distribution()` now works with lists (@etiennebacher, #105).

* `data_rename()` doesn't use `pattern` anymore to rename the columns if
  `replacement` is not provided (@etiennebacher, #103).

* `data_rename()` now adds a suffix to duplicated names in `replacement`
  (@etiennebacher, #103).

BUG FIXES

* `data_to_numeric()` produced wrong results for factors when `dummy_factors =
  TRUE` and factor contained missing values.

* `data_match()` produced wrong results when data contained missing values.

* Fixed CRAN check issues in `data_extract()` when more than one variable was
  extracted from a data frame.

# datawizard 0.3.0

NEW FUNCTIONS

  * To find or remove empty rows and columns in a data frame: `empty_rows()`,
    `empty_columns()`, `remove_empty_rows()`, `remove_empty_columns()`, and
    `remove_empty`.

  * To check for names: `object_has_names()` and `object_has_rownames()`.

  * To rotate data frames: `data_rotate()`.

  * To reverse score variables: `data_reverse()`.

  * To merge/join multiple data frames: `data_merge()` (or its alias
    `data_join()`).

  * To cut (recode) data into groups: `data_cut()`.

  * To replace specific values with `NA`s: `convert_to_na()`.

  * To replace `Inf` and `NaN` values with `NA`s: `replace_nan_inf()`.

- Arguments `cols`, `before` and `after` in `data_relocate()` can now also be
  numeric values, indicating the position of the destination column.

# datawizard 0.2.3

- New functions:

  * to work with lists: `is_empty_object()` and `compact_list()`

  * to work with strings: `compact_character()`

# datawizard 0.2.2

- New function `data_extract()` (or its alias `extract()`) to pull single
  variables from a data frame, possibly naming each value by the row names of
  that data frame.

- `reshape_ci()` gains a `ci_type` argument, to reshape data frames where
  CI-columns have prefixes other than `"CI"`.

- `standardize()` and `center()` gain arguments `center` and `scale`, to define
  references for centrality and deviation that are used when centering or
  standardizing variables.

- `center()` gains the arguments `force` and `reference`, similar to
  `standardize()`.

- The functionality of the `append` argument in `center()` and `standardize()`
  was revised. This made the `suffix` argument redundant, and thus it was
  removed.

- Fixed issue in `standardize()`.

- Fixed issue in `data_findcols()`.

# datawizard 0.2.1

- Exports `plot` method for `visualisation_recipe()` objects from `{see}`
  package.

- `centre()`, `standardise()`, `unstandardise()` are exported as aliases for
  `center()`, `standardize()`, `unstandardize()`, respectively.

# datawizard 0.2.0.1

- This is mainly a maintenance release that addresses some issues with
  conflicting namespaces.

# datawizard 0.2.0

- New function: `visualisation_recipe()`.

- The following function has now moved to *performance* package:
  `check_multimodal()`.

- Minor updates to documentation, including a new vignette about `demean()`.

# datawizard 0.1.0

* First release.


================================================
FILE: R/adjust.R
================================================
#' Adjust data for the effect of other variable(s)
#'
#' This function can be used to adjust the data for the effect of other
#' variables present in the dataset. It is based on an underlying fitting of
#' regressions models, allowing for quite some flexibility, such as including
#' factors as random effects in mixed models (multilevel partialization),
#' continuous variables as smooth terms in general additive models (non-linear
#' partialization) and/or fitting these models under a Bayesian framework. The
#' values returned by this function are the residuals of the regression models.
#' Note that a regular correlation between two "adjusted" variables is
#' equivalent to the partial correlation between them.
#'
#' @param data A data frame.
#' @param effect Character vector of column names to be adjusted for (regressed
#'   out). If `NULL` (the default), all variables will be selected.
#' @param multilevel If `TRUE`, the factors are included as random factors.
#'   Else, if `FALSE` (default), they are included as fixed effects in the
#'   simple regression model.
#' @param additive If `TRUE`, continuous variables as included as smooth terms
#'   in additive models. The goal is to regress-out potential non-linear
#'   effects.
#' @param bayesian If `TRUE`, the models are fitted under the Bayesian framework
#'   using `rstanarm`.
#' @param keep_intercept If `FALSE` (default), the intercept of the model is
#'   re-added. This avoids the centering around 0 that happens by default
#'   when regressing out another variable (see the examples below for a
#'   visual representation of this).
#' @inheritParams extract_column_names
#' @inheritParams standardize
#'
#' @return A data frame comparable to `data`, with adjusted variables.
#'
#' @examplesIf all(insight::check_if_installed(c("bayestestR", "rstanarm", "gamm4"), quietly = TRUE))
#' adjusted_all <- adjust(attitude)
#' head(adjusted_all)
#' adjusted_one <- adjust(attitude, effect = "complaints", select = "rating")
#' head(adjusted_one)
#' \donttest{
#' adjust(attitude, effect = "complaints", select = "rating", bayesian = TRUE)
#' adjust(attitude, effect = "complaints", select = "rating", additive = TRUE)
#' attitude$complaints_LMH <- cut(attitude$complaints, 3)
#' adjust(attitude, effect = "complaints_LMH", select = "rating", multilevel = TRUE)
#' }
#'
#' # Generate data
#' data <- bayestestR::simulate_correlation(n = 100, r = 0.7)
#' data$V2 <- (5 * data$V2) + 20 # Add intercept
#'
#' # Adjust
#' adjusted <- adjust(data, effect = "V1", select = "V2")
#' adjusted_icpt <- adjust(data, effect = "V1", select = "V2", keep_intercept = TRUE)
#'
#' # Visualize
#' plot(
#'   data$V1, data$V2,
#'   pch = 19, col = "blue",
#'   ylim = c(min(adjusted$V2), max(data$V2)),
#'   main = "Original (blue), adjusted (green), and adjusted - intercept kept (red) data"
#' )
#' abline(lm(V2 ~ V1, data = data), col = "blue")
#' points(adjusted$V1, adjusted$V2, pch = 19, col = "green")
#' abline(lm(V2 ~ V1, data = adjusted), col = "green")
#' points(adjusted_icpt$V1, adjusted_icpt$V2, pch = 19, col = "red")
#' abline(lm(V2 ~ V1, data = adjusted_icpt), col = "red")
#'
#' @export
adjust <- function(
  data,
  effect = NULL,
  select = is.numeric,
  exclude = NULL,
  multilevel = FALSE,
  additive = FALSE,
  bayesian = FALSE,
  keep_intercept = FALSE,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = FALSE
) {
  # make sure column names are syntactically valid
  .check_dataframe_names(data, action = "error")

  # check for formula notation, convert to character vector
  if (inherits(effect, "formula")) {
    effect <- all.vars(effect)
  }

  # Find predictors
  if (is.null(effect)) {
    effect <- names(data)
  }

  if (is.null(select)) {
    select <- is.numeric
  }

  select <- .select_nse(
    select,
    data,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  # Factors
  formula_random <- NULL
  facs <- names(data[effect][!vapply(data[effect], is.numeric, logical(1L))])
  if (length(facs) >= 1 && multilevel) {
    if (additive) {
      formula_random <- stats::as.formula(paste(
        "~",
        paste(paste0("(1|", facs, ")"), collapse = " + ")
      ))
    } else {
      formula_random <- paste(
        "+",
        paste(paste0("(1|", facs, ")"), collapse = " + ")
      )
    }
    effect <- effect[!effect %in% facs]
  }

  # Fit models
  out <- data.frame(.ID = seq_len(nrow(data)))
  for (var in select) {
    predictors <- effect[effect != var]
    if (additive) {
      predictors_num <- names(data[predictors][vapply(
        data[predictors],
        is.numeric,
        logical(1L)
      )])
      predictors[predictors == predictors_num] <- paste0(
        "s(",
        predictors_num,
        ")"
      )
    }
    formula_predictors <- paste(c("1", predictors), collapse = " + ")
    model_formula <- paste(var, "~", formula_predictors)

    x <- .model_adjust_for(
      data = data[unique(c(var, effect, facs))],
      model_formula = model_formula,
      multilevel = multilevel,
      additive = additive,
      bayesian = bayesian,
      formula_random = formula_random,
      keep_intercept = keep_intercept
    )
    out[var] <- x
  }
  out[names(data)[!names(data) %in% names(out)]] <- data[names(data)[
    !names(data) %in% names(out)
  ]]
  out[names(data)]
}

#' @rdname adjust
#' @export
data_adjust <- adjust


#' @keywords internal
.model_adjust_for <- function(
  data,
  model_formula,
  multilevel = FALSE,
  additive = FALSE,
  bayesian = FALSE,
  formula_random = NULL,
  keep_intercept = FALSE
) {
  # Additive -----------------------
  if (additive) {
    # Bayesian
    if (bayesian) {
      insight::check_if_installed("rstanarm")
      model <- rstanarm::stan_gamm4(
        stats::as.formula(model_formula),
        random = formula_random,
        data = data,
        refresh = 0
      )
      # Frequentist
    } else {
      insight::check_if_installed("gamm4")
      model <- gamm4::gamm4(
        stats::as.formula(model_formula),
        random = formula_random,
        data = data
      )
    }

    # Linear -------------------------
  } else if (bayesian) {
    # Bayesian
    insight::check_if_installed("rstanarm")
    if (multilevel) {
      model <- rstanarm::stan_lmer(
        paste(model_formula, formula_random),
        data = data,
        refresh = 0
      )
    } else {
      model <- rstanarm::stan_glm(model_formula, data = data, refresh = 0)
    }
  } else if (multilevel) {
    # Frequentist
    insight::check_if_installed("lme4")
    model <- lme4::lmer(paste(model_formula, formula_random), data = data)
  } else {
    model <- stats::lm(model_formula, data = data)
  }

  adjusted <- insight::get_residuals(model)

  # Re-add intercept if need be
  if (keep_intercept) {
    intercept <- insight::get_intercept(model)
    if (length(intercept) > 1) {
      intercept <- stats::median(intercept)
    } # For bayesian model
    if (is.na(intercept)) {
      intercept <- 0
    }
    adjusted <- adjusted + intercept
  }

  # Deal with missing data
  out <- rep(NA, nrow(data))
  out[stats::complete.cases(data)] <- as.vector(adjusted)

  out
}


================================================
FILE: R/assign_labels.R
================================================
#' @title Assign variable and value labels
#' @name assign_labels
#'
#' @description
#' Assign variable and values labels to a variable or variables in a data frame.
#' Labels are stored as attributes (`"label"` for variable labels and `"labels"`)
#' for value labels.
#'
#' @param x A data frame, factor or vector.
#' @param variable The variable label as string.
#' @param values The value labels as (named) character vector. If `values` is
#' *not* a named vector, the length of labels must be equal to the length of
#' unique values. For a named vector, the left-hand side (LHS) is the value in
#' `x`, the right-hand side (RHS) the associated value label. Non-matching
#' labels are omitted.
#' @param ... Currently not used.
#' @inheritParams extract_column_names
#'
#' @inheritSection center Selection of variables - the `select` argument
#'
#' @return A labelled variable, or a data frame of labelled variables.
#'
#' @examples
#' x <- 1:3
#' # labelling by providing required number of labels
#' assign_labels(
#'   x,
#'   variable = "My x",
#'   values = c("one", "two", "three")
#' )
#'
#' # labelling using named vectors
#' data(iris)
#' out <- assign_labels(
#'   iris$Species,
#'   variable = "Labelled Species",
#'   values = c(`setosa` = "Spec1", `versicolor` = "Spec2", `virginica` = "Spec3")
#' )
#' str(out)
#'
#' # data frame example
#' out <- assign_labels(
#'   iris,
#'   select = "Species",
#'   variable = "Labelled Species",
#'   values = c(`setosa` = "Spec1", `versicolor` = "Spec2", `virginica` = "Spec3")
#' )
#' str(out$Species)
#'
#' # Partial labelling
#' x <- 1:5
#' assign_labels(
#'   x,
#'   variable = "My x",
#'   values = c(`1` = "lowest", `5` = "highest")
#' )
#' @export
assign_labels <- function(x, ...) {
  UseMethod("assign_labels")
}


#' @export
assign_labels.default <- function(x, verbose = TRUE, ...) {
  if (isTRUE(verbose)) {
    insight::format_alert(
      sprintf(
        "Adding labels currently not possible for variables of class `%s`.",
        class(x)[1]
      )
    )
  }
  x
}

#' @rdname assign_labels
#' @export
assign_labels.numeric <- function(x, variable = NULL, values = NULL, ...) {
  # add variable label
  if (!is.null(variable)) {
    if (is.character(variable) && length(variable) == 1L) {
      attr(x, "label") <- variable
    } else {
      insight::format_error(
        "Variable labels (argument `variable`) must be provided as a single character string, e.g. `variable = \"mylabel\"`." # nolint
      )
    }
  }

  # if user just wants to add a variable label, skip next steps
  if (!is.null(values)) {
    # extract unique values
    unique_values <- as.vector(sort(stats::na.omit(unique(x))))
    value_labels <- NULL

    # do we have a names vector for "values"?
    # else check if number of labels and values match
    if (is.null(names(values))) {
      if (length(values) == length(unique_values)) {
        value_labels <- stats::setNames(unique_values, values)
      } else {
        insight::format_error(
          "Cannot add labels. Number of unique values and number of value labels are not equal.",
          sprintf(
            "There are %i unique values and %i provided labels.",
            length(unique_values),
            length(values)
          )
        )
      }
    } else {
      # check whether we have matches of labels and values
      matching_labels <- names(values) %in% unique_values
      if (!all(matching_labels)) {
        insight::format_error(
          "Following labels were associated with values that don't exist:",
          text_concatenate(
            paste0(
              values[!matching_labels],
              " (",
              names(values)[!matching_labels],
              ")"
            ),
            enclose = "`"
          )
        )
      }
      values <- values[names(values) %in% unique_values]

      if (length(values)) {
        # we need to switch names and values
        value_labels <- stats::setNames(
          coerce_to_numeric(names(values)),
          values
        )
      }
    }

    attr(x, "labels") <- value_labels
  }

  x
}

#' @export
assign_labels.factor <- assign_labels.numeric

#' @export
assign_labels.character <- assign_labels.numeric

#' @rdname assign_labels
#' @export
assign_labels.data.frame <- function(
  x,
  select = NULL,
  exclude = NULL,
  values = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
) {
  # evaluate arguments
  select <- .select_nse(
    select,
    x,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  x[select] <- lapply(
    x[select],
    assign_labels,
    values = values,
    verbose = verbose,
    ...
  )
  x
}


================================================
FILE: R/categorize.R
================================================
#' @title Recode (or "cut" / "bin") data into groups of values.
#' @name categorize
#'
#' @description
#' This functions divides the range of variables into intervals and recodes
#' the values inside these intervals according to their related interval.
#' It is basically a wrapper around base R's `cut()`, providing a simplified
#' and more accessible way to define the interval breaks (cut-off values).
#'
#' @param x A (grouped) data frame, numeric vector or factor.
#' @param split Character vector, indicating at which breaks to split variables,
#'   or numeric values with values indicating breaks. If character, may be one
#'   of `"median"`, `"mean"`, `"quantile"`, `"equal_length"`, or `"equal_range"`.
#'   `"median"` or `"mean"` will return dichotomous variables, split at their
#'   mean or median, respectively. `"quantile"` and `"equal_length"` will split
#'   the variable into `n_groups` groups, where each group refers to an interval
#'   of a specific range of values. Thus, the length of each interval will be
#'   based on the number of groups. `"equal_range"` also splits the variable
#'   into multiple groups, however, the length of the interval is given, and
#'   the number of resulting groups (and hence, the number of breaks) will be
#'   determined by how many intervals can be generated, based on the full range
#'   of the variable.
#' @param n_groups If `split` is `"quantile"` or `"equal_length"`, this defines
#'   the number of requested groups (i.e. resulting number of levels or values)
#'   for the recoded variable(s). `"quantile"` will define intervals based
#'   on the distribution of the variable, while `"equal_length"` tries to
#'   divide the range of the variable into pieces of equal length.
#' @param range If `split = "equal_range"`, this defines the range of values
#'   that are recoded into a new value.
#' @param lowest Minimum value of the recoded variable(s). If `NULL` (the default),
#'   for numeric variables, the minimum of the original input is preserved. For
#'   factors, the default minimum is `1`. For `split = "equal_range"`, the
#'   default minimum is always `1`, unless specified otherwise in `lowest`.
#' @param breaks Character, indicating whether breaks for categorizing data are
#'   `"inclusive"` (values indicate the _upper_ bound of the _previous_ group or
#'   interval) or `"exclusive"` (values indicate the _lower_ bound of the _next_
#'   group or interval to begin). Use `labels = "range"` to make this behaviour
#'   easier to see.
#' @param labels Character vector of value labels. If not `NULL`, `categorize()`
#'   will returns factors instead of numeric variables, with `labels` used
#'   for labelling the factor levels. Can also be `"mean"`, `"median"`,
#'   `"range"` or `"observed"` for a factor with labels as the mean/median,
#'   the requested range (even if not all values of that range are present in
#'   the data) or observed range (range of the actual recoded values) of each
#'   group. See 'Examples'.
#' @param append Logical or string. If `TRUE`, recoded or converted variables
#'   get new column names and are appended (column bind) to `x`, thus returning
#'   both the original and the recoded variables. The new columns get a suffix,
#'   based on the calling function: `"_r"` for recode functions, `"_n"` for
#'   `to_numeric()`, `"_f"` for `to_factor()`, or `"_s"` for
#'   `slide()`. If `append=FALSE`, original variables in `x` will be
#'   overwritten by their recoded versions. If a character value, recoded
#'   variables are appended with new column names (using the defined suffix) to
#'   the original data frame.
#' @param ... not used.
#' @inheritParams extract_column_names
#'
#' @inherit data_rename seealso
#'
#' @details
#'
#' # Splits and breaks (cut-off values)
#'
#' Breaks are by default _exclusive_, this means that these values indicate
#' the lower bound of the next group or interval to begin. Take a simple
#' example, a numeric variable with values from 1 to 9. The median would be 5,
#' thus the first interval ranges from 1-4 and is recoded into 1, while 5-9
#' would turn into 2 (compare `cbind(1:9, categorize(1:9))`). The same variable,
#' using `split = "quantile"` and `n_groups = 3` would define breaks at 3.67
#' and 6.33 (see `quantile(1:9, probs = c(1/3, 2/3))`), which means that values
#' from 1 to 3 belong to the first interval and are recoded into 1 (because
#' the next interval starts at 3.67), 4 to 6 into 2 and 7 to 9 into 3.
#'
#' The opposite behaviour can be achieved using `breaks = "inclusive"`, in which
#' case
#'
#' # Recoding into groups with equal size or range
#'
#' `split = "equal_length"` and `split = "equal_range"` try to divide the
#' range of `x` into intervals of similar (or same) length. The difference is
#' that `split = "equal_length"` will divide the range of `x` into `n_groups`
#' pieces and thereby defining the intervals used as breaks (hence, it is
#' equivalent to `cut(x, breaks = n_groups)`), while  `split = "equal_range"`
#' will cut `x` into intervals that all have the length of `range`, where the
#' first interval by defaults starts at `1`. The lowest (or starting) value
#' of that interval can be defined using the `lowest` argument.
#'
#' @inheritSection center Selection of variables - the `select` argument
#'
#' @return `x`, recoded into groups. By default `x` is numeric, unless `labels`
#'   is specified. In this case, a factor is returned, where the factor levels
#'   (i.e. recoded groups are labelled accordingly.
#'
#' @examples
#' set.seed(123)
#' x <- sample(1:10, size = 50, replace = TRUE)
#'
#' table(x)
#'
#' # by default, at median
#' table(categorize(x))
#'
#' # into 3 groups, based on distribution (quantiles)
#' table(categorize(x, split = "quantile", n_groups = 3))
#'
#' # into 3 groups, user-defined break
#' table(categorize(x, split = c(3, 5)))
#'
#' set.seed(123)
#' x <- sample(1:100, size = 500, replace = TRUE)
#'
#' # into 5 groups, try to recode into intervals of similar length,
#' # i.e. the range within groups is the same for all groups
#' table(categorize(x, split = "equal_length", n_groups = 5))
#'
#' # into 5 groups, try to return same range within groups
#' # i.e. 1-20, 21-40, 41-60, etc. Since the range of "x" is
#' # 1-100, and we have a range of 20, this results into 5
#' # groups, and thus is for this particular case identical
#' # to the previous result.
#' table(categorize(x, split = "equal_range", range = 20))
#'
#' # return factor with value labels instead of numeric value
#' set.seed(123)
#' x <- sample(1:10, size = 30, replace = TRUE)
#' categorize(x, "equal_length", n_groups = 3)
#' categorize(x, "equal_length", n_groups = 3, labels = c("low", "mid", "high"))
#'
#' # cut numeric into groups with the mean or median as a label name
#' x <- sample(1:10, size = 30, replace = TRUE)
#' categorize(x, "equal_length", n_groups = 3, labels = "mean")
#' categorize(x, "equal_length", n_groups = 3, labels = "median")
#'
#' # cut numeric into groups with the requested range as a label name
#' # each category has the same range, and labels indicate this range
#' categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range")
#' # in this example, each category has the same range, but labels only refer
#' # to the ranges of the actual values (present in the data) inside each group
#' categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed")
#' @export
categorize <- function(x, ...) {
  UseMethod("categorize")
}

#' @export
categorize.default <- function(x, verbose = TRUE, ...) {
  if (isTRUE(verbose)) {
    insight::format_alert(
      paste0(
        "Variables of class `",
        class(x)[1],
        "` can't be recoded and remain unchanged."
      )
    )
  }
  x
}


#' @rdname categorize
#' @export
categorize.numeric <- function(
  x,
  split = "median",
  n_groups = NULL,
  range = NULL,
  lowest = 1,
  breaks = "exclusive",
  labels = NULL,
  verbose = TRUE,
  ...
) {
  # sanity check
  split <- .sanitize_split_arg(split, n_groups, range)

  # handle aliases
  if (identical(split, "equal_length")) {
    split <- "length"
  }
  if (identical(split, "equal_range")) {
    split <- "range"
  }

  # check for valid values
  breaks <- match.arg(breaks, c("exclusive", "inclusive"))

  # save
  original_x <- x

  # no missings
  x <- stats::na.omit(x)

  # stop if all NA
  if (!length(x)) {
    if (isTRUE(verbose)) {
      insight::format_alert(
        "Variable contains only missing values. No recoding carried out."
      )
    }
    return(original_x)
  }

  if (is.numeric(split)) {
    category_splits <- split
  } else {
    category_splits <- switch(
      split,
      median = stats::median(x),
      mean = mean(x),
      length = n_groups,
      quantile = stats::quantile(x, probs = seq_len(n_groups) / n_groups),
      range = .equal_range(x, range, n_groups, lowest),
      NULL
    )
  }

  # complete ranges, including minimum and maximum
  if (!identical(split, "length")) {
    category_splits <- unique(c(min(x), category_splits, max(x)))
  }

  # recode into groups
  out <- droplevels(cut(
    x,
    breaks = category_splits,
    include.lowest = TRUE,
    right = identical(breaks, "inclusive")
  ))
  cut_result <- out
  levels(out) <- 1:nlevels(out)

  # fix lowest value, add back into original vector
  out <- as.numeric(out)
  if (!is.null(lowest)) {
    out <- out - (min(out) - lowest)
  }
  original_x[!is.na(original_x)] <- out

  # turn into factor?
  .original_x_to_factor(original_x, x, cut_result, labels, out, verbose, ...)
}


#' @export
categorize.factor <- function(x, ...) {
  original_x <- x
  levels(x) <- 1:nlevels(x)
  out <- as.factor(categorize(as.numeric(x), ...))
  .set_back_labels(out, original_x, include_values = FALSE)
}


#' @rdname categorize
#' @export
categorize.data.frame <- function(
  x,
  select = NULL,
  exclude = NULL,
  split = "median",
  n_groups = NULL,
  range = NULL,
  lowest = 1,
  breaks = "exclusive",
  labels = NULL,
  append = FALSE,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
) {
  # evaluate arguments
  select <- .select_nse(
    select,
    x,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  # when we append variables, we call ".process_append()", which will
  # create the new variables and updates "select", so new variables are processed
  if (!isFALSE(append)) {
    # process arguments
    my_args <- .process_append(
      x,
      select,
      append,
      append_suffix = "_r"
    )
    # update processed arguments
    x <- my_args$x
    select <- my_args$select
  }

  x[select] <- lapply(
    x[select],
    categorize,
    split = split,
    n_groups = n_groups,
    range = range,
    lowest = lowest,
    breaks = breaks,
    labels = labels,
    verbose = verbose,
    ...
  )
  x
}


#' @export
categorize.grouped_df <- function(
  x,
  select = NULL,
  exclude = NULL,
  split = "median",
  n_groups = NULL,
  range = NULL,
  lowest = 1,
  breaks = "exclusive",
  labels = NULL,
  append = FALSE,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
) {
  grps <- attr(x, "groups", exact = TRUE)[[".rows"]]

  attr_data <- attributes(x)

  # evaluate arguments
  select <- .select_nse(
    select,
    x,
    exclude,
    ignore_case,
    regex = regex,
    remove_group_var = TRUE,
    verbose = verbose
  )

  # when we append variables, we call ".process_append()", which will
  # create the new variables and updates "select", so new variables are processed
  if (!isFALSE(append)) {
    # process arguments
    my_args <- .process_append(
      x,
      select,
      append,
      append_suffix = "_r"
    )
    # update processed arguments
    x <- my_args$x
    select <- my_args$select
  }

  x <- as.data.frame(x)
  for (rows in grps) {
    x[rows, ] <- categorize(
      x[rows, , drop = FALSE],
      split = split,
      n_groups = n_groups,
      range = range,
      lowest = lowest,
      breaks = breaks,
      labels = labels,
      select = select,
      exclude = exclude,
      append = FALSE, # need to set to FALSE here, else variable will be doubled
      ignore_case = ignore_case,
      verbose = verbose,
      ...
    )
  }
  # set back class, so data frame still works with dplyr
  x <- .replace_attrs(x, attr_data)
  x
}


# tools --------------------

.equal_range <- function(x, range, n_groups, lowest = NULL) {
  if (is.null(lowest)) {
    lowest <- 1
  }
  if (is.null(range)) {
    size <- ceiling((max(x) - min(x)) / n_groups)
    range <- as.numeric(size)
  }
  seq(lowest, max(x), by = range)
}


.sanitize_split_arg <- function(split, n_groups, range) {
  # check arguments
  if (is.character(split)) {
    split <- match.arg(
      split,
      choices = c(
        "median",
        "mean",
        "quantile",
        "equal_length",
        "equal_range",
        "equal",
        "equal_distance",
        "range",
        "distance"
      )
    )
  }

  if (
    is.character(split) &&
      split %in% c("quantile", "equal_length") &&
      is.null(n_groups)
  ) {
    insight::format_error(
      "Recoding based on quantiles or equal-sized groups requires the `n_groups` argument to be specified."
    )
  }

  if (
    is.character(split) &&
      split == "equal_range" &&
      is.null(n_groups) &&
      is.null(range)
  ) {
    insight::format_error(
      "Recoding into groups with equal range requires either the `range` or `n_groups` argument to be specified."
    )
  }

  split
}


.original_x_to_factor <- function(
  original_x,
  x,
  cut_result,
  labels,
  out,
  verbose,
  ...
) {
  if (!is.null(labels)) {
    if (length(labels) == length(unique(out))) {
      original_x <- as.factor(original_x)
      levels(original_x) <- labels
    } else if (
      length(labels) == 1 &&
        labels %in% c("mean", "median", "range", "observed")
    ) {
      original_x <- as.factor(original_x)
      no_na_x <- original_x[!is.na(original_x)]
      out <- switch(
        labels,
        mean = stats::aggregate(x, list(no_na_x), FUN = mean, na.rm = TRUE)$x,
        median = stats::aggregate(
          x,
          list(no_na_x),
          FUN = stats::median,
          na.rm = TRUE
        )$x,
        # labels basically like what "cut()" returns
        range = levels(cut_result),
        # range based on the values that are actually present in the data
        {
          temp <- stats::aggregate(
            x,
            list(no_na_x),
            FUN = range,
            na.rm = TRUE
          )$x
          apply(temp, 1, function(i) {
            paste0("(", paste(as.vector(i), collapse = "-"), ")")
          })
        }
      )
      levels(original_x) <- insight::format_value(out, ...)
    } else if (isTRUE(verbose)) {
      insight::format_warning(
        "Argument `labels` and levels of the recoded variable are not of the same length.",
        "Variable will not be converted to factor."
      )
    }
  }
  original_x
}


================================================
FILE: R/center.R
================================================
#' Centering (Grand-Mean Centering)
#'
#' Performs a grand-mean centering of data.
#'
#' @param x A (grouped) data frame, a (numeric or character) vector or a factor.
#' @param force Logical, if `TRUE`, forces centering of factors as
#'   well. Factors are converted to numerical values, with the lowest level
#'   being the value `1` (unless the factor has numeric levels, which are
#'   converted to the corresponding numeric value).
#' @param robust Logical, if `TRUE`, centering is done by subtracting the
#'   median from the variables. If `FALSE`, variables are centered by
#'   subtracting the mean.
#' @param append Logical or string. If `TRUE`, centered variables get new
#'   column names (with the suffix `"_c"`) and are appended (column bind) to `x`,
#'   thus returning both the original and the centered variables. If `FALSE`,
#'   original variables in `x` will be overwritten by their centered versions.
#'   If a character value, centered variables are appended with new column
#'   names (using the defined suffix) to the original data frame.
#' @param verbose Toggle warnings and messages.
#' @param weights Can be `NULL` (for no weighting), or:
#'   - For data frames: a numeric vector of weights, or a character of the
#'   name of a column in the `data.frame` that contains the weights.
#'   - For numeric vectors: a numeric vector of weights.
#' @param center Numeric value, which can be used as alternative to
#'   `reference` to define a reference centrality. If `center` is of length 1,
#'   it will be recycled to match the length of selected variables for centering.
#'   Else, `center` must be of same length as the number of selected variables.
#'   Values in `center` will be matched to selected variables in the provided
#'   order, unless a named vector is given. In this case, names are matched
#'   against the names of the selected variables.
#' @param ... Currently not used.
#' @inheritParams extract_column_names
#' @inheritParams standardize
#'
#' @section Selection of variables - the `select` argument:
#' For most functions that have a `select` argument (including this function),
#' the complete input data frame is returned, even when `select` only selects
#' a range of variables. That is, the function is only applied to those variables
#' that have a match in `select`, while all other variables remain unchanged.
#' In other words: for this function, `select` will not omit any non-included
#' variables, so that the returned data frame will include all variables
#' from the input data frame.
#'
#' @note
#' **Difference between centering and standardizing**: Standardized variables
#' are computed by subtracting the mean of the variable and then dividing it by
#' the standard deviation, while centering variables involves only the
#' subtraction.
#'
#' @seealso If centering within-clusters (instead of grand-mean centering)
#'   is required, see [demean()]. For standardizing, see [standardize()], and
#'   [makepredictcall.dw_transformer()] for use in model formulas.
#'
#' @return The centered variables.
#'
#' @examples
#' data(iris)
#'
#' # entire data frame or a vector
#' head(iris$Sepal.Width)
#' head(center(iris$Sepal.Width))
#' head(center(iris))
#' head(center(iris, force = TRUE))
#'
#' # only the selected columns from a data frame
#' center(anscombe, select = c("x1", "x3"))
#' center(anscombe, exclude = c("x1", "x3"))
#'
#' # centering with reference center and scale
#' d <- data.frame(
#'   a = c(-2, -1, 0, 1, 2),
#'   b = c(3, 4, 5, 6, 7)
#' )
#'
#' # default centering at mean
#' center(d)
#'
#' # centering, using 0 as mean
#' center(d, center = 0)
#'
#' # centering, using -5 as mean
#' center(d, center = -5)
#' @export
center <- function(x, ...) {
  UseMethod("center")
}

#' @rdname center
#' @export
centre <- center


#' @export
center.default <- function(x, verbose = TRUE, ...) {
  if (isTRUE(verbose)) {
    insight::format_alert(
      sprintf(
        "Centering currently not possible for variables of class `%s`.",
        class(x)[1]
      ),
      "You may open an issue at https://github.com/easystats/datawizard/issues."
    )
  }
  x
}


#' @rdname center
#' @export
center.numeric <- function(
  x,
  robust = FALSE,
  weights = NULL,
  reference = NULL,
  center = NULL,
  verbose = TRUE,
  ...
) {
  # set default. Furthermore, data.frame methods cannot return a vector
  # of NULLs for each variable - instead they return NA. Thus, we have to
  # treat NA like NULL
  if (is.null(center) || is.na(center)) {
    center <- TRUE
  }

  my_args <- .process_std_center(
    x,
    weights,
    robust,
    verbose,
    reference,
    center,
    scale = NULL
  )
  dot_args <- list(...)

  if (is.null(my_args)) {
    # all NA?
    return(x)
  } else if (is.null(my_args$check)) {
    vals <- rep(0, length(my_args$vals)) # If only unique value
  } else {
    vals <- as.vector(my_args$vals - my_args$center)
  }

  centered_x <- rep(NA, length(my_args$valid_x))
  centered_x[my_args$valid_x] <- vals
  attr(centered_x, "center") <- my_args$center
  attr(centered_x, "scale") <- 1
  attr(centered_x, "robust") <- robust
  # labels
  z <- .set_back_labels(centered_x, x, include_values = FALSE)
  # don't add attribute when we call data frame methods
  if (!isFALSE(dot_args$add_transform_class)) {
    class(z) <- c("dw_transformer", class(z))
  }
  z
}


#' @export
center.factor <- function(
  x,
  robust = FALSE,
  weights = NULL,
  force = FALSE,
  verbose = TRUE,
  ...
) {
  if (!force) {
    return(x)
  }
  center(
    .factor_to_numeric(x),
    weights = weights,
    robust = robust,
    verbose = verbose,
    ...
  )
}

#' @export
center.logical <- center.factor

#' @export
center.character <- center.factor

#' @export
center.Date <- center.factor

#' @export
center.AsIs <- center.numeric

#' @rdname center
#' @inheritParams standardize.data.frame
#' @export
center.data.frame <- function(
  x,
  select = NULL,
  exclude = NULL,
  robust = FALSE,
  weights = NULL,
  reference = NULL,
  center = NULL,
  force = FALSE,
  remove_na = c("none", "selected", "all"),
  append = FALSE,
  ignore_case = FALSE,
  verbose = TRUE,
  regex = FALSE,
  ...
) {
  # evaluate select/exclude, may be select-helpers
  select <- .select_nse(
    select,
    x,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  # process arguments
  my_args <- .process_std_args(
    x,
    select,
    exclude,
    weights,
    append,
    append_suffix = "_c",
    keep_factors = force,
    remove_na,
    reference,
    .center = center,
    .scale = NULL
  )

  # set new values
  x <- my_args$x

  for (var in my_args$select) {
    x[[var]] <- center(
      x[[var]],
      robust = robust,
      weights = my_args$weights,
      verbose = FALSE,
      reference = reference[[var]],
      center = my_args$center[var],
      force = force,
      add_transform_class = FALSE
    )
  }

  attr(x, "center") <- vapply(
    x[my_args$select],
    function(z) attributes(z)$center,
    numeric(1)
  )
  attr(x, "scale") <- vapply(
    x[my_args$select],
    function(z) attributes(z)$scale,
    numeric(1)
  )
  attr(x, "robust") <- robust
  x
}


#' @export
center.grouped_df <- function(
  x,
  select = NULL,
  exclude = NULL,
  robust = FALSE,
  weights = NULL,
  reference = NULL,
  center = NULL,
  force = FALSE,
  remove_na = c("none", "selected", "all"),
  append = FALSE,
  ignore_case = FALSE,
  verbose = TRUE,
  regex = FALSE,
  ...
) {
  # evaluate select/exclude, may be select-helpers
  select <- .select_nse(
    select,
    x,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  my_args <- .process_grouped_df(
    x,
    select,
    exclude,
    append,
    append_suffix = "_c",
    reference,
    weights,
    keep_factors = force
  )

  for (rows in my_args$grps) {
    my_args$x[rows, ] <- center(
      my_args$x[rows, , drop = FALSE],
      select = my_args$select,
      exclude = NULL,
      robust = robust,
      weights = my_args$weights,
      remove_na = remove_na,
      verbose = verbose,
      force = force,
      append = FALSE,
      center = center,
      add_transform_class = FALSE,
      ...
    )
  }
  # set back class, so data frame still works with dplyr
  attributes(my_args$x) <- my_args$info
  my_args$x
}


# methods -------------------------

#' @export
print.dw_transformer <- function(x, ...) {
  print(as.vector(x), ...)
  vector_info <- NULL
  if (!is.null(attributes(x)$scale)) {
    # attributes for center() / standardize()
    vector_info <- sprintf(
      "(center: %.2g, scale = %.2g)\n",
      attributes(x)$center,
      attributes(x)$scale
    )
  } else if (!is.null(attributes(x)$range_difference)) {
    # attributes for normalize() / rescale()
    vector_info <- sprintf(
      "(original range = %.2g to %.2g)\n",
      attributes(x)$min_value,
      attributes(x)$min_value + attributes(x)$range_difference
    )
  }
  if (!is.null(vector_info)) {
    insight::print_color(vector_info, color = "grey")
  }
  invisible(x)
}


================================================
FILE: R/contrs.R
================================================
#' Deviation Contrast Matrix
#'
#' Build a deviation contrast matrix, a type of _effects contrast_ matrix.
#'
#' @inheritParams stats::contr.sum
#'
#' @details
#' In effects coding, unlike treatment/dummy coding
#' ([stats::contr.treatment()]), each contrast sums to 0. In regressions models,
#' this results in an intercept that represents the (unweighted) average of the
#' group means. In ANOVA settings, this also guarantees that lower order effects
#' represent _main_ effects (and not _simple_ or _conditional_ effects, as is
#' the case when using R's default [stats::contr.treatment()]).
#' \cr\cr
#' Deviation coding (`contr.deviation`) is a type of effects coding. With
#' deviation coding, the coefficients for factor variables are interpreted as
#' the difference of each factor level from the base level (this is the same
#' interpretation as with treatment/dummy coding). For example, for a factor
#' `group` with levels "A", "B", and "C", with `contr.devation`, the intercept
#' represents the overall mean (average of the group means for the 3 groups),
#' and the coefficients `groupB` and `groupC` represent the differences between
#' the A group mean and the B and C group means, respectively.
#' \cr\cr
#' Sum coding ([stats::contr.sum()]) is another type of effects coding. With sum
#' coding, the coefficients for factor variables are interpreted as the
#' difference of each factor level from **the grand (across-groups) mean**. For
#' example, for a factor `group` with levels "A", "B", and "C", with
#' `contr.sum`, the intercept represents the overall mean (average of the group
#' means for the 3 groups), and the coefficients `group1` and `group2` represent
#' the differences the
#' **A** and **B** group means from the overall mean, respectively.
#'
#' @seealso [stats::contr.sum()]
#'
#' @examplesIf !identical(Sys.getenv("IN_PKGDOWN"), "true")
#' \donttest{
#' data("mtcars")
#'
#' mtcars <- data_modify(mtcars, cyl = factor(cyl))
#'
#' c.treatment <- cbind(Intercept = 1, contrasts(mtcars$cyl))
#' solve(c.treatment)
#' #>            4 6 8
#' #> Intercept  1 0 0  # mean of the 1st level
#' #> 6         -1 1 0  # 2nd level - 1st level
#' #> 8         -1 0 1  # 3rd level - 1st level
#'
#' contrasts(mtcars$cyl) <- contr.sum
#' c.sum <- cbind(Intercept = 1, contrasts(mtcars$cyl))
#' solve(c.sum)
#' #>                4      6      8
#' #> Intercept  0.333  0.333  0.333   # overall mean
#' #>            0.667 -0.333 -0.333   # deviation of 1st from overall mean
#' #>           -0.333  0.667 -0.333   # deviation of 2nd from overall mean
#'
#'
#' contrasts(mtcars$cyl) <- contr.deviation
#' c.deviation <- cbind(Intercept = 1, contrasts(mtcars$cyl))
#' solve(c.deviation)
#' #>                4     6     8
#' #> Intercept  0.333 0.333 0.333   # overall mean
#' #> 6         -1.000 1.000 0.000   # 2nd level - 1st level
#' #> 8         -1.000 0.000 1.000   # 3rd level - 1st level
#'
#' ## With Interactions -----------------------------------------
#' mtcars <- data_modify(mtcars, am = C(am, contr = contr.deviation))
#' mtcars <- data_arrange(mtcars, select = c("cyl", "am"))
#'
#' mm <- unique(model.matrix(~ cyl * am, data = mtcars))
#' rownames(mm) <- c(
#'   "cyl4.am0", "cyl4.am1", "cyl6.am0",
#'   "cyl6.am1", "cyl8.am0", "cyl8.am1"
#' )
#'
#' solve(mm)
#' #>             cyl4.am0 cyl4.am1 cyl6.am0 cyl6.am1 cyl8.am0 cyl8.am1
#' #> (Intercept)    0.167    0.167    0.167    0.167    0.167    0.167  # overall mean
#' #> cyl6          -0.500   -0.500    0.500    0.500    0.000    0.000  # cyl MAIN eff: 2nd - 1st
#' #> cyl8          -0.500   -0.500    0.000    0.000    0.500    0.500  # cyl MAIN eff: 2nd - 1st
#' #> am1           -0.333    0.333   -0.333    0.333   -0.333    0.333  # am MAIN eff
#' #> cyl6:am1       1.000   -1.000   -1.000    1.000    0.000    0.000
#' #> cyl8:am1       1.000   -1.000    0.000    0.000   -1.000    1.000
#' }
#'
#' @export
contr.deviation <- function(n, base = 1, contrasts = TRUE, sparse = FALSE) {
  cont <- stats::contr.treatment(
    n,
    base = base,
    contrasts = contrasts,
    sparse = sparse
  )
  if (contrasts) {
    n <- nrow(cont)
    cont <- cont - 1 / n
  }
  cont
}


================================================
FILE: R/convert_na_to.R
================================================
#' @title Replace missing values in a variable or a data frame.
#' @name convert_na_to
#'
#' @description
#' Replace missing values in a variable or a data frame.
#'
#' @param x A numeric, factor, or character vector, or a data frame.
#' @param replacement Numeric or character value that will be used to
#' replace `NA`.
#' @param verbose Toggle warnings.
#' @param ... Not used.
#'
#' @inheritSection center Selection of variables - the `select` argument
#'
#' @return
#' `x`, where `NA` values are replaced by `replacement`.
#'
#' @examples
#' # Convert NA to 0 in a numeric vector
#' convert_na_to(
#'   c(9, 3, NA, 2, 3, 1, NA, 8),
#'   replacement = 0
#' )
#'
#' # Convert NA to "missing" in a character vector
#' convert_na_to(
#'   c("a", NA, "d", "z", NA, "t"),
#'   replacement = "missing"
#' )
#'
#' ### For data frames
#'
#' test_df <- data.frame(
#'   x = c(1, 2, NA),
#'   x2 = c(4, 5, NA),
#'   y = c("a", "b", NA)
#' )
#'
#' # Convert all NA to 0 in numeric variables, and all NA to "missing" in
#' # character variables
#' convert_na_to(
#'   test_df,
#'   replace_num = 0,
#'   replace_char = "missing"
#' )
#'
#' # Convert a specific variable in the data frame
#' convert_na_to(
#'   test_df,
#'   replace_num = 0,
#'   replace_char = "missing",
#'   select = "x"
#' )
#'
#' # Convert all variables starting with "x"
#' convert_na_to(
#'   test_df,
#'   replace_num = 0,
#'   replace_char = "missing",
#'   select = starts_with("x")
#' )
#'
#' # Convert NA to 1 in variable 'x2' and to 0 in all other numeric
#' # variables
#' convert_na_to(
#'   test_df,
#'   replace_num = 0,
#'   select = list(x2 = 1)
#' )
#'
#' @export

convert_na_to <- function(x, ...) {
  UseMethod("convert_na_to")
}


#' @export
convert_na_to.default <- function(x, verbose = TRUE, ...) {
  if (isTRUE(verbose)) {
    insight::format_alert(
      sprintf(
        "Converting missing values (`NA`) into regular values currently not possible for variables of class `%s`.",
        class(x)[1]
      )
    )
  }
  x
}


#' @rdname convert_na_to
#' @export
convert_na_to.numeric <- function(x, replacement = NULL, verbose = TRUE, ...) {
  if (insight::is_empty_object(replacement) || !is.numeric(replacement)) {
    if (isTRUE(verbose)) {
      insight::format_warning("`replacement` needs to be a numeric vector.")
    }
  } else if (length(replacement) > 1) {
    if (isTRUE(verbose)) {
      insight::format_warning("`replacement` needs to be of length one.")
    }
  } else {
    x[is.na(x)] <- replacement
  }
  x
}


#' @export
convert_na_to.factor <- function(x, replacement = NULL, verbose = TRUE, ...) {
  if (insight::is_empty_object(replacement) || length(replacement) > 1) {
    if (isTRUE(verbose)) {
      insight::format_warning("`replacement` needs to be of length one.")
    }
  } else {
    x <- addNA(x)
    levels(x) <- c(levels(x), replacement)
    x[is.na(x)] <- replacement
  }
  x
}


#' @rdname convert_na_to
#' @export
convert_na_to.character <- function(
  x,
  replacement = NULL,
  verbose = TRUE,
  ...
) {
  if (
    insight::is_empty_object(replacement) ||
      !is.character(replacement) && !is.numeric(replacement)
  ) {
    if (isTRUE(verbose)) {
      insight::format_warning(
        "`replacement` needs to be a character or numeric vector."
      )
    }
  } else if (length(replacement) > 1) {
    if (isTRUE(verbose)) {
      insight::format_warning("`replacement` needs to be of length one.")
    }
  } else {
    x[is.na(x)] <- as.character(replacement)
  }
  x
}


#' @param replace_num Value to replace `NA` when variable is of type numeric.
#' @param replace_char Value to replace `NA` when variable is of type character.
#' @param replace_fac Value to replace `NA` when variable is of type factor.
#' @inheritParams extract_column_names
#'
#' @rdname convert_na_to
#' @export
convert_na_to.data.frame <- function(
  x,
  select = NULL,
  exclude = NULL,
  replacement = NULL,
  replace_num = replacement,
  replace_char = replacement,
  replace_fac = replacement,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
) {
  my_data <- x
  select_nse <- .select_nse(
    select,
    data = my_data,
    exclude = exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  # default
  lookup <- lapply(x, function(y) {
    if (is.numeric(y)) {
      replace_num
    } else if (is.character(y)) {
      replace_char
    } else if (is.factor(y)) {
      replace_fac
    }
  })

  # override for specific vars
  try_eval <- try(eval(select), silent = TRUE)
  select_is_list <- !inherits(try_eval, "try-error") && is.list(select)

  if (select_is_list) {
    for (i in select_nse) {
      lookup[[i]] <- select[[i]]
    }
  } else {
    lookup <- lookup[names(lookup) %in% select_nse]
  }

  lookup <- Filter(Negate(is.null), lookup)

  for (i in names(lookup)) {
    x[[i]] <- convert_na_to(
      x[[i]],
      replacement = lookup[[i]],
      verbose = verbose
    )
  }

  x
}


================================================
FILE: R/convert_to_na.R
================================================
#' @title Convert non-missing values in a variable into missing values.
#' @name convert_to_na
#'
#' @description
#' Convert non-missing values in a variable into missing values.
#'
#' @param x A vector, factor or a data frame.
#' @param na Numeric, character vector or logical (or a list of numeric, character
#'   vectors or logicals) with values that should be converted to `NA`. Numeric
#'   values applied to numeric vectors, character values are used for factors,
#'   character vectors or date variables, and logical values for logical vectors.
#' @param drop_levels Logical, for factors, when specific levels are replaced
#'   by `NA`, should unused levels be dropped?
#' @param ... Not used.
#' @inheritParams extract_column_names
#'
#' @return
#' `x`, where all values in `na` are converted to `NA`.
#'
#' @examples
#' x <- sample(1:6, size = 30, replace = TRUE)
#' x
#' # values 4 and 5 to NA
#' convert_to_na(x, na = 4:5)
#'
#' # data frames
#' set.seed(123)
#' x <- data.frame(
#'   a = sample(1:6, size = 20, replace = TRUE),
#'   b = sample(letters[1:6], size = 20, replace = TRUE),
#'   c = sample(c(30:33, 99), size = 20, replace = TRUE)
#' )
#' # for all numerics, convert 5 to NA. Character/factor will be ignored.
#' convert_to_na(x, na = 5)
#'
#' # for numerics, 5 to NA, for character/factor, "f" to NA
#' convert_to_na(x, na = list(6, "f"))
#'
#' # select specific variables
#' convert_to_na(x, select = c("a", "b"), na = list(6, "f"))
#' @export
convert_to_na <- function(x, ...) {
  UseMethod("convert_to_na")
}


#' @export
convert_to_na.default <- function(x, verbose = TRUE, ...) {
  if (isTRUE(verbose)) {
    insight::format_alert(
      sprintf(
        "Converting values into missing values (`NA`) currently not possible for variables of class `%s`.",
        class(x)[1]
      )
    )
  }
  x
}


#' @rdname convert_to_na
#' @export
convert_to_na.numeric <- function(x, na = NULL, verbose = TRUE, ...) {
  # if we have a list, use first valid element
  if (is.list(na)) {
    na <- unlist(
      na[vapply(na, is.numeric, FUN.VALUE = TRUE)],
      use.names = FALSE
    )
  }

  if (insight::is_empty_object(na) || !is.numeric(na)) {
    if (isTRUE(verbose)) {
      insight::format_alert(
        "Could not convert values into `NA` for a numeric variable.",
        "To do this, `na` needs to be a numeric vector, or a list that contains numeric vector elements."
      )
    }
  } else {
    matches <- which(x %in% na)
    x[matches] <- NA
    # drop unused labels
    value_labels <- attr(x, "labels", exact = TRUE)
    attr(x, "labels") <- value_labels[!value_labels %in% na]
  }
  x
}


#' @rdname convert_to_na
#' @export
convert_to_na.factor <- function(
  x,
  na = NULL,
  drop_levels = FALSE,
  verbose = TRUE,
  ...
) {
  # if we have a list, use first valid element
  if (is.list(na)) {
    na <- unlist(
      na[vapply(na, is.character, FUN.VALUE = TRUE)],
      use.names = FALSE
    )
  }

  if (insight::is_empty_object(na) || (!is.factor(na) && !is.character(na))) {
    if (isTRUE(verbose)) {
      insight::format_alert(
        "Could not convert values into `NA` for a factor or character variable.",
        "To do this, `na` needs to be a character vector, or a list that contains character vector elements."
      )
    }
  } else {
    matches <- which(x %in% na)
    x[matches] <- NA
    # drop unused labels
    value_labels <- attr(x, "labels", exact = TRUE)
    if (is.factor(x) && isTRUE(drop_levels)) {
      # save label attribute
      variable_label <- attr(x, "label", exact = TRUE)
      x <- droplevels(x)
      # droplevels() discards attributes, so we need to re-assign them
      attr(x, "label") <- variable_label
    }
    attr(x, "labels") <- value_labels[!value_labels %in% na]
  }
  x
}


#' @export
convert_to_na.character <- convert_to_na.factor


#' @export
convert_to_na.Date <- function(x, na = NULL, verbose = TRUE, ...) {
  # if we have a list, use first valid element
  if (is.list(na)) {
    na <- na[vapply(na, .is_date, FUN.VALUE = logical(1L))]
    if (length(na) > 1) {
      na <- na[[1]]
    }
  }

  if (insight::is_empty_object(na) || !.is_date(na)) {
    if (isTRUE(verbose)) {
      insight::format_alert(
        "Could not convert values into `NA` for a date/time variable.",
        "To do this, `na` must be of class 'Date'."
      )
    }
  } else {
    matches <- which(x == na)
    x[matches] <- NA
  }
  x
}


#' @export
convert_to_na.logical <- function(x, na = NULL, verbose = TRUE, ...) {
  # if we have a list, use first valid element
  if (is.list(na)) {
    na <- unlist(
      na[vapply(na, is.logical, FUN.VALUE = TRUE)],
      use.names = FALSE
    )
  }

  if (insight::is_empty_object(na) || !is.logical(na)) {
    if (isTRUE(verbose)) {
      insight::format_alert(
        "Could not convert values into `NA` for a logical variable.",
        "To do this, `na` needs to be a logical vector, or a list that contains logical vector elements."
      )
    }
  } else {
    matches <- which(x == na)
    x[matches] <- NA
  }
  x
}


#' @rdname convert_to_na
#' @export
convert_to_na.data.frame <- function(
  x,
  select = NULL,
  exclude = NULL,
  na = NULL,
  drop_levels = FALSE,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
) {
  # evaluate arguments
  select <- .select_nse(
    select,
    x,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  x[select] <- lapply(
    x[select],
    convert_to_na,
    na = na,
    drop_levels = drop_levels,
    verbose = verbose,
    ...
  )

  x
}


================================================
FILE: R/data.R
================================================
#' @docType data
#' @title Sample dataset from the National Health and Nutrition Examination Survey
#' @name nhanes_sample
#' @keywords data
#'
#' @description Selected variables from the National Health and Nutrition Examination
#'              Survey that are used in the example from Lumley (2010), Appendix E.
#'
#' @references Lumley T (2010). Complex Surveys: a guide to analysis using R. Wiley
NULL


#' @docType data
#' @title Sample dataset from the EFC Survey
#' @name efc
#' @keywords data
#'
#' @description Selected variables from the EUROFAMCARE survey. Useful when
#' testing on "real-life" data sets, including random missing values. This
#' data set also has value and variable label attributes.
NULL


================================================
FILE: R/data_addprefix.R
================================================
#' Add a prefix or suffix to column names
#'
#' @rdname data_prefix_suffix
#' @inheritParams extract_column_names
#' @param pattern A character string, which will be added as prefix or suffix
#' to the column names.
#' @param ... Other arguments passed to or from other functions.
#'
#' @seealso
#' [data_rename()] for more fine-grained column renaming.
#' @examples
#' # Add prefix / suffix to all columns
#' head(data_addprefix(iris, "NEW_"))
#' head(data_addsuffix(iris, "_OLD"))
#'
#' @export
data_addprefix <- function(
  data,
  pattern,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
) {
  # evaluate arguments
  select <- .select_nse(
    select,
    data,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  selected_columns <- colnames(data) %in% select
  colnames(data)[selected_columns] <- paste0(
    pattern,
    colnames(data)[selected_columns]
  )
  data
}


#' @rdname data_prefix_suffix
#' @export
data_addsuffix <- function(
  data,
  pattern,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
) {
  # evaluate arguments
  select <- .select_nse(
    select,
    data,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  selected_columns <- colnames(data) %in% select
  colnames(data)[selected_columns] <- paste0(
    colnames(data)[selected_columns],
    pattern
  )
  data
}


================================================
FILE: R/data_arrange.R
================================================
#' Arrange rows by column values
#'
#' `data_arrange()` orders the rows of a data frame by the values of selected
#' columns.
#'
#' @param data A data frame, or an object that can be coerced to a data frame.
#' @param select Character vector of column names. Use a dash just before column
#'   name to arrange in decreasing order, for example `"-x1"`.
#' @param safe Do not throw an error if one of the variables specified doesn't
#'   exist.
#'
#' @return A data frame.
#'
#' @examples
#'
#' # Arrange using several variables
#' data_arrange(head(mtcars), c("gear", "carb"))
#'
#' # Arrange in decreasing order
#' data_arrange(head(mtcars), "-carb")
#'
#' # Throw an error if one of the variables specified doesn't exist
#' try(data_arrange(head(mtcars), c("gear", "foo"), safe = FALSE))
#' @export
data_arrange <- function(data, select = NULL, safe = TRUE) {
  UseMethod("data_arrange")
}


#' @export
data_arrange.default <- function(data, select = NULL, safe = TRUE) {
  if (is.null(select) || length(select) == 0) {
    return(data)
  }

  original_x <- data

  # Input validation check
  data <- .coerce_to_dataframe(data)

  # Remove tidyverse attributes, will add them back at the end
  if (inherits(original_x, "tbl_df")) {
    tbl_input <- TRUE
    data <- as.data.frame(data, stringsAsFactors = FALSE)
  } else {
    tbl_input <- FALSE
  }

  # find which vars should be decreasing
  desc <- select[startsWith(select, "-")]
  desc <- gsub("^-", "", desc)
  select <- gsub("^-", "", select)

  # check for variables that are not in data
  dont_exist <- setdiff(select, colnames(data))

  if (length(dont_exist) > 0) {
    if (safe) {
      insight::format_warning(
        paste0(
          "The following column(s) don't exist in the dataset: ",
          text_concatenate(dont_exist),
          "."
        ),
        .misspelled_string(names(data), dont_exist, "Possibly misspelled?")
      )
    } else {
      insight::format_error(
        paste0(
          "The following column(s) don't exist in the dataset: ",
          text_concatenate(dont_exist),
          "."
        ),
        .misspelled_string(names(data), dont_exist, "Possibly misspelled?")
      )
    }
    select <- select[-which(select %in% dont_exist)]
  }

  if (length(select) == 0) {
    return(data)
  }

  already_sorted <- all(vapply(
    data[, select, drop = FALSE],
    .is_sorted,
    logical(1L)
  ))

  if (isTRUE(already_sorted)) {
    return(data)
  }

  out <- data

  # reverse order for variables that should be decreasing
  if (length(desc) > 0) {
    for (i in desc) {
      out[[i]] <- -xtfrm(out[[i]])
    }
  }

  # apply ordering
  if (length(select) == 1) {
    out <- data[order(out[[select]]), , drop = FALSE]
  } else {
    out <- data[do.call(order, out[, select]), , drop = FALSE]
  }

  if (!insight::object_has_rownames(data)) {
    rownames(out) <- NULL
  }

  # add back custom attributes
  out <- .replace_attrs(out, attributes(original_x))

  out
}


#' @export
data_arrange.grouped_df <- function(data, select = NULL, safe = TRUE) {
  original_x <- data
  grps <- attr(data, "groups", exact = TRUE)
  grps <- grps[[".rows"]]

  # Remove tidyverse attributes, will add them back at the end
  if (inherits(data, "tbl_df")) {
    tbl_input <- TRUE
    data <- as.data.frame(data, stringsAsFactors = FALSE)
  } else {
    tbl_input <- FALSE
  }

  out <- lapply(grps, function(x) {
    data_arrange.default(data[x, ], select = select, safe = safe)
  })

  out <- do.call(rbind, out)

  if (!insight::object_has_rownames(data)) {
    rownames(out) <- NULL
  }

  # add back tidyverse attributes
  if (isTRUE(tbl_input)) {
    class(out) <- c("tbl_df", "tbl", "data.frame")
  }

  # add back custom attributes
  out <- .replace_attrs(out, attributes(original_x))

  out
}


================================================
FILE: R/data_codebook.R
================================================
#' Generate a codebook of a data frame.
#'
#' `data_codebook()` generates codebooks from data frames, i.e. overviews
#' of all variables and some more information about each variable (like
#' labels, values or value range, frequencies, amount of missing values).
#'
#' @param data A data frame, or an object that can be coerced to a data frame.
#' @param variable_label_width Length of variable labels. Longer labels will be
#' wrapped at `variable_label_width` chars. If `NULL`, longer labels will not
#' be split into multiple lines. Only applies to _labelled data_.
#' @param value_label_width Length of value labels. Longer labels will be
#' shortened, where the remaining part is truncated. Only applies to
#' _labelled data_ or factor levels.
#' @param range_at Indicates how many unique values in a numeric vector are
#' needed in order to print a range for that variable instead of a frequency
#' table for all numeric values. Can be useful if the data contains numeric
#' variables with only a few unique values and where full frequency tables
#' instead of value ranges should be displayed.
#' @param max_values Number of maximum values that should be displayed. Can be
#' used to avoid too many rows when variables have lots of unique values.
#' @param font_size For HTML tables, the font size.
#' @param line_padding For HTML tables, the distance (in pixel) between lines.
#' @param row_color For HTML tables, the fill color for odd rows.
#' @inheritParams standardize.data.frame
#' @inheritParams extract_column_names
#' @inheritParams data_tabulate
#'
#' @return A formatted data frame, summarizing the content of the data frame.
#' Returned columns include the column index of the variables in the original
#' data frame (`ID`), column name, variable label (if data is labelled), type
#' of variable, number of missing values, unique values (or value range),
#' value labels (for labelled data), and a frequency table (N for each value).
#' Most columns are formatted as character vectors.
#'
#' @note There are methods to `print()` the data frame in a nicer output, as
#' well methods for printing in markdown or HTML format (`print_md()` and
#' `print_html()`). The `print()` method for text outputs passes arguments in
#' `...` to [`insight::export_table()`].
#'
#' @examples
#' data(iris)
#' data_codebook(iris, select = starts_with("Sepal"))
#'
#' data(efc)
#' data_codebook(efc)
#'
#' # shorten labels
#' data_codebook(efc, variable_label_width = 20, value_label_width = 15)
#'
#' # automatic range for numerics at more than 5 unique values
#' data(mtcars)
#' data_codebook(mtcars, select = starts_with("c"))
#'
#' # force all values to be displayed
#' data_codebook(mtcars, select = starts_with("c"), range_at = 100)
#' @export
data_codebook <- function(
  data,
  select = NULL,
  exclude = NULL,
  variable_label_width = NULL,
  value_label_width = NULL,
  max_values = 10,
  range_at = 6,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
) {
  data_name <- insight::safe_deparse(substitute(data))

  # evaluate select/exclude, may be select-helpers
  select <- .select_nse(
    select,
    data,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  # check for emtpy columns, and remove
  empty <- empty_columns(data[select])
  if (length(empty)) {
    if (verbose) {
      insight::format_warning(
        sprintf(
          "Following %i columns were empty and have been removed:",
          length(empty)
        ),
        text_concatenate(names(empty))
      )
    }
    select <- select[-empty]
  }

  # check if any columns left, or found
  if (!length(select) || is.null(select)) {
    if (isTRUE(verbose)) {
      insight::format_warning(
        "No column names that matched the required search pattern were found."
      )
    }
    return(NULL)
  }

  # needed for % NA
  rows <- nrow(data)
  max_values <- max_values + 1

  out <- lapply(seq_along(select), function(id) {
    # variable
    x <- data[[select[id]]]
    x_na <- is.na(x)
    x_inf <- is.infinite(x)

    # inital data frame for codebook
    d <- data.frame(
      ID = which(colnames(data) == select[id]),
      Name = select[id],
      Type = .variable_type(x),
      Missings = sprintf("%g (%.1f%%)", sum(x_na), 100 * (sum(x_na) / rows)),
      stringsAsFactors = FALSE,
      row.names = NULL,
      check.names = FALSE
    )

    # check if there are variable labels
    variable_label <- .extract_variable_labels(x, variable_label_width)

    # we may need to remove duplicated value range elements
    flag_range <- FALSE

    # save value labels
    vallab <- attr(x, "labels", exact = TRUE)

    # do we have labelled NA values? If so, include labelled NAs in count table
    # we do this by converting NA values into character strings
    if (anyNA(vallab) && insight::check_if_installed("haven", quietly = TRUE)) {
      # get na-tags, i.e. the value labels for the different NA values
      na_labels <- haven::na_tag(vallab)
      # replace NA in labels with NA tags
      vallab[!is.na(na_labels)] <- stats::setNames(
        paste0("NA(", na_labels[!is.na(na_labels)], ")"),
        names(vallab[!is.na(na_labels)])
      )
      # replace tagged NAs in variable with their values, tagged as NA(value)
      na_values <- haven::na_tag(x)
      # need to convert, we still have haven-class, which cannot coerce
      x <- as.character(x)
      x[!is.na(na_values)] <- paste0("NA(", na_values[!is.na(na_values)], ")")
      # update information on NA - we still might have non-labelled (regular) NA
      x_na <- is.na(x)
    }

    # remove NA and Inf, for tabulate(). as.factor() will convert NaN
    # to a factor level "NaN", which we don't want here (same for Inf),
    # because tabulate() will then return frequencies for that level, too
    x <- x[!(x_na | x_inf)]

    # get unique values, to remove non labelled data
    unique_values <- unique(x)

    # coerce to factor, for tabulate(). We will coerce numerics to factor later
    # which is required because tabulate() doesn't return frequencies for values
    # lower than 1
    if (!is.numeric(x) && !is.factor(x)) {
      x <- as.factor(x)
    }

    # for ranges, we don't want the N% value, so use this to flag range-values
    is_range <- FALSE

    # handle labelled data - check if there are value labels or factor levels,
    # and extract values and N
    if (!is.null(vallab) && length(vallab)) {
      # if not all values are labelled, fill in value labels
      if (!all(unique_values %in% vallab)) {
        new_vals <- setdiff(unique_values, vallab)
        vallab <- c(vallab, stats::setNames(new_vals, new_vals))
      }
      # if not all value labels are present in the data, remove unused value labels
      if (!all(vallab %in% unique_values)) {
        not_needed <- setdiff(vallab, unique_values)
        # match not needed values in vallab vector - values from labels
        # may not be in sorted order (e.g. 1, 2, 3, -9), or may be character
        # vectors in case of tagged NAs, so we have to make sure we know which
        # values can be removed from vallab
        not_needed <- stats::na.omit(match(not_needed, vallab))
        vallab <- vallab[-not_needed]
      }
      # we now should have the same length of value labels and labelled values
      # which should also match the numberof unique values in the vector.
      # "tabulate" creates frequency tables by sorting by values/levels, so
      # we need to make sure that labels are also in sorted order.
      value_labels <- names(vallab)[order(unname(vallab))]
      values <- sort(unname(vallab))
      frq <- tabulate(as.factor(x))

      # handle factors
    } else if (is.factor(x)) {
      values <- levels(x)
      value_labels <- NA
      frq <- tabulate(x)

      # handle numerics
    } else {
      value_labels <- NA
      # only range for too many unique values
      if (length(unique_values) >= range_at) {
        r <- range(x, na.rm = TRUE)
        values <- sprintf("[%g, %g]", round(r[1], 2), round(r[2], 2))
        frq <- sum(!x_na)
        flag_range <- length(variable_label) > 1
        is_range <- TRUE
        # if we have few values, we can print whole freq. table
      } else {
        values <- sort(unique_values)
        frq <- tabulate(as.factor(x))
      }
    }

    # tabulate fills 0 for non-existing values, remove those
    frq <- frq[frq != 0]

    # add Inf values?
    if (any(x_inf) && length(frq) <= max_values) {
      values <- c(values, Inf)
      if (!is.na(value_labels)) {
        value_labels <- c(value_labels, "infinite")
      }
      frq <- c(frq, sum(x_inf))
      # Inf are added as value, so don't flag range any more,
      # since we now have proportions for the range and the inf values.
      is_range <- FALSE
    }

    # add proportions, but not for ranges, since these are always 100%
    if (is_range) {
      frq_proportions <- ""
    } else {
      frq_proportions <- sprintf("%.1f%%", round(100 * (frq / sum(frq)), 1))
    }

    # make sure we have not too long rows, e.g. for variables that
    # have dozens of unique values
    if (length(value_labels) > max_values) {
      value_labels <- value_labels[1:max_values]
      value_labels[max_values] <- "(...)"
    }
    if (length(frq) > max_values) {
      frq <- frq[1:max_values]
      frq_proportions <- frq_proportions[1:max_values]
      frq[max_values] <- NA
      frq_proportions[max_values] <- NA
    }
    if (length(values) > max_values) {
      values <- values[1:max_values]
      values[max_values] <- "(...)"
    }

    # make sure length recycling doesn't fail, e.g. if we have split
    # variable_label into two lines (i.e. vector of length 2), but we have
    # 7 values in "frq", creating the data frame will fail. In this case,
    # we have to make sure that recycling shorter vectors works.
    if (length(variable_label) > 1 && !flag_range) {
      variable_label <- variable_label[seq_along(frq)]
    }

    # shorten value labels
    if (!is.null(value_label_width)) {
      value_labels <- insight::format_string(
        value_labels,
        length = value_label_width
      )
    }

    # add values, value labels and frequencies to data frame
    d <- cbind(
      d,
      data.frame(
        variable_label,
        values,
        value_labels,
        frq,
        proportions = frq_proportions,
        stringsAsFactors = FALSE
      )
    )

    # which columns need to be checked for duplicates?
    duplicates <- c("ID", "Name", "Type", "Missings", "variable_label")
    if (isTRUE(flag_range)) {
      # when we have numeric variables with value range as values, and when
      # these variables had long variable labels that have been wrapped,
      # the range value is duplicated (due to recycling), so we need to fix
      # this here.
      duplicates <- c(duplicates, c("values", "frq", "proportions"))
    }

    # clear duplicates due to recycling
    for (i in duplicates) {
      d[[i]][duplicated(d[[i]])] <- ""
    }

    # remove empty rows
    d <- remove_empty_rows(d)

    # add empty row at the end, as separator
    d[nrow(d) + 1, ] <- rep("", ncol(d))

    # add row ID
    d$.row_id <- id
    d
  })

  # clean-up (column order, rename, ...)
  out <- .finalize_result(do.call(rbind, out))

  # add attributes
  .add_codebook_attributes(out, data_name, data, select)
}


# helper -----------------------

#' @keywords internal
.extract_variable_labels <- function(x, variable_label_width = NULL) {
  varlab <- attr(x, "label", exact = TRUE)
  if (!is.null(varlab) && length(varlab)) {
    variable_label <- varlab
    # if variable labels are too long, split into multiple elements
    if (
      !is.null(variable_label_width) &&
        nchar(variable_label) > variable_label_width
    ) {
      variable_label <- insight::trim_ws(unlist(
        strsplit(
          text_wrap(variable_label, width = variable_label_width),
          "\n",
          fixed = TRUE
        ),
        use.names = FALSE
      ))
    }
  } else {
    variable_label <- NA
  }
  variable_label
}


#' @keywords internal
.finalize_result <- function(out) {
  # rename
  pattern <- c("variable_label", "values", "value_labels", "frq", "proportions")
  replacement <- c("Label", "Values", "Value Labels", "N", "Prop")
  for (i in seq_along(pattern)) {
    names(out) <- replace(names(out), names(out) == pattern[i], replacement[i])
  }

  # remove all empty columns
  out <- remove_empty_columns(out)

  # reorder
  column_order <- c(
    "ID",
    "Name",
    "Label",
    "Type",
    "Missings",
    "Values",
    "Value Labels",
    "N",
    "Prop",
    ".row_id"
  )
  out[union(intersect(column_order, names(out)), names(out))]
}


#' @keywords internal
.add_codebook_attributes <- function(out, data_name, data, select) {
  attr(out, "data_name") <- data_name
  attr(out, "n_rows") <- nrow(data)
  attr(out, "n_cols") <- ncol(data)
  attr(out, "n_shown") <- length(select)
  class(out) <- c("data_codebook", "data.frame")

  out
}


# methods ----------------------

#' @export
format.data_codebook <- function(x, format = "text", ...) {
  # use [["N"]] to avoid partial matching
  if (any(stats::na.omit(nchar(x[["N"]]) > 5))) {
    x[["N"]] <- insight::trim_ws(prettyNum(x[["N"]], big.mark = ","))
    x[["N"]][x[["N"]] == "NA" | is.na(x[["N"]])] <- ""
  }
  # merge N and %
  if (!is.null(x$Prop)) {
    x$Prop[x$Prop == "NA" | is.na(x$Prop)] <- ""
    # align only for text format
    if (identical(format, "text")) {
      x$Prop[x$Prop != ""] <- format(x$Prop[x$Prop != ""], justify = "right") # nolint
    }
    x[["N"]][x$Prop != ""] <- sprintf(
      # nolint
      "%s (%s)",
      as.character(x[["N"]][x$Prop != ""]), # nolint
      x$Prop[x$Prop != ""] # nolint
    )
    x$Prop <- NULL
  }
  x
}


#' @export
print.data_codebook <- function(x, ...) {
  caption <- c(.get_codebook_caption(x), "blue")
  x$.row_id <- NULL
  cat(
    insight::export_table(
      format(x),
      title = caption,
      empty_line = "-",
      cross = "+",
      align = .get_codebook_align(x),
      ...
    )
  )
}


#' @rdname data_codebook
#' @export
print_html.data_codebook <- function(
  x,
  font_size = "100%",
  line_padding = 3,
  row_color = "#eeeeee",
  ...
) {
  caption <- .get_codebook_caption(x)
  attr(x, "table_caption") <- caption
  # since we have each value at its own row, the HTML table contains
  # horizontal borders for each cell/row. We want to remove those borders
  # from rows that actually belong to one variable
  separator_lines <- which(duplicated(x$.row_id) & x$N == "") # nolint
  # remove separator lines, as we don't need these for HTML tables
  x <- x[-separator_lines, ]
  # check row IDs, and find odd rows
  odd_rows <- (x$.row_id %% 2 == 1)
  x$.row_id <- NULL
  # create basic table
  backend <- .check_format_backend(...)
  out <- insight::export_table(
    format(x, format = "html"),
    title = caption,
    format = backend,
    align = .get_codebook_align(x)
  )

  # for tiny table output, we don't need to do any further formatting
  if (identical(backend, "tt")) {
    return(out)
  }

  insight::check_if_installed("gt")
  # no border for rows which are not separator lines
  out <- gt::tab_style(
    out,
    style = list(gt::cell_borders(sides = "top", style = "hidden")),
    locations = gt::cells_body(rows = which(x$ID == "")) # nolint
  )
  # highlight odd rows
  if (!is.null(row_color)) {
    out <- gt::tab_style(
      out,
      style = list(gt::cell_fill(color = row_color)),
      locations = gt::cells_body(rows = odd_rows)
    )
  }
  # set up additonal HTML options
  gt::tab_options(
    out,
    table.font.size = font_size,
    data_row.padding = gt::px(line_padding)
  )
}


#' @rdname data_codebook
#' @export
display.data_codebook <- function(
  object,
  format = "markdown",
  font_size = "100%",
  line_padding = 3,
  row_color = "#eeeeee",
  ...
) {
  format <- .display_default_format(format)

  fun_args <- list(
    x = object,
    font_size = font_size,
    line_padding = line_padding,
    row_color = row_color,
    ...
  )

  # print table in HTML or markdown format
  if (format %in% c("html", "tt")) {
    fun_args$backend <- format
    do.call(print_html, fun_args)
  } else {
    do.call(print_md, fun_args)
  }
}


#' @export
print_md.data_codebook <- function(x, ...) {
  caption <- .get_codebook_caption(x)
  x$.row_id <- NULL
  attr(x, "table_caption") <- caption
  insight::export_table(
    format(x, format = "markdown"),
    title = caption,
    align = .get_codebook_align(x),
    format = "markdown"
  )
}


# helper ---------

.get_codebook_caption <- function(x) {
  n_rows <- as.character(attributes(x)$n_rows)
  if (nchar(n_rows) > 5) {
    n_rows <- prettyNum(n_rows, big.mark = ",")
  }
  sprintf(
    "%s (%s rows and %i variables, %i shown)",
    attributes(x)$data_name,
    n_rows,
    attributes(x)$n_cols,
    attributes(x)$n_shown
  )
}

.get_codebook_align <- function(x) {
  # need to remove this one
  x$Prop <- NULL
  align <- c(
    ID = "l",
    Name = "l",
    Label = "l",
    Type = "l",
    Missings = "r",
    Values = "r",
    `Value Labels` = "l",
    N = "r"
  )
  align <- align[colnames(x)]
  paste(unname(align), collapse = "")
}


================================================
FILE: R/data_duplicated.R
================================================
#' @title Extract all duplicates
#'
#' @description Extract all duplicates, for visual inspection.
#' Note that it also contains the first occurrence of future
#' duplicates, unlike [duplicated()] or [dplyr::distinct()]). Also
#' contains an additional column reporting the number of missing
#' values for that row, to help in the decision-making when
#' selecting which duplicates to keep.
#'
#' @inheritParams extract_column_names
#'
#' @keywords duplicates
#' @export
#' @seealso
#' [data_unique()]
#' @return A dataframe, containing all duplicates.
#' @examples
#' df1 <- data.frame(
#'   id = c(1, 2, 3, 1, 3),
#'   year = c(2022, 2022, 2022, 2022, 2000),
#'   item1 = c(NA, 1, 1, 2, 3),
#'   item2 = c(NA, 1, 1, 2, 3),
#'   item3 = c(NA, 1, 1, 2, 3)
#' )
#'
#' data_duplicated(df1, select = "id")
#'
#' data_duplicated(df1, select = c("id", "year"))
#'
#' # Filter to exclude duplicates
#' df2 <- df1[-c(1, 5), ]
#' df2
#'
data_duplicated <- function(
  data,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE
) {
  UseMethod("data_duplicated")
}

#' @export
data_duplicated.data.frame <- function(
  data,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE
) {
  select <- .select_nse(
    select,
    data,
    exclude = exclude,
    ignore_case = ignore_case,
    regex = regex,
    verbose = verbose
  )

  data$temporary_id <- do.call(paste, c(data_select(data, select), sep = "_"))

  data <- cbind(Row = seq_len(nrow(data)), data)
  dups.index <- data$temporary_id %in%
    data$temporary_id[duplicated(data$temporary_id)]
  dups <- data[dups.index, ]
  dups$count_na <- rowSums(is.na(dups))
  dups <- data_arrange(dups, select)
  dups <- data_remove(dups, "temporary_id")
  dups
}

#' @export
data_duplicated.grouped_df <- function(
  data,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE
) {
  select <- .select_nse(
    select,
    data,
    exclude = exclude,
    ignore_case = ignore_case,
    regex = regex,
    verbose = verbose
  )

  grps <- attr(data, "groups", exact = TRUE)
  grps <- grps[[".rows"]]

  data <- as.data.frame(data)

  out <- lapply(grps, function(x) {
    data_duplicated.data.frame(data[x, ], select = select)
  })

  out <- do.call(rbind, out)

  out
}


================================================
FILE: R/data_extract.R
================================================
#' Extract one or more columns or elements from an object
#'
#' `data_extract()` (or its alias `extract()`) is similar to `$`. It extracts
#' either a single column or element from an object (e.g., a data frame, list),
#' or multiple columns resp. elements.
#'
#' @param data The object to subset. Methods are currently available for data frames
#'   and data frame extensions (e.g., tibbles).
#' @param name An optional argument that specifies the column to be used as
#'   names for the vector elements after extraction. Must be specified either
#'   as literal variable name (e.g., `column_name`) or as string
#'   (`"column_name"`). `name` will be ignored when a data frame is returned.
#' @param extract String, indicating which element will be extracted when `select`
#'   matches multiple variables. Can be `"all"` (the default) to return all
#'   matched variables, `"first"` or `"last"` to return the first or last match,
#'   or `"odd"` and `"even"` to return all odd-numbered or even-numbered
#'   matches. Note that `"first"` or `"last"` return a vector (unless
#'   `as_data_frame = TRUE`), while `"all"` can return a vector (if only one
#'   match was found) *or* a data frame (for more than one match). Type safe
#'   return values are only possible when `extract` is `"first"` or `"last"` (will
#'   always return a vector) or when `as_data_frame = TRUE` (always returns a
#'   data frame).
#' @param as_data_frame Logical, if `TRUE`, will always return a data frame,
#'   even if only one variable was matched. If `FALSE`, either returns a vector
#'   or a data frame. See `extract` for details.
#' @param verbose Toggle warnings.
#' @param ... For use by future methods.
#'
#' @inheritParams extract_column_names
#'
#' @details `data_extract()` can be used to select multiple variables or pull a
#' single variable from a data frame. Thus, the return value is by default not
#' type safe - `data_extract()` either returns a vector or a data frame.
#' \subsection{Extracting single variables (vectors)}{
#' When `select` is the name of a single column, or when select only matches
#' one column, a vector is returned. A single variable is also returned when
#' `extract` is either `"first` or `"last"`. Setting `as_data_frame` to `TRUE`
#' overrides this behaviour and *always* returns a data frame.
#' }
#' \subsection{Extracting a data frame of variables}{
#' When `select` is a character vector containing more than one column name (or
#' a numeric vector with more than one valid column indices), or when `select`
#' uses one of the supported select-helpers that match multiple columns, a
#' data frame is returned. Setting `as_data_frame` to `TRUE` *always* returns
#' a data frame.
#' }
#'
#' @return A vector (or a data frame) containing the extracted element, or
#'   `NULL` if no matching variable was found.
#' @export
#'
#' @examples
#' # single variable
#' data_extract(mtcars, cyl, name = gear)
#' data_extract(mtcars, "cyl", name = gear)
#' data_extract(mtcars, -1, name = gear)
#' data_extract(mtcars, cyl, name = 0)
#' data_extract(mtcars, cyl, name = "row.names")
#'
#' # selecting multiple variables
#' head(data_extract(iris, starts_with("Sepal")))
#' head(data_extract(iris, ends_with("Width")))
#' head(data_extract(iris, 2:4))
#'
#' # select first of multiple variables
#' data_extract(iris, starts_with("Sepal"), extract = "first")
#'
#' # select first of multiple variables, return as data frame
#' head(data_extract(iris, starts_with("Sepal"), extract = "first", as_data_frame = TRUE))
data_extract <- function(data, select, ...) {
  UseMethod("data_extract")
}

#' @rdname data_extract
#' @export
data_extract.data.frame <- function(
  data,
  select,
  name = NULL,
  extract = "all",
  as_data_frame = FALSE,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
) {
  extract <- match.arg(
    tolower(extract),
    choices = c("all", "first", "last", "odd", "even")
  )

  # evaluate arguments
  select <- .select_nse(
    select,
    data,
    exclude = NULL,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  # nothing to select?
  if (!length(select)) {
    return(NULL)
  }

  nl <- as.list(seq_along(data))
  names(nl) <- names(data)
  name <- eval(substitute(name), nl, parent.frame())

  if (is.numeric(name) && length(name) == 1L) {
    if (name < 0L) {
      name <- ncol(data) + name + 1L
    } else if (name == 0L) {
      name <- rownames(data)
    }
  } else if (is.character(name) && identical(name, "row.names")) {
    name <- rownames(data)
  }

  # chose which matched variables to extract
  select <- switch(
    extract,
    first = select[1L],
    last = select[length(select)],
    odd = select[seq(1L, length(select), 2L)],
    even = select[seq(2L, length(select), 2L)],
    select
  )

  # "name" only used for naming elements in a vector, not data frame
  needs_no_names <- isTRUE(as_data_frame) ||
    # more than one variable means data frame, so no name
    length(select) > 1L ||
    # if we have only one variable, but number of observations not equal to
    # length of names, we have no proper match, so no naming, too.
    (length(select) == 1L &&
      length(name) > 1L &&
      length(data[[select]]) != length(name))

  if (needs_no_names) {
    name <- NULL
  }

  # we definitely should have a vector here when name not NULL
  if (is.null(name)) {
    data[, select, drop = !as_data_frame]
  } else {
    # if name indicates a variable, extract values for naming now
    if (length(name) == 1L) {
      name <- data[[name]]
    }
    stats::setNames(data[[select]], name)
  }
}


================================================
FILE: R/data_group.R
================================================
#' @title Create a grouped data frame
#' @name data_group
#'
#' @description This function is comparable to `dplyr::group_by()`, but just
#' following the **datawizard** function design. `data_ungroup()` removes the
#' grouping information from a grouped data frame.
#'
#' @param data A data frame
#' @inheritParams extract_column_names
#'
#' @return A grouped data frame, i.e. a data frame with additional information
#' about the grouping structure saved as attributes.
#'
#' @examplesIf requireNamespace("poorman")
#' data(efc)
#' suppressPackageStartupMessages(library(poorman, quietly = TRUE))
#'
#' # total mean
#' efc %>%
#'   summarize(mean_hours = mean(c12hour, na.rm = TRUE))
#'
#' # mean by educational level
#' efc %>%
#'   data_group(c172code) %>%
#'   summarize(mean_hours = mean(c12hour, na.rm = TRUE))
#' @export
data_group <- function(
  data,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
) {
  # variables for grouping
  select <- .select_nse(
    select,
    data,
    exclude,
    ignore_case = ignore_case,
    regex = regex,
    verbose = verbose
  )
  # create grid with combinations of all levels
  my_grid <- as.data.frame(expand.grid(lapply(data[select], unique)))
  # sort grid
  my_grid <- my_grid[do.call(order, my_grid), , drop = FALSE]

  .rows <- lapply(seq_len(nrow(my_grid)), function(i) {
    as.integer(data_match(
      data,
      to = my_grid[i, , drop = FALSE],
      match = "and",
      return_indices = TRUE,
      remove_na = FALSE
    ))
  })
  my_grid[[".rows"]] <- .rows

  # remove data_match attributes
  attr(my_grid, "out.attrs") <- NULL
  attr(my_grid, ".drop") <- TRUE

  attr(data, "groups") <- my_grid
  class(data) <- unique(c("grouped_df", "data.frame"), class(data))

  data
}


#' @rdname data_group
#' @export
data_ungroup <- function(data, verbose = TRUE, ...) {
  attr(data, "groups") <- NULL
  class(data) <- unique(setdiff(class(data), "grouped_df"))

  data
}


================================================
FILE: R/data_match.R
================================================
#' Return filtered or sliced data frame, or row indices
#'
#' Return a filtered (or sliced) data frame or row indices of a data frame that
#' match a specific condition. `data_filter()` works like `data_match()`, but works
#' with logical expressions or row indices of a data frame to specify matching
#' conditions.
#'
#' @param x A data frame.
#' @param to A data frame matching the specified conditions. Note that if
#'   `match` is a value other than `"and"`, the original row order might be
#'   changed. See 'Details'.
#' @param match String, indicating with which logical operation matching
#'   conditions should be combined. Can be `"and"` (or `"&"`), `"or"` (or `"|"`)
#'   or `"not"` (or `"!"`).
#' @param return_indices Logical, if `TRUE`, return the vector of rows that
#'   can be used to filter the original data frame. If `FALSE` (default),
#'   returns directly the filtered data frame instead of the row indices.
#' @param remove_na Logical, if `TRUE`, missing values (`NA`s) are removed before
#'   filtering the data. This is the default behaviour, however, sometimes when
#'   row indices are requested (i.e. `return_indices=TRUE`), it might be useful
#'   to preserve `NA` values, so returned row indices match the row indices of
#'   the original data frame.
#' @param ... A sequence of logical expressions indicating which rows to keep,
#'   or a numeric vector indicating the row indices of rows to keep. Can also be
#'   a string representation of a logical expression (e.g. `"x > 4"`), a
#'   character vector (e.g. `c("x > 4", "y == 2")`) or a variable that contains
#'   the string representation of a logical expression. These might be useful
#'   when used in packages to avoid defining undefined global variables.
#'
#' @return A filtered data frame, or the row indices that match the specified
#' configuration.
#'
#' @details For `data_match()`, if `match` is either `"or"` or `"not"`, the
#' original row order from `x` might be changed. If preserving row order is
#' required, use `data_filter()` instead.
#'
#' ```
#' # mimics subset() behaviour, preserving original row order
#' head(data_filter(mtcars[c("mpg", "vs", "am")], vs == 0 | am == 1))
#' #>                    mpg vs am
#' #> Mazda RX4         21.0  0  1
#' #> Mazda RX4 Wag     21.0  0  1
#' #> Datsun 710        22.8  1  1
#' #> Hornet Sportabout 18.7  0  0
#' #> Duster 360        14.3  0  0
#' #> Merc 450SE        16.4  0  0
#'
#' # re-sorting rows
#' head(data_match(mtcars[c("mpg", "vs", "am")],
#'                 data.frame(vs = 0, am = 1),
#'                 match = "or"))
#' #>                    mpg vs am
#' #> Mazda RX4         21.0  0  1
#' #> Mazda RX4 Wag     21.0  0  1
#' #> Hornet Sportabout 18.7  0  0
#' #> Duster 360        14.3  0  0
#' #> Merc 450SE        16.4  0  0
#' #> Merc 450SL        17.3  0  0
#' ```
#'
#' While `data_match()` works with data frames to match conditions against,
#' `data_filter()` is basically a wrapper around `subset(subset = <filter>)`.
#' However, unlike `subset()`, it preserves label attributes and is useful when
#' working with labelled data.
#'
#' @examples
#' data_match(mtcars, data.frame(vs = 0, am = 1))
#' data_match(mtcars, data.frame(vs = 0, am = c(0, 1)))
#'
#' # observations where "vs" is NOT 0 AND "am" is NOT 1
#' data_match(mtcars, data.frame(vs = 0, am = 1), match = "not")
#' # equivalent to
#' data_filter(mtcars, vs != 0 & am != 1)
#'
#' # observations where EITHER "vs" is 0 OR "am" is 1
#' data_match(mtcars, data.frame(vs = 0, am = 1), match = "or")
#' # equivalent to
#' data_filter(mtcars, vs == 0 | am == 1)
#'
#' # slice data frame by row indices
#' data_filter(mtcars, 5:10)
#'
#' # Define a custom function containing data_filter()
#' my_filter <- function(data, variable) {
#'   data_filter(data, variable)
#' }
#' my_filter(mtcars, "cyl == 6")
#'
#' # Pass complete filter-condition as string.
#' my_filter <- function(data, condition) {
#'   data_filter(data, condition)
#' }
#' my_filter(mtcars, "am != 0")
#'
#' # string can also be used directly as argument
#' data_filter(mtcars, "am != 0")
#'
#' # or as variable
#' fl <- "am != 0"
#' data_filter(mtcars, fl)
#' @inherit data_rename seealso
#' @export
data_match <- function(
  x,
  to,
  match = "and",
  return_indices = FALSE,
  remove_na = TRUE,
  ...
) {
  if (!is.data.frame(to)) {
    to <- as.data.frame(to)
  }
  original_x <- x

  # evaluate
  match <- match.arg(
    tolower(match),
    c("and", "&", "&&", "or", "|", "||", "!", "not")
  )
  match <- switch(
    match,
    `&` = ,
    `&&` = ,
    and = "and",
    `!` = ,
    not = "not",
    "or"
  )

  # validation check
  shared_columns <- intersect(colnames(x), colnames(to))
  if (is.null(shared_columns) || length(shared_columns) == 0L) {
    insight::format_error(
      "None of the columns from the data frame with matching conditions were found in `x`."
    )
  }

  # only select common columns
  x <- x[shared_columns]

  # prepare
  if (identical(match, "or")) {
    idx <- vector("numeric", length = 0L)
  } else {
    # remove missings before matching
    if (isTRUE(remove_na)) {
      x <- x[stats::complete.cases(x), , drop = FALSE]
    }
    idx <- seq_len(nrow(x))
  }

  # Find matching rows
  for (col in names(to)) {
    values <- x[[col]]
    if (match == "or") {
      idx <- union(idx, which(values %in% to[[col]]))
    } else if (match == "not") {
      idx <- idx[!values[idx] %in% to[[col]]]
    } else {
      idx <- idx[values[idx] %in% to[[col]]]
    }
  }

  # prepare output
  if (isFALSE(return_indices)) {
    out <- original_x[idx, , drop = FALSE]
    # restore value and variable labels
    for (i in colnames(out)) {
      attr(out[[i]], "label") <- attr(original_x[[i]], "label", exact = TRUE)
      attr(out[[i]], "labels") <- attr(original_x[[i]], "labels", exact = TRUE)
    }
  } else {
    out <- idx
  }

  # add back custom attributes
  out <- .replace_attrs(out, attributes(original_x))
  out
}


#' @rdname data_match
#' @export
data_filter <- function(x, ...) {
  UseMethod("data_filter")
}

#' @export
data_filter.data.frame <- function(x, ...) {
  out <- x

  # convert tibble to data.frame
  if (inherits(x, "tbl_df")) {
    out <- as.data.frame(out, stringsAsFactors = FALSE)
    tbl_input <- TRUE
  } else {
    tbl_input <- FALSE
  }

  dots <- match.call(expand.dots = FALSE)[["..."]]

  if (any(nzchar(names(dots), keepNA = TRUE))) {
    insight::format_error(
      "Filtering did not work. Please check if you need `==` (instead of `=`) for comparison."
    )
  }

  # turn character vector (like `c("mpg <= 20", "cyl == 6")`) into symbols
  if (length(dots) == 1) {
    character_vector <- .dynEval(dots[[1]], ifnotfound = NULL)
    if (is.character(character_vector) && length(character_vector) > 1) {
      dots <- lapply(character_vector, str2lang)
    }
  }

  # Check syntax of the filter. Must be done *before* calling subset()
  # (cf easystats/datawizard#237)
  for (.fcondition in dots) {
    .check_filter_syntax(insight::safe_deparse(.fcondition))
  }

  for (i in seq_along(dots)) {
    # only proceed when result is still valid
    if (!is.null(out)) {
      symbol <- dots[[i]]
      # evaluate, we may have a variable with filter expression
      eval_symbol <- .dynEval(symbol, ifnotfound = NULL)
      # validation check: is variable named like a function?
      if (is.function(eval_symbol)) {
        eval_symbol <- .dynGet(symbol, ifnotfound = NULL)
      }
      eval_symbol_numeric <- NULL
      if (!is.null(eval_symbol)) {
        # when possible to evaluate, do we have a numeric vector provided
        # as string? (e.g. `"5:10"`) - then try to coerce to numeric
        eval_symbol_numeric <- tryCatch(
          eval(parse(text = eval_symbol)),
          error = function(e) NULL
        )
      }

      # here we go when we have a filter expression, and no numeric vector to slice
      if (
        is.null(eval_symbol) ||
          (!is.numeric(eval_symbol) && !is.numeric(eval_symbol_numeric))
      ) {
        # could be evaluated? Then filter expression is a string and we need
        # to convert into symbol
        if (is.character(eval_symbol)) {
          symbol <- str2lang(eval_symbol)
        }
        # filter data
        out <- tryCatch(
          subset(out, subset = eval(symbol, envir = new.env())),
          warning = function(e) e,
          error = function(e) e
        )
      } else if (is.numeric(eval_symbol)) {
        # if symbol could be evaluated and is numeric, slice
        out <- tryCatch(out[eval_symbol, , drop = FALSE], error = function(e) {
          NULL
        })
      } else if (is.numeric(eval_symbol_numeric)) {
        # if symbol could be evaluated, was string and could be converted to numeric, slice
        out <- tryCatch(
          out[eval_symbol_numeric, , drop = FALSE],
          error = function(e) NULL
        )
      }

      if (inherits(out, c("simpleError", "objectNotFoundError"))) {
        error_msg <- out$message[1]
        # try to find out which variable was the cause for the error
        if (grepl("object '(.*)' not found", error_msg)) {
          error_var <- gsub("object '(.*)' not found", "\\1", error_msg)
          # some syntax errors do not relate to misspelled variables...
          if (!error_var %in% colnames(x)) {
            insight::format_error(
              paste0(
                "Variable \"",
                error_var,
                "\" was not found in the dataset."
              ),
              .misspelled_string(colnames(x), error_var, "Possibly misspelled?")
            )
          }
        }
        out <- NULL
      }
    }
  }

  if (is.null(out)) {
    insight::format_error(
      "Filtering did not work. Please check the syntax of your conditions."
    )
  }

  # restore value and variable labels
  for (i in colnames(out)) {
    attr(out[[i]], "label") <- attr(x[[i]], "label", exact = TRUE)
    attr(out[[i]], "labels") <- attr(x[[i]], "labels", exact = TRUE)
  }

  # add back custom attributes
  out <- .replace_attrs(out, attributes(x))

  # add back tidyverse attributes
  if (isTRUE(tbl_input)) {
    class(out) <- c("tbl_df", "tbl", "data.frame")
  }

  out
}


#' @export
data_filter.grouped_df <- function(x, ...) {
  original_x <- x
  grps <- attr(x, "groups", exact = TRUE)
  grps <- grps[[".rows"]]

  # Remove tidyverse attributes, will add them back at the end
  if (inherits(x, "tbl_df")) {
    tbl_input <- TRUE
    x <- as.data.frame(x, stringsAsFactors = FALSE)
  } else {
    tbl_input <- FALSE
  }

  dots <- match.call(expand.dots = FALSE)[["..."]]
  out <- lapply(grps, function(grp) {
    arguments <- list(x[grp, ])
    arguments <- c(arguments, dots)
    do.call("data_filter.data.frame", arguments)
  })

  out <- do.call(rbind, out)

  if (!insight::object_has_rownames(x)) {
    rownames(out) <- NULL
  }

  # add back tidyverse attributes
  if (isTRUE(tbl_input)) {
    class(out) <- c("tbl_df", "tbl", "data.frame")
  }

  # add back custom attributes
  out <- .replace_attrs(out, attributes(original_x))

  out
}


# helper -------------------

.check_filter_syntax <- function(.fcondition) {
  # NOTE: We cannot check for `=` when "filter" is not a character vector
  # because the function will then fail in general. I.e.,
  # "data_filter(mtcars, filter = mpg > 10 & cyl = 4)" will not start
  # running this function and never reaches the first code line,
  # but immediately stops...
  tmp <- gsub("==", "", .fcondition, fixed = TRUE)
  tmp <- gsub("<=", "", tmp, fixed = TRUE)
  tmp <- gsub(">=", "", tmp, fixed = TRUE)
  tmp <- gsub("!=", "", tmp, fixed = TRUE)

  # We want to check whether user used a "=" in the filter syntax. This
  # typically indicates that the comparison "==" is probably wrong by using
  # a "=" instead of `"=="`. However, if a function was provided, we indeed
  # may have "=", e.g. if the pattern was
  # `data_filter(out, grep("pattern", x = value))`. We thus first check if we
  # can identify a function call, and only continue checking for wrong syntax
  # when we have not identified a function.

  if (
    !is.function(tryCatch(
      get(gsub("^(.*?)\\((.*)", "\\1", tmp)),
      error = function(e) NULL
    ))
  ) {
    # Give more informative message to users
    # about possible misspelled comparisons / logical conditions
    # check if "=" instead of "==" was used?
    if (any(grepl("=", tmp, fixed = TRUE))) {
      insight::format_error(
        "Filtering did not work. Please check if you need `==` (instead of `=`) for comparison."
      )
    }
    # check if "&&" etc instead of "&" was used?
    logical_operator <- NULL
    if (any(grepl("&&", .fcondition, fixed = TRUE))) {
      logical_operator <- "&&"
    }
    if (any(grepl("||", .fcondition, fixed = TRUE))) {
      logical_operator <- "||"
    }
    if (!is.null(logical_operator)) {
      insight::format_error(
        paste0(
          "Filtering did not work. Please check if you need `",
          substr(logical_operator, 0, 1),
          "` (instead of `",
          logical_operator,
          "`) as logical operator."
        )
      )
    }
  }
}


================================================
FILE: R/data_merge.R
================================================
#' @title Merge (join) two data frames, or a list of data frames
#' @name data_merge
#'
#' @description
#' Merge (join) two data frames, or a list of data frames. However, unlike
#' base R's `merge()`, `data_merge()` offers a few more methods to join data
#' frames, and it does not drop data frame nor column attributes.
#'
#' @param x,y A data frame to merge. `x` may also be a list of data frames
#'   that will be merged. Note that the list-method has no `y` argument.
#' @param join Character vector, indicating the method of joining the data frames.
#'   Can be `"full"`, `"left"` (default), `"right"`, `"inner"`, `"anti"`, `"semi"`
#'   or `"bind"`. See details below.
#' @param by Specifications of the columns used for merging.
#' @param id Optional name for ID column that will be created to indicate the
#'   source data frames for appended rows. Only applies if `join = "bind"`.
#' @param verbose Toggle warnings.
#' @param ... Not used.
#'
#' @return
#' A merged data frame.
#'
#' @section Merging data frames:
#'
#' Merging data frames is performed by adding rows (cases), columns
#' (variables) or both from the source data frame (`y`) to the target
#' data frame (`x`). This usually requires one or more variables which
#' are included in both data frames and that are used for merging, typically
#' indicated with the `by` argument. When `by` contains a variable present
#' in both data frames, cases are matched and filtered by identical values
#' of `by` in `x` and `y`.
#'
#' @section Left- and right-joins:
#'
#' Left- and right joins usually don't add new rows (cases), but only new
#' columns (variables) for existing cases in `x`. For `join = "left"` or
#' `join = "right"` to work, `by` *must* indicate one or more columns that
#' are included in both data frames. For `join = "left"`, if `by` is an
#' identifier variable, which is included in both `x` and `y`, all variables
#' from `y` are copied to `x`, but only those cases from `y` that have
#' matching values in their identifier variable in `x` (i.e. all cases
#' in `x` that are also found in `y` get the related values from the new
#' columns in `y`). If there is no match between identifiers in `x` and `y`,
#' the copied variable from `y` will get a `NA` value for this particular
#' case. Other variables that occur both in `x` and `y`, but are not used
#' as identifiers (with `by`), will be renamed to avoid multiple identical
#' variable names. Cases in `y` where values from the identifier have no
#' match in `x`'s identifier are removed. `join = "right"` works in
#' a similar way as `join = "left"`, just that only cases from `x` that
#' have matching values in their identifier variable in `y` are chosen.
#'
#' In base R, these are equivalent to `merge(x, y, all.x = TRUE)` and
#' `merge(x, y, all.y = TRUE)`.
#'
#' @section Full joins:
#'
#' Full joins copy all cases from `y` to `x`. For matching cases in both
#' data frames, values for new variables are copied from `y` to `x`. For
#' cases in `y` not present in `x`, these will be added as new rows to `x`.
#' Thus, full joins not only add new columns (variables), but also might
#' add new rows (cases).
#'
#' In base R, this is equivalent to `merge(x, y, all = TRUE)`.
#'
#' @section Inner joins:
#'
#' Inner joins merge two data frames, however, only those rows (cases) are
#' kept that are present in both data frames. Thus, inner joins usually
#' add new columns (variables), but also remove rows (cases) that only
#' occur in one data frame.
#'
#' In base R, this is equivalent to `merge(x, y)`.
#'
#' @section Binds:
#'
#' `join = "bind"` row-binds the complete second data frame `y` to `x`.
#' Unlike simple `rbind()`, which requires the same columns for both data
#' frames, `join = "bind"` will bind shared columns from `y` to `x`, and
#' add new columns from `y` to `x`.
#'
#' @examples
#'
#' x <- data.frame(a = 1:3, b = c("a", "b", "c"), c = 5:7, id = 1:3)
#' y <- data.frame(c = 6:8, d = c("f", "g", "h"), e = 100:102, id = 2:4)
#'
#' x
#' y
#'
#' # "by" will default to all shared columns, i.e. "c" and "id". new columns
#' # "d" and "e" will be copied from "y" to "x", but there are only two cases
#' # in "x" that have the same values for "c" and "id" in "y". only those cases
#' # have values in the copied columns, the other case gets "NA".
#' data_merge(x, y, join = "left")
#'
#' # we change the id-value here
#' x <- data.frame(a = 1:3, b = c("a", "b", "c"), c = 5:7, id = 1:3)
#' y <- data.frame(c = 6:8, d = c("f", "g", "h"), e = 100:102, id = 3:5)
#'
#' x
#' y
#'
#' # no cases in "y" have the same matching "c" and "id" as in "x", thus
#' # copied variables from "y" to "x" copy no values, all get NA.
#' data_merge(x, y, join = "left")
#'
#' # one case in "y" has a match in "id" with "x", thus values for this
#' # case from the remaining variables in "y" are copied to "x", all other
#' # values (cases) in those remaining variables get NA
#' data_merge(x, y, join = "left", by = "id")
#'
#' data(mtcars)
#' x <- mtcars[1:5, 1:3]
#' y <- mtcars[28:32, 4:6]
#'
#' # add ID common column
#' x$id <- 1:5
#' y$id <- 3:7
#'
#' # left-join, add new variables and copy values from y to x,
#' # where "id" values match
#' data_merge(x, y)
#'
#' # right-join, add new variables and copy values from x to y,
#' # where "id" values match
#' data_merge(x, y, join = "right")
#'
#' # full-join
#' data_merge(x, y, join = "full")
#'
#'
#' data(mtcars)
#' x <- mtcars[1:5, 1:3]
#' y <- mtcars[28:32, c(1, 4:5)]
#'
#' # add ID common column
#' x$id <- 1:5
#' y$id <- 3:7
#'
#' # left-join, no matching rows (because columns "id" and "disp" are used)
#' # new variables get all NA values
#' data_merge(x, y)
#'
#' # one common value in "mpg", so one row from y is copied to x
#' data_merge(x, y, by = "mpg")
#'
#' # only keep rows with matching values in by-column
#' data_merge(x, y, join = "semi", by = "mpg")
#'
#' # only keep rows with non-matching values in by-column
#' data_merge(x, y, join = "anti", by = "mpg")
#'
#' # merge list of data frames. can be of different rows
#' x <- mtcars[1:5, 1:3]
#' y <- mtcars[28:31, 3:5]
#' z <- mtcars[11:18, c(1, 3:4, 6:8)]
#' x$id <- 1:5
#' y$id <- 4:7
#' z$id <- 3:10
#' data_merge(list(x, y, z), join = "bind", by = "id", id = "source")
#' @inherit data_rename seealso
#' @export
data_merge <- function(x, ...) {
  UseMethod("data_merge")
}

#' @rdname data_merge
#' @export
data_join <- data_merge

#' @rdname data_merge
#' @export
data_merge.data.frame <- function(
  x,
  y,
  join = "left",
  by = NULL,
  id = NULL,
  verbose = TRUE,
  ...
) {
  class_x <- class(x)

  # save variable attributes
  attr_x_vars <- lapply(x, attributes)
  attr_y_vars <- lapply(y, attributes)
  attr_vars <- c(
    attr_x_vars,
    attr_y_vars[names(attr_y_vars)[!names(attr_y_vars) %in% names(attr_x_vars)]]
  )

  # check join-argument ----------------------

  join <- match.arg(
    join,
    choices = c("full", "left", "right", "inner", "semi", "anti", "bind")
  )

  # check id-argument ----------------------

  all_columns <- union(colnames(x), colnames(y))

  if (join == "bind" && !is.null(id) && id %in% all_columns) {
    # ensure unique ID
    id <- make.unique(c(all_columns, id), sep = "_")[length(all_columns) + 1]
    # and also tell user...
    if (isTRUE(verbose)) {
      insight::format_warning(
        sprintf(
          "Value of `id` already exists as column name. ID column was renamed to `%s`.",
          id
        )
      )
    }
  }

  if (!is.null(id) && join == "bind") {
    x[[id]] <- 1
    y[[id]] <- 2
  }

  # check merge columns ("by"-argument) ----------------------

  if (join != "bind") {
    # we need a value for "by". If not provided, use all shared column names
    if (is.null(by)) {
      by <- intersect(colnames(x), colnames(y))
    }

    # If not all column names specified in "by" are present, yield warning
    # and use all shared column names
    if (!all(by %in% colnames(x)) || !all(by %in% colnames(y))) {
      missing_in_x <- setdiff(by, colnames(x))
      missing_in_y <- setdiff(by, colnames(y))
      stop_message <- c(
        "Not all columns specified in `by` were found in the data frames.",
        if (length(missing_in_x) > 0L) {
          paste0(
            "Following columns are in `by` but absent in `x`: ",
            text_concatenate(missing_in_x)
          )
        },
        if (length(missing_in_y) > 0L) {
          paste0(
            "Following columns are in `by` but absent in `y`: ",
            text_concatenate(missing_in_y)
          )
        }
      )
      if (isTRUE(verbose)) {
        insight::format_error(stop_message)
      }
    }

    # if still both data frames have no common columns, do a full join
    if (!length(by)) {
      if (isTRUE(verbose)) {
        insight::format_warning(
          "Found no matching columns in the data frames. Fully merging both data frames now.",
          "Note that this can lead to unintended results, because rows in `x` and `y` are possibly duplicated.",
          "You probably want to use `data_merge(x, y, join = \"bind\")` instead."
        )
      }
      by <- NULL
      join <- "full"
    }
  }

  # check valid combination of "join" and "by" -----------------------

  if (join %in% c("anti", "semi") && (is.null(by) || length(by) != 1)) {
    insight::format_error(
      sprintf(
        "For `join = \"%s\"`, `by` needs to be a name of only one variable that is present in both data frames.",
        join
      )
    )
  }

  # merge --------------------

  # for later sorting
  if (join != "bind") {
    if (nrow(x) > 0L) {
      x$.data_merge_id_x <- seq_len(nrow(x))
    }
    if (nrow(y) > 0L) {
      y$.data_merge_id_y <- (seq_len(nrow(y))) + nrow(x)
    }
  }
  all_columns <- union(colnames(x), colnames(y))

  out <- switch(
    join,
    full = merge(x, y, all = TRUE, sort = FALSE, by = by),
    left = merge(x, y, all.x = TRUE, sort = FALSE, by = by),
    right = merge(x, y, all.y = TRUE, sort = FALSE, by = by),
    inner = merge(x, y, sort = FALSE, by = by),
    semi = x[x[[by]] %in% y[[by]], , drop = FALSE],
    anti = x[!x[[by]] %in% y[[by]], , drop = FALSE],
    bind = .bind_data_frames(x, y)
  )

  # sort rows, add attributes, and return results -------------------------

  if (".data_merge_id_x" %in% colnames(out)) {
    # for full joins, we have no complete sorting id, but NAs for each
    # data frame. we now "merge" the two sorting IDs from each data frame.
    if (anyNA(out$.data_merge_id_x) && ".data_merge_id_y" %in% colnames(out)) {
      out$.data_merge_id_x[is.na(
        out$.data_merge_id_x
      )] <- out$.data_merge_id_y[is.na(out$.data_merge_id_x)]
    }
    out <- out[order(out$.data_merge_id_x), ]
    out$.data_merge_id_x <- NULL
    out$.data_merge_id_y <- NULL
  }

  # try to restore original column order as good as possible. Therefore, we
  # first take all column names of the original input data frames, then
  # we add all new columns, like duplicated from merging (name.x and name.y,
  # if "name" was in both data frames, but not used in "by"), and then do a
  # final check that all column names are present in "out" (e.g., "name" would)
  # no longer be there if we have "name.x" and "name.y").

  all_columns <- c(all_columns, setdiff(colnames(out), all_columns))
  all_columns <- all_columns[all_columns %in% colnames(out)]
  out <- out[all_columns]

  # add back attributes
  out <- .replace_attrs(out, attributes(y))
  out <- .replace_attrs(out, attributes(x))

  for (i in colnames(out)) {
    if (is.list(attr_vars[[i]])) {
      if (is.list(attributes(out[[i]]))) {
        attributes(out[[i]]) <- utils::modifyList(
          attr_vars[[i]],
          attributes(out[[i]])
        )
      } else {
        attributes(out[[i]]) <- attr_vars[[i]]
      }
    }
  }

  class(out) <- unique(c(class_x, "data.frame"))
  out
}


#' @rdname data_merge
#' @export
data_merge.list <- function(
  x,
  join = "left",
  by = NULL,
  id = NULL,
  verbose = TRUE,
  ...
) {
  out <- x[[1]]
  df_id <- rep(1, times = nrow(out))

  for (i in 2:length(x)) {
    out <- data_merge(
      out,
      x[[i]],
      join = join,
      by = by,
      id = NULL,
      verbose = verbose,
      ...
    )
    df_id <- c(df_id, rep(i, times = nrow(x[[i]])))
  }

  # we need separate handling for list of data frames and id-variable here
  if (!is.null(id) && join == "bind") {
    if (id %in% colnames(out)) {
      # ensure unique ID
      id <- make.unique(c(colnames(out), id), sep = "_")[
        length(colnames(out)) + 1
      ]
      # and also tell user...
      if (isTRUE(verbose)) {
        insight::format_warning(
          sprintf(
            "Value of `id` already exists as column name. ID column was renamed to `%s`.",
            id
          )
        )
      }
    }
    out[[id]] <- df_id
  }

  out
}


.bind_data_frames <- function(x, y) {
  # merge and sort. "rbind()" is faster than "merge()" if all columns present
  if (all(colnames(x) %in% colnames(y)) && ncol(x) == ncol(y)) {
    # we may have different column order
    out <- rbind(x, y[match(colnames(x), colnames(y))])
  } else {
    # add ID for merging
    if (nrow(x) > 0L) {
      x$.data_merge_row <- seq_len(nrow(x))
    }
    if (nrow(y) > 0L) {
      y$.data_merge_row <- (nrow(x) + 1):(nrow(x) + nrow(y))
    }
    merge_by <- intersect(colnames(x), colnames(y))
    out <- merge(x, y, all = TRUE, sort = FALSE, by = merge_by)
  }

  # for empty df's, merge() may return an empty character vector
  # make sure it's a data frame object.
  if (!is.data.frame(out)) {
    out <- as.data.frame(out)
  }

  if (".data_merge_row" %in% colnames(out)) {
    out <- out[order(out$.data_merge_row), ]
  }

  out$.data_merge_row <- NULL
  out
}


================================================
FILE: R/data_modify.R
================================================
#' Create new variables in a data frame
#'
#' Create new variables or modify existing variables in a data frame. Unlike `base::transform()`, `data_modify()`
#' can be used on grouped data frames, and newly created variables can be directly
#' used.
#'
#' @param data A data frame
#' @param ... One or more expressions that define the new variable name and the
#' values or recoding of those new variables. These expressions can be one of:
#' - A sequence of named, literal expressions, where the left-hand side refers
#'   to the name of the new variable, while the right-hand side represent the
#'   values of the new variable. Example: `Sepal.Width = center(Sepal.Width)`.
#' - A vector of length 1 (which will be recycled to match the number of rows
#'   in the data), or of same length as the data.
#' - A variable that contains a value to be used. Example:
#'   ```r
#'   a <- "abc"
#'   data_modify(iris, var_abc = a) # var_abc contains "abc"
#'   ```
#' - An expression can also be provided as string and wrapped in
#'   `as_expr()`. Example:
#'   ```r
#'   data_modify(iris, as_expr("Sepal.Width = center(Sepal.Width)"))
#'   # or
#'   a <- "center(Sepal.Width)"
#'   data_modify(iris, Sepal.Width = as_expr(a))
#'   # or
#'   a <- "Sepal.Width = center(Sepal.Width)"
#'   data_modify(iris, as_expr(a))
#'   ```
#'   Note that `as_expr()` is no real function, which cannot be used outside
#'   of `data_modify()`, and hence it is not exported nor documented. Rather,
#'   it is only used for internally processing expressions.
#' - Using `NULL` as right-hand side removes a variable from the data frame.
#'   Example: `Petal.Width = NULL`.
#' - For data frames (including grouped ones), the function `n()` can be used to
#'   count the number of observations and thereby, for instance, create index
#'   values by using `id = 1:n()` or `id = 3:(n()+2)` and similar. Note that,
#'   like `as_expr()`, `n()` is also no true function and cannot be used outside
#'   of `data_modify()`.
#'
#' Note that newly created variables can be used in subsequent expressions,
#' including `.at` or `.if`. See also 'Examples'.
#'
#' @param .at A character vector of variable names that should be modified. This
#' argument is used in combination with the `.modify` argument. Note that only one
#' of `.at` or `.if` can be provided, but not both at the same time. Newly created
#' variables in `...` can also be selected, see 'Examples'.
#' @param .if A function that returns `TRUE` for columns in the data frame where
#' `.if` applies. This argument is used in combination with the `.modify` argument.
#' Note that only one of `.at` or `.if` can be provided, but not both at the same
#' time. Newly created variables in `...` can also be selected, see 'Examples'.
#' @param .modify A function that modifies the variables defined in `.at` or `.if`.
#' This argument is used in combination with either the `.at` or the `.if` argument.
#' Note that the modified variable (i.e. the result from `.modify`) must be either
#' of length 1 or of same length as the input variable.
#'
#' @note `data_modify()` can also be used inside functions. However, it is
#' recommended to pass the recode-expression as character vector or list of
#' characters.
#'
#' @examples
#' data(efc)
#' new_efc <- data_modify(
#'   efc,
#'   c12hour_c = center(c12hour),
#'   c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE),
#'   c12hour_z2 = standardize(c12hour)
#' )
#' head(new_efc)
#'
#' # using strings instead of literal expressions
#' new_efc <- data_modify(
#'   efc,
#'   as_expr("c12hour_c = center(c12hour)"),
#'   as_expr("c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE)"),
#'   as_expr("c12hour_z2 = standardize(c12hour)")
#' )
#' head(new_efc)
#'
#' # using a character vector, provided a variable
#' xpr <- c(
#'   "c12hour_c = center(c12hour)",
#'   "c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE)",
#'   "c12hour_z2 = standardize(c12hour)"
#' )
#' new_efc <- data_modify(efc, as_expr(xpr))
#' head(new_efc)
#'
#' # using character strings, provided as variable
#' stand <- "c12hour_c / sd(c12hour, na.rm = TRUE)"
#' new_efc <- data_modify(
#'   efc,
#'   c12hour_c = center(c12hour),
#'   c12hour_z = as_expr(stand)
#' )
#' head(new_efc)
#'
#' # attributes - in this case, value and variable labels - are preserved
#' str(new_efc)
#'
#' # using `paste()` to build a string-expression
#' to_standardize <- c("Petal.Length", "Sepal.Length")
#' out <- data_modify(
#'   iris,
#'   as_expr(
#'     paste0(to_standardize, "_stand = standardize(", to_standardize, ")")
#'   )
#' )
#' head(out)
#'
#' # overwrite existing variable, remove old variable
#' out <- data_modify(iris, Petal.Length = 1 / Sepal.Length, Sepal.Length = NULL)
#' head(out)
#'
#' # works on grouped data
#' grouped_efc <- data_group(efc, "c172code")
#' new_efc <- data_modify(
#'   grouped_efc,
#'   c12hour_c = center(c12hour),
#'   c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE),
#'   c12hour_z2 = standardize(c12hour),
#'   id = 1:n()
#' )
#' head(new_efc)
#'
#' # works from inside functions
#' foo1 <- function(data, ...) {
#'   head(data_modify(data, ...))
#' }
#' foo1(iris, SW_fraction = Sepal.Width / 10)
#' # or
#' foo1(iris, as_expr("SW_fraction = Sepal.Width / 10"))
#'
#' # also with string arguments, using `as_expr()`
#' foo2 <- function(data, modification) {
#'   head(data_modify(data, as_expr(modification)))
#' }
#' foo2(iris, "SW_fraction = Sepal.Width / 10")
#'
#' # modify at specific positions or if condition is met
#' d <- iris[1:5, ]
#' data_modify(d, .at = "Species", .modify = as.numeric)
#' data_modify(d, .if = is.factor, .modify = as.numeric)
#'
#' # can be combined with dots
#' data_modify(d, new_length = Petal.Length * 2, .at = "Species", .modify = as.numeric)
#'
#' # new variables used in `.at` or `.if`
#' data_modify(
#'   d,
#'   new_length = Petal.Length * 2,
#'   .at = c("Petal.Length", "new_length"),
#'   .modify = round
#' )
#'
#' # combine "extract_column_names()" and ".at" argument
#' out <- data_modify(
#'   d,
#'   .at = extract_column_names(d, select = starts_with("Sepal")),
#'   .modify = as.factor
#' )
#' # "Sepal.Length" and "Sepal.Width" are now factors
#' str(out)
#'
#' @export
data_modify <- function(data, ...) {
  UseMethod("data_modify")
}


#' @export
data_modify.default <- function(data, ...) {
  insight::format_error("`data` must be a data frame.")
}


#' @rdname data_modify
#' @export
data_modify.data.frame <- function(
  data,
  ...,
  .if = NULL,
  .at = NULL,
  .modify = NULL
) {
  dots <- eval(substitute(alist(...)))

  # error for data frames with no rows...
  if (nrow(data) == 0) {
    insight::format_error(
      "`data` is an empty data frame. `data_modify()` only works for data frames with at least one row."
    ) # nolint
  }

  # check if we have dots, or only at/modify ----

  if (length(dots)) {
    # Check if dots are named. Usually, all dots should be named, i.e. include
    # the name of the new variable. There's only one exception, if a string is
    # masked as expression, and this string includes the new name, e.g.
    #
    # data_modify(iris, as_expr("sepwid = 2 * Sepal.Width"))
    # a <- "sepwid = 2 * Sepal.Width"
    # data_modify(iris, as_expr(a))
    #
    dots <- .process_unnamed_expressions(dots, data)

    # next, we check for named expression-tags and convert these into regular
    # expressions, e.g.
    #
    # data_modify(iris, sepwid =  = as_expr("2 * Sepal.Width"))
    # a <- "2 * Sepal.Width"
    # data_modify(iris, sepwid = as_expr(a))
    #
    dots <- .process_named_expressions(dots, data)

    for (i in seq_along(dots)) {
      # create new variable
      new_variable <- .get_new_dots_variable(dots, i, data)
      # give informative error when new variable doesn't match number of rows
      if (
        !is.null(new_variable) &&
          length(new_variable) != nrow(data) &&
          (nrow(data) %% length(new_variable)) != 0
      ) {
        insight::format_error(
          "New variable has not the same length as the other variables in the data frame and cannot be recycled."
        )
      }
      data[[names(dots)[i]]] <- new_variable
    }
  }

  # check if we have at/modify ----
  data <- .modify_at(data, .at, .if, .modify)

  data
}


#' @export
data_modify.grouped_df <- function(
  data,
  ...,
  .if = NULL,
  .at = NULL,
  .modify = NULL
) {
  # we need to evaluate dots here, and pass them with "do.call" to
  # the data.frame method later...
  dots <- match.call(expand.dots = FALSE)[["..."]]

  # error for data frames with no rows...
  if (nrow(data) == 0) {
    insight::format_error(
      "`data` is an empty data frame. `data_modify()` only works for data frames with at least one row."
    ) # nolint
  }

  grps <- attr(data, "groups", exact = TRUE)
  grps <- grps[[".rows"]]
  attr_data <- attributes(data)

  # remove conflicting class attributes
  class_attr <- class(data)
  data <- as.data.frame(data)

  if (length(dots)) {
    # check is dots are named. Usually, all dots should be named, i.e. include
    # the name of the new variable. There's only one exception, if a string is
    # masked as expression, and this string includes the new name, e.g.
    #
    # data_modify(iris, as_expr("sepwid = 2 * Sepal.Width"))
    # a <- "sepwid = 2 * Sepal.Width"
    # data_modify(iris, as_expr(a))
    #
    dots <- .process_unnamed_expressions(dots, data)

    # next, we check for named expression-tags and convert these into regular
    # expressions, e.g.
    #
    # data_modify(iris, sepwid =  = as_expr("2 * Sepal.Width"))
    # a <- "2 * Sepal.Width"
    # data_modify(iris, sepwid = as_expr(a))
    #
    dots <- .process_named_expressions(dots, data)
  }

  # create new variables as dummys, do for-loop works
  for (i in names(dots)) {
    # don't overwrite / fill existing variables with NA,
    # e.g. if we have "data_modify(iris, Sepal.Length = normalize(Sepal.Length))"
    # normalize() won't work when we fill with NA
    if (!i %in% colnames(data)) {
      data[[i]] <- NA
    }
  }

  # create new variables per group
  for (rows in grps) {
    data[rows, ] <- data_modify.data.frame(data[rows, ], ...)
  }

  # check if we have at/modify ----
  data <- .modify_at(data, .at, .if, .modify)

  # set back attributes and class
  data <- .replace_attrs(data, attr_data)
  class(data) <- class_attr
  data
}


# expression processing ----------------------------------------------------

.process_unnamed_expressions <- function(dots, data) {
  # dots are only unnamed, when the full expression is saved in a string,
  # e.g. data_modify(iris, as_expr("sepwid = 2 * Sepal.Width")).
  # Thus, we know we *have to* find an expression here, and the string value
  # *must* contain a name definition. If not, fail. If yes, convert string
  # into a language expression...

  if (!is.null(names(dots)) && all(nzchar(names(dots)))) {
    # if all elements are named, return early
    return(dots)
  }

  # find which dots are unnamed, check those for expressions
  if (is.null(names(dots))) {
    unnamed_dots <- seq_along(dots)
  } else {
    unnamed_dots <- which(!nzchar(names(dots)))
  }

  for (i in rev(unnamed_dots)) {
    # copy dot-element and convert to string for manipulation
    dot_element <- dots[[i]]
    symbol_string <- insight::safe_deparse(dot_element)
    # sanity check - this may happen when user wants to remove a variable,
    # e.g. data_modify(iris, as_expr("Species = NULL"))
    if (is.null(symbol_string)) {
      next
    }
    # we only allow unnamed elements if these are masked as expression. String
    # values or numeric values require a named element, i.e. we can only have
    # data_modify(iris, newvar = "a"), but we cannot have data_modify(iris, "a").
    # For expression, missing name is possible.
    if (!startsWith(symbol_string, "as_expr")) {
      insight::format_error(paste0(
        "A variable name for the expression `",
        symbol_string,
        "` is missing. ",
        "Please use something like `new_name = ",
        symbol_string,
        "`."
      ))
    }
    # next, check if the string-expression includes a name for the new variable
    # therefore, we remove the "as_expr()" token
    if (startsWith(symbol_string, "as_expr")) {
      symbol_string <- insight::trim_ws(
        gsub("as_expr\\((.*)\\)", "\\1", symbol_string)
      )
    }
    # remove c(), split at comma, if we have a vector of expressions
    if (startsWith(symbol_string, "c(")) {
      symbol_string <- gsub("c\\((.*)\\)", "\\1", symbol_string)
      # only split at highest-level comma
      pattern <- ",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)" # suggestion by Co-pilot
      # Locate commas not inside quotes
      symbol_string <- insight::trim_ws(unlist(
        strsplit(symbol_string, pattern, perl = TRUE),
        use.names = FALSE
      ))
    }
    # check if we have any symbols instead of strings as expression, e.g.
    # xpr <- "sepwid = 2 * Sepal.Width"
    # data_modify(iris, as_expr(xpr))
    #
    # in this case, we need to evaluate the symbol (i.e. convert symbol string
    # into a language expression and then evaluate)
    symbol_string <- .evaluate_expression_in_string(symbol_string, data)
    # check whether we have exact one = sign. We need to have a name definition,
    # i.e. something like "var = a+b" - if the string has no "=" sign, name is
    # definitely missing
    pattern <- "(?<!=)=(?!=)"
    has_names <- grepl(pattern, symbol_string, perl = TRUE)
    if (!all(has_names)) {
      insight::format_error(paste0(
        "A variable name for the expression `",
        symbol_string[!has_names[1]],
        "` is missing. ",
        "Please use something like `new_name = ",
        symbol_string[!has_names[1]],
        "`."
      ))
    }
    # extract names (split at =), separate name from new variable from its
    # expression. we need this to create a named list of expressions for the dots
    symbol_string <- lapply(
      strsplit(symbol_string, "=", fixed = TRUE),
      function(split_result) {
        # we may have multiple "=" signs, e.g. when we have the pattern
        # "c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE)". In this case
        # paste remaining parts to one string again
        first_part <- split_result[1]
        second_part <- paste(split_result[-1], collapse = "=")
        insight::trim_ws(c(first_part, second_part))
      }
    )
    # extract names (LHS)
    symbol_names <- vapply(symbol_string, function(i) i[1], character(1))
    # extract expressions (RHS)
    symbol_string <- lapply(symbol_string, function(i) {
      str2lang(.fix_quotes(i[2]))
    })
    names(symbol_string) <- symbol_names
    # copy to dots... if we have a character vector, one dot element may
    # return more than one expression elements. Thus, we have to insert /
    # replace the old element by one or more new elements
    if (length(dots) == 1) {
      return_value <- symbol_string
    } else if (i == 1) {
      return_value <- c(symbol_string, dots[(i + 1):length(dots)])
    } else if (i == length(dots)) {
      return_value <- c(dots[1:(i - 1)], symbol_string)
    } else {
      return_value <- c(
        dots[1:(i - 1)],
        symbol_string,
        dots[(i + 1):length(dots)]
      )
    }
    dots <- return_value
  }
  dots
}


.process_named_expressions <- function(dots, data) {
  # this is basically a shorter version of ".process_unnamed_expressions()",
  # because we don't need to extract the name definition of the string, which
  # makes the handling easier. See ".process_unnamed_expressions()" for a more
  # comprehensive documentation of the single steps.

  for (i in seq_along(dots)) {
    dot_element <- dots[[i]]
    symbol_string <- insight::safe_deparse(dot_element)
    # sanity check, this may happen when user wants to remove a variable
    # e.g. data_modify(iris, Species = NULL)
    if (is.null(symbol_string)) {
      next
    }
    # extract string-expression, if we have any
    if (startsWith(symbol_string, "as_expr")) {
      symbol_string <- gsub("as_expr\\((.*)\\)", "\\1", symbol_string)
    } else {
      # no expression token found
      symbol_string <- NULL
    }
    # here we found an expression token - convert string into a regular expression
    if (!is.null(symbol_string)) {
      # check if we have any symbols instead of strings as expression
      symbol_string <- .evaluate_expression_in_string(symbol_string, data)
      # remove quotes from strings and save symbol name
      symbol_string <- .fix_quotes(symbol_string)
      symbol_name <- names(dots)[i]
      # convert string into language and replace in dots
      return_value <- try(str2lang(symbol_string), silent = TRUE)
      # sanity check - for invalid expressions, like
      # data_modify(iris, a = as_expr(c("1 + 1", "2 + 2")))
      # we get an error here
      if (inherits(return_value, "try-error")) {
        insight::format_error(paste0(
          "Could not evaluate expression `",
          symbol_string[1],
          "`. ",
          "Please check if it's correctly specified. If you think there's a bug ",
          "in `data_modify()`, please file an issue at {.url https://github.com/easystats/datawizard/issues}"
        ))
      }
      dots[[i]] <- return_value
      names(dots)[i] <- symbol_name
    }
  }
  dots
}


# helper -------------

.evaluate_expression_in_string <- function(symbol_string, data) {
  # check if we have any symbols instead of strings as expression, e.g.
  # xpr <- "sepwid = 2 * Sepal.Width"
  # data_modify(iris, as_expr(xpr))
  #
  # in this case, we need to evaluate the symbol (i.e. convert symbol string
  # into a language expression and then evaluate)
  symbol_string <- unlist(
    lapply(symbol_string, function(symbol_element) {
      if (startsWith(symbol_element, "\"")) {
        symbol_element
      } else {
        return_value <- .dynEval(str2lang(symbol_element))
        # dynEval might fail if we don't look in data - sanity check
        if (identical(return_value, symbol_element)) {
          return_value <- .dynEval(str2lang(symbol_element), data = data)
        }
        return_value
      }
    }),
    use.names = FALSE
  )
  # now we should have the expression as character string. Next, we
  # # remove quotes from strings
  gsub("^\"(.*)\"$", "\\1", symbol_string)
}


.fix_quotes <- function(symbol_string) {
  # if user uses double-quotes inside double-quotes, these are escaped by
  # "\", e.g. data_modify(iris, foo = as_expr("grepl(\"a\", Species)"))
  # In this case, we have double-backslashes, which need to be removed.
  # Furthermore, to avoid adding back backslashes, we replace by single quotes
  # Using single quotes inside a string, even if escaped with backslash, is no
  # problem here. Main issue is that if a string is parsed, double-quotes are
  # *always* escaped, so we just need to make sure we only have single quotes
  # and then remove all backslashes
  gsub("\\", "", gsub("\"", "'", symbol_string, fixed = TRUE), fixed = TRUE)
}


.modify_at <- function(data, .at, .if, .modify) {
  # check if ".at" or ".if" is defined, but not ".modify"
  if (is.null(.modify)) {
    if (!is.null(.at) || !is.null(.if)) {
      insight::format_error(
        "You need to specify `.modify` when using `.at` or `.if`."
      )
    }
    return(data)
  }
  # make sure "modify" is a function
  if (!is.function(.modify)) {
    insight::format_error("`.modify` must be a function.")
  }
  # make sure either .at or .if is defined, not both
  if (!is.null(.at) && !is.null(.if)) {
    insight::format_error(
      "You cannot use both `.at` and `.if` at the same time."
    )
  }
  # make sure at least one of .at or .if is defined
  if (is.null(.at) && is.null(.if)) {
    insight::format_error("You need to specify either `.at` or `.if`.")
  }

  column_names <- colnames(data)

  # if we have ".if" defined, specify ".at"
  if (!is.null(.if)) {
    .at <- column_names[vapply(data, .if, logical(1))]
  }
  # check for valid defined column names
  if (!all(.at %in% column_names)) {
    not_found <- .at[!.at %in% column_names]
    insight::format_error(
      paste0(
        "Variable",
        ifelse(length(not_found) > 1, "s ", " "),
        text_concatenate(not_found, enclose = "\""),
        ifelse(length(not_found) > 1, " were", " was"),
        " not found in the dataset."
      ),
      .misspelled_string(
        column_names,
        not_found,
        "Possibly misspelled or not yet defined?"
      )
    )
  }
  for (i in .at) {
    result <- tryCatch(
      .modify(data[[i]]),
      warning = function(e) e,
      error = function(e) e
    )
    if (inherits(result, c("error", "warning"))) {
      insight::format_error(
        paste0("Error in modifying variable \"", i, "\": ", result$message),
        "Please check if you correctly specified the `.modify` function."
      )
    } else {
      data[[i]] <- result
    }
  }

  data
}


.get_new_dots_variable <- function(dots, i, data) {
  # iterate expressions for new variables
  symbol <- dots[[i]]

  # we evaluate the content of "symbol", hence, "eval_symbol" either contains
  # the values of the expression, or the expression itself as string
  eval_symbol <- .dynEval(symbol, ifnotfound = NULL, data = data)

  # finally, we can evaluate expression and get values for new variables
  symbol_string <- insight::safe_deparse(symbol)
  if (!is.null(symbol_string) && all(symbol_string == "n()")) {
    # "special" functions - using "n()" just returns number of rows
    new_variable <- nrow(data)
  } else if (
    !is.null(symbol_string) &&
      length(symbol_string) == 1 &&
      grepl("\\bn\\(\\)", symbol_string)
  ) {
    # "special" functions, like "1:n()" or similar - but not "1:fun()"
    symbol_string <- str2lang(gsub(
      "n()",
      "nrow(data)",
      symbol_string,
      fixed = TRUE
    ))
    new_variable <- try(with(data, eval(symbol_string)), silent = TRUE)
  } else {
    # evaluate symbol
    new_variable <- try(with(data, eval(symbol)), silent = TRUE)
    # if evaluation fails, we have a value - and directly use it
    if (inherits(new_variable, "try-error") && !is.null(eval_symbol)) {
      new_variable <- eval_symbol
    }
  }

  # successful, or any errors, like misspelled variable name?
  if (inherits(new_variable, "try-error")) {
    # in which step did error happen?
    step_number <- switch(
      as.character(i),
      "1" = "the first expression",
      "2" = "the second expression",
      "3" = "the third expression",
      paste("expression", i)
    )
    step_msg <- paste0("There was an error in ", step_number, ".")
    # try to find out which variable was the cause for the error
    error_msg <- attributes(new_variable)$condition$message
    if (grepl("object '(.*)' not found", error_msg)) {
      error_var <- gsub("object '(.*)' not found", "\\1", error_msg)
      insight::format_error(
        paste0(
          step_msg,
          " Variable \"",
          error_var,
          "\" was not found in the dataset or in the environment."
        ),
        .misspelled_string(
          colnames(data),
          error_var,
          "Possibly misspelled or not yet defined?"
        )
      )
    } else {
      insight::format_error(paste0(
        step_msg,
        " ",
        insight::format_capitalize(error_msg),
        ". Possibly misspelled or not yet defined?"
      ))
    }
  }

  new_variable
}


================================================
FILE: R/data_partition.R
================================================
#' Partition data
#'
#' Creates data partitions (for instance, a training and a test set) based on a
#' data frame that can also be stratified (i.e., evenly spread a given factor)
#' using the `by` argument.
#'
#' @inheritParams data_rename
#' @param proportion Scalar (between 0 and 1) or numeric vector, indicating the
#'   proportion(s) of the training set(s). The sum of `proportion` must not be
#'   greater than 1. The remaining part will be used for the test set.
#' @param by A character vector indicating the name(s) of the column(s) used
#'   for stratified partitioning.
#' @param seed A random number generator seed. Enter an integer (e.g. 123) so
#'   that the random sampling will be the same each time you run the function.
#' @param row_id Character string, indicating the name of the column that
#'   contains the row-id's.
#' @param verbose Toggle messages and warnings.
#'
#' @return A list of data frames. The list includes one training set per given
#'   proportion and the remaining data as test set. List elements of training
#'   sets are named after the given proportions (e.g., `$p_0.7`), the test set
#'   is named `$test`.
#'
#' @examples
#' data(iris)
#' out <- data_partition(iris, proportion = 0.9)
#' out$test
#' nrow(out$p_0.9)
#'
#' # Stratify by group (equal proportions of each species)
#' out <- data_partition(iris, proportion = 0.9, by = "Species")
#' out$test
#'
#' # Create multiple partitions
#' out <- data_partition(iris, proportion = c(0.3, 0.3))
#' lapply(out, head)
#'
#' # Create multiple partitions, stratified by group - 30% equally sampled
#' # from species in first training set, 50% in second training set and
#' # remaining 20% equally sampled from each species in test set.
#' out <- data_partition(iris, proportion = c(0.3, 0.5), by = "Species")
#' lapply(out, function(i) table(i$Species))
#'
#' @inherit data_rename seealso
#' @export
data_partition <- function(
  data,
  proportion = 0.7,
  by = NULL,
  seed = NULL,
  row_id = ".row_id",
  verbose = TRUE,
  ...
) {
  # validation checks
  data <- .coerce_to_dataframe(data)

  if (sum(proportion) > 1) {
    insight::format_error("Sum of `proportion` cannot be higher than 1.")
  }
  if (any(proportion < 0)) {
    insight::format_error("Values in `proportion` cannot be negative.")
  }
  if (sum(proportion) == 1 && isTRUE(verbose)) {
    insight::format_warning(
      "Proportions of sampled training sets (`proportion`) sums up to 1, so no test set will be generated."
    )
  }

  if (is.null(row_id)) {
    row_id <- ".row_id"
  }

  # check that name of row-id doesn't exist to prevent existing data
  # from overwriting. create new unique name for row-id then...
  if (row_id %in% colnames(data)) {
    if (isTRUE(verbose)) {
      insight::format_warning(
        paste0("A variable named \"", row_id, "\" already exists."),
        "Changing the value of `row_id` to a unique variable name now."
      )
    }
    unique_names <- make.unique(c(colnames(data), row_id), sep = "_")
    row_id <- unique_names[length(unique_names)]
  }

  if (!is.null(seed)) {
    set.seed(seed)
  }

  # add row-id column
  data[[row_id]] <- seq_len(nrow(data))

  # Create list of data groups. We generally lapply over list of
  # sampled row-id's by group, thus, we even create a list if not grouped.
  if (is.null(by)) {
    indices_list <- list(seq_len(nrow(data)))
  } else {
    # else, split by group(s) and extract row-ids per group
    indices_list <- lapply(
      split(data, data[by]),
      data_extract,
      select = row_id,
      as_data_frame = FALSE
    )
  }

  # iterate over (grouped) row-id's
  training_sets <- lapply(indices_list, function(i) {
    # return value, list of data frames
    d <- list()

    # row-id's by groups
    indices <- i

    # check length of group (= data)
    n <- length(indices)

    # iterate probabilities. we use for/next, so we can change
    # the "indices" variable, where we remove already sampled id's
    for (p in proportion) {
      # training-id's, sampled from id's per group - size is % within each group
      training <- sort(sample(indices, round(n * p)))

      # remove already sampled id's from group-indices
      indices <- setdiff(indices, training)

      # each training set data frame as one list element
      d[[length(d) + 1]] <- data[training, ]
    }
    d
  })

  # we need to move all list elements one level higher.
  if (is.null(by)) {
    training_sets <- training_sets[[1]]
  } else {
    # for grouped training sets, we need to row-bind all sampled training
    # sets from each group. currently, we have a list of data frames,
    # grouped by "group"; but we want one data frame per proportion that
    # contains sampled rows from all groups.
    training_sets <- lapply(seq_along(proportion), function(p) {
      do.call(rbind, lapply(training_sets, function(i) i[[p]]))
    })
  }

  # use probabilies as element names
  names(training_sets) <- sprintf("p_%g", proportion)

  # remove all training set id's from data, add remaining data (= test set)
  all_ids <- lapply(
    training_sets,
    data_extract,
    select = row_id,
    as_data_frame = FALSE
  )
  out <- c(
    training_sets,
    list(test = data[-unlist(all_ids, use.names = FALSE), ])
  )

  lapply(out, `row.names<-`, NULL)
}


================================================
FILE: R/data_peek.R
================================================
#' @title Peek at values and type of variables in a data frame
#' @name data_peek
#'
#' @description This function creates a table a data frame, showing all
#' column names, variable types and the first values (as many as fit into
#' the screen).
#'
#' @param x A data frame.
#' @param width Maximum width of line length to display. If `NULL`, width will
#' be determined using `options()$width`.
#' @param ... not used.
#' @inheritParams extract_column_names
#'
#' @note To show only specific or a limited number of variables, use the
#' `select` argument, e.g. `select = 1:5` to show only the first five variables.
#'
#' @return A data frame with three columns, containing information about
#' the name, type and first values of the input data frame.
#'
#' @examples
#' data(efc)
#' data_peek(efc)
#' # show variables two to four
#' data_peek(efc, select = 2:4)
#' @export
data_peek <- function(x, ...) {
  UseMethod("data_peek")
}


#' @rdname data_peek
#' @export
data_peek.data.frame <- function(
  x,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  width = NULL,
  verbose = TRUE,
  ...
) {
  # evaluate arguments
  select <- .select_nse(
    select,
    x,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )
  out <- do.call(
    rbind,
    lapply(select, function(i) {
      .data_peek(x, i, width, verbose = verbose, ...)
    })
  )

  class(out) <- c("dw_data_peek", class(out))
  attr(out, "n_cols") <- ncol(x)
  attr(out, "n_rows") <- nrow(x)
  attr(out, "max_width") <- ifelse(is.null(width), 0.9 * options()$width, width)

  out
}


# methods -----------------

#' @export
print.dw_data_peek <- function(x, ...) {
  x <- format(x, ...)
  caption <- sprintf(
    "Data frame with %i rows and %i variables",
    attributes(x)$n_rows,
    attributes(x)$n_cols
  )
  cat(insight::export_table(x, align = "lll", caption = caption, ...))
}

#' @export
print_md.dw_data_peek <- function(x, ...) {
  x <- format(x, ...)
  caption <- sprintf(
    "Data frame with %i rows and %i variables",
    attributes(x)$n_rows,
    attributes(x)$n_cols
  )
  insight::export_table(
    x,
    align = "lll",
    format = "markdown",
    caption = caption,
    ...
  )
}

#' @export
print_html.dw_data_peek <- function(x, ...) {
  x <- format(x, ...)
  caption <- sprintf(
    "Data frame with %i rows and %i variables",
    attributes(x)$n_rows,
    attributes(x)$n_cols
  )
  insight::export_table(
    x,
    align = "lll",
    format = "html",
    caption = caption,
    ...
  )
}

#' @export
format.dw_data_peek <- function(x, ...) {
  width_col1 <- max(nchar(x$Variable))
  width_col2 <- max(nchar(x$Type))
  max_width <- attributes(x)$max_width
  if (is.null(max_width)) {
    max_width <- 0.9 * options()$width
  }
  width_col3 <- max_width - (width_col1 + width_col2 + 10) # 10 = separator chars in table

  # shorten value-string
  x$Values <- substr(x$Values, 0, width_col3)
  # make sure we have a clear truncation, at last "comma"
  x$Values <- gsub("(.+)(,.+)$", "\\1", x$Values)
  # add "..."
  x$Values <- paste0(x$Values, ", ...")

  x
}


# helper -----------------

.data_peek <- function(x, variable, width = NULL, verbose = TRUE, ...) {
  v_name <- variable
  v_type <- .variable_type(x[[variable]])
  v_type[v_type == "categorical"] <- "factor"

  max_width <- ifelse(is.null(width), 0.9 * options()$width, width)
  v_values <- toString(x[[variable]][1:max_width])

  data.frame(
    Variable = v_name,
    Type = v_type,
    Values = v_values,
    stringsAsFactors = FALSE
  )
}


================================================
FILE: R/data_read.R
================================================
#' @title Read (import) data files from various sources
#' @name data_read
#'
#' @description
#' This functions imports data from various file types. It is a small wrapper
#' around `haven::read_spss()`, `haven::read_stata()`, `haven::read_sas()`,
#' `readxl::read_excel()` and `data.table::fread()` resp. `readr::read_delim()`
#' (the latter if package **data.table** is not installed). Thus, supported file
#' types for importing data are data files from SPSS, SAS or Stata, Excel files
#' or text files (like '.csv' files). All other file types are passed to
#' `rio::import()`. `data_write()` works in a similar way.
#'
#' @param path Character string, the file path to the data file.
#' @param path_catalog Character string, path to the catalog file. Only relevant
#' for SAS data files.
#' @param encoding The character encoding used for the file. Usually not needed.
#' @param convert_factors If `TRUE` (default), numeric variables, where all
#' values have a value label, are assumed to be categorical and converted into
#' factors. If `FALSE`, no variable types are guessed and no conversion of
#' numeric variables into factors will be performed. For `data_read()`, this
#' argument only applies to file types with *labelled data*, e.g. files from
#' SPSS, SAS or Stata. See also section 'Differences to other packages'. For
#' `data_write()`, this argument only applies to the text (e.g. `.txt` or
#' `.csv`) or spreadsheet file formats (like `.xlsx`). Converting to factors
#' might be useful for these formats because labelled numeric variables are then
#' converted into factors and exported as character columns - else, value labels
#' would be lost and only numeric values are written to the file.
#' @param password Password for data encryption. If not `NULL`, the data will be
#' encrypted (for `data_write()`) or decrypted (for `data_read()`) using the
#' provided password. Encryption is currently only supported for R file formats
#' (`.rds`, `.rda` and `.rdata`). See the section "Data encryption" below for more
#' information on the encryption method used.
#' @param verbose Toggle warnings and messages.
#' @param ... Arguments passed to the related `read_*()` or `write_*()` functions.
#'
#' @return A data frame.
#'
#' @section Supported file types:
#' - `data_read()` is a wrapper around the **haven**, **data.table**, **readr**
#'   **readxl**, **nanoparquet** and **rio** packages. Currently supported file
#'   types are `.txt`, `.csv`, `.xls`, `.xlsx`, `.sav`, `.por`, `.dta`, `.sas`,
#'   `.rda`, `.parquet`, `.rdata`, and `.rds` (and related files). All other file
#'   types are passed to `rio::import()`.
#' - `data_write()` is a wrapper around **haven**, **readr**, **nanoparquet**,
#'   and **rio** packages, and supports writing files into all formats supported
#'   by these packages.
#'
#' @section Compressed files (zip) and URLs:
#' `data_read()` can also read the above mentioned files from URLs or from
#' inside zip-compressed files. Thus, `path` can also be a URL to a file like
#' `"http://www.url.com/file.csv"`. When `path` points to a zip-compressed file,
#' and there are multiple files inside the zip-archive, then the first supported
#' file is extracted and loaded.
#'
#' @section General behaviour:
#' `data_read()` detects the appropriate `read_*()` function based on the
#' file-extension of the data file. Thus, in most cases it should be enough to
#' only specify the `path` argument. However, if more control is needed, all
#' arguments in `...` are passed down to the related `read_*()` function. The
#' same applies to `data_write()`, i.e. based on the file extension provided in
#' `path`, the appropriate `write_*()` function is used automatically.
#'
#' @section SPSS specific behaviour:
#' `data_read()` does *not* import user-defined ("tagged") `NA` values from
#' SPSS, i.e. argument `user_na` is always set to `FALSE` when importing SPSS
#' data with the **haven** package. Use `convert_to_na()` to define missing
#' values in the imported data, if necessary. Furthermore, `data_write()`
#' compresses SPSS files by default. If this causes problems with (older) SPSS
#' versions, use `compress = "none"`, for example
#' `data_write(data, "myfile.sav", compress = "none")`.
#'
#' @section Differences to other packages that read foreign data formats:
#' `data_read()` is most comparable to `rio::import()`. For data files from
#' SPSS, SAS or Stata, which support labelled data, variables are converted into
#' their most appropriate type. The major difference to `rio::import()` is for
#' data files from SPSS, SAS, or Stata, i.e. file types that support
#' *labelled data*. `data_read()` automatically converts fully labelled numeric
#' variables into factors, where imported value labels will be set as factor
#' levels. If a numeric variable has _no_ value labels or less value labels than
#' values, it is not converted to factor. In this case, value labels are
#' preserved as `"labels"` attribute. Character vectors are preserved. Use
#' `convert_factors = FALSE` to remove the automatic conversion of numeric
#' variables to factors.
#'
#' @section Data encryption:
#' `data_read()` and `data_write()` support data encryption for R file formats
#' (`.rds`, `.rda` and `.rdata`). To encrypt a file, provide a password to the
#' `password` argument in `data_write()`. To decrypt the file, provide the same
#' password to `data_read()`. The encryption is based on the **openssl** package
#' and uses the AES-GCM algorithm (see `?openssl::aes_gcm_encrypt`) with a
#' 256-bit key (see `?openssl::sha256`). Thus, data can also be decrypted without
#' relying on the **datawizard** package, e.g. using following code:
#'
#' ```
#' encrypted_data <- readRDS(datafile)
#' key <- openssl::sha256(charToRaw("<password>"))
#' out <- openssl::aes_gcm_decrypt(encrypted_data, key = key)
#' decrypted_data <- unserialize(out)
#' ```
#'
#' **Warning:** Do not lose your `password`, else you will not be able to
#' decrypt the data again!
#'
#' @export
data_read <- function(
  path,
  path_catalog = NULL,
  encoding = NULL,
  convert_factors = TRUE,
  password = NULL,
  verbose = TRUE,
  ...
) {
  # extract first valid file from zip-file
  if (identical(.file_ext(path), "zip")) {
    path <- .extract_zip(path)
  }

  # check for valid file type
  file_type <- .file_ext(path)
  if (!is.character(file_type) || file_type == "") {
    insight::format_error(
      "Could not detect file type. The `path` argument has no file extension.",
      "Please provide a file path including extension, like \"myfile.csv\" or \"c:/Users/Default/myfile.sav\"."
    )
  }

  # read data
  out <- switch(
    file_type,
    txt = ,
    csv = .read_text(path, encoding, verbose, ...),
    rda = ,
    rdata = .read_base_rda(path, file_type, password, verbose, ...),
    rds = .read_base_rds(path, password, verbose, ...),
    xls = ,
    xlsx = .read_excel(path, encoding, verbose, ...),
    sav = ,
    por = .read_spss(path, encoding, convert_factors, verbose, ...),
    dta = .read_stata(path, encoding, convert_factors, verbose, ...),
    sas7bdat = .read_sas(
      path,
      path_catalog,
      encoding,
      convert_factors,
      verbose,
      ...
    ),
    parquet = .read_parquet(path, verbose, ...),
    .read_unknown(path, file_type, verbose, ...)
  )

  # tell user about empty columns
  if (verbose) {
    empty_cols <- empty_columns(out)
    # only message if we actually have empty columns
    if (length(empty_cols)) {
      insight::format_alert(
        sprintf("Following %i variables are empty:", length(empty_cols)),
        text_concatenate(names(empty_cols)),
        "\nUse `remove_empty_columns()` to remove them from the data frame."
      )
    }
  }

  out
}


# helper -----------------------

.file_ext <- function(x) {
  pos <- regexpr("\\.([[:alnum:]]+)$", x)
  ifelse(pos > -1L, tolower(substring(x, pos + 1L)), "")
}


.extract_zip <- function(path) {
  files <- utils::unzip(path, list = TRUE)
  files_ext <- vapply(files$Name, .file_ext, FUN.VALUE = character(1L))

  supported_filetypes <- c("txt", "csv", "xls", "xlsx", "sav", "por", "dta")
  dest <- files$Name[which(files_ext %in% supported_filetypes)]

  if (length(dest) > 0) {
    d <- tempfile()
    dir.create(d)
    utils::unzip(path, exdir = d)
    path <- file.path(d, dest[1])
  } else {
    insight::format_error(
      "The zip-file does not contain any supported file types."
    )
  }

  path
}


# process imported data from SPSS, SAS or Stata -----------------------

.post_process_imported_data <- function(x, convert_factors, verbose) {
  # user may decide whether we automatically detect variable type or not
  if (isTRUE(convert_factors)) {
    if (verbose) {
      msg <- "Variables where all values have associated labels are now converted into factors. If this is not intended, use `convert_factors = FALSE`." # nolint
      insight::format_alert(msg)
    }
    x[] <- lapply(x, function(i) {
      # only proceed if not all missing
      if (!all(is.na(i))) {
        # save labels
        value_labels <- attr(i, "labels", exact = TRUE)
        variable_labels <- attr(i, "label", exact = TRUE)

        # filter, so only matching value labels remain
        value_labels <- value_labels[value_labels %in% unique(i)]

        # guess variable type
        if (is.character(i)) {
          # we need this to drop haven-specific class attributes
          i <- as.character(i)
        } else if (
          !is.null(value_labels) && length(value_labels) == insight::n_unique(i)
        ) {
          # if all values are labelled, we assume factor. Use labels as levels
          if (is.numeric(i)) {
            i <- factor(i, labels = names(value_labels))
          } else {
            i <- factor(as.character(i), labels = names(value_labels))
          }
          value_labels <- NULL
          attr(i, "converted_to_factor") <- TRUE
        } else {
          # else, fall back to numeric or factor
          i <- as.numeric(i)
        }

        # drop unused value labels
        value_labels <- value_labels[value_labels %in% unique(i)]
        if (length(value_labels) > 0L) {
          attr(i, "labels") <- value_labels
        }

        # add back variable label
        attr(i, "label") <- variable_labels
      }
      i
    })
    # tell user how many variables were converted
    if (verbose) {
      cnt <- sum(vapply(
        x,
        function(i) isTRUE(attributes(i)$converted_to_factor),
        TRUE
      ))
      msg <- sprintf(
        "%i out of %i variables were fully labelled and converted into factors.",
        cnt,
        ncol(x)
      )
      insight::format_alert(msg)
    }
  } else {
    # drop haven class attributes
    x[] <- lapply(x, function(i) {
      # save labels
      class(i) <- setdiff(class(i), c("haven_labelled", "vctrs_vctr"))
      i
    })
  }

  class(x) <- "data.frame"
  x
}


# read functions -----------------------

.read_spss <- function(path, encoding, convert_factors, verbose, ...) {
  insight::check_if_installed(
    "haven",
    reason = paste0("to read files of type '", .file_ext(path), "'")
  )
  if (verbose) {
    insight::format_alert("Reading data...")
  }
  out <- haven::read_sav(file = path, encoding = encoding, user_na = FALSE, ...)
  .post_process_imported_data(out, convert_factors, verbose)
}


.read_stata <- function(path, encoding, convert_factors, verbose, ...) {
  insight::check_if_installed(
    "haven",
    reason = paste0("to read files of type '", .file_ext(path), "'")
  )
  if (verbose) {
    insight::format_alert("Reading data...")
  }
  out <- haven::read_dta(file = path, encoding = encoding, ...)
  .post_process_imported_data(out, convert_factors, verbose)
}


.read_sas <- function(
  path,
  path_catalog,
  encoding,
  convert_factors,
  verbose,
  ...
) {
  insight::check_if_installed(
    "haven",
    reason = paste0("to read files of type '", .file_ext(path), "'")
  )
  if (verbose) {
    insight::format_alert("Reading data...")
  }
  out <- haven::read_sas(
    data_file = path,
    catalog_file = path_catalog,
    encoding = encoding,
    ...
  )
  .post_process_imported_data(out, convert_factors, verbose)
}


.read_excel <- function(path, encoding, verbose, ...) {
  insight::check_if_installed(
    "readxl",
    reason = paste0("to read files of type '", .file_ext(path), "'")
  )
  if (verbose) {
    insight::format_alert("Reading data...")
  }
  out <- readxl::read_excel(path, ...)
  class(out) <- "data.frame"
  out
}


.read_text <- function(path, encoding, verbose, ...) {
  if (insight::check_if_installed("data.table", quietly = TRUE)) {
    # set proper default encoding-value for fread
    if (is.null(encoding)) {
      encoding <- "unknown"
    }
    out <- data.table::fread(input = path, encoding = encoding, ...)
    return(as.data.frame(out))
  }

  insight::check_if_installed(
    "readr",
    reason = paste0("to read files of type '", .file_ext(path), "'")
  )
  if (verbose) {
    insight::format_alert("Reading data...")
  }
  out <- readr::read_delim(path, ...)
  as.data.frame(out)
}


.read_unknown <- function(path, file_type, verbose, ...) {
  insight::check_if_installed(
    "rio",
    reason = paste0("to read files of type '", file_type, "'")
  )
  if (verbose) {
    insight::format_alert("Reading data...")
  }
  # set up arguments. for RDS, we set trust = TRUE, to avoid warnings
  rio_args <- list(file = path)
  # check if we have RDS, and if so, add trust = TRUE
  if (file_type %in% c("rds", "rdata", "rda")) {
    rio_args$trust <- TRUE
  }
  out <- do.call(rio::import, c(rio_args, list(...)))

  # check if loaded file is a data frame, or not (e.g. model objects)
  # it returns `NULL` if the file is no valid data file that contains a data
  # frame.frame, or cannot be coerced to a data frame. Else, if it was a data
  # frame or could be coerced into one, the (new) data frame is returned. In
  # this case, we overwrite "out", else we keep its original object.
  valid_data_object <- .get_data_from_loaded_file(out, verbose)
  # if file could be coerced to a data frame, overwrite out
  if (!is.null(valid_data_object)) {
    out <- valid_data_object
  }

  out
}


.read_base_rda <- function(path, file_type, password, verbose = TRUE, ...) {
  if (verbose) {
    insight::format_alert("Reading data...")
  }

  # check URLs
  path <- .check_path_url(path, file_type)

  # since RData and rda can keep multiple files, we load them into a
  # new environment and return them as list object then
  env <- new.env()
  load(file = path, envir = env)

  # if the RData file contains more than one object, we don't check the output
  # but just return everything
  if (length(ls(env)) > 1) {
    if (verbose) {
      insight::format_alert(
        "File contained more than one object, returning all objects."
      )
    }
    return(as.list(env))
  }

  # else, retrieve loaded object
  out <- get(ls(env)[1], env)

  # data decryption
  out <- .data_decryption(out, password)

  # check if loaded file is a data frame, or not (e.g. model objects)
  # it returns `NULL` if the file is no valid data file that contains a data
  # frame.frame, or cannot be coerced to a data frame. Else, if it was a data
  # frame or could be coerced into one, the (new) data frame is returned. In
  # this case, we overwrite "out", else we keep its original object.
  valid_data_object <- .get_data_from_loaded_file(out, verbose)
  # if file could be coerced to a data frame, overwrite out
  if (!is.null(valid_data_object)) {
    out <- valid_data_object
  }

  out
}


.read_base_rds <- function(path, password, verbose = TRUE, ...) {
  if (verbose) {
    insight::format_alert("Reading data...")
  }

  # check URLs
  path <- .check_path_url(path, file_type = "rds")
  out <- readRDS(file = path)

  # data decryption
  out <- .data_decryption(out, password)

  # check if loaded file is a data frame, or not (e.g. model objects)
  # it returns `NULL` if the file is no valid data file that contains a data
  # frame.frame, or cannot be coerced to a data frame. Else, if it was a data
  # frame or could be coerced into one, the (new) data frame is returned. In
  # this case, we overwrite "out", else we keep its original object.
  valid_data_object <- .get_data_from_loaded_file(out, verbose)
  # if file could be coerced to a data frame, overwrite out
  if (!is.null(valid_data_object)) {
    out <- valid_data_object
  }

  out
}


.read_parquet <- function(path, verbose = TRUE, ...) {
  # requires nanoparquet package
  insight::check_if_installed("nanoparquet")

  if (verbose) {
    insight::format_alert("Reading data...")
  }

  # check URLs
  path <- .check_path_url(path, file_type = "parquet")
  out <- nanoparquet::read_parquet(file = path, ...)
  as.data.frame(out)
}


# check input helper --------------------------------------------------------

# for URLs, we need to download the file and save it locally
.check_path_url <- function(path, file_type) {
  url_pattern <- "^(https?|ftp)://(.*)"
  # check if file path is an URL
  if (grepl(url_pattern, path)) {
    insight::check_if_installed("curl")
    if (curl::has_internet()) {
      # if yes, create temp file and save file locally
      temp_file <- tempfile(fileext = paste0(".", file_type))
      download <- curl::curl_fetch_memory(path)
      writeBin(object = download$content, con = temp_file)
      # return path to temp file
      path <- temp_file
    } else {
      insight::format_error(
        "No internet connection detected. Could not download file from URL."
      )
    }
  }
  path
}


.get_data_from_loaded_file <- function(out, verbose = TRUE) {
  # it is also possible to read in pre-compiled model objects with data_read()
  # in this case, just return as is. We do this check before we check with
  # "is.data.frame()", because some models (like brmsfit) have an `as.data.frame()`
  # method, which coerces the model object into a data frame, which is likely to
  # be not intentional
  if (insight::is_model(out)) {
    if (verbose) {
      insight::format_alert(
        paste0(
          "Imported file is a regression model object of class \"",
          class(out)[1],
          "\"."
        ),
        "Returning file as is."
      )
    }
    return(NULL)
  }

  # for "unknown" data formats (like .RDS), which still can be imported via
  # "rio::import()", we must check whether we actually have a data frame or
  # not. Else, tell user.
  if (!is.data.frame(out)) {
    tmp <- tryCatch(
      as.data.frame(out, stringsAsFactors = FALSE),
      error = function(e) NULL
    )
    if (is.null(tmp)) {
      if (verbose) {
        insight::format_warning(
          paste0(
            "Imported file is no data frame, but of class \"",
            class(out)[1],
            "\"."
          ),
          "Returning file as is. Please check if importing this file was intended."
        )
      }
      return(NULL)
    }
    out <- tmp
  }
  out
}

# decrypt data ---------------------------------

.data_decryption <- function(data, password = NULL) {
  # check if data should be decrypted
  if (!is.null(password)) {
    .validate_password(password)
    data <- .decrypt_data(data, password)
  }
  data
}

.decrypt_data <- function(data, password = NULL) {
  insight::check_if_installed("openssl", "for data decryption")
  # it is important to remember the phrase! else, you cannot decrypt the data
  passphrase <- charToRaw(password)
  key <- openssl::sha256(passphrase)
  # decrypt the data. in case of wrong password, `unserialize()` errors
  out <- tryCatch(
    unserialize(openssl::aes_gcm_decrypt(data, key = key)),
    error = function(e) NULL
  )
  # check if we had encrypted data at all?
  if (is.null(out)) {
    insight::format_error(
      "File does not appear to be encrypted with {datawizard}, or you provided the wrong password."
    )
  }
  out
}


================================================
FILE: R/data_relocate.R
================================================
#' @title Relocate (reorder) columns of a data frame
#' @name data_relocate
#'
#' @description
#' `data_relocate()` will reorder columns to specific positions, indicated by
#' `before` or `after`. `data_reorder()` will instead move selected columns to
#' the beginning of a data frame. Finally, `data_remove()` removes columns
#' from a data frame. All functions support select-helpers that allow flexible
#' specification of a search pattern to find matching columns, which should
#' be reordered or removed.
#'
#' @param data A data frame.
#' @param before,after Destination of columns. Supplying neither will move
#'   columns to the left-hand side; specifying both is an error. Can be a
#'   character vector, indicating the name of the destination column, or a
#'   numeric value, indicating the index number of the destination column.
#'   If `-1`, will be added before or after the last column.
#' @inheritParams extract_column_names
#' @inheritParams data_rename
#'
#' @inherit data_rename seealso
#'
#' @return A data frame with reordered columns.
#'
#' @examples
#' # Reorder columns
#' head(data_relocate(iris, select = "Species", before = "Sepal.Length"))
#' head(data_relocate(iris, select = "Species", before = "Sepal.Width"))
#' head(data_relocate(iris, select = "Sepal.Width", after = "Species"))
#' # which is same as
#' head(data_relocate(iris, select = "Sepal.Width", after = -1))
#'
#' # Reorder multiple columns
#' head(data_relocate(iris, select = c("Species", "Petal.Length"), after = "Sepal.Width"))
#' # which is same as
#' head(data_relocate(iris, select = c("Species", "Petal.Length"), after = 2))
#'
#' # Reorder columns
#' head(data_reorder(iris, c("Species", "Sepal.Length")))
#'
#' @export
data_relocate <- function(
  data,
  select,
  before = NULL,
  after = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
) {
  # Sanitize
  if (!is.null(before) && !is.null(after)) {
    insight::format_error("You must supply only one of `before` or `after`.")
  }

  # allow numeric values
  if (!is.null(before) && is.numeric(before)) {
    if (before == -1) {
      before <- names(data)[ncol(data)]
    } else if (before >= 1 && before <= ncol(data)) {
      before <- names(data)[before]
    } else {
      insight::format_error("No valid position defined in `before`.")
    }
  }

  # allow numeric values
  if (!is.null(after) && is.numeric(after)) {
    if (after == -1) {
      after <- names(data)[ncol(data)]
    } else if (after >= 1 && after <= ncol(data)) {
      after <- names(data)[after]
    } else {
      insight::format_error("No valid position defined in `after`.")
    }
  }

  cols <- .select_nse(
    select,
    data,
    exclude = NULL,
    ignore_case = ignore_case,
    regex = regex,
    verbose = verbose
  )

  # save attributes
  attr_data <- attributes(data)

  # Move columns to the right hand side
  data <- data[c(setdiff(names(data), cols), cols)]

  # Get columns and their original position
  data_cols <- names(data)
  position <- which(data_cols %in% cols)

  # remember original values, for more informative messages
  original_before <- before
  original_after <- after

  # Find new positions
  # nolint start
  if (!is.null(before)) {
    before <- before[before %in% data_cols][1] # Take first that exists (if vector is supplied)
    if (length(before) != 1 || is.na(before)) {
      # guess the misspelled column
      insight::format_error(
        "The column passed to `before` wasn't found.",
        .misspelled_string(
          data_cols,
          original_before[1],
          default_message = "Possibly misspelled?"
        )
      )
    }
    where <- min(match(before, data_cols))
    position <- c(setdiff(position, where), where)
  } else if (!is.null(after)) {
    after <- after[after %in% data_cols][1] # Take first that exists (if vector is supplied)
    if (length(after) != 1 || is.na(after)) {
      # guess the misspelled column
      insight::format_error(
        "The column passed to `after` wasn't found.",
        .misspelled_string(
          data_cols,
          original_after[1],
          default_message = "Possibly misspelled?"
        )
      )
    }
    where <- max(match(after, data_cols))
    position <- c(where, setdiff(position, where))
  } else {
    where <- 1
    position <- union(position, where)
  }
  # nolint end

  # Set left and right side
  lhs <- setdiff(seq(1, where - 1), position)
  rhs <- setdiff(seq(where + 1, ncol(data)), position)
  position <- unique(c(lhs, position, rhs))
  position <- position[position <= length(data_cols)]

  out <- data[position]
  out <- .replace_attrs(out, attr_data)

  out
}


#' @rdname data_relocate
#' @export
data_reorder <- function(
  data,
  select,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
) {
  cols <- .select_nse(
    select,
    data,
    exclude = NULL,
    ignore_case = ignore_case,
    regex = regex,
    verbose = verbose
  )
  remaining_columns <- setdiff(colnames(data), cols)

  out <- data[c(cols, remaining_columns)]
  out <- .replace_attrs(out, attributes(data))
  out
}


================================================
FILE: R/data_remove.R
================================================
#' @inheritParams extract_column_names
#' @rdname data_relocate
#' @examples
#' # Remove columns
#' head(data_remove(iris, "Sepal.Length"))
#' head(data_remove(iris, starts_with("Sepal")))
#' @export
data_remove <- function(
  data,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = FALSE,
  ...
) {
  ## TODO set verbose = TRUE by default in a later update?

  # evaluate arguments
  select <- .select_nse(
    select,
    data,
    exclude = NULL,
    ignore_case = ignore_case,
    regex = regex,
    verbose = verbose
  )

  # nothing to remove?
  if (!length(select)) {
    return(data)
  }

  out <- data[!colnames(data) %in% select]
  out <- .replace_attrs(out, attributes(data))

  out
}


================================================
FILE: R/data_rename.R
================================================
#' @title Rename columns and variable names
#' @name data_rename
#'
#' @description Safe and intuitive functions to rename variables or rows in
#'   data frames. `data_rename()` will rename column names, i.e. it facilitates
#'   renaming variables. `data_rename_rows()` is a convenient shortcut
#'   to add or rename row names of a data frame, but unlike `row.names()`, its
#'   input and output is a data frame, thus, integrating smoothly into a
#'   possible pipe-workflow.
#'
#' @inheritParams extract_column_names
#' @param data A data frame.
#' @param replacement Character vector. Can be one of the following:
#'   - A character vector that indicates the new names of the columns selected
#'     in `select`. `select` and `replacement` must be of the same length.
#'   - A string (i.e. character vector of length 1) with a "glue" styled
#'     pattern. Currently supported tokens are:
#'     - `{col}` which will be replaced by the column name, i.e. the
#'       corresponding value in `select`.
#'     - `{n}` will be replaced by the number of the variable that is replaced.
#'     - `{letter}` will be replaced by alphabetical letters in sequential
#'       order.
#'       If more than 26 letters are required, letters are repeated, but have
#'       sequential numeric indices (e.g., `a1` to `z1`, followed by `a2` to
#'       `z2`).
#'     - Finally, the name of a user-defined object that is available in the
#'       environment can be used. Note that the object's name is not allowed to
#'       be one of the pre-defined tokens, `"col"`, `"n"` and `"letter"`.
#'
#'     An example for the use of tokens is...
#'     ```r
#'     data_rename(
#'       mtcars,
#'       select = c("am", "vs"),
#'       replacement = "new_name_from_{col}"
#'     )
#'     ```
#'     ... which would return new column names `new_name_from_am` and
#'     `new_name_from_vs`. See 'Examples'.
#'
#' If `select` is a named vector, `replacement` is ignored.
#' @param rows Vector of row names.
#' @param ... Other arguments passed to or from other functions.
#'
#' @details
#' `select` can also be a named character vector. In this case, the names are
#' used to rename the columns in the output data frame. If you have a named
#' list, use `unlist()` to convert it to a named vector. See 'Examples'.
#'
#' @return A modified data frame.
#'
#' @examples
#' # Rename columns
#' head(data_rename(iris, "Sepal.Length", "length"))
#'
#' # Use named vector to rename
#' head(data_rename(iris, c(length = "Sepal.Length", width = "Sepal.Width")))
#'
#' # Change all
#' head(data_rename(iris, replacement = paste0("Var", 1:5)))
#'
#' # Use glue-styled patterns
#' head(data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "formerly_{col}"))
#' head(data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "{col}_is_column_{n}"))
#' head(data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "new_{letter}"))
#'
#' # User-defined glue-styled patterns from objects in environment
#' x <- c("hi", "there", "!")
#' head(data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "col_{x}"))
#' @seealso
#' - Add a prefix or suffix to column names: [data_addprefix()], [data_addsuffix()]
#' - Functions to reorder or remove columns: [data_reorder()], [data_relocate()],
#'   [data_remove()]
#' - Functions to reshape, pivot or rotate data frames: [data_to_long()],
#'   [data_to_wide()], [data_rotate()]
#' - Functions to recode data: [rescale()], [reverse()], [categorize()],
#'   [recode_values()], [slide()]
#' - Functions to standardize, normalize, rank-transform: [center()], [standardize()],
#'   [normalize()], [ranktransform()], [winsorize()]
#' - Split and merge data frames: [data_partition()], [data_merge()]
#' - Functions to find or select columns: [data_select()], [extract_column_names()]
#' - Functions to filter rows: [data_match()], [data_filter()]
#'
#' @export
data_rename <- function(data, select = NULL, replacement = NULL, ...) {
  # check for valid input
  if (!is.data.frame(data)) {
    insight::format_error("Argument `data` must be a data frame.")
  }

  # change all names if no pattern specified
  select <- .select_nse(
    select,
    data,
    exclude = NULL,
    ignore_case = NULL,
    regex = NULL,
    allow_rename = TRUE,
    ifnotfound = "error"
  )

  # Forbid partially named "select",
  # Ex: if select = c("foo" = "Species", "Sepal.Length") then the 2nd name and
  # 2nd value are "Sepal.Length"
  if (!is.null(names(select)) && any(names(select) == select)) {
    insight::format_error(
      "When `select` is a named vector, all elements must be named."
    )
  }

  # check if `select` has names, and if so, use as "replacement"
  if (!is.null(names(select))) {
    replacement <- names(select)
  }

  # coerce to character
  replacement <- as.character(replacement)

  # check if `replacement` has no empty strings and no NA values
  invalid_replacement <- is.na(replacement) | !nzchar(replacement)
  if (any(invalid_replacement)) {
    if (is.null(names(select))) {
      # when user did not match `select` with `replacement`
      msg <- c(
        "`replacement` is not allowed to have `NA` or empty strings.",
        sprintf(
          "Following values in `select` have no match in `replacement`: %s",
          toString(select[invalid_replacement])
        )
      )
    } else {
      # when user did not name all elements of `select`
      msg <- c(
        "Either name all elements of `select` or use `replacement`.",
        sprintf(
          "Following values in `select` were not named: %s",
          toString(select[invalid_replacement])
        )
      )
    }
    insight::format_error(msg)
  }

  # if duplicated names in replacement, append ".2", ".3", etc. to duplicates
  # ex: c("foo", "foo") -> c("foo", "foo.2")
  if (anyDuplicated(replacement) > 0L) {
    dup <- as.data.frame(table(replacement))
    dup <- dup[dup$Freq > 1, ]
    for (i in dup$replacement) {
      to_replace <- which(replacement == i)[-1]
      new_replacement <- paste0(i, ".", 1 + seq_along(to_replace))
      replacement[to_replace] <- new_replacement
    }
  }

  # check if we have "glue" styled replacement-string
  glue_style <- length(replacement) == 1 &&
    grepl("{", replacement, fixed = TRUE)

  if (length(replacement) > length(select)) {
    insight::format_error(
      "There are more names in `replacement` than in `select`."
    )
  } else if (length(replacement) < length(select) && !glue_style) {
    insight::format_error(
      "There are more names in `select` than in `replacement`"
    )
  }

  # if we have glue-styled replacement-string, create replacement select now
  if (glue_style) {
    replacement <- .glue_replacement(select, replacement)
  }

  for (i in seq_along(select)) {
    if (!is.na(replacement[i])) {
      data <- .data_rename(data, select[i], replacement[i])
    }
  }

  data
}

#' @keywords internal
.data_rename <- function(data, pattern, replacement) {
  if (!pattern %in% names(data)) {
    insight::format_error(paste0(
      "Variable `",
      pattern,
      "` is not in your data frame :/"
    ))
  }

  names(data) <- replace(names(data), names(data) == pattern, replacement)

  data
}


.glue_replacement <- function(pattern, replacement) {
  # this function replaces "glue" tokens into their related
  # real names/values. Currently, following tokens are accepted:
  # - {col}: replacement is the name of the column (indicated in "pattern")
  # - {letter}: replacement is lower-case alphabetically letter, in sequential order
  # - {n}: replacement is the number of the variable out of n, that should be renamed
  out <- rep_len("", length(pattern))

  # for alphabetical letters, we prepare a string if we have more than
  # 26 columns to rename
  if (length(out) > 26) {
    long_letters <- paste0(
      rep.int(letters[1:26], times = ceiling(length(out) / 26)),
      rep(1:ceiling(length(out) / 26), each = 26)
    )
  } else {
    long_letters <- letters[1:26]
  }
  long_letters <- long_letters[seq_along(out)]

  for (i in seq_along(out)) {
    # prepare pattern
    column_name <- pattern[i]
    out[i] <- replacement
    # replace first pre-defined token
    out[i] <- gsub(
      "(.*)(\\{col\\})(.*)",
      replacement = paste0("\\1", column_name, "\\3"),
      x = out[i]
    )
    # replace second pre-defined token
    out[i] <- gsub(
      "(.*)(\\{n\\})(.*)",
      replacement = paste0("\\1", i, "\\3"),
      x = out[i]
    )
    # replace third pre-defined token
    out[i] <- gsub(
      "(.*)(\\{letter\\})(.*)",
      replacement = paste0("\\1", long_letters[i], "\\3"),
      x = out[i]
    )
    # extract all non-standard tokens
    matches <- unlist(
      regmatches(out[i], gregexpr("\\{([^}]*)\\}", out[i])),
      use.names = FALSE
    )
    # do we have any additional tokens, i.e. variable names from the environment?
    # users can also specify variable names, where the
    if (length(matches)) {
      # if so, iterate all tokens
      for (token in matches) {
        # evaluate token-object from the environment
        values <- .dynEval(
          str2lang(gsub("\\{(.*)\\}", "\\1", token)),
          ifnotfound = insight::format_error(paste0(
            "The object `",
            token,
            "` was not found. Please check if it really exists."
          ))
        )
        # check for correct length
        if (length(values) != length(pattern)) {
          insight::format_error(paste0(
            "The number of values provided in `",
            token,
            "` (",
            length(values),
            " values) do not match the number of columns to rename (",
            length(pattern),
            " columns)."
          ))
        }
        # replace token with values from the object
        if (length(values)) {
          out[i] <- gsub(token, values[i], out[i], fixed = TRUE)
        }
      }
    }
  }
  out
}


# Row.names ----------------------------------------------------------------

#' @rdname data_rename
#' @export
data_rename_rows <- function(data, rows = NULL) {
  row.names(data) <- rows
  data
}


================================================
FILE: R/data_replicate.R
================================================
#' @title Expand (i.e. replicate rows) a data frame
#' @name data_replicate
#'
#' @description
#' Expand a data frame by replicating rows based on another variable that
#' contains the counts of replications per row.
#'
#' @param data A data frame.
#' @param expand The name of the column that contains the counts of replications
#' for each row. Can also be a numeric value, indicating the position of that
#' column. Note that the variable indicated by `expand` must be an integer vector.
#' @param remove_na Logical. If `TRUE`, missing values in the column
#' provided in `expand` are removed from the data frame. If `FALSE` and `expand`
#' contains missing values, the function will throw an error.
#' @param ... Currently not used.
#' @inheritParams extract_column_names
#'
#' @return A dataframe with each row replicated as many times as defined in `expand`.
#'
#' @examples
#' data(mtcars)
#' data_replicate(head(mtcars), "carb")
#' @export
data_replicate <- function(
  data,
  expand = NULL,
  select = NULL,
  exclude = NULL,
  remove_na = FALSE,
  ignore_case = FALSE,
  verbose = TRUE,
  regex = FALSE,
  ...
) {
  # we need a name for the new column
  if (is.null(expand)) {
    insight::format_error(
      "No column that should be used to expand the data frame was provided. Please use `expand` to define a column."
    )
  }

  # only one column name
  if (length(expand) > 1) {
    insight::format_error(
      "Please provide only a single string for `expand`, no character vector with multiple values."
    )
  }

  # check if numerics, and if so, use column name
  if (is.numeric(expand)) {
    expand <- colnames(data)[expand]
  }

  # check if in data
  if (!expand %in% colnames(data)) {
    insight::format_error(
      "The column provided in `expand` does not exist in the data frame.",
      .misspelled_string(colnames(data), expand, "Possibly misspelled?")
    )
  }

  # check that "expand" contains no Inf
  if (any(is.infinite(data[[expand]]))) {
    insight::format_error(
      "The column provided in `expand` contains infinite values. Please provide a column that does not contain infinite values." # nolint
    )
  }

  # check that "expand" is integer
  if (!.is_integer(data[[expand]])) {
    insight::format_error(
      "The column provided in `expand` is not of type integer. Please provide a column that contains integer values." # nolint
    )
  }

  # evaluate select/exclude, may be select-helpers
  select <- .select_nse(
    select,
    data,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  # extract variable that contains the counts of replicates
  replicates <- data[[expand]]
  # we can remove that column now
  data[[expand]] <- NULL

  # also remove "expand" from "select" string
  select <- setdiff(select, expand)

  # if user doesn't want to remove "NA", but replicates contain "NA",
  # give informative error here
  if (!remove_na && anyNA(replicates)) {
    insight::format_error(
      "The column provided in `expand` contains missing values, but `remove_na` is set to `FALSE`.",
      "Please set `remove_na` to `TRUE` or remove the missing values from the `expand` variable."
    )
  }

  # remove rows where "expand" is NA
  data <- data[!is.na(replicates), , drop = FALSE]
  replicates <- replicates[!is.na(replicates)]

  # fin
  as.data.frame(do.call(
    cbind,
    lapply(data[select], rep.int, times = replicates)
  ))
}


# is.integer(c(1, 2)) -> FALSE
# all(c(1, 2) %% 1 == 0) -> TRUE
.is_integer <- function(x, remove_na = TRUE) {
  if (remove_na) {
    x <- x[!is.na(x)]
  }
  tryCatch(
    all(x %% 1 == 0),
    warning = function(w) is.integer(x),
    error = function(e) FALSE
  )
}


================================================
FILE: R/data_rescale.R
================================================
#' @title Rescale Variables to a New Range
#' @name rescale
#'
#' @description
#' Rescale variables to a new range. Can also be used to reverse-score variables
#' (change the keying/scoring direction), or to expand a range.
#'
#' @inheritParams categorize
#' @inheritParams extract_column_names
#' @inheritParams standardize.data.frame
#' @param to Numeric vector of length 2 giving the new range that the variable
#'   will have after rescaling. To reverse-score a variable, the range should
#'   be given with the maximum value first. See examples.
#' @param multiply If not `NULL`, `to` is ignored and `multiply` will be used,
#'   giving the factor by which the actual range of `x` should be expanded.
#'   For example, if a vector ranges from 5 to 15 and `multiply = 1.1`, the current
#'   range of 10 will be expanded by the factor of 1.1, giving a new range of
#'   11. Thus, the rescaled vector would range from 4.5 to 15.5.
#' @param add A vector of length 1 or 2. If not `NULL`, `to` is ignored and `add`
#'   will be used, giving the amount by which the minimum and maximum of the
#'   actual range of `x` should be expanded. For example, if a vector ranges from
#'   5 to 15 and `add = 1`, the range will be expanded from 4 to 16. If `add` is
#'   of length 2, then the first value is used for the lower bound and the second
#'   value for the upper bound.
#' @param range Initial (old) range of values. If `NULL`, will take the range of
#'   the input vector (`range(x)`).
#' @param ... Arguments passed to or from other methods.
#'
#' @inheritSection center Selection of variables - the `select` argument
#'
#' @seealso See [makepredictcall.dw_transformer()] for use in model formulas.
#' @family transform utilities
#'
#' @return A rescaled object.
#'
#' @examples
#' rescale(c(0, 1, 5, -5, -2))
#' rescale(c(0, 1, 5, -5, -2), to = c(-5, 5))
#' rescale(c(1, 2, 3, 4, 5), to = c(-2, 2))
#'
#' # Specify the "theoretical" range of the input vector
#' rescale(c(1, 3, 4), to = c(0, 40), range = c(0, 4))
#'
#' # Reverse-score a variable
#' rescale(c(1, 2, 3, 4, 5), to = c(5, 1))
#' rescale(c(1, 2, 3, 4, 5), to = c(2, -2))
#'
#' # Data frames
#' head(rescale(iris, to = c(0, 1)))
#' head(rescale(iris, to = c(0, 1), select = "Sepal.Length"))
#'
#' # One can specify a list of ranges
#' head(rescale(iris, to = list(
#'   "Sepal.Length" = c(0, 1),
#'   "Petal.Length" = c(-1, 0)
#' )))
#'
#' # "expand" ranges by a factor or a given value
#' x <- 5:15
#' x
#' # both will expand the range by 10%
#' rescale(x, multiply = 1.1)
#' rescale(x, add = 0.5)
#'
#' # expand range by different values
#' rescale(x, add = c(1, 3))
#'
#' # Specify list of multipliers
#' d <- data.frame(x = 5:15, y = 5:15)
#' rescale(d, multiply = list(x = 1.1, y = 0.5))
#' @export
rescale <- function(x, ...) {
  UseMethod("rescale")
}


#' @rdname rescale
#' @export
change_scale <- function(x, ...) {
  # Alias for rescale()
  rescale(x, ...)
}


#' @export
rescale.default <- function(x, verbose = TRUE, ...) {
  if (isTRUE(verbose)) {
    insight::format_alert(
      paste0(
        "Variables of class `",
        class(x)[1],
        "` can't be rescaled and remain unchanged."
      )
    )
  }
  x
}


#' @rdname rescale
#' @export
rescale.numeric <- function(
  x,
  to = c(0, 100),
  multiply = NULL,
  add = NULL,
  range = NULL,
  verbose = TRUE,
  ...
) {
  if (is.null(to)) {
    return(x)
  }

  # Warning if all NaNs
  if (all(is.na(x))) {
    return(x)
  }

  if (is.null(range)) {
    range <- c(min(x, na.rm = TRUE), max(x, na.rm = TRUE))
  }

  # check if user specified "multiply" or "add", and then update "to"
  to <- .update_to(x, to, multiply, add)

  # called from "makepredictcal()"? Then we have additional arguments
  dot_args <- list(...)
  required_dot_args <- c("min_value", "max_value", "new_min", "new_max")
  flag_predict <- FALSE

  if (all(required_dot_args %in% names(dot_args))) {
    # we gather informatiom about the original data, which is needed
    # for "predict()" to work properly when "rescale()" is called
    # in formulas on-the-fly, e.g. "lm(mpg ~ rescale(hp), data = mtcars)"
    min_value <- dot_args$min_value
    max_value <- dot_args$max_value
    new_min <- dot_args$new_min
    new_max <- dot_args$new_max
    flag_predict <- TRUE
  } else {
    min_value <- ifelse(is.na(range[1]), min(x, na.rm = TRUE), range[1])
    max_value <- ifelse(is.na(range[2]), max(x, na.rm = TRUE), range[2])
    new_min <- ifelse(is.na(to[1]), min_value, to[1])
    new_max <- ifelse(is.na(to[2]), max_value, to[2])
  }

  # Warning if only one value
  if (!flag_predict && insight::has_single_value(x) && is.null(range)) {
    if (verbose) {
      insight::format_warning(
        "A `range` must be provided for data with only one unique value."
      )
    }
    return(x)
  }

  out <- as.vector(
    (new_max - new_min) / (max_value - min_value) * (x - min_value) + new_min
  )

  attr(out, "min_value") <- min_value
  attr(out, "max_value") <- max_value
  attr(out, "new_min") <- new_min
  attr(out, "new_max") <- new_max
  attr(out, "range_difference") <- max_value - min_value
  attr(out, "to_range") <- c(new_min, new_max)
  # don't add attribute when we call data frame methods
  if (!isFALSE(dot_args$add_transform_class)) {
    class(out) <- c("dw_transformer", class(out))
  }

  out
}


#' @export
rescale.grouped_df <- function(
  x,
  select = NULL,
  exclude = NULL,
  to = c(0, 100),
  multiply = NULL,
  add = NULL,
  range = NULL,
  append = FALSE,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = FALSE,
  ...
) {
  info <- attributes(x)

  grps <- attr(x, "groups", exact = TRUE)[[".rows"]]

  # evaluate arguments
  select <- .select_nse(
    select,
    x,
    exclude,
    ignore_case,
    regex = regex,
    remove_group_var = TRUE,
    verbose = verbose
  )

  # when we append variables, we call ".process_append()", which will
  # create the new variables and updates "select", so new variables are processed
  if (!isFALSE(append)) {
    # process arguments
    my_args <- .process_append(
      x,
      select,
      append,
      append_suffix = "_r",
      preserve_value_labels = TRUE
    )
    # update processed arguments
    x <- my_args$x
    select <- my_args$select
  }

  x <- as.data.frame(x)
  for (rows in grps) {
    x[rows, ] <- rescale(
      x[rows, , drop = FALSE],
      select = select,
      exclude = exclude,
      to = to,
      multiply = multiply,
      add = add,
      range = range,
      append = FALSE, # need to set to FALSE here, else variable will be doubled
      add_transform_class = FALSE,
      ...
    )
  }
  # set back class, so data frame still works with dplyr
  attributes(x) <- utils::modifyList(info, attributes(x))
  x
}


#' @rdname rescale
#' @export
rescale.data.frame <- function(
  x,
  select = NULL,
  exclude = NULL,
  to = c(0, 100),
  multiply = NULL,
  add = NULL,
  range = NULL,
  append = FALSE,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = FALSE,
  ...
) {
  # evaluate arguments
  select <- .select_nse(
    select,
    x,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  # when we append variables, we call ".process_append()", which will
  # create the new variables and updates "select", so new variables are processed
  if (!isFALSE(append)) {
    # process arguments
    my_args <- .process_append(
      x,
      select,
      append,
      append_suffix = "_r"
    )
    # update processed arguments
    x <- my_args$x
    select <- my_args$select
  }

  # Transform the range so that it is a list now
  if (!is.null(range) && !is.list(range)) {
    range <- stats::setNames(rep(list(range), length(select)), select)
  }
  # Transform the 'to' so that it is a list now
  if (!is.list(to)) {
    to <- stats::setNames(rep(list(to), length(select)), select)
  }
  # Transform the 'multiply' so that it is a list now
  if (!is.null(multiply) && !is.list(multiply)) {
    multiply <- stats::setNames(rep(list(multiply), length(select)), select)
  }
  # Transform the 'add' so that it is a list now
  if (!is.null(add) && !is.list(add)) {
    add <- stats::setNames(rep(list(add), length(select)), select)
  }
  # update "to" if user specified "multiply" or "add"
  to[] <- lapply(names(to), function(i) {
    .update_to(x[[i]], to[[i]], multiply[[i]], add[[i]])
  })

  x[select] <- as.data.frame(sapply(
    select,
    function(n) {
      rescale(
        x[[n]],
        to = to[[n]],
        range = range[[n]],
        add_transform_class = FALSE
      )
    },
    simplify = FALSE
  ))
  x
}


# helper ----------------------------------------------------------------------

# expand the new target range by multiplying or adding
.update_to <- function(x, to, multiply, add) {
  # check if user specified "multiply" or "add", and if not, return "to"
  if (is.null(multiply) && is.null(add)) {
    return(to)
  }
  # only one of "multiply" or "add" can be specified
  if (!is.null(multiply) && !is.null(add)) {
    insight::format_error("Only one of `multiply` or `add` can be specified.")
  }
  # multiply? If yes, calculate the "add" value
  if (!is.null(multiply)) {
    # check for correct length
    if (length(multiply) > 1) {
      insight::format_error("The length of `multiply` must be 1.")
    }
    add <- (diff(range(x, na.rm = TRUE)) * (multiply - 1)) / 2
  }
  # add?
  if (!is.null(add)) {
    # add must be of length 1 or 2
    if (length(add) > 2) {
      insight::format_error("The length of `add` must be 1 or 2.")
    }
    # if add is of length 2, then the first value is used for the lower bound
    # and the second value for the upper bound
    if (length(add) == 2) {
      add_low <- add[1]
      add_high <- add[2]
    } else {
      add_low <- add_high <- add
    }
    to <- c(min(x, na.rm = TRUE) - add_low, max(x, na.rm = TRUE) + add_high)
  }
  to
}


================================================
FILE: R/data_restoretype.R
================================================
#' Restore the type of columns according to a reference data frame
#'
#' @param data A data frame for which to restore the column types.
#' @inheritParams data_to_long
#' @inheritParams data_rename
#' @param reference A reference data frame from which to find the correct
#'   column types. If `NULL`, each column is converted to numeric if it doesn't
#'   generate `NA`s. For example, `c("1", "2")` can be converted to numeric but not
#'   `c("Sepal.Length")`.
#' @return
#'
#' A data frame with columns whose types have been restored based on the
#' reference data frame.
#'
#' @examples
#' data <- data.frame(
#'   Sepal.Length = c("1", "3", "2"),
#'   Species = c("setosa", "versicolor", "setosa"),
#'   New = c("1", "3", "4")
#' )
#'
#' fixed <- data_restoretype(data, reference = iris)
#' summary(fixed)
#' @export

data_restoretype <- function(data, reference = NULL, ...) {
  for (col in names(data)) {
    # No reference data (regular fixing) ----------------
    if (is.null(reference)) {
      if (is.character(data[[col]])) {
        data[[col]] <- coerce_to_numeric(data[[col]])
      }
    } else {
      if (is.factor(reference[[col]]) && !is.factor(data[[col]])) {
        # Restore factor levels
        data[[col]] <- factor(data[[col]], levels = levels(reference[[col]]))
      }

      if (is.numeric(reference[[col]]) && !is.numeric(data[[col]])) {
        data[[col]] <- coerce_to_numeric(as.character(data[[col]]))
      }

      if (is.character(reference[[col]]) && !is.character(data[[col]])) {
        data[[col]] <- as.character(data[[col]])
      }
    }
  }

  data
}


================================================
FILE: R/data_reverse.R
================================================
#' Reverse-Score Variables
#'
#' Reverse-score variables (change the keying/scoring direction).
#'
#' @param range Range of values that is used as reference for reversing the
#'   scale. For numeric variables, can be `NULL` or a numeric vector of length
#'   two, indicating the lowest and highest value of the reference range. If
#'   `NULL`, will take the range of the input vector (`range(x)`). For factors,
#'   `range` can be `NULL`, a numeric vector of length two, or a (numeric)
#'   vector of at least the same length as factor levels (i.e. must be equal
#'   to or larger than `nlevels(x)`). Note that providing a `range` for factors
#'   usually only makes sense when factor levels are numeric, not characters.
#' @param ... Arguments passed to or from other methods.
#' @inheritParams categorize
#' @inheritParams extract_column_names
#'
#' @inheritSection center Selection of variables - the `select` argument
#'
#' @examples
#' reverse(c(1, 2, 3, 4, 5))
#' reverse(c(-2, -1, 0, 2, 1))
#'
#' # Specify the "theoretical" range of the input vector
#' reverse(c(1, 3, 4), range = c(0, 4))
#'
#' # Factor variables
#' reverse(factor(c(1, 2, 3, 4, 5)))
#' reverse(factor(c(1, 2, 3, 4, 5)), range = 0:10)
#'
#' # Data frames
#' head(reverse(iris))
#' head(reverse(iris, select = "Sepal.Length"))
#'
#' @return A reverse-scored object.
#'
#' @family transform utilities
#'
#' @inherit data_rename seealso
#'
#' @export
reverse <- function(x, ...) {
  UseMethod("reverse")
}


#' @rdname reverse
#' @export
reverse_scale <- reverse


#' @export
reverse.default <- function(x, verbose = TRUE, ...) {
  if (isTRUE(verbose)) {
    insight::format_alert(
      paste0(
        "Variables of class '",
        class(x)[1],
        "' can't be recoded and remain unchanged."
      )
    )
  }

  x
}


#' @rdname reverse
#' @export
reverse.numeric <- function(x, range = NULL, verbose = TRUE, ...) {
  # Warning if all NaNs
  if (all(is.na(x))) {
    return(x)
  }

  # Warning if only one value
  if (insight::has_single_value(x) && is.null(range)) {
    if (verbose) {
      insight::format_warning(
        "A `range` must be provided for data with only one unique value."
      )
    }
    return(x)
  }

  # no missing values allowed
  if (anyNA(range)) {
    insight::format_error("`range` is not allowed to have missing values.")
  }

  if (is.null(range)) {
    range <- c(min(x, na.rm = TRUE), max(x, na.rm = TRUE))
  }

  # old minimum and maximum
  min_value <- min(range)
  max_value <- max(range)

  # check if a valid range (i.e. vector of length 2) is provided
  if (length(range) > 2) {
    insight::format_error(
      "`range` must be a numeric vector of length two, indicating lowest and highest value of the required range.",
      sprintf(
        "Did you want to provide `range = c(%g, %g)`?",
        min_value,
        max_value
      )
    )
  }

  new_min <- max_value
  new_max <- min_value

  out <- as.vector(
    (new_max - new_min) / (max_value - min_value) * (x - min_value) + new_min
  )

  # labelled data?
  out <- .set_back_labels(out, x, reverse_values = TRUE)
  out
}


#' @export
reverse.factor <- function(x, range = NULL, verbose = TRUE, ...) {
  # Warning if all NaNs
  if (all(is.na(x))) {
    return(x)
  }

  # Warning if only one value
  if (insight::has_single_value(x) && is.null(range)) {
    if (verbose) {
      insight::format_warning(
        "A `range` must be provided for data with only one unique value."
      )
    }
    return(x)
  }

  # save for later use
  original_x <- x

  if (is.null(range)) {
    old_levels <- levels(x)
  } else {
    # no missing values allowed
    if (anyNA(range)) {
      insight::format_error("`range` is not allowed to have missing values.")
    }
    range_ok <- TRUE
    # if we have a vector of length 2 for range, and more factor levels,
    # we assume `range` indicates minimum and maximum of range values
    if (length(range) == 2 && nlevels(droplevels(x)) > 2) {
      if (is.numeric(range)) {
        range <- min(range):max(range)
      } else {
        # if range is of length 2, and we have more than 2 number of levels,
        # range must be numeric to indicate minima and maxima. if not, stop.
        range_ok <- FALSE
      }
    }
    if (length(range) > 2 && length(range) < nlevels(droplevels(x))) {
      # if range has more than two values, but fewer values than number of
      # factor levels, we cannot associate the reversed scale, so stop
      range_ok <- FALSE
    }
    if (!range_ok) {
      insight::format_error(
        "`range` must be one of the following:",
        "- a numeric vector of length two, indicating lowest and highest value of the required range,",
        "- a vector (numeric or character) of values with at least as many values as number of levels in `x`,",
        "- or `NULL`."
      )
    }
    # check if no or not all old levels are in new range
    if (verbose) {
      if (!any(levels(x) %in% as.character(range))) {
        insight::format_warning(
          "No current factor level is included in `range`.",
          "Returned factor will only contain missing values."
        )
      } else if (!all(levels(x) %in% as.character(range))) {
        insight::format_warning(
          "Not all current factor levels are included in `range`.",
          "Returned factor will contain missing values."
        )
      }
    }
    old_levels <- range
    x <- factor(x, levels = range)
  }

  int_x <- as.integer(x)
  rev_x <- reverse(int_x, range = c(1, length(old_levels)))
  x <- factor(rev_x, levels = seq_along(old_levels), labels = old_levels)

  # labelled data?
  x <- .set_back_labels(x, original_x, reverse_values = TRUE)

  x
}


#' @export
reverse.grouped_df <- function(
  x,
  select = NULL,
  exclude = NULL,
  range = NULL,
  append = FALSE,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = FALSE,
  ...
) {
  info <- attributes(x)

  grps <- attr(x, "groups", exact = TRUE)[[".rows"]]

  # evaluate arguments
  select <- .select_nse(
    select,
    x,
    exclude,
    ignore_case,
    regex = regex,
    remove_group_var = TRUE,
    verbose = verbose
  )

  # when we append variables, we call ".process_append()", which will
  # create the new variables and updates "select", so new variables are processed
  if (!isFALSE(append)) {
    # process arguments
    arguments <- .process_append(
      x,
      select,
      append,
      append_suffix = "_r",
      preserve_value_labels = TRUE
    )
    # update processed arguments
    x <- arguments$x
    select <- arguments$select
  }

  x <- as.data.frame(x)
  for (rows in grps) {
    x[rows, ] <- reverse(
      x[rows, , drop = FALSE],
      select = select,
      exclude = exclude,
      range = range,
      append = FALSE, # need to set to FALSE here, else variable will be doubled
      ...
    )
  }
  # set back class, so data frame still works with dplyr
  attributes(x) <- utils::modifyList(info, attributes(x))
  x
}


#' @rdname reverse
#' @export
reverse.data.frame <- function(
  x,
  select = NULL,
  exclude = NULL,
  range = NULL,
  append = FALSE,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = FALSE,
  ...
) {
  # evaluate arguments
  select <- .select_nse(
    select,
    x,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  # when we append variables, we call ".process_append()", which will
  # create the new variables and updates "select", so new variables are processed
  if (!isFALSE(append)) {
    # process arguments
    arguments <- .process_append(
      x,
      select,
      append,
      append_suffix = "_r",
      preserve_value_labels = TRUE
    )
    # update processed arguments
    x <- arguments$x
    select <- arguments$select
  }

  # Transform the range so that it is a list now
  if (!is.null(range) && !is.list(range)) {
    range <- stats::setNames(rep(list(range), length(select)), select)
  }

  x[select] <- lapply(select, function(n) {
    reverse(x[[n]], range = range[[n]])
  })
  x
}


================================================
FILE: R/data_rotate.R
================================================
#' @title Rotate a data frame
#' @name data_rotate
#'
#' @description
#' This function rotates a data frame, i.e. columns become rows and vice versa.
#'   It's the equivalent of using `t()` but restores the `data.frame` class,
#'   preserves attributes and prints a warning if the data type is
#'   modified (see example).
#'
#' @param data A data frame.
#' @param rownames Character vector (optional). If not `NULL`, the data frame's
#'   rownames will be added as (first) column to the output, with `rownames`
#'   being the name of this column.
#' @param colnames Logical or character vector (optional). If `TRUE`, the values
#'   of the first column in `x` will be used as column names in the rotated data
#'   frame. If a character vector, values from that column are used as column
#'   names.
#' @param verbose Toggle warnings.
#'
#' @inherit data_rename seealso
#'
#' @return A (rotated) data frame.
#'
#' @examples
#' x <- mtcars[1:3, 1:4]
#'
#' x
#'
#' data_rotate(x)
#' data_rotate(x, rownames = "property")
#'
#' # use values in 1. column as column name
#' data_rotate(x, colnames = TRUE)
#' data_rotate(x, rownames = "property", colnames = TRUE)
#'
#' # use either first column or specific column for column names
#' x <- data.frame(a = 1:5, b = 11:15, c = 21:25)
#' data_rotate(x, colnames = TRUE)
#' data_rotate(x, colnames = "c")
#'
#' @export
data_rotate <- function(
  data,
  rownames = NULL,
  colnames = FALSE,
  verbose = TRUE
) {
  # copy attributes
  attr_data <- attributes(data)

  # check if first column has column names to be used for rotated data
  if (isTRUE(colnames)) {
    colnames <- data[[1]]
    data <- data[-1]
  } else if (
    !is.null(colnames) && is.character(colnames) && colnames %in% colnames(data)
  ) {
    cn_col <- which(colnames(data) == colnames)
    colnames <- data[[colnames]]
    data <- data[-cn_col]
  } else {
    colnames <- row.names(data)
  }

  # warning after possible removal of columns
  if (
    verbose &&
      insight::n_unique(vapply(data, typeof, FUN.VALUE = character(1L))) > 1L
  ) {
    insight::format_warning(
      "Your data frame contains mixed types of data. After transposition, all variables will be transformed into characters."
    ) # nolint
  }

  # rotate data frame by 90 degrees
  out <- as.data.frame(t(as.data.frame(data)))

  # add column names, if requested
  if (!is.null(colnames)) {
    # check if we have correct length of column names
    if (length(colnames) != ncol(out)) {
      insight::format_warning(
        "Length of provided column names does not match number of columns. No column names changed."
      )
    } else {
      colnames(out) <- colnames
    }
  }

  # add rownames as a new column, if requested
  if (!is.null(rownames)) {
    out <- rownames_as_column(out, var = rownames)
  }

  out <- .replace_attrs(out, attr_data)

  out
}


#' @rdname data_rotate
#' @export
data_transpose <- data_rotate


================================================
FILE: R/data_seek.R
================================================
#' @title Find variables by their names, variable or value labels
#' @name data_seek
#'
#' @description This functions seeks variables in a data frame, based on patterns
#' that either match the variable name (column name), variable labels, value labels
#' or factor levels. Matching variable and value labels only works for "labelled"
#' data, i.e. when the variables either have a `label` attribute or `labels`
#' attribute.
#'
#' `data_seek()` is particular useful for larger data frames with labelled
#' data - finding the correct variable name can be a challenge. This function
#' helps to find the required variables, when only certain patterns of variable
#' names or labels are known.
#'
#' @param data A data frame.
#' @param pattern Character string (regular expression) to be matched in `data`.
#' May also be a character vector of length > 1. `pattern` is searched for in
#' column names, variable label and value labels attributes, or factor levels of
#' variables in `data`.
#' @param seek Character vector, indicating where `pattern` is sought. Use one
#' or more of the following options:
#'
#' - `"names"`: Searches in column names. `"column_names"` and `"columns"` are
#'   aliases for `"names"`.
#' - `"labels"`: Searches in variable labels. Only applies when a `label` attribute
#'   is set for a variable.
#' - `"values"`: Searches in value labels or factor levels. Only applies when a
#'   `labels` attribute is set for a variable, or if a variable is a factor.
#'   `"levels"` is an alias for `"values"`.
#' - `"all"`: Searches in all of the above.
#' @param fuzzy Logical. If `TRUE`, "fuzzy matching" (partial and close distance
#' matching) will be used to find `pattern`.
#'
#' @return A data frame with three columns: the column index, the column name
#' and - if available - the variable label of all matched variables in `data`.
#'
#' @examples
#' # seek variables with "Length" in variable name or labels
#' data_seek(iris, "Length")
#'
#' # seek variables with "dependency" in names or labels
#' # column "e42dep" has a label-attribute "elder's dependency"
#' data(efc)
#' data_seek(efc, "dependency")
#'
#' # "female" only appears as value label attribute - default search is in
#' # variable names and labels only, so no match
#' data_seek(efc, "female")
#' # when we seek in all sources, we find the variable "e16sex"
#' data_seek(efc, "female", seek = "all")
#'
#' # typo, no match
#' data_seek(iris, "Lenght")
#' # typo, fuzzy match
#' data_seek(iris, "Lenght", fuzzy = TRUE)
#' @export
data_seek <- function(
  data,
  pattern,
  seek = c("names", "labels"),
  fuzzy = FALSE
) {
  # check valid args
  if (!is.data.frame(data)) {
    insight::format_error("`data` must be a data frame.")
  }

  # check valid args
  seek <- intersect(
    seek,
    c("names", "labels", "values", "levels", "column_names", "columns", "all")
  )
  if (is.null(seek) || !length(seek)) {
    insight::format_error(
      "`seek` must be one of \"names\", \"labels\", \"values\", a combination of these options, or \"all\"."
    ) # nolint
  }

  pos1 <- pos2 <- pos3 <- NULL

  pos <- unlist(lapply(pattern, function(search_pattern) {
    # search in variable names?
    if (any(seek %in% c("names", "columns", "column_names", "all"))) {
      pos1 <- grep(search_pattern, colnames(data))
      # find in near distance?
      if (fuzzy) {
        pos1 <- c(
          pos1,
          .fuzzy_grep(x = colnames(data), pattern = search_pattern)
        )
      }
    }

    # search in variable labels?
    if (any(seek %in% c("labels", "all"))) {
      var_labels <- insight::compact_character(unlist(lapply(
        data,
        attr,
        which = "label",
        exact = TRUE
      )))
      if (!is.null(var_labels) && length(var_labels)) {
        found <- grepl(search_pattern, var_labels)
        pos2 <- match(names(var_labels)[found], colnames(data))
        # find in near distanc?
        if (fuzzy) {
          found <- .fuzzy_grep(x = var_labels, pattern = search_pattern)
          if (length(found)) {
            pos2 <- c(pos2, match(names(var_labels)[found], colnames(data)))
          }
        }
      }
    }

    # search for pattern in value labels or levels?
    if (any(seek %in% c("values", "levels", "all"))) {
      values <- insight::compact_list(lapply(data, function(i) {
        l <- attr(i, "labels", exact = TRUE)
        if (is.null(l) && is.factor(i)) {
          levels(i)
        } else {
          names(l)
        }
      }))
      if (!is.null(values) && length(values)) {
        found <- vapply(
          values,
          function(i) any(grepl(search_pattern, i)),
          logical(1)
        )
        pos3 <- match(names(found)[found], colnames(data))
        # find in near distance
        if (fuzzy) {
          found <- vapply(
            values,
            function(i) {
              length(.fuzzy_grep(x = i, pattern = search_pattern)) > 0
            },
            logical(1)
          )
          if (any(found)) {
            pos3 <- c(pos3, match(names(found)[found], colnames(data)))
          }
        }
      }
    }
    c(pos1, pos2, pos3)
  }))

  # clean up
  pos <- unique(pos)

  # variable labels of matching variables
  var_labels <- vapply(
    colnames(data[pos]),
    function(i) {
      l <- attr(data[[i]], "label", exact = TRUE)
      if (is.null(l)) {
        i
      } else {
        l
      }
    },
    character(1)
  )

  out <- data.frame(
    index = pos,
    column = colnames(data)[pos],
    labels = var_labels,
    stringsAsFactors = FALSE
  )
  # no row names
  rownames(out) <- NULL

  class(out) <- c("data_seek", "data.frame")
  out
}


# methods ---------------------------------------------------------------------

#' @export
print.data_seek <- function(x, ...) {
  if (nrow(x) == 0) {
    cat("No matches found.\n")
  } else {
    cat(insight::export_table(x, ...))
  }
}


================================================
FILE: R/data_select.R
================================================
#' @rdname extract_column_names
#' @export
data_select <- function(
  data,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
) {
  columns <- .select_nse(
    select,
    data,
    exclude,
    ignore_case = ignore_case,
    regex = regex,
    allow_rename = TRUE,
    verbose = FALSE
  )

  # save attributes
  a <- attributes(data)

  if (!length(columns) || is.null(columns)) {
    if (isTRUE(verbose)) {
      insight::format_warning(
        "No column names that matched the required search pattern were found."
      )
    }
    return(NULL)
  }

  out <- data[columns]

  # for named character vectors, we offer the service to directly rename the columns
  if (!is.null(names(columns))) {
    colnames(out) <- names(columns)
  }

  # add back attributes
  out <- .replace_attrs(out, a)
  out
}


================================================
FILE: R/data_separate.R
================================================
#' @title Separate single variable into multiple variables
#' @name data_separate
#'
#' @description
#' Separates a single variable into multiple new variables.
#'
#' @param data A data frame.
#' @param new_columns The names of the new columns, as character vector. If
#' more than one variable was selected (in `select`), the new names are prefixed
#' with the name of the original column. `new_columns` can also be a list of
#' (named) character vectors when multiple variables should be separated. See
#' 'Examples'.
#' @param separator Separator between columns. Can be a character vector, which
#' is then treated as regular expression, or a numeric vector that indicates at
#' which positions the string values will be split.
#' @param append Logical, if `FALSE` (default), removes original columns that
#' were separated. If `TRUE`, all columns are preserved and the new columns are
#' appended to the data frame.
#' @param guess_columns If `new_columns` is not given, the required number of
#' new columns is guessed based on the results of value splitting. For example,
#' if a variable is split into three new columns, this will be considered as
#' the required number of new columns, and columns are named `"split_1"`,
#' `"split_2"` and `"split_3"`. When values from a variable are split into
#' different amount of new columns, the `guess_column` can be either `"mode"`
#' (number of new columns is based on the most common number of splits), `"min"`
#' or `"max"` to use the minimum resp. maximum number of possible splits as
#' required number of columns.
#' @param fill How to deal with values that return fewer new columns after
#' splitting? Can be `"left"` (fill missing columns from the left with `NA`),
#' `"right"` (fill missing columns from the right with `NA`) or `"value_left"`
#' or `"value_right"` to fill missing columns from left or right with the
#' left-most or right-most values.
#' @param extra How to deal with values that return too many new columns after
#' splitting? Can be `"drop_left"` or `"drop_right"` to drop the left-most or
#' right-most values, or `"merge_left"` or `"merge_right"` to merge the left-
#' or right-most value together, and keeping all remaining values as is.
#' @param merge_multiple Logical, if `TRUE` and more than one variable is selected
#' for separating, new columns can be merged. Value pairs of all split variables
#' are merged.
#' @param merge_separator Separator string when `merge_multiple = TRUE`. Defines
#' the string that is used to merge values together.
#' @param convert_na Logical, if `TRUE`, character `"NA"` values are converted
#' into real `NA` values.
#' @param ... Currently not used.
#' @inheritParams extract_column_names
#'
#' @seealso [`data_unite()`]
#'
#' @return A data frame with the newly created variable(s), or - when `append = TRUE` -
#' `data` including new variables.
#'
#' @examples
#' # simple case
#' d <- data.frame(
#'   x = c("1.a.6", "2.b.7", "3.c.8"),
#'   stringsAsFactors = FALSE
#' )
#' d
#' data_separate(d, new_columns = c("a", "b", "c"))
#'
#' # guess number of columns
#' d <- data.frame(
#'   x = c("1.a.6", NA, "2.b.6.7", "3.c", "x.y.z"),
#'   stringsAsFactors = FALSE
#' )
#' d
#' data_separate(d, guess_columns = "mode")
#'
#' data_separate(d, guess_columns = "max")
#'
#' # drop left-most column
#' data_separate(d, guess_columns = "mode", extra = "drop_left")
#'
#' # merge right-most column
#' data_separate(d, guess_columns = "mode", extra = "merge_right")
#'
#' # fill columns with fewer values with left-most values
#' data_separate(d, guess_columns = "mode", fill = "value_left")
#'
#' # fill and merge
#' data_separate(
#'   d,
#'   guess_columns = "mode",
#'   fill = "value_left",
#'   extra = "merge_right"
#' )
#'
#' # multiple columns to split
#' d <- data.frame(
#'   x = c("1.a.6", "2.b.7", "3.c.8"),
#'   y = c("x.y.z", "10.11.12", "m.n.o"),
#'   stringsAsFactors = FALSE
#' )
#' d
#' # split two columns, default column names
#' data_separate(d, guess_columns = "mode")
#'
#' # split into new named columns, repeating column names
#' data_separate(d, new_columns = c("a", "b", "c"))
#'
#' # split selected variable new columns
#' data_separate(d, select = "y", new_columns = c("a", "b", "c"))
#'
#' # merge multiple split columns
#' data_separate(
#'   d,
#'   new_columns = c("a", "b", "c"),
#'   merge_multiple = TRUE
#' )
#'
#' # merge multiple split columns
#' data_separate(
#'   d,
#'   new_columns = c("a", "b", "c"),
#'   merge_multiple = TRUE,
#'   merge_separator = "-"
#' )
#'
#' # separate multiple columns, give proper column names
#' d_sep <- data.frame(
#'   x = c("1.a.6", "2.b.7.d", "3.c.8", "5.j"),
#'   y = c("m.n.99.22", "77.f.g.34", "44.9", NA),
#'   stringsAsFactors = FALSE
#' )
#'
#' data_separate(
#'   d_sep,
#'   select = c("x", "y"),
#'   new_columns = list(
#'     x = c("A", "B", "C"), # separate "x" into three columns
#'     y = c("EE", "FF", "GG", "HH") # separate "y" into four columns
#'   ),
#'   verbose = FALSE
#' )
#' @export
data_separate <- function(
  data,
  select = NULL,
  new_columns = NULL,
  separator = "[^[:alnum:]]+",
  guess_columns = NULL,
  merge_multiple = FALSE,
  merge_separator = "",
  fill = "right",
  extra = "drop_right",
  convert_na = TRUE,
  exclude = NULL,
  append = FALSE,
  ignore_case = FALSE,
  verbose = TRUE,
  regex = FALSE,
  ...
) {
  # we need at least one explicit choice for either `new_columns` or `guess_columns`
  if (is.null(new_columns) && is.null(guess_columns)) {
    insight::format_error(
      "Cannot separate values. Either `new_columns` or `guess_columns` must be provided."
    )
  }
  # in case user did not provide names of new columns, we can try
  # to guess number of columns per variable
  guess_columns <- match.arg(guess_columns, choices = c("min", "max", "mode"))

  # make sure we have valid options for fill and extra
  fill <- match.arg(
    fill,
    choices = c("left", "right", "value_left", "value_right")
  )
  extra <- match.arg(
    extra,
    choices = c("drop_left", "drop_right", "merge_left", "merge_right")
  )

  # evaluate select/exclude, may be select-helpers
  select <- .select_nse(
    select,
    data,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  # make new_columns as list, this works with single and multiple columns
  if (!is.null(new_columns) && !is.list(new_columns)) {
    new_columns <- rep(list(new_columns), times = length(select))
    # if we have multiple columns that were separated, we avoid duplicated
    # column names of created variables by appending name of original column
    # however, we don't have duplicated column names when we merge them together
    # so don't create new column names when "merge_multiple" is FALSE.
    make_unique_colnames <- length(select) > 1 && !merge_multiple
  } else {
    # we don't want to create own unique column names when user explicitly
    # provided column names as a list, i.e. column names for each separated
    # variable
    make_unique_colnames <- FALSE
  }

  # make sure list of new column names is named
  if (!is.null(new_columns) && is.null(names(new_columns))) {
    names(new_columns) <- select
  }

  # iterate columns that should be split
  split_data <- lapply(select, function(sep_column) {
    # do we have known number of columns?
    if (is.null(new_columns)) {
      n_columns <- NULL
    } else {
      n_columns <- length(new_columns[[sep_column]])
    }

    # make sure we have a character that we can split
    x <- data[[sep_column]]
    if (!is.character(x)) {
      x <- as.character(x)
    }

    # separate column into multiple strings
    if (is.numeric(separator)) {
      maxlen <- max(nchar(x), na.rm = TRUE)
      starts <- c(0, separator)
      ends <- c(separator - 1, maxlen)
      separated_columns <- lapply(seq_along(starts), function(i) {
        substr(x, starts[i], ends[i])
      })
      separated_columns <- as.data.frame(
        do.call(rbind, separated_columns),
        stringsAsFactors = FALSE
      )
    } else {
      separated_columns <- strsplit(x, separator, perl = TRUE)
    }

    # how many new columns do we need?
    if (is.null(n_columns)) {
      # lengths of all split strings
      l <- lengths(separated_columns)
      # but without NA values
      l <- l[!vapply(l, function(i) all(is.na(i)), TRUE)]
      # define number of new columns, based on user-choice
      n_cols <- switch(
        guess_columns,
        min = min(l, na.rm = TRUE),
        max = max(l, na.rm = TRUE),
        mode = distribution_mode(l),
      )
      # tell user
      if (verbose && insight::n_unique(l) != 1 && !is.numeric(separator)) {
        insight::format_alert(paste0(
          "Column `",
          sep_column,
          "` had different number of values after splitting. Variable was split into ",
          n_cols,
          " column",
          ifelse(n_cols > 1, "s", ""),
          "."
        ))
      }
    } else {
      # else, if we know number of columns, use that number
      n_cols <- n_columns
    }

    # main task here - fill or drop values for all columns
    separated_columns <- tryCatch(
      .fix_separated_columns(
        separated_columns,
        fill,
        extra,
        n_cols,
        sep_column,
        verbose
      ),
      error = function(e) NULL
    )

    # catch error
    if (is.null(separated_columns)) {
      insight::format_error(
        "Something went wrong. Probably the number of provided column names did not match number of newly created columns?" # nolint
      )
    }

    # bind separated columns into data frame and set column names
    out <- as.data.frame(do.call(rbind, separated_columns))

    # if no column names provided, use standard names
    if (is.null(new_columns[[sep_column]])) {
      new_column_names <- paste0(sep_column, "_", seq_along(out))
    } else if (make_unique_colnames) {
      # if we have multiple columns that were separated, we avoid duplicated
      # column names of created variables by appending name of original column
      new_column_names <- paste0(sep_column, "_", new_columns[[sep_column]])
    } else {
      new_column_names <- new_columns[[sep_column]]
    }

    colnames(out) <- new_column_names
    out
  })

  # any split performed?
  if (all(lengths(split_data) == 1)) {
    if (verbose) {
      insight::format_alert(
        "Separator probably not found. No values were split. Returning original data."
      )
    }
    return(data)
  }

  # final preparation, bind or merge columns, make unique columm names
  if (isTRUE(merge_multiple) && length(split_data) > 1) {
    # we merge all split columns, which are currently saved as list
    # of data frames, together into one data frame
    for (i in 2:length(split_data)) {
      for (j in seq_along(split_data[[1]])) {
        split_data[[1]][[j]] <- gsub(
          " ",
          "",
          paste(
            split_data[[1]][[j]],
            split_data[[i]][[j]],
            sep = merge_separator
          ),
          fixed = TRUE
        )
      }
    }
    split_data <- split_data[[1]]
  } else {
    # bind all columns
    split_data <- do.call(cbind, split_data)
  }

  # convert "NA" strings into real NA?
  if (convert_na) {
    split_data[] <- lapply(split_data, function(i) {
      i[i == "NA"] <- NA_character_
      i
    })
  }

  data <- cbind(data, split_data)
  if (!isTRUE(append)) {
    data[select] <- NULL
  }

  # fin
  data
}


#' @keywords internal
.fix_separated_columns <- function(
  separated_columns,
  fill,
  extra,
  n_cols,
  sep_column,
  verbose = TRUE
) {
  warn_extra <- warn_fill <- FALSE
  for (sc in seq_along(separated_columns)) {
    i <- separated_columns[[sc]]
    # determine number of values in separated column
    n_values <- length(i)
    if (all(is.na(i))) {
      # we have NA values - so fill everything with NA
      out <- rep(NA_character_, times = n_cols)
    } else if (n_values > n_cols) {
      # we have more values than required - drop extra columns
      out <- switch(
        extra,
        drop_left = i[(n_values - n_cols + 1):n_values],
        drop_right = i[1:n_cols],
        merge_left = {
          tmp <- paste(i[1:(n_values - n_cols + 1)], collapse = " ")
          c(tmp, i[(n_values - n_cols + 2):n_values])
        },
        {
          tmp <- i[1:(n_cols - 1)]
          c(tmp, paste(i[n_cols:n_values], collapse = " "))
        }
      )
      warn_extra <- TRUE
    } else if (n_values < n_cols) {
      # we have fewer values than required - fill columns
      out <- switch(
        fill,
        left = c(rep(NA_character_, times = n_cols - n_values), i),
        right = c(i, rep(NA_character_, times = n_cols - n_values)),
        value_left = c(rep(i[1], times = n_cols - n_values), i),
        c(i, rep(i[length(i)], times = n_cols - n_values))
      )
      warn_fill <- TRUE
    } else {
      out <- i
    }
    separated_columns[[sc]] <- out
  }

  if (verbose) {
    if (warn_extra) {
      insight::format_alert(paste0(
        "`",
        sep_column,
        "`",
        " returned more columns than expected after splitting. ",
        switch(
          extra,
          drop_left = "Left-most columns have been dropped.",
          drop_right = "Right-most columns have been dropped.",
          merge_left = "Left-most columns have been merged together.",
          merge_right = "Right-most columns have been merged together."
        )
      ))
    }
    if (warn_fill) {
      insight::format_alert(paste0(
        "`",
        sep_column,
        "`",
        "returned fewer columns than expected after splitting. ",
        switch(
          fill,
          left = "Left-most columns were filled with `NA`.",
          right = "Right-most columns were filled with `NA`.",
          value_left = "Left-most columns were filled with first value.",
          value_right = "Right-most columns were filled with last value."
        )
      ))
    }
  }

  separated_columns
}


================================================
FILE: R/data_summary.R
================================================
#' @title Summarize data
#' @name data_summary
#'
#' @description This function can be used to compute summary statistics for a
#' data frame or a matrix.
#'
#' @param x A (grouped) data frame.
#' @param by Optional character string, indicating the names of one or more
#' variables in the data frame. If supplied, the data will be split by these
#' variables and summary statistics will be computed for each group.
#' @param remove_na Logical. If `TRUE`, missing values are omitted from the
#' grouping variable. If `FALSE` (default), missing values are included as a
#' level in the grouping variable.
#' @param suffix Optional, suffixes to be added to the new variable names,
#' especially useful when a function returns several values (e.g. `quantile()`).
#' Can be:
#' * a character vector: all expressions in `...` must return the same number
#'    of values as elements in `suffix`.
#' * a list of named character vectors: the names of elements in `suffix` must
#'    match the names of the expressions. It is also allowed to specify suffixes
#'    for selected expressions only.
#'
#' The new column names are a combination of the left-hand side (i.e.,
#' the name) of the expression and the related suffixes. If `suffix = NULL` (the
#' default), and a summary expression returns multiple values, either the names
#' of the returned values (if any) or automatically numbered suffixes such as
#' `_1`, `_2`, etc. are used. See 'Examples'.
#' @param ... One or more named expressions that define the new variable name
#' and the function to compute the summary statistic. Example:
#' `mean_sepal_width = mean(Sepal.Width)`. The expression can also be provided
#' as a character string, e.g. `"mean_sepal_width = mean(Sepal.Width)"`. The
#' summary function `n()` can be used to count the number of observations.
#'
#' @return A data frame with the requested summary statistics.
#'
#' @examples
#' data(iris)
#' data_summary(iris, MW = mean(Sepal.Width), SD = sd(Sepal.Width))
#' data_summary(
#'   iris,
#'   MW = mean(Sepal.Width),
#'   SD = sd(Sepal.Width),
#'   by = "Species"
#' )
#'
#' # same as
#' d <- data_group(iris, "Species")
#' data_summary(d, MW = mean(Sepal.Width), SD = sd(Sepal.Width))
#'
#' # multiple groups
#' data(mtcars)
#' data_summary(mtcars, MW = mean(mpg), SD = sd(mpg), by = c("am", "gear"))
#'
#' # expressions can also be supplied as character strings
#' data_summary(mtcars, "MW = mean(mpg)", "SD = sd(mpg)", by = c("am", "gear"))
#'
#' # count observations within groups
#' data_summary(mtcars, observations = n(), by = c("am", "gear"))
#'
#' # first and last observations of "mpg" within groups
#' data_summary(
#'   mtcars,
#'   first = mpg[1],
#'   last = mpg[length(mpg)],
#'   by = c("am", "gear")
#' )
#'
#' # allow more than one-column-summaries for expressions
#' d <- data.frame(
#'   x = rnorm(100, 1, 1),
#'   y = rnorm(100, 2, 2),
#'   groups = rep(1:4, each = 25)
#' )
#'
#' # since we have multiple columns for one expression, the names of the
#' # returned summary results are used as suffix by default
#' data_summary(
#'   d,
#'   quant_x = quantile(x, c(0.25, 0.75)),
#'   mean_x = mean(x),
#'   quant_y = quantile(y, c(0.25, 0.5, 0.75))
#' )
#'
#' # if a summary function, like `fivenum()`, returns no named vector, suffixes
#' # are automatically numbered
#' data_summary(
#'   d,
#'   quant_x = quantile(x, c(0.25, 0.75)),
#'   mean_x = mean(x),
#'   fivenum_y = fivenum(y)
#' )
#'
#' # specify column suffix for expressions, matching by names
#' data_summary(
#'   d,
#'   quant_x = quantile(x, c(0.25, 0.75)),
#'   mean_x = mean(x),
#'   quant_y = quantile(y, c(0.25, 0.5, 0.75)),
#'   suffix = list(quant_y = c("_Q1", "_Q2", "_Q3"))
#' )
#'
#' # name multiple expression suffixes, grouped by variable
#' data_summary(
#'   d,
#'   quant_x = quantile(x, c(0.25, 0.75)),
#'   mean_x = mean(x),
#'   quant_y = quantile(y, c(0.25, 0.5, 0.75)),
#'   suffix = list(quant_x = c("Q1", "Q3"), quant_y = c("_Q1", "_Q2", "_Q3")),
#'   by = "groups"
#' )
#'
#' @export
data_summary <- function(x, ...) {
  UseMethod("data_summary")
}


#' @export
data_summary.matrix <- function(
  x,
  ...,
  by = NULL,
  remove_na = FALSE,
  suffix = NULL
) {
  data_summary(
    as.data.frame(x),
    ...,
    by = by,
    remove_na = remove_na,
    suffix = suffix
  )
}


#' @export
data_summary.default <- function(x, ...) {
  insight::format_error(
    "`data_summary()` only works for (grouped) data frames and matrices."
  )
}


#' @rdname data_summary
#' @export
data_summary.data.frame <- function(
  x,
  ...,
  by = NULL,
  remove_na = FALSE,
  suffix = NULL
) {
  dots <- eval(substitute(alist(...)))

  # do we have any expression at all?
  if (length(dots) == 0) {
    insight::format_error(
      "No expressions for calculating summary statistics provided."
    )
  }

  if (is.null(by)) {
    # when we have no grouping, just compute a one-row summary
    summarise <- .process_datasummary_dots(dots, x, suffix)
    # coerce to data frame
    out <- as.data.frame(t(summarise))
    colnames(out) <- names(summarise)
  } else {
    # sanity check - is "by" a character string?
    if (!is.character(by)) {
      insight::format_error(
        "Argument `by` must be a character string indicating the name of variables in the data."
      )
    }
    # is "by" in the data?
    if (!all(by %in% colnames(x))) {
      by_not_found <- by[!by %in% colnames(x)]
      insight::format_error(
        paste0(
          "Variable",
          ifelse(length(by_not_found) > 1, "s ", " "),
          text_concatenate(by_not_found, enclose = "\""),
          " not found in the data."
        ),
        .misspelled_string(colnames(x), by_not_found, "Possibly misspelled?")
      )
    }
    # split data, add NA levels, if requested
    l <- lapply(x[by], function(i) {
      if (remove_na || !anyNA(i)) {
        i
      } else {
        addNA(i)
      }
    })
    split_data <- split(x, l, drop = TRUE)
    out <- lapply(split_data, function(s) {
      # no data for combination? Return NULL
      if (nrow(s) == 0) {
        return(NULL)
      }
      # summarize data
      summarise <- .process_datasummary_dots(dots, s, suffix)
      # coerce to data frame
      summarised_data <- as.data.frame(t(summarise))
      # bind grouping-variables and values
      summarised_data <- cbind(s[1, by], summarised_data)
      # make sure we have proper column names
      colnames(summarised_data) <- c(by, names(summarise))
      summarised_data
    })
    # check for correct number of columns. If one expression returns different
    # number of values (which now means, we have different number of columns
    # to bind) for each group, tell user
    if (!all(lengths(out) == lengths(out)[1])) {
      insight::format_error(
        "Each expression must return the same number of values for each group. Some of the expressions seem to return varying numbers of values."
      )
    }
    out <- do.call(rbind, out)
  }
  # sort data
  out <- data_arrange(out, select = by)
  # data attributes
  class(out) <- c("dw_data_summary", "data.frame")
  rownames(out) <- NULL
  out
}


#' @export
data_summary.grouped_df <- function(
  x,
  ...,
  by = NULL,
  remove_na = FALSE,
  suffix = NULL
) {
  # extract group variables
  grps <- attr(x, "groups", exact = TRUE)
  group_variables <- data_remove(grps, ".rows")
  # if "by" is not supplied, use group variables
  if (is.null(by)) {
    by <- colnames(group_variables)
  }
  # remove information specific to grouped df's
  attr(x, "groups") <- NULL
  class(x) <- "data.frame"
  data_summary(x, ..., by = by, remove_na = remove_na, suffix = suffix)
}


# helper -----------------------------------------------------------------------

.process_datasummary_dots <- function(dots, data, suffix = NULL) {
  out <- NULL
  if (length(dots)) {
    # we check for character vector of expressions, in which case
    # "dots" should be unnamed
    if (is.null(names(dots))) {
      # if we have multiple strings, concatenate them to a character vector
      # and put it into a list...
      if (length(dots) > 1) {
        if (all(vapply(dots, is.character, logical(1)))) {
          dots <- list(unlist(dots))
        } else {
          insight::format_error(
            "You cannot mix string and literal representation of expressions."
          )
        }
      }
      # expression is given as character string, e.g.
      # a <- "mean_sepwid = mean(Sepal.Width)"
      # data_summary(iris, a, by = "Species")
      # or as character vector, e.g.
      # data_summary(iris, c("var_a = mean(Sepal.Width)", "var_b = sd(Sepal.Width)"))
      character_symbol <- tryCatch(.dynEval(dots[[1]]), error = function(e) {
        NULL
      })
      # do we have a character vector? Then we can proceed
      if (is.character(character_symbol)) {
        dots <- lapply(character_symbol, function(s) {
          # turn value from character vector into expression
          str2lang(.dynEval(s))
        })
        names(dots) <- vapply(
          dots,
          function(n) insight::safe_deparse(n[[2]]),
          character(1)
        )
      }
    }

    # sanity check: check the input for the `suffix` argument
    # `suffix` can be NULL, or must be a (named) list
    if (!is.null(suffix)) {
      # if `suffix` is a character vector, we transform it into a list,
      # matching the names of the expressions
      if (is.character(suffix)) {
        suffix <- rep(list(suffix), length(dots))
        names(suffix) <- names(dots)
      }
      # no list? error
      if (!is.list(suffix)) {
        insight::format_error(
          "Argument `suffix` must be a list of (named) character vectors, where the names match the names of the expressions, e.g.:",
          paste0(
            "`suffix = list(",
            names(dots)[1],
            " = c(\"_suffix1\", \"_suffix2\")`."
          )
        )
      }
      # not all elements named? error
      if (!length(which(nzchar(names(suffix), keepNA = TRUE)))) {
        insight::format_error("All elements of `suffix` must have names.")
      }
      # names of suffix do not match names of expressions? error
      if (!all(names(suffix) %in% names(dots))) {
        wrong_name <- which(!names(suffix) %in% names(dots))[1]
        insight::format_error(
          paste0(
            "Names of `suffix` must match the names of the expressions. Suffix `",
            names(suffix)[wrong_name],
            "` has no corresponding expression."
          )
        )
      }
      # identical suffixes for one expression? error
      identical_suffix <- vapply(
        suffix,
        function(i) insight::n_unique(i) != length(i),
        logical(1)
      )
      if (any(identical_suffix)) {
        insight::format_error(
          paste0(
            "All suffixes for a single expression must be unique. Suffix for element `",
            names(identical_suffix)[which(identical_suffix)][1],
            "` has duplicate values."
          )
        )
      }
    }

    out <- lapply(seq_along(dots), function(i) {
      new_variable <- .get_new_dots_variable(dots, i, data)
      # check special case here - we want bayestestR::ci to work with
      # data summary, to easily create CIs for, say, posterior draws
      if (inherits(new_variable, c("bayestestR_ci", "bayestestR_eti"))) {
        stats::setNames(new_variable, c("CI", "CI_low", "CI_high"))
      } else {
        # init
        current_suffix <- NULL
        # find matches and set use suffix if found
        matching_names <- which(names(suffix) == names(dots)[i])
        # either use suffixes based on matching names, or try to extract
        # names from the returned summary expression (saved in "new_variable"),
        # if the summary function returned a named vector
        if (length(matching_names) > 0) {
          current_suffix <- suffix[[matching_names]]
        } else if (
          length(new_variable) > 1 &&
            all(nzchar(names(new_variable), keepNA = TRUE))
        ) {
          current_suffix <- names(new_variable)
        }
        # if we don't have suffixes for multiple columns, but expression
        # returns multiple columns, we get NA column names - we use
        # automatically numbered suffixes in this case
        if (is.null(current_suffix) && length(new_variable) > 1) {
          current_suffix <- paste0("_", seq_along(new_variable))
        }

        # if number of suffixes does not match the number of returned values
        # by the expression, error
        if (
          !is.null(current_suffix) &&
            length(current_suffix) != length(new_variable)
        ) {
          insight::format_error(
            paste0(
              "Argument `suffix` must have the same length as the result of the corresponding summary expression. `suffix` has ",
              length(current_suffix),
              " elements (",
              text_concatenate(current_suffix, enclose = "`"),
              ") for the expression `",
              insight::safe_deparse(dots[[i]]),
              "`, which returned ",
              length(new_variable),
              " values."
            )
          )
        }
        stats::setNames(new_variable, paste0(names(dots)[i], current_suffix))
      }
    })
  }

  unlist(out)
}


# methods ----------------------------------------------------------------------

#' @export
print.dw_data_summary <- function(x, ...) {
  if (nrow(x) == 0) {
    cat("No matches found.\n")
  } else {
    if (all(c("CI", "CI_low", "CI_high") %in% colnames(x))) {
      ci <- insight::format_table(x[c("CI", "CI_low", "CI_high")], ...)
      x$CI <- x$CI_low <- x$CI_high <- NULL
      x <- cbind(x, ci)
    }
    cat(insight::export_table(x, missing = "<NA>", ...))
  }
}


================================================
FILE: R/data_tabulate.R
================================================
#' @title Create frequency and crosstables of variables
#' @name data_tabulate
#'
#' @description This function creates frequency or crosstables of variables,
#' including the number of levels/values as well as the distribution of raw,
#' valid and cumulative percentages. For crosstables, row, column and cell
#' percentages can be calculated.
#'
#' @param x A (grouped) data frame, a vector or factor.
#' @param by Optional vector or factor. If supplied, a crosstable is created.
#' If `x` is a data frame, `by` can also be a character string indicating the
#' name of a variable in `x`.
#' @param drop_levels Logical, if `FALSE`, factor levels that do not occur in
#' the data are included in the table (with frequency of zero), else unused
#' factor levels are dropped from the frequency table.
#' @param name Optional character string, which includes the name that is used
#' for printing.
#' @param remove_na Logical, if `FALSE`, missing values are included in the
#' frequency or crosstable, else missing values are omitted. Note that the
#' default for the `as.table()` method is `remove_na = TRUE`, so that missing
#' values are not included in the returned table, which makes more sense for
#' post-processing of the table, e.g. using `chisq.test()`.
#' @param collapse Logical, if `TRUE` collapses multiple tables into one larger
#' table for printing. This affects only printing, not the returned object.
#' @param weights Optional numeric vector of weights. Must be of the same length
#' as `x`. If `weights` is supplied, weighted frequencies are calculated.
#' @param proportions Optional character string, indicating the type of
#' percentages to be calculated. Only applies to crosstables, i.e. when `by` is
#' not `NULL`. Can be `"row"` (row percentages), `"column"` (column percentages)
#' or `"full"` (to calculate relative frequencies for the full table).
#' @param big_mark Optional character string, indicating the big mark that is
#' used for large numbers. If `NULL` (default), a big mark is added automatically for
#' large numbers (i.e. numbers with more than 5 digits). If you want to remove
#' the big mark, set `big_mark = ""`.
#' @param object An object returned by `data_tabulate()`.
#' @param format String, indicating the output format. Can be `"markdown"`
#' `"html"`, or `"tt"`. `format = "html"` create an HTML table using the *gt*
#' package. `format = "tt"` creates a `tinytable` object, which is either
#' printed as markdown or HTML table, depending on the environment. See
#' [`insight::export_table()`] for details.
#' @param verbose Toggle warnings and messages.
#' @param ... not used.
#' @inheritParams extract_column_names
#'
#' @details
#' There is an `as.data.frame()` method, to return the frequency tables as a
#' data frame. The structure of the returned object is a nested data frame,
#' where the first column contains name of the variable for which frequencies
#' were calculated, and the second column is a list column that contains the
#' frequency tables as data frame. See [as.table.datawizard_table].
#'
#' There is also an `as.table()` method, which returns a table object with the
#' frequencies of the variable. This is useful for further statistical analysis,
#' e.g. for using `chisq.test()` on the frequency table. See
#' [as.table.datawizard_table].
#'
#' @section Crosstables:
#' If `by` is supplied, a crosstable is created. The crosstable includes `<NA>`
#' (missing) values by default. The first column indicates values of `x`, the
#' first row indicates values of `by` (including missing values). The last row
#' and column contain the total frequencies for each row and column, respectively.
#' Setting `remove_na = FALSE` will omit missing values from the crosstable.
#' Setting `proportions` to `"row"` or `"column"` will add row or column
#' percentages. Setting `proportions` to `"full"` will add relative frequencies
#' for the full table.
#'
#' @note
#' There are `print_html()` and `print_md()` methods available for printing
#' frequency or crosstables in HTML and markdown format, e.g.
#' `print_html(data_tabulate(x))`. The `print()` method for text outputs passes
#' arguments in `...` to [`insight::export_table()`].
#'
#' @return A data frame, or a list of data frames, with one frequency table
#' as data frame per variable.
#'
#' @seealso [as.prop.table]
#'
#' @examplesIf requireNamespace("poorman")
#' # frequency tables -------
#' # ------------------------
#' data(efc)
#'
#' # vector/factor
#' data_tabulate(efc$c172code)
#'
#' # drop missing values
#' data_tabulate(efc$c172code, remove_na = TRUE)
#'
#' # data frame
#' data_tabulate(efc, c("e42dep", "c172code"))
#'
#' # grouped data frame
#' suppressPackageStartupMessages(library(poorman, quietly = TRUE))
#' efc %>%
#'   group_by(c172code) %>%
#'   data_tabulate("e16sex")
#'
#' # collapse tables
#' efc %>%
#'   group_by(c172code) %>%
#'   data_tabulate("e16sex", collapse = TRUE)
#'
#' # for larger N's (> 100000), a big mark is automatically added
#' set.seed(123)
#' x <- sample(1:3, 1e6, TRUE)
#' data_tabulate(x, name = "Large Number")
#'
#' # to remove the big mark, use "print(..., big_mark = "")"
#' print(data_tabulate(x), big_mark = "")
#'
#' # weighted frequencies
#' set.seed(123)
#' efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
#' data_tabulate(efc$e42dep, weights = efc$weights)
#'
#' # crosstables ------
#' # ------------------
#'
#' # add some missing values
#' set.seed(123)
#' efc$e16sex[sample.int(nrow(efc), 5)] <- NA
#'
#' data_tabulate(efc, "c172code", by = "e16sex")
#'
#' # add row and column percentages
#' data_tabulate(efc, "c172code", by = "e16sex", proportions = "row")
#' data_tabulate(efc, "c172code", by = "e16sex", proportions = "column")
#'
#' # omit missing values
#' data_tabulate(
#'   efc$c172code,
#'   by = efc$e16sex,
#'   proportions = "column",
#'   remove_na = TRUE
#' )
#'
#' # round percentages
#' out <- data_tabulate(efc, "c172code", by = "e16sex", proportions = "column")
#' print(out, digits = 0)
#' @export
data_tabulate <- function(x, ...) {
  UseMethod("data_tabulate")
}


#' @rdname data_tabulate
#' @export
data_tabulate.default <- function(
  x,
  by = NULL,
  drop_levels = FALSE,
  weights = NULL,
  remove_na = FALSE,
  proportions = NULL,
  name = NULL,
  verbose = TRUE,
  ...
) {
  # save label attribute, before it gets lost...
  var_label <- attr(x, "label", exact = TRUE)

  # save and fix variable name, check for grouping variable
  obj_name <- tryCatch(
    insight::safe_deparse(substitute(x)),
    error = function(e) NULL
  )
  if (identical(obj_name, "x[[i]]")) {
    obj_name <- name
  }
  group_variable <- list(...)$group_variable

  # check whether levels not present in data should be shown or not
  if (is.factor(x) && isTRUE(drop_levels)) {
    x <- droplevels(x)
  }

  # validate "weights"
  weights <- .validate_table_weights(
    weights,
    x,
    weights_expression = insight::safe_deparse(substitute(weights))
  )

  # we go into another function for crosstables here...
  if (!is.null(by)) {
    by <- .validate_by(by, x)
    return(.crosstable(
      x,
      by = by,
      weights = weights,
      remove_na = remove_na,
      proportions = proportions,
      obj_name = obj_name,
      group_variable = group_variable
    ))
  }

  # frequency table
  if (is.null(weights)) {
    if (remove_na) {
      # we have a `.default` and a `.data.frame` method for `data_tabulate()`.
      # since this is the default, `x` can be an object which cannot be used
      # with `table()`, that's why we add `tryCatch()` here. Below we give an
      # informative error message for non-supported objects.
      freq_table <- tryCatch(table(x), error = function(e) NULL)
    } else {
      freq_table <- tryCatch(table(addNA(x)), error = function(e) NULL)
    }
  } else if (remove_na) {
    # weighted frequency table, excluding NA
    freq_table <- tryCatch(
      stats::xtabs(
        weights ~ x,
        data = data.frame(weights = weights, x = x),
        na.action = stats::na.omit,
        addNA = FALSE
      ),
      error = function(e) NULL
    )
  } else {
    # weighted frequency table, including NA
    freq_table <- tryCatch(
      stats::xtabs(
        weights ~ x,
        data = data.frame(weights = weights, x = addNA(x)),
        na.action = stats::na.pass,
        addNA = TRUE
      ),
      error = function(e) NULL
    )
  }

  if (is.null(freq_table)) {
    insight::format_warning(paste0(
      "Can't compute frequency tables for objects of class `",
      class(x)[1],
      "`."
    ))
    return(NULL)
  }

  # create data frame with freq table and cumulative percentages etc.
  out <- data_rename(
    data.frame(freq_table, stringsAsFactors = FALSE),
    replacement = c("Value", "N")
  )

  # we want to round N for weighted frequencies
  if (!is.null(weights)) {
    out$N <- round(out$N)
  }

  out$`Raw %` <- 100 * out$N / sum(out$N)
  # if we have missing values, we add a row with NA
  if (remove_na) {
    out$`Valid %` <- 100 * out$N / sum(out$N)
    valid_n <- sum(out$N, na.rm = TRUE)
  } else {
    out$`Valid %` <- c(100 * out$N[-nrow(out)] / sum(out$N[-nrow(out)]), NA)
    valid_n <- sum(out$N[-length(out$N)], na.rm = TRUE)
  }
  out$`Cumulative %` <- cumsum(out$`Valid %`)

  # add information about variable/group names
  if (!is.null(obj_name)) {
    if (is.null(group_variable)) {
      var_info <- data.frame(Variable = obj_name, stringsAsFactors = FALSE)
    } else {
      var_info <- data.frame(
        Variable = obj_name,
        Group = toString(lapply(colnames(group_variable), function(i) {
          sprintf("%s (%s)", i, group_variable[[i]])
        })),
        stringsAsFactors = FALSE
      )
    }
    out <- cbind(var_info, out)
  }

  # save information
  attr(out, "type") <- .variable_type(x)
  attr(out, "varname") <- name
  attr(out, "label") <- var_label
  attr(out, "object") <- obj_name
  attr(out, "group_variable") <- group_variable
  attr(out, "duplicate_varnames") <- duplicated(out$Variable)
  attr(out, "weights") <- weights

  attr(out, "total_n") <- sum(out$N, na.rm = TRUE)
  attr(out, "valid_n") <- valid_n

  class(out) <- c("datawizard_table", "data.frame")

  out
}


#' @rdname data_tabulate
#' @export
data_tabulate.data.frame <- function(
  x,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  by = NULL,
  drop_levels = FALSE,
  weights = NULL,
  remove_na = FALSE,
  proportions = NULL,
  collapse = FALSE,
  verbose = TRUE,
  ...
) {
  # evaluate arguments
  select <- .select_nse(
    select,
    x,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  # validate "by"
  by <- .validate_by(by, x)
  # validate "weights"
  weights <- .validate_table_weights(weights, x)

  out <- lapply(select, function(i) {
    data_tabulate(
      x[[i]],
      by = by,
      proportions = proportions,
      drop_levels = drop_levels,
      weights = weights,
      remove_na = remove_na,
      name = i,
      verbose = verbose,
      ...
    )
  })

  if (is.null(by)) {
    class(out) <- c("datawizard_tables", "list")
  } else {
    class(out) <- c("datawizard_crosstabs", "list")
  }
  attr(out, "collapse") <- isTRUE(collapse)
  attr(out, "is_weighted") <- !is.null(weights)

  out
}


#' @export
data_tabulate.grouped_df <- function(
  x,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  by = NULL,
  proportions = NULL,
  drop_levels = FALSE,
  weights = NULL,
  remove_na = FALSE,
  collapse = FALSE,
  verbose = TRUE,
  ...
) {
  grps <- attr(x, "groups", exact = TRUE)
  group_variables <- data_remove(grps, ".rows")
  grps <- grps[[".rows"]]

  # evaluate arguments
  select <- .select_nse(
    select,
    x,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  x <- as.data.frame(x)

  out <- list()
  for (i in seq_along(grps)) {
    rows <- grps[[i]]
    # save information about grouping factors
    if (is.null(group_variables)) {
      group_variable <- NULL
    } else {
      group_variable <- group_variables[i, , drop = FALSE]
    }
    out <- c(
      out,
      data_tabulate(
        data_filter(x, rows),
        select = select,
        exclude = exclude,
        ignore_case = ignore_case,
        verbose = verbose,
        drop_levels = drop_levels,
        weights = weights,
        remove_na = remove_na,
        by = by,
        proportions = proportions,
        group_variable = group_variable,
        ...
      )
    )
  }
  if (is.null(by)) {
    class(out) <- c("datawizard_tables", "list")
  } else {
    class(out) <- c("datawizard_crosstabs", "list")
  }
  attr(out, "collapse") <- isTRUE(collapse)
  attr(out, "is_weighted") <- !is.null(weights)

  out
}


# methods --------------------

#' @importFrom insight print_html
#' @export
insight::print_html


#' @importFrom insight print_md
#' @export
insight::print_md


#' @importFrom insight display
#' @export
insight::display

#' Convert a crosstable to a frequency or a propensity table
#'
#' @description
#' `as.prop.table()` is an S3 generic. It can be used on objects of class
#' `datawizard_crosstab` created by `data_tabulate()` when it was run with the
#' arguments `by` and `proportions`.
#'
#' @param x An object created by `data_tabulate()`. It must be of class
#' `datawizard_crosstab` for `as.prop.table()`.
#' @param simplify Logical, if `TRUE`, the returned table is simplified to a
#' single table object if there is only one frequency or contingency table
#' input. Else, always for multiple table inputs or when `simplify = FALSE`, a
#' list of tables is returned. This is only relevant for the `as.table()`
#' methods. To ensure consistent output, the default is `FALSE`.
#' @inheritParams data_tabulate
#'
#' @export
#' @seealso [data_tabulate]
#'
#' @examples
#' data(efc)
#'
#' # Some cross tabulation
#' cross <- data_tabulate(efc, select = "e42dep", by = "c172code", proportions = "row")
#' cross
#'
#' # Convert to a propensity table
#' as.prop.table(cross)
#'
#' # Convert to data.frame
#' result <- data_tabulate(efc, "c172code", by = "e16sex")
#' as.data.frame(result)
#' as.data.frame(result)$table
#' as.data.frame(result, add_total = TRUE)$table
#'
#' # Convert to a table that can be passed to chisq.test()
#'
#' out <- data_tabulate(efc, "c172code", by = "e16sex")
#' # we need to simplify the output, else we get a list of tables
#' tbl <- as.table(out, simplify = TRUE)
#' tbl
#' suppressWarnings(chisq.test(tbl))
#'
#' # apply chisq.test to each table
#' out <- data_tabulate(efc, c("c172code", "e16sex"))
#' suppressWarnings(lapply(as.table(out), chisq.test))
#'
#' # can also handle grouped data frames
#' d <- data_group(mtcars, "am")
#' x <- data_tabulate(d, "cyl", by = "gear")
#' as.table(x)
as.prop.table <- function(x, ...) {
  UseMethod("as.prop.table")
}

#' @rdname as.prop.table
#' @export
as.prop.table.datawizard_crosstab <- function(
  x,
  remove_na = TRUE,
  simplify = FALSE,
  verbose = TRUE,
  ...
) {
  # sanity check - the `.data.frame` method  returns a list, but not the
  # default method
  if (!is.data.frame(x)) {
    x <- x[[1]]
  }
  prop_table <- attributes(x)$prop_table

  if (is.null(prop_table)) {
    insight::format_warning("No proportions available.")
    return(NULL)
  }

  if (remove_na) {
    if (
      verbose &&
        ("NA" %in% colnames(prop_table) || "NA" %in% rownames(prop_table))
    ) {
      insight::format_alert("Removing NA values from frequency table.")
    }
    if (!is.null(prop_table[["NA"]])) {
      prop_table[["NA"]] <- NULL
    }
    if ("NA" %in% rownames(prop_table)) {
      prop_table <- prop_table[rownames(prop_table) != "NA", ]
    }
  }
  # coerce to table
  result <- as.table(as.matrix(prop_table))
  # if we don't want to simplify the table, we wrap it into a list
  if (!simplify) {
    result <- list(result)
  }

  result
}

#' @export
as.prop.table.datawizard_crosstabs <- function(
  x,
  remove_na = TRUE,
  simplify = FALSE,
  verbose = TRUE,
  ...
) {
  # only show message once we set `verbose = FALSE` in the lapply()
  if (remove_na && verbose) {
    prop_table <- attributes(x[[1]])$prop_table
    if ("NA" %in% colnames(prop_table) || "NA" %in% rownames(prop_table)) {
      insight::format_alert("Removing NA values from frequency table.")
    }
  }

  out <- insight::compact_list(lapply(
    x,
    as.prop.table.datawizard_crosstab,
    remove_na = remove_na,
    simplify = TRUE,
    verbose = FALSE,
    ...
  ))

  # if no proportions found, return NULL
  if (!length(out)) {
    return(NULL)
  }

  # if only one table is returned, "unlist"
  if (length(out) == 1 && simplify) {
    out <- out[[1]]
  }
  out
}


# as.data.frame --------------------

#' @rdname as.prop.table
#' @param add_total For crosstables (i.e. when `by` is not `NULL`), a row and
#' column with the total N values are added to the data frame. `add_total` has
#' no effect in `as.data.frame()` for simple frequency tables.
#' @inheritParams base::as.data.frame
#' @export
as.data.frame.datawizard_tables <- function(
  x,
  row.names = NULL,
  optional = FALSE,
  ...,
  stringsAsFactors = FALSE,
  add_total = FALSE
) {
  # extract variables of frequencies
  selected_vars <- unlist(lapply(x, function(i) attributes(i)$varname))
  # coerce to data frame, remove rownames
  data_frames <- lapply(x, function(i) {
    # the `format()` methods for objects returned by `data_tabulate()` call
    # `as.data.frame()` - we have to pay attention to avoid infinite iterations
    # here. At the moment, this is no problem, as objects we have at this stage
    # are of class "datawizard_table" or "datawizard_crosstab", while this
    # `as.data.frame()` method is only called for "datawizard_tables" (the plural)
    # form). Else, we would need to modify the class attribute here,
    # e.g. class(i) <- "data.frame"
    if (add_total) {
      # to add the total column and row, we simply can call `format()`
      out <- as.data.frame(format(i))
      for (cols in 2:ncol(out)) {
        # since "format()" returns a character matrix, we want to convert
        # the columns to numeric. We have to exclude the first column, as the
        # first column is character, due to the added "Total" value.
        out[[cols]] <- as.numeric(out[[cols]])
      }
      # after formatting, we have a "separator" row for nicer printing.
      # this should also be removed
      out <- remove_empty_rows(out)
    } else {
      out <- as.data.frame(i)
    }
    rownames(out) <- NULL
    out
  })
  # create nested data frame
  result <- data.frame(
    var = selected_vars,
    table = I(data_frames),
    stringsAsFactors = stringsAsFactors
  )
  # consider additional arguments
  rownames(result) <- row.names
  result
}

#' @export
as.data.frame.datawizard_crosstabs <- as.data.frame.datawizard_tables


# as.table --------------------

#' @rdname as.prop.table
#' @export
as.table.datawizard_table <- function(
  x,
  remove_na = TRUE,
  simplify = FALSE,
  verbose = TRUE,
  ...
) {
  # sanity check - the `.data.frame` method (data_tabulate(mtcars, "cyl"))
  # returns a list, but not the default method (data_tabulate(mtcars$cyl))
  if (!is.data.frame(x)) {
    x <- x[[1]]
  }
  # check if any table has NA values - the column "Value" contains the value
  # "NA", and the column "N" contains the frequency of this value.
  if (remove_na) {
    # .check_table_na() works on lists of data frames, so we wrap the data frame
    # into a list here
    if (verbose && .check_table_na(list(x))) {
      insight::format_alert("Removing NA values from frequency table.")
    }
    # remove NA values from the table
    x <- x[!is.na(x$Value), , drop = FALSE]
  }
  # coerce to table
  result <- as.table(stats::setNames(x[["N"]], x$Value))
  # if we don't want to simplify the table, we wrap it into a list
  if (!simplify) {
    result <- list(result)
  }

  result
}

#' @export
as.table.datawizard_tables <- function(
  x,
  remove_na = TRUE,
  simplify = FALSE,
  verbose = TRUE,
  ...
) {
  # only show message once we set `verbose = FALSE` in the lapply()
  if (remove_na && verbose && .check_table_na(x)) {
    insight::format_alert("Removing NA values from frequency table.")
  }

  out <- lapply(
    x,
    as.table.datawizard_table,
    remove_na = remove_na,
    # no nested lists
    simplify = TRUE,
    # no multiple messages
    verbose = FALSE,
    ...
  )
  # if only one table is returned, "unlist"
  if (length(out) == 1 && simplify) {
    out <- out[[1]]
  }
  out
}

#' @export
as.table.datawizard_crosstab <- function(
  x,
  remove_na = TRUE,
  simplify = FALSE,
  verbose = TRUE,
  ...
) {
  # sanity check - the `.data.frame` method  returns a list, but not the
  # default method
  if (!is.data.frame(x)) {
    x <- x[[1]]
  }
  # check for grouped df - we need to remove the "Group" column
  if (.is_grouped_df_xtab(x)) {
    x$Group <- NULL
  }
  # first column contains the row names
  row_names <- as.character(x[[1]])
  row_names[is.na(row_names)] <- "NA"
  # remove first column, set rownames
  x[[1]] <- NULL
  rownames(x) <- row_names

  if (remove_na) {
    if (verbose && .check_xtable_na(list(x))) {
      insight::format_alert("Removing NA values from frequency table.")
    }
    if (!is.null(x[["NA"]])) {
      x[["NA"]] <- NULL
    }
    if ("NA" %in% row_names) {
      x <- x[row_names != "NA", ]
    }
  }
  # coerce to table
  result <- as.table(as.matrix(x))
  # if we don't want to simplify the table, we wrap it into a list
  if (!simplify) {
    result <- list(result)
  }

  result
}

#' @export
as.table.datawizard_crosstabs <- function(
  x,
  remove_na = TRUE,
  simplify = FALSE,
  verbose = TRUE,
  ...
) {
  # only show message once we set `verbose = FALSE` in the lapply()
  if (remove_na && verbose && .check_xtable_na(x)) {
    insight::format_alert("Removing NA values from frequency table.")
  }

  out <- lapply(
    x,
    as.table.datawizard_crosstab,
    remove_na = remove_na,
    simplify = TRUE,
    verbose = FALSE,
    ...
  )
  # if only one table is returned, "unlist"
  if (length(out) == 1 && simplify) {
    out <- out[[1]]
  }
  # if we have a grouped data frame, we save the grouping values as
  # names for the list
  if (.is_grouped_df_xtab(x)) {
    names(out) <- unlist(
      lapply(x, function(i) {
        i$Group[1]
      }),
      use.names = FALSE
    )
  }
  out
}


.is_grouped_df_xtab <- function(x) {
  if (!is.data.frame(x)) {
    x <- x[[1]]
  }
  isTRUE(attributes(x)$grouped_df)
}


.check_table_na <- function(x) {
  # check if any table has NA values - the column "Value" contains the value
  # "NA", and the column "N" contains the frequency of this value.
  any(vapply(x, function(i) any(i$N[is.na(i$Value)] > 0), logical(1)))
}


.check_xtable_na <- function(x) {
  any(vapply(
    x,
    function(i) {
      # need to extract rownames, to check if we have a "NA" row
      row_names <- as.character(i[[1]])
      row_names[is.na(row_names)] <- "NA"
      has_na <- FALSE
      # check for NA columns and rows
      if (!is.null(i[["NA"]])) {
        has_na <- any(i[["NA"]] > 0)
      }
      if ("NA" %in% row_names) {
        # for grouped data frames, we need to remove the "Group" column, else
        # the indexing -1 below won't work
        if (.is_grouped_df_xtab(i)) {
          i$Group <- NULL
        }
        # we need "as.data.frame()" for grouped df, else `as.vector()` fails
        has_na <- has_na |
          any(as.vector(as.data.frame(i[row_names == "NA", -1])) > 0)
      }
      has_na
    },
    logical(1)
  ))
}


# format --------------------

#' @export
format.datawizard_table <- function(x, format = "text", big_mark = NULL, ...) {
  # convert to character manually, else, for large numbers,
  # format_table() returns scientific notation
  x <- as.data.frame(x)
  x$N <- as.character(x$N)

  # format data frame
  ftab <- insight::format_table(x, ...)
  ftab[] <- lapply(ftab, function(i) {
    i[i == ""] <- ifelse(identical(format, "text"), "<NA>", "(NA)") # nolint
    i
  })
  ftab$N <- gsub("\\.00$", "", ftab$N)

  # insert big marks?
  ftab$N <- .add_commas_in_numbers(ftab$N, big_mark)

  ftab
}

.add_commas_in_numbers <- function(x, big_mark = NULL) {
  # sanity checks - for crosstables with `remove_na = FALSE`, nchar(x) fails,
  # and pretty() warns about non-numeric input. Thus, we skip if any NA value
  # is in `x`.
  if (anyNA(x)) {
    return(x)
  }
  # automatically add a big mark for large numbers
  if (is.null(big_mark) && any(nchar(x) > 5)) {
    big_mark <- ","
  }
  if (identical(big_mark, "")) {
    return(x)
  }
  if (!is.null(big_mark)) {
    x <- prettyNum(x, big.mark = big_mark)
  }

  x
}


# print --------------------

#' @rdname data_tabulate
#' @export
print.datawizard_table <- function(x, big_mark = NULL, ...) {
  a <- attributes(x)

  # "table" header with variable label/name, and type
  cat(.table_header(x, "text"))

  # grouped data? if yes, add information on grouping factor
  if (!is.null(a$group_variable)) {
    group_title <- paste0(
      "Grouped by ",
      toString(lapply(colnames(a$group_variable), function(i) {
        sprintf("%s (%s)", i, a$group_variable[[i]])
      }))
    )
    cat(insight::print_color(group_title, "blue"))
    cat("\n")
  }

  a$total_n <- .add_commas_in_numbers(a$total_n, big_mark)
  a$valid_n <- .add_commas_in_numbers(a$valid_n, big_mark)

  # summary of total and valid N (we may add mean/sd as well?)
  summary_line <- sprintf(
    "# total N=%s valid N=%s%s\n\n",
    a$total_n,
    a$valid_n,
    ifelse(is.null(a$weights), "", " (weighted)")
  )
  cat(insight::print_color(summary_line, "blue"))

  # remove information that goes into the header/footer
  x$Variable <- NULL
  x$Group <- NULL

  # print table
  cat(insight::export_table(
    format(x, big_mark = big_mark, ...),
    cross = "+",
    missing = "<NA>",
    ...
  ))
  invisible(x)
}


#' @export
print.datawizard_tables <- function(x, big_mark = NULL, ...) {
  # check if we have weights
  is_weighted <- isTRUE(attributes(x)$is_weighted)

  a <- attributes(x)
  if (!isTRUE(a$collapse) || length(x) == 1) {
    for (i in seq_along(x)) {
      print(x[[i]], big_mark = big_mark, ...)
      if (i < length(x)) cat("\n")
    }
  } else {
    x <- lapply(x, function(i) {
      i_attr <- attributes(i)
      i <- format(i, format = "text", big_mark = big_mark, ...)
      if (!is.null(i$Variable)) {
        i$Variable[i_attr$duplicate_varnames] <- ""
      }
      if (!is.null(i$Group)) {
        i$Group[i_attr$duplicate_varnames] <- ""
      }
      i[nrow(i) + 1, ] <- ""
      i
    })

    out <- do.call(rbind, x)
    if (is_weighted) {
      cat(insight::print_color("# Frequency Table (weighted)\n\n", "blue"))
    } else {
      cat(insight::print_color("# Frequency Table\n\n", "blue"))
    }

    # print table
    cat(insight::export_table(
      out,
      missing = "<NA>",
      cross = "+",
      empty_line = "-",
      ...
    ))
  }
}


# display --------------------

#' @rdname data_tabulate
#' @export
display.datawizard_table <- function(
  object,
  big_mark = NULL,
  format = "markdown",
  ...
) {
  format <- .display_default_format(format)

  fun_args <- list(
    x = object,
    big_mark = big_mark,
    ...
  )

  # print table in HTML or markdown format
  if (format %in% c("html", "tt")) {
    fun_args$backend <- format
    do.call(print_html, fun_args)
  } else {
    do.call(print_md, fun_args)
  }
}

#' @export
display.datawizard_tables <- display.datawizard_table

#' @export
display.datawizard_crosstab <- display.datawizard_table

#' @export
display.datawizard_crosstabs <- display.datawizard_table

.display_default_format <- function(format) {
  format <- getOption("easystats_display_format", format)
  insight::validate_argument(format, c("markdown", "html", "md", "tt"))
}


# print_html --------------------

#' @export
print_html.datawizard_table <- function(x, big_mark = NULL, ...) {
  .print_dw_table(x, format = "html", big_mark = big_mark, ...)
}


#' @export
print_html.datawizard_tables <- function(x, big_mark = NULL, ...) {
  # check if we have weights
  is_weighted <- isTRUE(attributes(x)$is_weighted)

  if (length(x) == 1) {
    print_html(x[[1]], big_mark = big_mark, ...)
  } else {
    x <- lapply(x, function(i) {
      i_attr <- attributes(i)
      i <- format(i, format = "html", big_mark = big_mark, ...)
      if (!is.null(i$Variable)) {
        i$Variable[i_attr$duplicate_varnames] <- ""
      }
      i
    })

    out <- do.call(rbind, x)

    # print table
    insight::export_table(
      out,
      missing = "<NA>",
      caption = ifelse(
        is_weighted,
        "Frequency Table (weighted)",
        "Frequency Table"
      ),
      format = .check_format_backend(...),
      group_by = "Group"
    )
  }
}


# print_md --------------------

#' @export
print_md.datawizard_table <- function(x, big_mark = NULL, ...) {
  .print_dw_table(x, format = "markdown", big_mark = big_mark, ...)
}


#' @export
print_md.datawizard_tables <- function(x, big_mark = NULL, ...) {
  # check if we have weights
  is_weighted <- isTRUE(attributes(x)$is_weighted)

  if (length(x) == 1) {
    print_md(x[[1]], big_mark = big_mark, ...)
  } else {
    x <- lapply(x, function(i) {
      i_attr <- attributes(i)
      i <- format(i, format = "markdown", big_mark = big_mark, ...)
      if (!is.null(i$Variable)) {
        i$Variable[i_attr$duplicate_varnames] <- ""
      }
      if (!is.null(i$Group)) {
        i$Group[i_attr$duplicate_varnames] <- ""
      }
      i[nrow(i) + 1, ] <- ""
      i
    })

    out <- do.call(rbind, x)

    # print table
    insight::export_table(
      out,
      missing = "(NA)",
      empty_line = "-",
      format = "markdown",
      title = ifelse(
        is_weighted,
        "Frequency Table (weighted)",
        "Frequency Table"
      )
    )
  }
}


# tools --------------------

.print_dw_table <- function(x, format = "markdown", big_mark = NULL, ...) {
  a <- attributes(x)

  # "table" header with variable label/name, and type
  caption <- .table_header(x, format)

  # summary of total and valid N (we may add mean/sd as well?)
  footer <- sprintf(
    "total N=%i valid N=%i%s%s",
    a$total_n,
    a$valid_n,
    ifelse(is.null(a$weights), "", " (weighted)"),
    ifelse(format == "markdown", "\n\n", "")
  )

  # remove information that goes into the header/footer
  x$Variable <- NULL
  x$Group <- NULL

  # this function is used by all four supported format, markdown, text, html
  # and tt (tinytable). For tt, we sometimes have format "html" and backend = "tt",
  # so we need to check for this special case
  backend <- switch(format, html = , tt = .check_format_backend(...), format)
  # print table
  insight::export_table(
    format(x, format = format, big_mark = big_mark, ...),
    title = caption,
    footer = footer,
    missing = "(NA)",
    format = backend
  )
}


# we allow exporting HTML format based on "gt" or "tinytable"
.check_format_backend <- function(...) {
  dots <- list(...)
  if (identical(dots$backend, "tt")) {
    "tt"
  } else {
    "html"
  }
}


.table_header <- function(x, format = "text") {
  a <- attributes(x)

  # assemble name, based on what information is available
  name <- NULL
  # fix object name
  if (identical(a$object, "x[[i]]")) {
    a$object <- NULL
  }
  if (!is.null(a$label)) {
    name <- a$label
    if (!is.null(a$varname)) {
      name <- paste0(name, " (", a$varname, ")")
    } else if (!is.null(a$object)) {
      name <- paste0(name, " (", a$object, ")")
    }
  } else if (!is.null(a$varname)) {
    name <- a$varname
    if (!is.null(a$object)) {
      name <- paste0(name, " (", a$object, ")")
    }
  }

  if (is.null(name) && !is.null(a$object)) {
    name <- a$object
  }

  # "table" header with variable label/name, and type
  if (identical(format, "text")) {
    out <- paste(
      insight::color_text(name, "red"),
      insight::color_text(sprintf("<%s>\n", a$type), "blue")
    )
  } else {
    out <- paste0(name, " (", a$type, ")")
  }

  out
}


.variable_type <- function(x) {
  if (is.ordered(x)) {
    vt <- "ord"
  } else if (is.factor(x)) {
    vt <- "fct"
  } else if (class(x)[1] == "Date") {
    vt <- "date"
  } else {
    vt <- switch(
      typeof(x),
      logical = "lgl",
      integer = "int",
      double = "dbl",
      character = "chr",
      complex = "cpl",
      closure = "fn",
      environment = "env",
      typeof(x)
    )
  }

  switch(
    vt,
    ord = "ordinal",
    fct = "categorical",
    dbl = "numeric",
    int = "integer",
    chr = "character",
    lbl = "labelled",
    cpl = "complex",
    lgl = "logical",
    vt
  )
}


================================================
FILE: R/data_to_long.R
================================================
#' @title Reshape (pivot) data from wide to long
#' @name data_to_long
#'
#' @description
#' This function "lengthens" data, increasing the number of rows and decreasing
#' the number of columns. This is a dependency-free base-R equivalent of
#' `tidyr::pivot_longer()`.
#'
#' @param data A data frame to convert to long format, so that it has more
#' rows and fewer columns after the operation.
#' @param names_to The name of the new column (variable) that will contain the
#' _names_ from columns in `select` as values, to identify the source of the
#' values. `names_to` can be a character vector with more than one column name,
#' in which case `names_sep` or `names_pattern` must be provided in order to
#' identify which parts of the column names go into newly created columns.
#' See also 'Examples'.
#' @param names_prefix A regular expression used to remove matching text from
#' the start of each variable name.
#' @param names_sep,names_pattern If `names_to` contains multiple values, this
#' argument controls how the column name is broken up. `names_pattern` takes a
#' regular expression containing matching groups, i.e. "()".
#' @param values_to The name of the new column that will contain the _values_ of
#' the columns in `select`.
#' @param values_drop_na If `TRUE`, will drop rows that contain only `NA` in the
#' `values_to` column. This effectively converts explicit missing values to
#' implicit missing values, and should generally be used only when missing values
#' in data were created by its structure.
#' @param rows_to The name of the column that will contain the row names or row
#' numbers from the original data. If `NULL`, will be removed.
#' @param ... Currently not used.
#' @inheritParams extract_column_names
#' @param cols Identical to `select`. This argument is here to ensure compatibility
#' with `tidyr::pivot_longer()`. If both `select` and `cols` are provided, `cols`
#' is used.
#'
#' @inherit data_rename seealso
#'
#' @details
#' Reshaping data into long format usually means that the input data frame is
#' in _wide_ format, where multiple measurements taken on the same subject are
#' stored in multiple columns (variables). The long format stores the same
#' information in a single column, with each measurement per subject stored in
#' a separate row. The values of all variables that are not in `select` will
#' be repeated.
#'
#' The necessary information for `data_to_long()` is:
#'
#' - The columns that contain the repeated measurements (`select`).
#' - The name of the newly created column that will contain the names of the
#'   columns in `select` (`names_to`), to identify the source of the values.
#'   `names_to` can also be a character vector with more than one column name,
#'   in which case `names_sep` or `names_pattern` must be provided to specify
#'   which parts of the column names go into the newly created columns.
#' - The name of the newly created column that contains the values of the
#'   columns in `select` (`values_to`).
#'
#' In other words: repeated measurements that are spread across several columns
#' will be gathered into a single column (`values_to`), with the original column
#' names, that identify the source of the gathered values, stored in one or more
#' new columns (`names_to`).
#'
#' @return If a tibble was provided as input, `reshape_longer()` also returns a
#' tibble. Otherwise, it returns a data frame.
#'
#' @examplesIf all(insight::check_if_installed(c("psych", "tidyr"), quietly = TRUE))
#' wide_data <- setNames(
#'   data.frame(replicate(2, rnorm(8))),
#'   c("Time1", "Time2")
#' )
#' wide_data$ID <- 1:8
#' wide_data
#'
#' # Default behaviour (equivalent to tidyr::pivot_longer(wide_data, cols = 1:3))
#' # probably doesn't make much sense to mix "time" and "id"
#' data_to_long(wide_data)
#'
#' # Customizing the names
#' data_to_long(
#'   wide_data,
#'   select = c("Time1", "Time2"),
#'   names_to = "Timepoint",
#'   values_to = "Score"
#' )
#'
#' # Reshape multiple columns into long format.
#' mydat <- data.frame(
#'   age = c(20, 30, 40),
#'   sex = c("Female", "Male", "Male"),
#'   score_t1 = c(30, 35, 32),
#'   score_t2 = c(33, 34, 37),
#'   score_t3 = c(36, 35, 38),
#'   speed_t1 = c(2, 3, 1),
#'   speed_t2 = c(3, 4, 5),
#'   speed_t3 = c(1, 8, 6)
#' )
#' # The column names are split into two columns: "type" and "time". The
#' # pattern for splitting column names is provided in `names_pattern`. Values
#' # of all "score_*" and "speed_*" columns are gathered into a single column
#' # named "count".
#' data_to_long(
#'   mydat,
#'   select = 3:8,
#'   names_to = c("type", "time"),
#'   names_pattern = "(score|speed)_t(\\d+)",
#'   values_to = "count"
#' )
#'
#' # Full example
#' # ------------------
#' data <- psych::bfi # Wide format with one row per participant's personality test
#'
#' # Pivot long format
#' very_long_data <- data_to_long(data,
#'   select = regex("\\d"), # Select all columns that contain a digit
#'   names_to = "Item",
#'   values_to = "Score",
#'   rows_to = "Participant"
#' )
#' head(very_long_data)
#'
#' even_longer_data <- data_to_long(
#'   tidyr::who,
#'   select = new_sp_m014:newrel_f65,
#'   names_to = c("diagnosis", "gender", "age"),
#'   names_pattern = "new_?(.*)_(.)(.*)",
#'   values_to = "count"
#' )
#' head(even_longer_data)
#' @export
data_to_long <- function(
  data,
  select = "all",
  names_to = "name",
  names_prefix = NULL,
  names_sep = NULL,
  names_pattern = NULL,
  values_to = "value",
  values_drop_na = FALSE,
  rows_to = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  ...,
  cols
) {
  # nolint
  original_data <- data

  # Prefer "cols" over "select" for compat with tidyr::pivot_longer
  # nolint start
  if (!missing(cols)) {
    select <- substitute(cols)
    cols <- .select_nse(
      select,
      data,
      exclude = NULL,
      ignore_case = ignore_case,
      regex = regex,
      ifnotfound = "error"
    )
  } else if (!missing(select) || !is.null(select)) {
    cols <- .select_nse(
      select,
      data,
      exclude = NULL,
      ignore_case = ignore_case,
      regex = regex,
      ifnotfound = "error"
    )
  } else {
    insight::format_error(
      "You need to specify columns to pivot, either with `select` or `cols`."
    )
  }
  # nolint end

  if (length(names_to) > 1L && is.null(names_sep) && is.null(names_pattern)) {
    insight::format_error(
      "If you supply multiple names in `names_to`, you must also supply one of `names_sep` or `names_pattern`."
    )
  }

  if (length(names_to) == 1L) {
    if (!is.null(names_sep)) {
      insight::format_error(
        "You can't use `names_sep` when `names_to` is of length 1."
      )
    }
    if (!is.null(names_pattern)) {
      insight::format_error(
        "You can't use `names_pattern` when `names_to` is of length 1."
      )
    }
  }

  # save custom attributes
  custom_attr <- attributes(data)

  # Remove tidyverse attributes, will add them back at the end
  if (inherits(data, "tbl_df")) {
    tbl_input <- TRUE
    data <- as.data.frame(data, stringsAsFactors = FALSE)
  } else {
    tbl_input <- FALSE
  }

  if (any(names_to %in% setdiff(names(data), cols))) {
    insight::format_error(
      "Some values of the columns specified in `names_to` are already present as column names.",
      paste0(
        "Either use another value in `names_to` or rename the following columns: ",
        text_concatenate(names_to[which(
          names_to %in% setdiff(names(data), cols)
        )])
      )
    )
  }

  not_selected <- setdiff(names(data), cols)

  # create a temp id so that we know how to rearrange the rows once the data is
  # stacked
  not_stacked <- data[, not_selected, drop = FALSE]
  not_stacked[["_Rows"]] <- coerce_to_numeric(row.names(data))

  # stack the selected columns
  stacked_data <- .stack(data[, cols, drop = FALSE])[, 2:1]

  # reorder the rows to have a repeated sequence when all vars are selected to
  # pivot
  #
  # See with following example:
  # wide_data <- data.frame(replicate(5, rnorm(10)))
  # data_to_long(wide_data)

  needs_to_rearrange <- length(not_selected) == 0L && is.null(rows_to)
  if (isTRUE(needs_to_rearrange)) {
    # https://stackoverflow.com/questions/73984957/efficient-way-to-reorder-rows-to-have-a-repeated-sequence
    stacked_data <- stacked_data[
      matrix(
        seq_len(nrow(stacked_data)),
        nrow = length(unique(stacked_data$ind)),
        byrow = TRUE
      ),
    ]

    row.names(stacked_data) <- NULL
  }

  stacked_data <- data_rename(stacked_data, "values", values_to)

  # split columns if several names in names_to or names_pattern is specified
  if (length(names_to) > 1L) {
    if (is.null(names_pattern)) {
      # faster than strsplit
      tmp <- utils::read.csv(
        text = stacked_data$ind,
        sep = names_sep,
        stringsAsFactors = FALSE,
        header = FALSE
      )
      names(tmp) <- paste0("V", seq_len(ncol(tmp)))
      tmp[tmp == ""] <- NA # nolint

      stacked_data$ind <- NULL
      stacked_data <- cbind(tmp, stacked_data)
    } else {
      tmp <- regmatches(
        unique(stacked_data$ind),
        regexec(names_pattern, unique(stacked_data$ind))
      )
      tmp <- as.data.frame(do.call(rbind, tmp), stringsAsFactors = FALSE)
      names(tmp) <- c("ind", names_to)
      # cbind + match is faster than merge
      # cbind doesn't remove identical columns so we need to manually remove "ind"
      # which is in both datasets
      stacked_data <- cbind(
        stacked_data,
        tmp[match(stacked_data[["ind"]], tmp[["ind"]]), -1]
      )
      stacked_data$ind <- NULL
    }
  }

  stacked_data <- data_relocate(stacked_data, select = values_to, after = -1)

  # if columns in data frame have attributes (e.g. labelled data), `cbind()`
  # won't work, so we need to remove them. We'll set them back later
  not_stacked[] <- lapply(not_stacked, function(i) {
    # we can't remove *all* attributes, this will convert factors into integers
    attr(i, "label") <- NULL
    attr(i, "labels") <- NULL
    attr(i, "format.spss") <- NULL
    class(i) <- setdiff(class(i), c("haven_labelled", "vctrs_vctr"))
    i
  })

  # reunite unselected data with stacked data
  out <- cbind(
    not_stacked,
    stats::setNames(stacked_data, c(names_to, values_to)),
    row.names = NULL
  )

  if (!is.null(names_prefix)) {
    if (length(names_to) > 1L) {
      insight::format_error(
        "`names_prefix` only works when `names_to` is of length 1."
      )
    }
    out[[names_to]] <- gsub(paste0("^", names_prefix), "", out[[names_to]])
  }

  # rearrange the rows with the temp id
  if (length(not_selected) > 0L) {
    out <- data_arrange(out, "_Rows")
  }

  # Remove or rename the row index
  if (is.null(rows_to)) {
    out[["_Rows"]] <- NULL
  } else {
    out <- data_rename(out, "_Rows", rows_to)
  }

  if (values_drop_na) {
    out <- out[!is.na(out[, values_to]), , drop = FALSE]
  }

  # add back attributes
  out <- .replace_attrs(out, custom_attr)

  # add back tidyverse attributes
  if (isTRUE(tbl_input)) {
    class(out) <- c("tbl_df", "tbl", "data.frame")
  }

  # reset row names
  if (!insight::object_has_rownames(data)) {
    row.names(out) <- NULL
  }

  # set back labels
  shared_columns <- intersect(colnames(out), colnames(original_data))
  for (i in shared_columns) {
    out[[i]] <- .set_back_labels(
      out[[i]],
      original_data[[i]],
      include_values = TRUE
    )
  }

  out
}


#' Code adapted from utils::stack (but largely modified)
#'
#' @noRd

.stack <- function(x) {
  ind <- rep(names(x), times = lengths(x))
  # use do.call("c", ...) instead of unlist to preserve the date format (but a
  # bit slower)
  # can't use do.call("c", ...) all the time because its behavior changed with
  # factors in 4.1.0
  values_are_dates <- all(
    vapply(x, .is_date, FUN.VALUE = logical(1L))
  )
  if (values_are_dates) {
    data.frame(values = do.call("c", unname(x)), ind, stringsAsFactors = FALSE)
  } else {
    data.frame(
      values = unlist(x, use.names = FALSE),
      ind,
      stringsAsFactors = FALSE
    )
  }
}

#' @rdname data_to_long
#' @export
reshape_longer <- data_to_long


================================================
FILE: R/data_to_wide.R
================================================
#' Reshape (pivot) data from long to wide
#'
#' This function "widens" data, increasing the number of columns and decreasing
#' the number of rows. This is a dependency-free base-R equivalent of
#' `tidyr::pivot_wider()`.
#'
#' @param data A data frame to convert to wide format, so that it has more
#' columns and fewer rows post-widening than pre-widening.
#' @param id_cols The name of the column that identifies the rows in the data
#' by which observations are grouped and the gathered data is spread into new
#' columns. Usually, this is a variable containing an ID for observations that
#' have been repeatedly measured. If `NULL`, it will use all remaining columns
#' that are not in `names_from` or `values_from` as ID columns. `id_cols` can
#' also be a character vector with more than one name of identifier columns. See
#' also 'Details' and 'Examples'.
#' @param names_from The name of the column in the original data whose values
#' will be used for naming the new columns created in the widened data. Each
#' unique value in this column will become the name of one of these new columns.
#' In case `names_prefix` is provided, column names will be concatenated with
#' the string given in `names_prefix`. If `values_from` specifies more than one
#' variable that should be widened, the new column names are a combination of
#' the old column names in `values_from` and the *values* from `names_from`, to
#' avoid duplicate column names.
#' @param names_prefix String added to the start of every variable name. This is
#'  particularly useful if `names_from` is a numeric vector and you want to create
#'  syntactic variable names.
#' @param names_sep If `names_from` or `values_from` contains multiple variables,
#' this will be used to join their values together into a single string to use
#' as a column name.
#' @param names_glue Instead of `names_sep` and `names_prefix`, you can supply a
#' [glue specification](https://glue.tidyverse.org/index.html) that uses the
#' `names_from` columns to create custom column names. Note that the only
#' delimiters supported by `names_glue` are curly brackets, `{` and `}`.
#' @param values_from The name of the columns in the original data that contains
#' the values used to fill the new columns created in the widened data. Can also
#' be one of the selection helpers (see argument `select` in [`data_select()`]).
#' @param values_fill Defunct argument, which has no function anymore. Will be
#' removed in future versions.
#' @param verbose Toggle warnings.
#' @param ... Not used for now.
#' @inheritParams data_select
#'
#' @return If a tibble was provided as input, `data_to_wide()` also returns a
#' tibble. Otherwise, it returns a data frame.
#'
#' @details
#' Reshaping data into wide format usually means that the input data frame is
#' in _long_ format, where multiple measurements taken on the same subject are
#' stored in multiple rows. The wide format stores the same information in a
#' single row, with each measurement stored in a separate column. Thus, the
#' necessary information for `data_to_wide()` is:
#'
#' - The name of the column(s) that identify the groups or repeated measurements
#'   (`id_cols`).
#' - The name of the column whose _values_ will become the new column names
#'   (`names_from`). Since these values may not necessarily reflect appropriate
#'   column names, you can use `names_prefix` to add a prefix to each newly
#'   created column name.
#' - The name of the column(s) that contain the values (`values_from`) for the
#'   new columns that are created by `names_from`.
#'
#' In other words: repeated measurements, as indicated by `id_cols`, that are
#' saved into the column `values_from` will be spread into new columns, which
#' will be named after the values in `names_from`. See also 'Examples'.
#'
#' @examplesIf requireNamespace("lme4", quietly = TRUE)
#' data_long <- read.table(header = TRUE, text = "
#'  subject sex condition measurement
#'        1   M   control         7.9
#'        1   M     cond1        12.3
#'        1   M     cond2        10.7
#'        2   F   control         6.3
#'        2   F     cond1        10.6
#'        2   F     cond2        11.1
#'        3   F   control         9.5
#'        3   F     cond1        13.1
#'        3   F     cond2        13.8
#'        4   M   control        11.5
#'        4   M     cond1        13.4
#'        4   M     cond2        12.9")
#'
#' # converting long data into wide format
#' data_to_wide(
#'   data_long,
#'   id_cols = "subject",
#'   names_from = "condition",
#'   values_from = "measurement"
#' )
#'
#' # converting long data into wide format with custom column names
#' data_to_wide(
#'   data_long,
#'   id_cols = "subject",
#'   names_from = "condition",
#'   values_from = "measurement",
#'   names_prefix = "Var.",
#'   names_sep = "."
#' )
#'
#' # converting long data into wide format, combining multiple columns
#' production <- expand.grid(
#'   product = c("A", "B"),
#'   country = c("AI", "EI"),
#'   year = 2000:2014
#' )
#' production <- data_filter(production, (product == "A" & country == "AI") | product == "B")
#' production$production <- rnorm(nrow(production))
#'
#' data_to_wide(
#'   production,
#'   names_from = c("product", "country"),
#'   values_from = "production",
#'   names_glue = "prod_{product}_{country}"
#' )
#'
#' # reshaping multiple long columns into wide format. to avoid duplicate
#' # column names, new names are a combination of the old column names in
#' # `values_from` and the values from `names_from`
#' data_long <- read.table(header = TRUE, text = "
#' subject_id time score anxiety test
#'          1    1    10       5   NA
#'          1    2    NA       7   NA
#'          2    1    15       6   NA
#'          2    2    12      NA   NA
#'          3    1    18       8   NA
#'          5    2    11       4   NA
#'          4    1    NA       5   NA
#'          4    2    14      NA   NA")
#'
#' data_to_wide(
#'   data_long,
#'   id_cols = "subject_id",
#'   names_from = "time",
#'   values_from = c("score", "anxiety", "test")
#' )
#'
#' # using the "sleepstudy" dataset
#' data(sleepstudy, package = "lme4")
#'
#' # the sleepstudy data contains repeated measurements of average reaction
#' # times for each subjects over multiple days, in a sleep deprivation study.
#' # It is in long-format, i.e. each row corresponds to a single measurement.
#' # The variable "Days" contains the timepoint of the measurement, and
#' # "Reaction" contains the measurement itself. Converting this data to wide
#' # format will create a new column for each day, with the reaction time as the
#' # value.
#' head(sleepstudy)
#'
#' data_to_wide(
#'   sleepstudy,
#'   id_cols = "Subject",
#'   names_from = "Days",
#'   values_from = "Reaction"
#' )
#'
#' # clearer column names
#' data_to_wide(
#'   sleepstudy,
#'   id_cols = "Subject",
#'   names_from = "Days",
#'   values_from = "Reaction",
#'   names_prefix = "Reaction_Day_"
#' )
#'
#' # For unequal group sizes, missing information is filled with NA
#' d <- subset(sleepstudy, Days %in% c(0, 1, 2, 3, 4))[c(1:9, 11:13, 16:17, 21), ]
#'
#' # long format, different number of "Subjects"
#' d
#'
#' data_to_wide(
#'   d,
#'   id_cols = "Subject",
#'   names_from = "Days",
#'   values_from = "Reaction",
#'   names_prefix = "Reaction_Day_"
#' )
#' @inherit data_rename seealso
#' @export
data_to_wide <- function(
  data,
  id_cols = NULL,
  values_from = "Value",
  names_from = "Name",
  names_sep = "_",
  names_prefix = "",
  names_glue = NULL,
  values_fill = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
) {
  ## TODO: remove in a future update (#645)
  if (!is.null(values_fill)) {
    insight::format_warning(
      "`values_fill` is defunct and has no function anymore. It will be removed in future versions.",
      "To handle missing values after widening, use `convert_na_to()` instead."
    )
  }

  if (is.null(names_from) || !all(names_from %in% colnames(data))) {
    insight::format_error(
      "`names_from` must be the name of an existing column in `data`."
    )
  }

  select <- substitute(values_from)
  values_from <- .select_nse(
    select,
    data,
    exclude = NULL,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  if (is.null(values_from) || !length(values_from)) {
    insight::format_error(
      "No variable defined in `values_from` was found in the `data`."
    )
  }

  if (is.null(id_cols)) {
    id_cols <- setdiff(colnames(data), c(names_from, values_from))
  } else if (!all(id_cols %in% colnames(data))) {
    insight::format_error(
      "`id_cols` must be the names of existing columns in `data`."
    )
  }

  # save custom attributes
  custom_attr <- attributes(data)

  current_colnames <- names(data)

  # Preserve attributes
  if (inherits(data, "tbl_df")) {
    tbl_input <- TRUE
    data <- as.data.frame(data, stringsAsFactors = FALSE)
  } else {
    tbl_input <- FALSE
  }

  variable_attr <- lapply(data, attributes)

  not_unstacked <- data[, id_cols, drop = FALSE]
  not_unstacked <- unique(not_unstacked)

  # unstack doesn't create NAs for combinations that don't exist (contrary to
  # reshape), so we need to complete the dataset before unstacking.

  new_data <- data

  # create an id with all variables that are not in names_from or values_from
  # so that we can create missing combinations between this id and names_from
  if (length(id_cols) > 1L) {
    new_data$temporary_id <- do.call(
      paste,
      c(new_data[, id_cols, drop = FALSE], sep = "_")
    )
  } else if (length(id_cols) == 1L) {
    new_data$temporary_id <- new_data[[id_cols]]
  } else {
    new_data$temporary_id <- seq_len(nrow(new_data))
  }

  # check that all_groups have all possible values for names_from
  # If not, need to complete the dataset with NA for values_from where names_from
  # didn't exist
  n_rows_per_group <- table(new_data$temporary_id)
  n_values_per_group <- insight::n_unique(n_rows_per_group)

  not_all_cols_are_selected <- length(id_cols) > 0L

  incomplete_groups <-
    (n_values_per_group > 1L &&
      !all(
        unique(n_rows_per_group) %in% insight::n_unique(new_data[, names_from])
      )) ||
    (n_values_per_group == 1L &&
      unique(n_rows_per_group) < length(unique(new_data[, names_from])))

  # create missing combinations

  if (not_all_cols_are_selected && incomplete_groups) {
    expanded <- expand.grid(
      unique(new_data[["temporary_id"]]),
      unique(new_data[[names_from]])
    )
    names(expanded) <- c("temporary_id", names_from)
    new_data <- data_merge(
      new_data,
      expanded,
      join = "full",
      by = c("temporary_id", names_from),
      sort = FALSE
    )

    # need to make a second temporary id to keep arrange values *without*
    # rearranging the whole dataset
    # Ex:
    # "B"   1
    # "A"   3
    # "A"   NA
    # "B"   NA
    #
    # must be rearranged as "B" "B" "A" "A" and not "A" "A" "B" "B"
    lookup <- data.frame(
      temporary_id = unique(
        new_data[!is.na(new_data[values_from]), "temporary_id"]
      )
    )
    lookup$temporary_id_2 <- seq_len(nrow(lookup))
    new_data <- data_merge(
      new_data,
      lookup,
      by = "temporary_id",
      join = "left"
    )

    # creation of missing combinations was done with a temporary id, so need
    # to fill columns that are not selected in names_from or values_from
    new_data[, id_cols] <- lapply(id_cols, function(x) {
      data <- data_arrange(new_data, c("temporary_id_2", x))
      ind <- which(!is.na(data[[x]]))
      rep_times <- diff(c(ind, length(data[[x]]) + 1))
      rep(data[[x]][ind], times = rep_times)
    })

    new_data <- data_arrange(new_data, "temporary_id_2")
  }

  # don't need temporary ids anymore
  new_data$temporary_id <- NULL
  new_data$temporary_id_2 <- NULL

  # convert to wide format (returns the data and the order in which columns
  # should be ordered)
  unstacked <- .unstack(
    new_data,
    names_from,
    values_from,
    names_sep,
    names_prefix,
    names_glue
  )

  out <- unstacked$out

  if (length(values_from) > 1L) {
    unstacked$col_order <- unique(data[, names_from])
    unstacked$col_order <- as.vector(
      t(outer(values_from, unstacked$col_order, paste, sep = names_sep))
    )
  }

  # stop if some column names would be duplicated (follow tidyr workflow)
  if (any(unstacked$col_order %in% current_colnames)) {
    insight::format_error(
      "Some values of the columns specified in `names_from` are already present as column names.",
      paste0(
        "Either use `names_prefix` or rename the following columns: ",
        text_concatenate(current_colnames[which(
          current_colnames %in% unstacked$col_order
        )])
      )
    )
  }

  # reorder columns
  out <- out[, unstacked$col_order, drop = FALSE]

  # need to add the wide data to the original data
  if (!insight::is_empty_object(not_unstacked)) {
    out <- cbind(not_unstacked, out)
  }
  row.names(out) <- NULL

  # add back attributes where possible
  for (i in colnames(out)) {
    attributes(out[[i]]) <- variable_attr[[i]]
  }

  # convert back to date if original values were dates
  values_are_dates <- all(
    vapply(data[, values_from, drop = FALSE], .is_date, FUN.VALUE = logical(1L))
  )
  if (values_are_dates) {
    for (i in unstacked$col_order) {
      out[[i]] <- as.Date.numeric(out[[i]], origin = "1970-01-01")
    }
  }

  # add back attributes
  out <- .replace_attrs(out, custom_attr)

  if (isTRUE(tbl_input)) {
    class(out) <- c("tbl_df", "tbl", "data.frame")
  }

  out
}


#' Adapted from `utils::unstack` (but largely modified)
#'
#' @noRd

.unstack <- function(
  x,
  names_from,
  values_from,
  names_sep,
  names_prefix,
  names_glue = NULL
) {
  # get values from names_from (future colnames)

  if (is.null(names_glue)) {
    x$future_colnames <- do.call(
      paste,
      c(x[, names_from, drop = FALSE], sep = names_sep)
    )
  } else {
    vars <- regmatches(
      names_glue,
      gregexpr("\\{\\K[^{}]+(?=\\})", names_glue, perl = TRUE)
    )[[1]]
    tmp_data <- x[, vars]
    x$future_colnames <- .gluestick(names_glue, src = tmp_data)
  }

  x$future_colnames <- paste0(names_prefix, x$future_colnames)

  # expand the values for each variable in "values_from"
  res <- list()
  for (i in seq_along(values_from)) {
    res[[i]] <- tapply(x[[values_from[i]]], x$future_colnames, as.vector)
    if (length(values_from) > 1L) {
      names(res[[i]]) <- paste0(values_from[i], names_sep, names(res[[i]]))
    }
  }

  # if there's a single variable in "values_from" and this variable only has
  # one value, need to make it a dataframe

  if (length(res) == 1L && !is.list(res[[1]])) {
    res <- data.frame(
      matrix(
        res[[1]],
        nrow = 1,
        dimnames = list(NULL, names(res[[1]]))
      ),
      stringsAsFactors = FALSE,
      check.names = FALSE
    )
  } else {
    res <- unlist(res, recursive = FALSE)
  }

  # return the wide data and the order in which the new columns should be

  list(
    out = data.frame(res, stringsAsFactors = FALSE, check.names = FALSE),
    col_order = unique(x$future_colnames)
  )
}


#' @rdname data_to_wide
#' @export
reshape_wider <- data_to_wide


================================================
FILE: R/data_unique.R
================================================
#' @title Keep only one row from all with duplicated IDs
#'
#' @description From all rows with at least one duplicated ID,
#' keep only one. Methods for selecting the duplicated row are
#' either the first duplicate, the last duplicate, or the "best"
#' duplicate (default), based on the duplicate with the smallest
#' number of `NA`. In case of ties, it picks the first
#' duplicate, as it is the one most likely to be valid and
#' authentic, given practice effects.
#'
#' Contrarily to `dplyr::distinct()`, `data_unique()` keeps all columns.
#'
#' @param keep The method to be used for duplicate selection, either "best"
#'   (the default), "first", or "last".
#' @inheritParams extract_column_names
#'
#' @return A data frame, containing only the chosen duplicates.
#' @seealso [data_duplicated()]
#' @examples
#' df1 <- data.frame(
#'   id = c(1, 2, 3, 1, 3),
#'   item1 = c(NA, 1, 1, 2, 3),
#'   item2 = c(NA, 1, 1, 2, 3),
#'   item3 = c(NA, 1, 1, 2, 3)
#' )
#'
#' data_unique(df1, select = "id")
#' @export
data_unique <- function(
  data,
  select = NULL,
  keep = "best",
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE
) {
  UseMethod("data_unique")
}


#' @export
data_unique.data.frame <- function(
  data,
  select = NULL,
  keep = "best",
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE
) {
  select <- .select_nse(
    select,
    data,
    exclude = exclude,
    ignore_case = ignore_case,
    regex = regex,
    verbose = verbose
  )

  # temporary_id <- paste0(sample(letters), collapse = "")
  data$temporary_id2 <- do.call(paste, c(data_select(data, select), sep = "_"))

  og.names <- names(data)
  dups <- data_duplicated(data, select)

  # if no duplicates, return the original data
  if (nrow(dups) == 0L) {
    data <- data_remove(data, "temporary_id2")
    return(data)
  }

  # count number of duplicates
  dups.n <- sum(duplicated(dups$temporary_id2))
  good.dups <- data_group(dups, "temporary_id2")

  # keep row that has the least duplicates
  if (keep == "best") {
    good.dups <- data_filter(good.dups, "count_na == min(count_na)")
  }

  good.dups <- good.dups[
    !duplicated(good.dups$temporary_id2, fromLast = keep == "last"),
  ]

  good.dups <- data_select(good.dups, og.names)
  out <- data[!duplicated(data$temporary_id2), , drop = FALSE]

  if (keep != "first") {
    match.index <- out$temporary_id2 %in% good.dups$temporary_id2
    out[match.index, ] <- good.dups
  }

  # id is not useful anymore
  out <- data_remove(out, "temporary_id2")

  if (verbose) {
    dup.msg <- sprintf(
      "(%s duplicates removed, with method '%s')",
      dups.n,
      keep
    )
    dup.msg <- paste0(dup.msg, ifelse(dups.n != 69, "", " 69... nice"))
    insight::format_alert(dup.msg)
  }

  out
}


#' @export
data_unique.grouped_df <- function(
  data,
  select = NULL,
  keep = "best",
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE
) {
  select <- .select_nse(
    select,
    data,
    exclude = exclude,
    ignore_case = ignore_case,
    regex = regex,
    verbose = verbose
  )

  grps <- attr(data, "groups", exact = TRUE)
  grps <- grps[[".rows"]]

  data2 <- as.data.frame(data_ungroup(data))

  out <- lapply(grps, function(x) {
    data_unique.data.frame(
      data2[x, ],
      select = select,
      keep = keep,
      verbose = verbose
    )
  })

  out <- do.call(rbind, out)

  if (!insight::object_has_rownames(data)) {
    rownames(out) <- NULL
  }

  class(out) <- class(data)
  attr(out, "groups") <- attr(data, "groups")

  out
}


================================================
FILE: R/data_unite.R
================================================
#' @title Unite ("merge") multiple variables
#' @name data_unite
#'
#' @description
#' Merge values of multiple variables per observation into one new variable.
#'
#' @param data A data frame.
#' @param new_column The name of the new column, as a string.
#' @param separator A character to use between values.
#' @param append Logical, if `FALSE` (default), removes original columns that
#' were united. If `TRUE`, all columns are preserved and the new column is
#' appended to the data frame.
#' @param remove_na Logical, if `TRUE`, missing values (`NA`) are not included
#' in the united values. If `FALSE`, missing values are represented as `"NA"`
#' in the united values.
#' @param ... Currently not used.
#' @inheritParams extract_column_names
#'
#' @seealso [`data_separate()`]
#'
#' @return `data`, with a newly created variable.
#'
#' @examples
#' d <- data.frame(
#'   x = 1:3,
#'   y = letters[1:3],
#'   z = 6:8
#' )
#' d
#' data_unite(d, new_column = "xyz")
#' data_unite(d, new_column = "xyz", remove = FALSE)
#' data_unite(d, new_column = "xyz", select = c("x", "z"))
#' data_unite(d, new_column = "xyz", select = c("x", "z"), append = TRUE)
#' @export
data_unite <- function(
  data,
  new_column = NULL,
  select = NULL,
  exclude = NULL,
  separator = "_",
  append = FALSE,
  remove_na = FALSE,
  ignore_case = FALSE,
  verbose = TRUE,
  regex = FALSE,
  ...
) {
  # we need a name for the new column
  if (is.null(new_column)) {
    insight::format_error(
      "No name for the new column was provided.",
      "Please use `new_column` to define a name for the newly created column."
    )
  }

  # only one column name
  if (length(new_column) > 1) {
    insight::format_error(
      "Please provide only a single string for `new_column`, no character vector with multiple values."
    )
  }

  # evaluate select/exclude, may be select-helpers
  select <- .select_nse(
    select,
    data,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  if (is.null(select) || length(select) <= 1) {
    insight::format_error(
      "At least two columns in `select` are required for `data_unite()`."
    )
  }

  # unite
  out <- data.frame(
    new_col = do.call(paste, c(data[select], sep = separator)),
    stringsAsFactors = FALSE
  )
  colnames(out) <- new_column

  # remove missings
  if (remove_na) {
    # remove trailing and leading "NA_" and "_NA"
    out[[new_column]] <- gsub(paste0("^NA", separator), "", out[[new_column]])
    out[[new_column]] <- gsub(paste0(separator, "NA$"), "", out[[new_column]])
    # remove _NA_ inside string, add separator back. This ensure we match
    # whole-word NA and do not break strings like "COUNTRY_NATION"
    out[[new_column]] <- gsub(
      paste0(separator, "NA", separator),
      separator,
      out[[new_column]],
      fixed = TRUE
    )
  }

  # remove old columns
  if (!isTRUE(append)) {
    data[select] <- NULL
  }

  # overwrite?
  if (new_column %in% colnames(data) && verbose) {
    insight::format_alert(
      "The name for `new_column` already exists as variable name in the data.",
      "This variable will be replaced by `new_column`."
    )
  }

  # overwrite or append
  data[[new_column]] <- out[[new_column]]

  # fin
  data
}


================================================
FILE: R/data_write.R
================================================
#' @param data The data frame that should be written to a file.
#' @param delimiter For CSV-files, specifies the delimiter. Defaults to `","`,
#'   but in particular in European regions, `";"` might be a useful alternative,
#'   especially when exported CSV-files should be opened in Excel.
#' @param save_labels Only applies to CSV files. If `TRUE`, value and variable
#'   labels (if any) will be saved as additional CSV file. This file has the same
#'   file name as the exported CSV file, but includes a `"_labels"` suffix (i.e.
#'   when the file name is `"mydat.csv"`, the additional file with value and
#'   variable labels is named `"mydat_labels.csv"`).
#' @rdname data_read
#' @export
data_write <- function(
  data,
  path,
  delimiter = ",",
  convert_factors = FALSE,
  save_labels = FALSE,
  password = NULL,
  verbose = TRUE,
  ...
) {
  # check file type, so we know the target dta format
  file_type <- .file_ext(path)
  type <- switch(
    file_type,
    txt = ,
    csv = "csv",
    sav = ,
    por = "spss",
    zsav = "zspss",
    dta = "stata",
    xpt = "sas",
    rds = "rds",
    rda = ,
    rdata = "rda",
    parquet = "parquet",
    "unknown"
  )

  # no file type provided?
  if (!is.character(file_type) || file_type == "") {
    insight::format_error(
      "Could not detect file type. The `path` argument has no file extension.",
      "Please provide a file path including extension, like \"myfile.csv\" or \"c:/Users/Default/myfile.sav\"."
    )
  }

  if (type %in% c("csv", "unknown")) {
    .write_csv_or_unknown(
      data,
      path,
      type,
      delimiter,
      convert_factors,
      save_labels,
      password,
      verbose,
      ...
    )
  } else if (type == "rds") {
    .write_rds(data, path, password, verbose, ...)
  } else if (type == "rda") {
    .write_rda(data, path, password, verbose, ...)
  } else if (type == "parquet") {
    .write_parquet(data, path, password, verbose, ...)
  } else {
    .write_haven(data, path, password, verbose, type, ...)
  }
}


# base R formats -----

.write_rds <- function(data, path, password, verbose = TRUE, ...) {
  # encrypt data
  data <- .data_encryption(data, password)
  saveRDS(data, path, ...)
}

.write_rda <- function(data, path, password, verbose = TRUE, ...) {
  # encrypt data
  data <- .data_encryption(data, password)
  # save single data frame
  if (is.data.frame(data) || is.raw(data)) {
    save(data, file = path, ...)
  } else {
    # save list of data frames
    env <- as.environment(data)
    save(list = names(data), file = path, envir = env, ...)
  }
}


# nanoparquet -----

.write_parquet <- function(data, path, password, verbose = TRUE, ...) {
  # saving raw columns in data frames is not yet supported by parquet, thus,
  # we cannot save encrypted data right now.
  if (!is.null(password)) {
    insight::format_error(
      "Data encryption is not supported for parquet-files."
    )
  }

  # requires nanoparquet package
  insight::check_if_installed("nanoparquet")

  # save single data frame
  nanoparquet::write_parquet(x = data, file = path, ...)
}


# saving into CSV or unknown format -----

.write_csv_or_unknown <- function(
  data,
  path,
  type = "csv",
  delimiter = ",",
  convert_factors = FALSE,
  save_labels = FALSE,
  password = NULL,
  verbose = TRUE,
  ...
) {
  # data encryption requires a data frame *with attributes* to be saved, which
  # is not possible for text or raw formats - thus, no encryption here
  if (!is.null(password)) {
    insight::format_error(
      "Data encryption is not supported for CSV or text files."
    )
  }

  # save labels
  if (save_labels && type == "csv") {
    data <- .save_labels_to_file(data, path, delimiter, verbose)
  }

  # this might make sense when writing labelled data to CSV
  if (convert_factors) {
    data <- .pre_process_exported_data(data, convert_factors)
  }

  if (type == "csv") {
    insight::check_if_installed("readr")
    if (delimiter == ",") {
      readr::write_csv(x = data, file = path, ...)
    } else {
      readr::write_csv2(x = data, file = path, ...)
    }
  } else {
    insight::check_if_installed("rio")
    rio::export(x = data, file = path, ...)
  }
}


# saving into haven format -----

.write_haven <- function(
  data,
  path,
  password,
  verbose = TRUE,
  type = "spss",
  ...
) {
  # saving raw columns in data frames in not yet supported by haven, thus,
  # we cannot save encrypted data right now.
  if (!is.null(password)) {
    insight::format_error(
      "Data encryption is not supported for SPSS, SAS or Stata files."
    )
  }

  insight::check_if_installed("haven")

  # check if user provided "compress" argument for SPSS files,
  # else, default to compression
  dots <- list(...)
  if (!is.null(dots$compress)) {
    compress <- dots$compress
  } else if (identical(type, "zspss")) {
    compress <- "zsav"
  } else {
    compress <- "byte"
  }

  # labelled data needs "labelled" class attributes
  data <- .set_haven_class_attributes(data, verbose)

  # fix invalid column names
  data <- .fix_column_names(data, verbose)

  if (type %in% c("spss", "zspss")) {
    # write to SPSS
    haven::write_sav(data = data, path = path, compress = compress)
  } else if (type == "stata") {
    # write to Stata
    haven::write_dta(data = data, path = path, ...)
  } else {
    # write to SAS
    haven::write_xpt(data = data, path = path, ...)
  }
}


# helper -------------------------------

# make sure we have the "labelled" class for labelled data
.set_haven_class_attributes <- function(x, verbose = TRUE) {
  insight::check_if_installed("haven")

  if (verbose) {
    insight::format_alert("Preparing data file: converting variable types.")
  }
  x[] <- lapply(x, function(i) {
    # make sure we have labelled class for labelled data
    value_labels <- attr(i, "labels", exact = TRUE)
    variable_label <- attr(i, "label", exact = TRUE)
    # factor requires special preparation to save levels as labels
    # haven:::vec_cast_named requires "x" and "labels" to be of same type
    if (is.factor(i)) {
      haven::labelled(
        x = as.numeric(i),
        labels = stats::setNames(seq_along(levels(i)), levels(i)),
        label = variable_label
      )
    } else if (!is.null(value_labels) || !is.null(variable_label)) {
      # character requires special preparation to save value labels
      # haven:::vec_cast_named requires "x" and "labels" to be of same type
      if (is.character(i)) {
        # only prepare value labels when these are not NULL
        if (!is.null(value_labels)) {
          value_labels <- stats::setNames(
            as.character(value_labels),
            names(value_labels)
          )
        }
        haven::labelled(
          x = i,
          labels = value_labels,
          label = variable_label
        )
      } else {
        # this should work for the remaining types...
        haven::labelled(x = i, labels = value_labels, label = variable_label)
      }
    } else {
      # non labelled data can be saved "as is"
      i
    }
  })
  x
}


# packages like SPSS cannot deal with variable which names end with a dot
# fix column names here by added a "fix" suffix
.fix_column_names <- function(x, verbose = TRUE) {
  # check for correct column names
  dot_ends <- vapply(colnames(x), endsWith, FUN.VALUE = TRUE, suffix = ".")
  if (any(dot_ends)) {
    if (verbose) {
      insight::format_alert(
        "Found and fixed invalid column names so they can be read by other software packages."
      )
    }
    colnames(x)[dot_ends] <- paste0(colnames(x)[dot_ends], "fix")
  }
  x
}


# save value and variable labels as addtional file
.save_labels_to_file <- function(x, path, delimiter, verbose = TRUE) {
  insight::check_if_installed("readr")

  # get file path information
  fpath <- dirname(path)
  fname <- sub("\\.csv$", "", basename(path))
  path <- paste0(fpath, .Platform$file.sep, fname, "_labels.csv")

  if (verbose) {
    insight::format_alert(
      paste0("Saving variable and value labels to \"", path, "\".")
    )
  }

  # extract labels
  var_labs <- vapply(
    x,
    function(i) {
      l <- attr(i, "label", exact = TRUE)
      if (is.null(l)) {
        l <- ""
      }
      l
    },
    character(1)
  )

  # extract value labels
  value_labs <- vapply(
    x,
    function(i) {
      l <- attr(i, "labels", exact = TRUE)
      if (is.null(l)) {
        ""
      } else {
        paste0(l, "=", names(l), collapse = "; ")
      }
    },
    character(1)
  )

  out <- data.frame(
    variable = colnames(x),
    label = var_labs,
    labels = value_labs,
    stringsAsFactors = FALSE
  )

  if (delimiter == ",") {
    readr::write_csv(x = out, file = path)
  } else {
    readr::write_csv2(x = out, file = path)
  }
}


# process data for export, use factor levels as data values -------------------

.pre_process_exported_data <- function(x, convert_factors) {
  # user may decide whether we automatically detect variable type or not
  if (isTRUE(convert_factors)) {
    x[] <- lapply(x, function(i) {
      # only proceed if not all missing
      if (!all(is.na(i))) {
        # save labels
        value_labels <- attr(i, "labels", exact = TRUE)
        variable_labels <- attr(i, "label", exact = TRUE)

        # filter, so only matching value labels remain
        value_labels <- value_labels[value_labels %in% unique(i)]

        # guess variable type
        if (is.character(i)) {
          # we need this to drop haven-specific class attributes
          i <- as.character(i)
        } else if (
          !is.null(value_labels) && length(value_labels) == insight::n_unique(i)
        ) {
          # if all values are labelled, we assume factor. Use labels as levels
          if (is.numeric(i)) {
            i <- factor(i, labels = names(value_labels))
          } else {
            i <- factor(as.character(i), labels = names(value_labels))
          }
          i <- as.character(i)
        } else {
          # else, fall back to numeric
          i <- as.numeric(as.character(i))
        }
        # add back variable label
        attr(i, "label") <- variable_labels
      }
      i
    })
  } else {
    # drop haven class attributes
    x[] <- lapply(x, function(i) {
      # save labels
      class(i) <- setdiff(class(i), c("haven_labelled", "vctrs_vctr"))
      i
    })
  }

  class(x) <- "data.frame"
  x
}

# data encryption ---------------------------------

.data_encryption <- function(data, password = NULL) {
  # check if data should be encrypted
  if (!is.null(password)) {
    .validate_password(password)
    data <- .encrypt_data(data, password)
    # tell user to remember the password
    insight::format_warning(
      "Remember your `password`, else you will not be able to decrypt the data again!"
    )
  }
  data
}

.encrypt_data <- function(data, password = NULL) {
  insight::check_if_installed("openssl", "for data encryption")
  x <- serialize(data, NULL)
  # it is important to remember the phrase! else, you cannot decrypt the data
  passphrase <- charToRaw(password)
  key <- openssl::sha256(passphrase)
  # encrypt the data. we return the raw data here, which can be handled by
  # rds/rda/rdata, and users can then also decrypt using openssl directly
  # datawizard is not necessarily needed for decryption
  openssl::aes_gcm_encrypt(x, key = key)
}

.validate_password <- function(password) {
  # password needs to be a character string
  if (!is.character(password) || length(password) != 1L || !nzchar(password)) {
    insight::format_error(
      "The password must be a single non-empty character string."
    )
  }
}


================================================
FILE: R/data_xtabulate.R
================================================
# helper to compute crosstables --------------

.crosstable <- function(
  x,
  by,
  weights = NULL,
  remove_na = FALSE,
  proportions = NULL,
  obj_name = NULL,
  group_variable = NULL
) {
  if (!is.null(proportions)) {
    proportions <- match.arg(proportions, c("row", "column", "full"))
  }
  # frequency table
  if (is.null(weights)) {
    # we have a `.default` and a `.data.frame` method for `data_tabulate()`.
    # since this is the default, `x` can be an object which cannot be used
    # with `table()`, that's why we add `tryCatch()` here. Below we give an
    # informative error message for non-supported objects.
    if (remove_na) {
      x_table <- tryCatch(table(x, by), error = function(e) NULL)
    } else {
      x_table <- tryCatch(table(addNA(x), addNA(by)), error = function(e) NULL)
    }
  } else if (remove_na) {
    # weighted frequency table, excluding NA
    x_table <- tryCatch(
      stats::xtabs(
        weights ~ x + by,
        data = data.frame(weights = weights, x = x, by = by),
        na.action = stats::na.omit,
        addNA = FALSE
      ),
      error = function(e) NULL
    )
  } else {
    # weighted frequency table, including NA
    x_table <- tryCatch(
      stats::xtabs(
        weights ~ x + by,
        data = data.frame(weights = weights, x = addNA(x), by = addNA(by)),
        na.action = stats::na.pass,
        addNA = TRUE
      ),
      error = function(e) NULL
    )
  }

  if (is.null(x_table)) {
    insight::format_warning(paste0(
      "Can't compute cross tables for objects of class `",
      class(x)[1],
      "`."
    ))
    return(NULL)
  }

  out <- as.data.frame(stats::ftable(x_table))
  colnames(out) <- c("Value", "by", "N")
  total_n <- sum(out$N, na.rm = TRUE)

  # we want to round N for weighted frequencies
  if (!is.null(weights)) {
    out$N <- round(out$N)
    total_n <- round(total_n)
  }

  out <- data_to_wide(out, values_from = "N", names_from = "by")

  # use variable name as column name
  if (!is.null(obj_name)) {
    colnames(out)[1] <- obj_name
  }

  # for grouped data frames, add info about grouping variables
  if (!is.null(group_variable)) {
    var_info <- toString(lapply(colnames(group_variable), function(i) {
      sprintf("%s (%s)", i, group_variable[[i]])
    }))
    out <- cbind(
      out[1],
      data.frame(Group = var_info, stringsAsFactors = FALSE),
      out[-1]
    )
  }

  attr(out, "total_n") <- total_n
  attr(out, "weights") <- weights
  attr(out, "proportions") <- proportions
  attr(out, "varname") <- obj_name
  attr(out, "grouped_df") <- !is.null(group_variable)
  attr(out, "prop_table") <- .prop_table(out)

  class(out) <- c("datawizard_crosstab", "data.frame")

  out
}

# Helper function to calculate a table of proportions from a frequency table
.prop_table <- function(x) {
  # Extract the "proportions" attribute, which determines the type of calculation (row, column, or full)
  props <- attributes(x)$proportions
  out <- NULL

  # Proceed only if the "proportions" attribute is set
  if (!is.null(props)) {
    # Identify numeric columns, as proportions are only calculated for these
    numeric_columns <- vapply(x, is.numeric, logical(1))
    # Get the total count from the attributes, used for "full" proportions
    total_n <- attributes(x)$total_n

    # Use a switch to perform the calculation based on the "props" value
    out <- switch(
      props,
      # Calculate row-wise proportions
      row = lapply(seq_len(nrow(x)), function(i) {
        # Sum of the current row's numeric values
        row_sum <- sum(x[i, numeric_columns], na.rm = TRUE)
        # Avoid division by zero; if row sum is 0, return a row of zeros
        if (row_sum == 0) {
          tmp <- as.data.frame(as.list(rep(0, sum(numeric_columns))))
          # for later rbind, we need identical column names
          colnames(tmp) <- colnames(x)[numeric_columns]
          tmp
        } else {
          x[i, numeric_columns] / row_sum
        }
      }),
      # Calculate column-wise proportions
      column = lapply(seq_len(ncol(x))[numeric_columns], function(i) {
        # Sum of the current column's values
        col_sum <- sum(x[, i], na.rm = TRUE)
        # Avoid division by zero; if column sum is 0, return a vector of zeros
        if (col_sum == 0) {
          rep(0, nrow(x))
        } else {
          x[, i] / col_sum
        }
      }),
      # Calculate proportions relative to the total count of the entire table
      full = lapply(seq_len(ncol(x))[numeric_columns], function(i) {
        # Avoid division by zero; if total is 0, return a vector of zeros
        if (total_n == 0) {
          rep(0, nrow(x))
        } else {
          x[, i] / total_n
        }
      })
    )
  }

  # If a proportion table was calculated, format it into a data frame
  if (!is.null(out)) {
    # The output of the switch is a list. We need to bind it into a data frame.
    # For row proportions, we bind rows. For column/full, we bind columns.
    out <- switch(
      props,
      row = as.data.frame(do.call(rbind, out)),
      as.data.frame(do.call(cbind, out))
    )
    # Set the column names of the new proportion table
    colnames(out) <- colnames(x)[numeric_columns]

    # Check if the dimensions are consistent before setting row names
    if (nrow(out) == nrow(x)) {
      # If the first column of the original data is not numeric, it's likely a
      # label column. Use these labels as row names in the output for better
      # readability. This is useful for identifying rows, especially when NAs
      # are present.
      if (isFALSE(numeric_columns[1])) {
        r_names <- x[[1]]
        r_names <- as.character(r_names)
        # Replace NA in labels with the string "NA", else we cannot set rownames
        r_names[is.na(r_names)] <- "NA"
        rownames(out) <- r_names
      } else {
        # Otherwise, just use the original row names
        rownames(out) <- rownames(x)
      }
    }
  }

  out
}


# methods ---------------------

#' @export
format.datawizard_crosstab <- function(
  x,
  format = "text",
  digits = 1,
  big_mark = NULL,
  include_total_row = TRUE,
  ...
) {
  # convert to character manually, else, for large numbers,
  # format_table() returns scientific notation
  x <- as.data.frame(x)

  # find numeric columns, only for these we need row/column sums
  numeric_columns <- which(vapply(x, is.numeric, logical(1)))

  # compute total N for rows and columns
  total_n <- attributes(x)$total_n
  total_column <- rowSums(x[numeric_columns], na.rm = TRUE)
  total_row <- c(colSums(x[numeric_columns], na.rm = TRUE), total_n)

  # proportions?
  props <- attributes(x)$proportions
  prop_table <- attributes(x)$prop_table

  if (!is.null(props) && !is.null(prop_table)) {
    for (i in seq_len(ncol(prop_table))) {
      x[, numeric_columns[i]] <- paste(
        format(x[, numeric_columns[i]]),
        format(
          sprintf("(%.*f%%)", digits, 100 * prop_table[, i]),
          justify = "right"
        )
      )
    }
  }

  x[] <- lapply(x, as.character)

  # format data frame
  ftab <- insight::format_table(x, ...)
  # replace empty cells with NA
  ftab[] <- lapply(ftab, function(i) {
    i[i == ""] <- ifelse(identical(format, "text"), "<NA>", "(NA)") # nolint
    i
  })
  # Remove ".00" from numbers
  ftab$Total <- gsub("\\.00$", "", as.character(total_column))

  # add final total row to each sub-table. For multiple, collapsed table
  # (i.e. when length of `by` > 1), we don't want multiple total rows in the
  # table, so we would set include_total_row = FALSE for objects of class
  # `datawizard_crosstabs` (note plural s!)
  if (include_total_row) {
    # for text format, insert "empty row" before last total row
    if (identical(format, "text") || identical(format, "markdown")) {
      empty_row <- as.data.frame(t(data.frame(
        rep("", ncol(ftab)),
        c("Total", as.character(total_row)),
        stringsAsFactors = FALSE
      )))
    } else {
      empty_row <- as.data.frame(t(data.frame(
        c("Total", as.character(total_row)),
        stringsAsFactors = FALSE
      )))
    }
    colnames(empty_row) <- colnames(ftab)
    ftab <- rbind(ftab, empty_row)
    ftab[nrow(ftab), ] <- gsub("\\.00$", "", ftab[nrow(ftab), ])
  }

  # insert big marks?
  ftab$Total <- .add_commas_in_numbers(ftab$Total, big_mark)
  ftab[nrow(ftab), ] <- .add_commas_in_numbers(ftab[nrow(ftab), ], big_mark)

  # also format NA column name
  colnames(ftab)[colnames(ftab) == "NA"] <- ifelse(
    identical(format, "text"),
    "<NA>",
    "(NA)"
  )

  ftab
}


# print, datawizard_crosstab ---------------------

#' @export
print.datawizard_crosstab <- function(x, big_mark = NULL, ...) {
  .print_text_table(x, big_mark, format = "text", ...)
  invisible(x)
}


#' @export
print_md.datawizard_crosstab <- function(x, big_mark = NULL, ...) {
  .print_text_table(x, big_mark, format = "markdown", ...)
}


#' @export
print_html.datawizard_crosstab <- function(x, big_mark = NULL, ...) {
  .print_text_table(x, big_mark, format = "html", ...)
}


.print_text_table <- function(x, big_mark = NULL, format = "text", ...) {
  # grouped data? if yes, add information on grouping factor
  if (is.null(x[["Group"]])) {
    caption <- NULL
  } else {
    caption <- paste0("Grouped by ", x[["Group"]][1])
    x$Group <- NULL
  }

  # this function is used by all four supported format, markdown, text, html
  # and tt (tinytable). For tt, we sometimes have format "html" and backend = "tt",
  # so we need to check for this special case
  backend <- switch(format, html = , tt = .check_format_backend(...), format)
  # prepare table arguments
  fun_args <- list(
    format(x, big_mark = big_mark, format = format, ...),
    caption = caption,
    format = backend
  )
  if (!format %in% c("html", "tt")) {
    fun_args$cross <- "+"
    fun_args$empty_line <- "-"
  }
  if (format == "text") {
    fun_args$missing <- "<NA>"
  } else {
    fun_args$missing <- "(NA)"
  }
  out <- do.call(insight::export_table, c(fun_args, list(...)))

  # print table
  if (identical(format, "text")) {
    cat(out)
  } else {
    out
  }
}


# print, datawizard_crosstabs ---------------------

#' @export
print.datawizard_crosstabs <- function(x, big_mark = NULL, ...) {
  .print_text_tables(x, big_mark, format = "text", ...)
  invisible(x)
}


#' @export
print_md.datawizard_crosstabs <- function(x, big_mark = NULL, ...) {
  .print_text_tables(x, big_mark, format = "markdown", ...)
}


#' @export
print_html.datawizard_crosstabs <- function(x, big_mark = NULL, ...) {
  .print_text_tables(x, big_mark, format = "html", ...)
}


.print_text_tables <- function(x, big_mark = NULL, format = "text", ...) {
  if (length(x) == 1) {
    .print_text_table(x[[1]], big_mark = big_mark, format = format, ...)
  } else {
    x <- lapply(x, function(i) {
      # grouped data? if yes, add information on grouping factor
      if (!is.null(i[["Group"]])) {
        i$groups <- paste0("Grouped by ", i[["Group"]][1])
        i$Group <- NULL
      }
      # if we don't have the gt-grouping variable "groups" yet, we use it now
      # for grouping. Else, we use a new column named "Variable", to avoid
      # overwriting the groups-variable from grouped data frames
      if (is.null(i$groups) && identical(format, "html")) {
        grp_variable <- "groups"
      } else {
        grp_variable <- "Variable"
      }
      # first variable differs for each data frame, so we harmonize it here
      i[[grp_variable]] <- colnames(i)[1]
      colnames(i)[1] <- "Value"
      # move column to first position
      i <- data_relocate(i, select = grp_variable, before = 1)
      # format data frame
      format(
        i,
        format = format,
        big_mark = big_mark,
        include_total_row = FALSE,
        ...
      )
    })
    # now bind, but we need to check for equal number of columns
    if (all(lengths(x) == max(length(x)))) {
      out <- do.call(rbind, x)
    } else {
      # if not all tables have identical columns, we can use "data_merge()",
      # which safely row-binds all data frames. However, the column order can be
      # messed up, so we save column order here and restore it later
      col_order <- colnames(x[[which.max(lengths(x))]])
      out <- data_merge(x, join = "bind")[col_order]
    }

    # split tables for grouped data frames
    if (!is.null(out$groups)) {
      out <- split(out, out$groups)
      out <- lapply(out, function(subtable) {
        # for text and markdown, if we split tables, we remove the "groups"
        # variable. we need to keep it for HTML tables.
        if (!identical(format, "html")) {
          attr(subtable, "table_caption") <- c(unique(subtable$groups), "blue")
          subtable$groups <- NULL
        }
        # remove duplicated names
        for (grpvars in c("Variable", "Group")) {
          if (!is.null(subtable[[grpvars]])) {
            subtable[[grpvars]][duplicated(subtable[[grpvars]])] <- ""
          }
        }
        subtable
      })
      # no splitting of grouped data frames into list for HTML format,
      # because splitting is done by the `by` argument later
      if (identical(format, "html")) {
        out <- do.call(rbind, out)
      }
    }

    # prepare table arguments
    fun_args <- list(
      out,
      format = format
    )
    if (format != "html") {
      fun_args$cross <- "+"
      fun_args$empty_line <- "-"
    } else {
      fun_args$by <- "groups"
    }
    if (format == "text") {
      fun_args$missing <- "<NA>"
    } else {
      fun_args$missing <- "(NA)"
    }
    out <- do.call(insight::export_table, c(fun_args, list(...)))

    # print table
    if (identical(format, "text")) {
      cat(out)
    } else {
      out
    }
  }
}


# helper ---------------------

.validate_by <- function(by, x) {
  if (!is.null(by)) {
    if (is.character(by)) {
      # If "by" is a character string, must be of length 1
      if (length(by) > 1) {
        insight::format_error(
          "If `by` is a string indicating a variable name, `by` must be of length 1.",
          "You may use `data_group()` to group by multiple variables, then call `data_tabulate()`."
        )
      }
      # if "by" is a character, "x" must be a data frame
      if (!is.data.frame(x)) {
        insight::format_error(
          "If `by` is a string indicating a variable name, `x` must be a data frame."
        )
      }
      # is "by" a column in "x"?
      if (!by %in% colnames(x)) {
        insight::format_error(sprintf(
          "The variable specified in `by` was not found in `x`. %s",
          .misspelled_string(names(x), by, "Possibly misspelled?")
        ))
      }
      by <- x[[by]]
    }
    # is "by" of same length as "x"?
    if (is.data.frame(x) && length(by) != nrow(x)) {
      insight::format_error(
        "Length of `by` must be equal to number of rows in `x`."
      ) # nolint
    }
    if (!is.data.frame(x) && length(by) != length(x)) {
      insight::format_error("Length of `by` must be equal to length of `x`.") # nolint
    }
    if (!is.factor(by)) {
      # coerce "by" to factor, including labels
      by <- to_factor(by, labels_to_levels = TRUE, verbose = FALSE)
    }
  }

  by
}


.validate_table_weights <- function(weights, x, weights_expression = NULL) {
  # exception: for vectors, if weighting variable not found, "weights" is NULL.
  # to check this, we further need to check whether a weights expression was
  # provided, e.g. "weights = iris$not_found" - all this is only relevant when
  # weights is NULL
  if (is.null(weights)) {
    # possibly misspelled weights-variables for default-method ----------------
    # -------------------------------------------------------------------------

    # do we have any value for weights_expression?
    if (
      !is.null(weights_expression) &&
        # due to deparse() and substitute, NULL becomes "NULL" - we need to check for this
        !identical(weights_expression, "NULL") &&
        # we should only run into this problem, when a variable from a data frame
        # is used in the data_tabulate() method for vectors - thus, we need to check
        # whether the weights_expression contains a "$" - `iris$not_found` is "NULL"
        # we need this check, because the default-method of data_tabulate() is called
        # from the data.frame method, where `weights = weights`, and then,
        # deparse(substitute(weights)) is "weights" (not "NULL" or "iris$not_found"),
        # leading to an error when actually all is OK (if "weights" is NULL)
        # Example:
        #> efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
        # Here, efc$wweight is NULL
        #> data_tabulate(efc$c172code, weights = efc$wweight)
        # Here, wweight errors anyway, because object "wweight" is not found
        #> data_tabulate(efc$c172code, weights = wweight)
        grepl("$", weights_expression, fixed = TRUE)
    ) {
      insight::format_error(
        "The variable specified in `weights` was not found. Possibly misspelled?"
      )
    }
  } else {
    # possibly misspecified weights-variables for data.frame-method -----------
    # -------------------------------------------------------------------------

    if (is.character(weights)) {
      # If "weights" is a character string, must be of length 1
      if (length(weights) > 1) {
        insight::format_error(
          "If `weights` is a string indicating a variable name, `weights` must be of length 1."
        )
      }
      # if "weights" is a character, "x" must be a data frame
      if (!is.data.frame(x)) {
        insight::format_error(
          "If `weights` is a string indicating a variable name, `x` must be a data frame."
        ) # nolint
      }
      # is "by" a column in "x"?
      if (!weights %in% colnames(x)) {
        insight::format_error(sprintf(
          "The variable specified in `weights` was not found in `x`. %s",
          .misspelled_string(names(x), weights, "Possibly misspelled?")
        ))
      }
      weights <- x[[weights]]
    }
    # is "by" of same length as "x"?
    if (is.data.frame(x) && length(weights) != nrow(x)) {
      insight::format_error(
        "Length of `weights` must be equal to number of rows in `x`."
      ) # nolint
    }
    if (!is.data.frame(x) && length(weights) != length(x)) {
      insight::format_error(
        "Length of `weights` must be equal to length of `x`."
      ) # nolint
    }
  }

  weights
}


================================================
FILE: R/datawizard-package.R
================================================
#' `datawizard`
#'
#' @title datawizard: Easy Data Wrangling and Statistical Transformations
#'
#' @description
#'
#' A lightweight package to assist in key steps involved in any data analysis
#' workflow:
#'
#'  - wrangling the raw data to get it in the needed form,
#'  - applying preprocessing steps and statistical transformations, and
#'  - compute statistical summaries of data properties and distributions.
#'
#' It is also the data wrangling backend for packages in 'easystats' ecosystem.
#' Reference: Patil et al. (2022) \doi{10.21105/joss.04684}.
#'
#' @docType package
#' @aliases datawizard datawizard-package
#' @name datawizard-package
#' @keywords internal
"_PACKAGE"


================================================
FILE: R/demean.R
================================================
#' Compute group-meaned and de-meaned variables
#'
#' @description
#'
#' `demean()` computes group- and de-meaned versions of a variable that can be
#' used in regression analysis to model the between- and within-subject effect
#' (person-mean centering or centering within clusters). `degroup()` is more
#' generic in terms of the centering-operation. While `demean()` always uses
#' mean-centering, `degroup()` can also use the mode or median for centering.
#'
#' @param x A data frame.
#' @param select Character vector (or formula) with names of variables to select
#'   that should be group- and de-meaned.
#' @param by Character vector (or formula) with the name of the variable that
#'   indicates the group- or cluster-ID. For cross-classified or nested designs,
#'   `by` can also identify two or more variables as group- or cluster-IDs. If
#'   the data is nested and should be treated as such, set `nested = TRUE`. Else,
#'   if `by` defines two or more variables and `nested = FALSE`, a cross-classified
#'   design is assumed. Note that `demean()` and `degroup()` can't handle a mix
#'   of nested and cross-classified designs in one model.
#'
#'   For nested designs, `by` can be:
#'   - a character vector with the name of the variable that indicates the
#'     levels, ordered from *highest* level to *lowest* (e.g.
#'     `by = c("L4", "L3", "L2")`.
#'   - a character vector with variable names in the format `by = "L4/L3/L2"`,
#'     where the levels are separated by `/`.
#'
#'   See also section _De-meaning for cross-classified designs_ and
#'   _De-meaning for nested designs_ below.
#' @param nested Logical, if `TRUE`, the data is treated as nested. If `FALSE`,
#'   the data is treated as cross-classified. Only applies if `by` contains more
#'   than one variable.
#' @param center Method for centering. `demean()` always performs
#'   mean-centering, while `degroup()` can use `center = "median"` or
#'   `center = "mode"` for median- or mode-centering, and also `"min"`
#'   or `"max"`.
#' @param suffix_demean,suffix_groupmean String value, will be appended to the
#'   names of the group-meaned and de-meaned variables of `x`. By default,
#'   de-meaned variables will be suffixed with `"_within"` and
#'   grouped-meaned variables with `"_between"`.
#' @param append Logical, if `TRUE` (default), the group- and de-meaned
#'   variables will be appended (column bind) to the original data `x`,
#'   thus returning both the original and the de-/group-meaned variables.
#' @param add_attributes Logical, if `TRUE`, the returned variables gain
#'   attributes to indicate the within- and between-effects. This is only
#'   relevant when printing `model_parameters()` - in such cases, the
#'   within- and between-effects are printed in separated blocks.
#' @inheritParams center
#'
#' @return
#' A data frame with the group-/de-meaned variables, which get the suffix
#' `"_between"` (for the group-meaned variable) and `"_within"` (for the
#' de-meaned variable) by default. For cross-classified or nested designs,
#' the name pattern of the group-meaned variables is the name of the centered
#' variable followed by the name of the variable that indicates the related
#' grouping level, e.g. `predictor_L3_between` and `predictor_L2_between`.
#'
#' @seealso If grand-mean centering (instead of centering within-clusters)
#'   is required, see [`center()`]. See [`performance::check_group_variation()`]
#'   to check for heterogeneity bias.
#'
#' @section Heterogeneity Bias:
#'
#' Mixed models include different levels of sources of variability, i.e.
#' error terms at each level. When macro-indicators (or level-2 predictors,
#' or higher-level units, or more general: *group-level predictors that
#' **vary** within and across groups*) are included as fixed effects (i.e.
#' treated as covariate at level-1), the variance that is left unaccounted for
#' this covariate will be absorbed into the error terms of level-1 and level-2
#' (_Bafumi and Gelman 2006; Gelman and Hill 2007, Chapter 12.6._):
#' "Such covariates contain two parts: one that is specific to the higher-level
#' entity that does not vary between occasions, and one that represents the
#' difference between occasions, within higher-level entities" (_Bell et al. 2015_).
#' Hence, the error terms will be correlated with the covariate, which violates
#' one of the assumptions of mixed models (iid, independent and identically
#' distributed error terms). This bias is also called the *heterogeneity bias*
#' (_Bell et al. 2015_). To resolve this problem, level-2 predictors used as
#' (level-1) covariates should be separated into their "within" and "between"
#' effects by "de-meaning" and "group-meaning": After demeaning time-varying
#' predictors, "at the higher level, the mean term is no longer constrained by
#' Level 1 effects, so it is free to account for all the higher-level variance
#' associated with that variable" (_Bell et al. 2015_).
#'
#' @section Panel data and correlating fixed and group effects:
#'
#' `demean()` is intended to create group- and de-meaned variables for panel
#' regression models (fixed effects models), or for complex
#' random-effect-within-between models (see _Bell et al. 2015, 2018_), where
#' group-effects (random effects) and fixed effects correlate (see
#' _Bafumi and Gelman 2006_). This can happen, for instance, when analyzing
#' panel data, which can lead to *Heterogeneity Bias*. To control for correlating
#' predictors and group effects, it is recommended to include the group-meaned
#' and de-meaned version of *time-varying covariates* (and group-meaned version
#' of *time-invariant covariates* that are on a higher level, e.g. level-2
#' predictors) in the model. By this, one can fit complex multilevel models for
#' panel data, including time-varying predictors, time-invariant predictors and
#' random effects.
#'
#' @section Why mixed models are preferred over fixed effects models:
#'
#' A mixed models approach can model the causes of endogeneity explicitly
#' by including the (separated) within- and between-effects of time-varying
#' fixed effects and including time-constant fixed effects. Furthermore,
#' mixed models also include random effects, thus a mixed models approach
#' is superior to classic fixed-effects models, which lack information of
#' variation in the group-effects or between-subject effects. Furthermore,
#' fixed effects regression cannot include random slopes, which means that
#' fixed effects regressions are neglecting "cross-cluster differences in the
#' effects of lower-level controls (which) reduces the precision of estimated
#' context effects, resulting in unnecessarily wide confidence intervals and
#' low statistical power" (_Heisig et al. 2017_).
#'
#' @section Terminology:
#'
#' The group-meaned variable is simply the mean of an independent variable
#' within each group (or id-level or cluster) represented by `by`. It represents
#' the cluster-mean of an independent variable. The regression coefficient of a
#' group-meaned variable is the *between-subject-effect*. The de-meaned variable
#' is then the centered version of the group-meaned variable. De-meaning is
#' sometimes also called person-mean centering or centering within clusters.
#' The regression coefficient of a de-meaned variable represents the
#' *within-subject-effect*.
#'
#' @section De-meaning with continuous predictors:
#'
#' For continuous time-varying predictors, the recommendation is to include
#' both their de-meaned and group-meaned versions as fixed effects, but not
#' the raw (untransformed) time-varying predictors themselves. The de-meaned
#' predictor should also be included as random effect (random slope). In
#' regression models, the coefficient of the de-meaned predictors indicates
#' the within-subject effect, while the coefficient of the group-meaned
#' predictor indicates the between-subject effect.
#'
#' @section De-meaning with binary predictors:
#'
#' For binary time-varying predictors, there are two recommendations. First
#' is to include the raw (untransformed) binary predictor as fixed effect
#' only and the *de-meaned* variable as random effect (random slope).
#' The alternative would be to add the de-meaned version(s) of binary
#' time-varying covariates as additional fixed effect as well (instead of
#' adding it as random slope). Centering time-varying binary variables to
#' obtain within-effects (level 1) isn't necessary. They have a sensible
#' interpretation when left in the typical 0/1 format (_Hoffmann 2015,
#' chapter 8-2.I_). `demean()` will thus coerce categorical time-varying
#' predictors to numeric to compute the de- and group-meaned versions for
#' these variables, where the raw (untransformed) binary predictor and the
#' de-meaned version should be added to the model.
#'
#' @section De-meaning of factors with more than 2 levels:
#'
#' Factors with more than two levels are demeaned in two ways: first, these
#' are also converted to numeric and de-meaned; second, dummy variables
#' are created (binary, with 0/1 coding for each level) and these binary
#' dummy-variables are de-meaned in the same way (as described above).
#' Packages like **panelr** internally convert factors to dummies before
#' demeaning, so this behaviour can be mimicked here.
#'
#' @section De-meaning interaction terms:
#'
#' There are multiple ways to deal with interaction terms of within- and
#' between-effects.
#'
#' - A classical approach is to simply use the product term of the de-meaned
#'   variables (i.e. introducing the de-meaned variables as interaction term
#'   in the model formula, e.g. `y ~ x_within * time_within`). This approach,
#'   however, might be subject to bias (see _Giesselmann & Schmidt-Catran 2020_).
#'
#' - Another option is to first calculate the product term and then apply the
#'   de-meaning to it. This approach produces an estimator "that reflects
#'   unit-level differences of interacted variables whose moderators vary
#'   within units", which is desirable if *no* within interaction of
#'   two time-dependent variables is required. This is what `demean()` does
#'   internally when `select` contains interaction terms.
#'
#' - A third option, when the interaction should result in a genuine within
#'   estimator, is to "double de-mean" the interaction terms
#'   (_Giesselmann & Schmidt-Catran 2018_), however, this is currently
#'   not supported by `demean()`. If this is required, the `wmb()`
#'   function from the **panelr** package should be used.
#'
#' To de-mean interaction terms for within-between models, simply specify
#' the term as interaction for the `select`-argument, e.g. `select = "a*b"`
#' (see 'Examples').
#'
#' @section De-meaning for cross-classified designs:
#'
#' `demean()` can handle cross-classified designs, where the data has two or
#' more groups at the higher (i.e. second) level. In such cases, the
#' `by`-argument can identify two or more variables that represent the
#'  cross-classified group- or cluster-IDs. The de-meaned variables for
#' cross-classified designs are simply subtracting all group means from each
#' individual value, i.e. _fully cluster-mean-centering_ (see _Guo et al. 2024_
#' for details). Note that de-meaning for cross-classified designs is *not*
#' equivalent to de-meaning of nested data structures from models with three or
#' more levels. Set `nested = TRUE` to explicitly assume a nested design. For
#' cross-classified designs, de-meaning is supposed to work for models like
#' `y ~ x + (1|level3) + (1|level2)`, but *not* for models like
#' `y ~ x + (1|level3/level2)`. Note that `demean()` and `degroup()` can't
#' handle a mix of nested and cross-classified designs in one model.
#'
#' @section De-meaning for nested designs:
#'
#' _Brincks et al. (2017)_ have suggested an algorithm to center variables for
#' nested designs, which is implemented in `demean()`. For nested designs, set
#' `nested = TRUE` *and* specify the variables that indicate the different
#' levels in descending order in the `by` argument. E.g.,
#' `by = c("level4", "level3, "level2")` assumes a model like
#' `y ~ x + (1|level4/level3/level2)`. An alternative notation for the
#' `by`-argument would be `by = "level4/level3/level2"`, similar to the
#' formula notation.
#'
#' @section Analysing panel data with mixed models using lme4:
#'
#' A description of how to translate the formulas described in *Bell et al. 2018*
#' into R using `lmer()` from **lme4** can be found in
#' [this vignette](https://easystats.github.io/parameters/articles/demean.html).
#'
#' @references
#'
#'   - Bafumi J, Gelman A. 2006. Fitting Multilevel Models When Predictors
#'     and Group Effects Correlate. In. Philadelphia, PA: Annual meeting of the
#'     American Political Science Association.
#'
#'   - Bell A, Fairbrother M, Jones K. 2019. Fixed and Random Effects
#'     Models: Making an Informed Choice. Quality & Quantity (53); 1051-1074
#'
#'   - Bell A, Jones K. 2015. Explaining Fixed Effects: Random Effects
#'     Modeling of Time-Series Cross-Sectional and Panel Data. Political Science
#'     Research and Methods, 3(1), 133–153.
#'
#'   - Brincks, A. M., Enders, C. K., Llabre, M. M., Bulotsky-Shearer, R. J.,
#'     Prado, G., and Feaster, D. J. (2017). Centering Predictor Variables in
#'     Three-Level Contextual Models. Multivariate Behavioral Research, 52(2),
#'     149–163. https://doi.org/10.1080/00273171.2016.1256753
#'
#'   - Gelman A, Hill J. 2007. Data Analysis Using Regression and
#'     Multilevel/Hierarchical Models. Analytical Methods for Social Research.
#'     Cambridge, New York: Cambridge University Press
#'
#'   - Giesselmann M, Schmidt-Catran, AW. 2020. Interactions in fixed
#'     effects regression models. Sociological Methods & Research, 1–28.
#'     https://doi.org/10.1177/0049124120914934
#'
#'   - Guo Y, Dhaliwal J, Rights JD. 2024. Disaggregating level-specific effects
#'     in cross-classified multilevel models. Behavior Research Methods, 56(4),
#'     3023–3057.
#'
#'   - Heisig JP, Schaeffer M, Giesecke J. 2017. The Costs of Simplicity:
#'     Why Multilevel Models May Benefit from Accounting for Cross-Cluster
#'     Differences in the Effects of Controls. American Sociological Review 82
#'     (4): 796–827.
#'
#'   - Hoffman L. 2015. Longitudinal analysis: modeling within-person
#'     fluctuation and change. New York: Routledge
#'
#' @examples
#'
#' data(iris)
#' iris$ID <- sample(1:4, nrow(iris), replace = TRUE) # fake-ID
#' iris$binary <- as.factor(rbinom(150, 1, 0.35)) # binary variable
#'
#' x <- demean(iris, select = c("Sepal.Length", "Petal.Length"), by = "ID")
#' head(x)
#'
#' x <- demean(iris, select = c("Sepal.Length", "binary", "Species"), by = "ID")
#' head(x)
#'
#'
#' # demean interaction term x*y
#' dat <- data.frame(
#'   a = c(1, 2, 3, 4, 1, 2, 3, 4),
#'   x = c(4, 3, 3, 4, 1, 2, 1, 2),
#'   y = c(1, 2, 1, 2, 4, 3, 2, 1),
#'   ID = c(1, 2, 3, 1, 2, 3, 1, 2)
#' )
#' demean(dat, select = c("a", "x*y"), by = "ID")
#'
#' # or in formula-notation
#' demean(dat, select = ~ a + x * y, by = ~ID)
#'
#' @export
demean <- function(
  x,
  select,
  by,
  nested = FALSE,
  suffix_demean = "_within",
  suffix_groupmean = "_between",
  append = TRUE,
  add_attributes = TRUE,
  verbose = TRUE
) {
  degroup(
    x = x,
    select = select,
    by = by,
    nested = nested,
    center = "mean",
    suffix_demean = suffix_demean,
    suffix_groupmean = suffix_groupmean,
    append = append,
    add_attributes = add_attributes,
    verbose = verbose
  )
}


#' @rdname demean
#' @export
degroup <- function(
  x,
  select,
  by,
  nested = FALSE,
  center = "mean",
  suffix_demean = "_within",
  suffix_groupmean = "_between",
  append = TRUE,
  add_attributes = TRUE,
  verbose = TRUE
) {
  # ugly tibbles again... but save original data frame
  original_data <- x
  x <- .coerce_to_dataframe(x)

  center <- match.arg(
    tolower(center),
    choices = c("mean", "median", "mode", "min", "max")
  )

  if (inherits(select, "formula")) {
    # formula to character, remove "~", split at "+". We don't use `all.vars()`
    # here because we want to keep the interaction terms as they are
    select <- trimws(unlist(
      strsplit(
        gsub("~", "", insight::safe_deparse(select), fixed = TRUE),
        "+",
        fixed = TRUE
      ),
      use.names = FALSE
    ))
  }

  # handle different "by" options
  if (inherits(by, "formula")) {
    by <- all.vars(by)
  }

  # we also allow lme4-syntax here: if by = "L4/L3/L2", we assume a nested design
  if (length(by) == 1 && grepl("/", by, fixed = TRUE)) {
    by <- insight::trim_ws(unlist(
      strsplit(by, "/", fixed = TRUE),
      use.names = FALSE
    ))
    nested <- TRUE
  }

  # identify interaction terms
  interactions_no <- select[!grepl("(\\*|\\:)", select)]
  interactions_yes <- select[grepl("(\\*|\\:)", select)]

  # if we have interaction terms that should be de-meaned, calculate the product
  # of the terms first, then demean the product
  if (length(interactions_yes)) {
    interaction_terms <- lapply(
      strsplit(interactions_yes, "*", fixed = TRUE),
      trimws
    )
    product <- lapply(interaction_terms, function(i) do.call(`*`, x[, i]))
    new_dat <- as.data.frame(stats::setNames(
      product,
      gsub("\\s", "", gsub("*", "_", interactions_yes, fixed = TRUE))
    ))
    x <- cbind(x, new_dat)
    select <- c(interactions_no, colnames(new_dat))
  }

  # check if all variables are present
  not_found <- setdiff(c(select, by), colnames(x))

  if (length(not_found)) {
    insight::format_error(
      paste0(
        "Variable",
        ifelse(length(not_found) > 1, "s ", " "),
        text_concatenate(not_found, enclose = "\""),
        ifelse(length(not_found) > 1, " were", " was"),
        " not found in the dataset."
      ),
      .misspelled_string(
        colnames(x),
        not_found,
        "Possibly misspelled or not yet defined?"
      )
    )
  }

  # get data to demean...
  dat <- x[, c(select, by)]

  # find categorical predictors that are coded as factors
  categorical_predictors <- vapply(
    dat[select],
    is.factor,
    FUN.VALUE = logical(1L)
  )

  # convert binary predictors to numeric
  if (any(categorical_predictors)) {
    # convert categorical to numeric, and then demean
    dat[select[categorical_predictors]] <- lapply(
      dat[select[categorical_predictors]],
      function(i) as.numeric(i) - 1
    )
    # convert categorical to dummy, and demean each binary dummy
    for (i in select[categorical_predictors]) {
      if (nlevels(x[[i]]) > 2) {
        for (j in levels(x[[i]])) {
          # create vector with zeros
          f <- rep(0, nrow(x))
          # for each matching level, set dummy to 1
          f[x[[i]] == j] <- 1
          dummy <- data.frame(f)
          # colnames = variable name + factor level
          # also add new dummy variables to "select"
          colnames(dummy) <- sprintf("%s_%s", i, j)
          select <- c(select, sprintf("%s_%s", i, j))
          # add to data
          dat <- cbind(dat, dummy)
        }
      }
    }
    # tell user...
    if (isTRUE(verbose)) {
      insight::format_alert(
        paste0(
          "Categorical predictors (",
          toString(names(categorical_predictors)[categorical_predictors]),
          ") have been coerced to numeric values to compute de- and group-meaned variables.\n"
        )
      )
    }
  }

  # group variables, then calculate the mean-value
  # for variables within each group (the group means). assign
  # mean values to a vector of same length as the data

  gm_fun <- switch(
    center,
    mode = function(.gm) distribution_mode(stats::na.omit(.gm)),
    median = function(.gm) stats::median(.gm, na.rm = TRUE),
    min = function(.gm) min(.gm, na.rm = TRUE),
    max = function(.gm) max(.gm, na.rm = TRUE),
    function(.gm) mean(.gm, na.rm = TRUE)
  )

  # we allow disaggregating level-specific effects for cross-classified multilevel
  # models (see Guo et al. 2024). Two levels should work as proposed by the authors,
  # more levels also already work, but need to check the formula from the paper
  # and validate results

  if (length(by) == 1) {
    # simple case: one level
    group_means_list <- lapply(select, function(i) {
      stats::ave(dat[[i]], dat[[by]], FUN = gm_fun)
    })
    names(group_means_list) <- select
    # create de-meaned variables by subtracting the group mean from each individual value
    person_means_list <- lapply(select, function(i) {
      dat[[i]] - group_means_list[[i]]
    })
  } else if (nested) {
    # nested design: by > 1, nested is explicitly set to TRUE
    # We want:
    # L3_between = xbar(k)
    # L2_between = xbar(j,k) - xbar(k)
    # L1_within = x(ijk) - xbar(jk)
    # , where
    # x(ijk) is the individual value / variable that is measured on level 1
    # xbar(k) <- ave(x_ijk, L3, FUN = mean), the group mean of the variable at highest level
    # xbar(jk) <- ave(x_ijk, L3, L2, FUN = mean), the group mean of the variable at second level
    group_means_list <- lapply(select, function(i) {
      out <- lapply(seq_along(by), function(k) {
        stats::ave(dat[[i]], dat[, by[1:k], drop = FALSE], FUN = gm_fun)
      })
      # subtract mean of higher level from lower level
      for (j in 2:length(by)) {
        out[[j]] <- out[[j]] - out[[j - 1]]
      }
      names(out) <- paste0(i, "_", by)
      out
    })
    # create de-meaned variables by subtracting the group mean from each individual value
    person_means_list <- lapply(
      # seq_along(select),
      # function(i) dat[[select[i]]] - group_means_list[[i]][[length(by)]]
      select,
      function(i) {
        dat[[i]] - stats::ave(dat[[i]], dat[, by, drop = FALSE], FUN = gm_fun)
      }
    )
  } else {
    # cross-classified design: by > 1
    group_means_list <- lapply(by, function(j) {
      out <- lapply(select, function(i) {
        stats::ave(dat[[i]], dat[[j]], FUN = gm_fun)
      })
      names(out) <- paste0(select, "_", j)
      out
    })
    group_means_list <- unlist(group_means_list, recursive = FALSE)

    # de-meaned variables for cross-classified design is simply subtracting
    # all group means from each individual value
    person_means_list <- lapply(select, function(i) {
      sum_group_means <- Reduce("+", group_means_list[paste0(i, "_", by)])
      dat[[i]] - sum_group_means
    })
  }

  # preserve names
  names(person_means_list) <- select

  # convert to data frame and add suffix to column names

  group_means <- as.data.frame(group_means_list)
  person_means <- as.data.frame(person_means_list)

  colnames(person_means) <- sprintf(
    "%s%s",
    colnames(person_means),
    suffix_demean
  )
  colnames(group_means) <- sprintf(
    "%s%s",
    colnames(group_means),
    suffix_groupmean
  )

  if (isTRUE(add_attributes)) {
    person_means[] <- lapply(person_means, function(i) {
      attr(i, "within-effect") <- TRUE
      i
    })
    group_means[] <- lapply(group_means, function(i) {
      attr(i, "between-effect") <- TRUE
      i
    })
  }

  # between and within effects
  out <- cbind(group_means, person_means)

  # append to original data?
  if (isTRUE(append)) {
    # check for unique column names
    duplicated_columns <- intersect(colnames(out), colnames(original_data))
    if (length(duplicated_columns)) {
      insight::format_error(paste0(
        "One or more of the centered variables already exist in the orignal data frame: ", # nolint
        text_concatenate(duplicated_columns, enclose = "`"),
        ". Please rename the affected variable(s) in the original data, or use the arguments `suffix_demean` and `suffix_groupmean` to rename the centered variables." # nolint
      ))
    }
    out <- cbind(original_data, out)
  }

  out
}


#' @rdname demean
#' @export
detrend <- degroup


================================================
FILE: R/describe_distribution.R
================================================
#' Describe a distribution
#'
#' This function describes a distribution by a set of indices (e.g., measures of
#' centrality, dispersion, range, skewness, (excess) kurtosis).
#'
#' @param x A numeric vector, a character vector, a data frame, or a list. See
#' `Details`.
#' @param by Column names indicating how to split the data in various groups
#' before describing the distribution. `by` groups will be added to potentially
#' existing groups created by `data_group()`.
#' @param range Return the range (min and max).
#' @param quartiles Return the first and third quartiles (25th and 75th
#'   percentiles).
#' @param include_factors Logical, if `TRUE`, factors are included in the
#'   output, however, only columns for range (first and last factor levels) as
#'   well as n and missing will contain information.
#' @param ci Confidence Interval (CI) level. Default is `NULL`, i.e. no
#'   confidence intervals are computed. If not `NULL`, confidence intervals are
#'   based on bootstrap replicates (see `iterations`).
#' @param iterations The number of bootstrap replicates for computing confidence
#'   intervals. Only applies when `ci` is not `NULL`. Defaults to `100`. For
#'   more stable results, increase the number of `iterations`, but note that this
#'   can also increase the computation time significantly.
#' @param iqr Logical, if `TRUE`, the interquartile range is calculated (based
#'   on [stats::IQR()], using `type = 6`).
#' @param verbose Show or silence warnings and messages.
#' @inheritParams bayestestR::point_estimate
#' @inheritParams extract_column_names
#'
#' @details If `x` is a data frame, only numeric variables are kept and will be
#' displayed in the summary by default.
#'
#' If `x` is a list, the behavior is different whether `x` is a stored list. If
#' `x` is stored (for example, `describe_distribution(mylist)` where `mylist`
#' was created before), artificial variable names are used in the summary
#' (`Var_1`, `Var_2`, etc.). If `x` is an unstored list (for example,
#' `describe_distribution(list(mtcars$mpg))`), then `"mtcars$mpg"` is used as
#' variable name.
#'
#' @note There is also a
#'   [`plot()`-method](https://easystats.github.io/see/articles/parameters.html)
#'   implemented in the [**see**-package](https://easystats.github.io/see/).
#'
#' @seealso [kurtosis()] to compute kurtosis (recognized as excess kurtosis).
#'
#' @return A data frame with columns that describe the properties of the variables.
#'
#' @examplesIf require("bayestestR", quietly = TRUE)
#' describe_distribution(rnorm(100))
#'
#' data(iris)
#' describe_distribution(iris)
#' describe_distribution(iris, include_factors = TRUE, quartiles = TRUE)
#' describe_distribution(list(mtcars$mpg, mtcars$cyl))
#' @export
describe_distribution <- function(x, ...) {
  UseMethod("describe_distribution")
}


#' @export
describe_distribution.default <- function(x, verbose = TRUE, ...) {
  if (verbose) {
    insight::format_warning(
      paste0("Can't describe variables of class `", class(x)[1], "`.")
    )
  }
  NULL
}


#' @export
describe_distribution.list <- function(
  x,
  centrality = "mean",
  dispersion = TRUE,
  iqr = TRUE,
  range = TRUE,
  quartiles = FALSE,
  ci = NULL,
  include_factors = FALSE,
  iterations = 100,
  threshold = 0.1,
  verbose = TRUE,
  ...
) {
  factor_el <- which(vapply(x, is.factor, FUN.VALUE = logical(1L)))
  num_el <- which(vapply(x, is.numeric, FUN.VALUE = logical(1L)))

  # get elements names as is
  # ex: `list(mtcars$mpg, mtcars$cyl) -> c("mtcars$mpg", "mtcars$cyl")`
  nm <- vapply(
    sys.call()[[2]],
    insight::safe_deparse,
    FUN.VALUE = character(1L)
  )[-1]

  if (isTRUE(include_factors)) {
    x <- x[c(num_el, factor_el)]
    if (length(nm) != 0) {
      nm <- nm[c(num_el, factor_el)]
    }
  } else {
    x <- x[num_el]
    if (length(nm) != 0) {
      nm <- nm[num_el]
    }
  }

  # Not possible to obtain elements names if they are stored in
  # an object
  if (length(nm) == 0) {
    nm <- paste0("Var_", seq_along(x))
  }

  # The function currently doesn't support descriptive summaries for character
  # or factor types.
  out <- do.call(
    rbind,
    lapply(x, function(i) {
      if (
        (include_factors && is.factor(i)) || (!is.character(i) && !is.factor(i))
      ) {
        describe_distribution(
          i,
          centrality = centrality,
          dispersion = dispersion,
          iqr = iqr,
          range = range,
          quartiles = quartiles,
          ci = ci,
          iterations = iterations,
          threshold = threshold,
          verbose = verbose
        )
      }
    })
  )

  if (is.null(names(x))) {
    new_names <- nm
  } else {
    empty_names <- which(!nzchar(names(x), keepNA = TRUE))
    new_names <- names(x)
    new_names[empty_names] <- nm[empty_names]
  }

  out$Variable <- new_names
  row.names(out) <- NULL
  out <- out[c("Variable", setdiff(colnames(out), "Variable"))]

  class(out) <- unique(c(
    "parameters_distribution",
    "see_parameters_distribution",
    class(out)
  ))
  attr(out, "object_name") <- deparse(substitute(x), width.cutoff = 500)
  attr(out, "ci") <- ci
  attr(out, "centrality") <- centrality
  attr(out, "threshold") <- threshold
  out
}


#' @rdname describe_distribution
#' @export
describe_distribution.numeric <- function(
  x,
  centrality = "mean",
  dispersion = TRUE,
  iqr = TRUE,
  range = TRUE,
  quartiles = FALSE,
  ci = NULL,
  iterations = 100,
  threshold = 0.1,
  verbose = TRUE,
  ...
) {
  insight::check_if_installed("bayestestR")
  out <- data.frame(.temp = 0)

  # Missing
  n_missing <- sum(is.na(x))
  x <- stats::na.omit(x)

  # Point estimates
  out <- cbind(
    out,
    bayestestR::point_estimate(
      x,
      centrality = centrality,
      dispersion = dispersion,
      threshold = threshold,
      verbose = verbose,
      ...
    )
  )

  # interquartile range, type same as minitab and SPSS
  if (iqr) {
    out$IQR <- stats::IQR(x, na.rm = TRUE, type = 6)
  }

  # Confidence Intervals
  if (!is.null(ci)) {
    insight::check_if_installed("boot")
    # tell user about bootstrapping and appropriate number of iterations.
    # "show_iterations_msg" is an undocumented argument that is only passed
    # internally to this function to avoid multiple repeated messages
    if (!isFALSE(list(...)$show_iterations_msg)) {
      .show_iterations_warning(verbose, iterations, ci)
    }
    # calculate CI for each centrality
    for (cntr in .centrality_options(centrality)) {
      results <- tryCatch(
        {
          boot::boot(
            data = x,
            statistic = .boot_distribution,
            R = iterations,
            centrality = cntr
          )
        },
        error = function(e) {
          msg <- conditionMessage(e)
          if (!is.null(msg) && msg == "sample is too sparse to find TD") {
            insight::format_warning(
              "When bootstrapping CIs, sample was too sparse to find TD. Returning NA for CIs."
            )
            list(t = c(NA_real_, NA_real_))
          }
        }
      )
      out_ci <- bayestestR::ci(results$t, ci = ci, verbose = FALSE)
      ci_data <- data.frame(out_ci$CI_low[1], out_ci$CI_high[1])
      colnames(ci_data) <- c(paste0("CI_low_", cntr), paste0("CI_high_", cntr))
      out <- cbind(out, ci_data)
    }
  }

  # Range
  if (range) {
    out <- cbind(
      out,
      data.frame(
        Min = min(x, na.rm = TRUE),
        Max = max(x, na.rm = TRUE)
      )
    )
  }

  # Quartiles
  if (quartiles) {
    out <- cbind(
      out,
      data.frame(
        Q1 = stats::quantile(x, probs = 0.25, na.rm = TRUE),
        Q3 = stats::quantile(x, probs = 0.75, na.rm = TRUE)
      )
    )
  }

  # Skewness
  out <- cbind(
    out,
    data.frame(
      Skewness = as.numeric(skewness(x, verbose = verbose)),
      Kurtosis = as.numeric(kurtosis(x, verbose = verbose))
    )
  )

  out$n <- length(x)
  out$n_Missing <- n_missing
  out$.temp <- NULL

  class(out) <- unique(c(
    "parameters_distribution",
    "see_parameters_distribution",
    class(out)
  ))
  attr(out, "data") <- x
  attr(out, "ci") <- ci
  attr(out, "centrality") <- centrality
  attr(out, "threshold") <- threshold
  out
}


#' @rdname describe_distribution
#' @export
describe_distribution.factor <- function(
  x,
  dispersion = TRUE,
  range = TRUE,
  verbose = TRUE,
  ...
) {
  # Missing
  n_missing <- sum(is.na(x))
  x <- stats::na.omit(x)

  out <- data.frame(
    Mean = NA,
    SD = NA,
    CI_low = NA,
    CI_high = NA,
    IQR = NA,
    Min = levels(x)[1],
    Max = levels(x)[nlevels(x)],
    Q1 = NA,
    Q3 = NA,
    Skewness = as.numeric(skewness(x, verbose = verbose)),
    Kurtosis = as.numeric(kurtosis(x, verbose = verbose)),
    n = length(x),
    n_Missing = n_missing,
    stringsAsFactors = FALSE
  )

  if (!dispersion) {
    out$SD <- NULL
  }

  dot_args <- list(...)

  if (is.null(dot_args[["ci"]])) {
    out$CI_low <- NULL
    out$CI_high <- NULL
  }

  if (is.null(dot_args[["iqr"]]) || isFALSE(dot_args[["iqr"]])) {
    out$IQR <- NULL
  }

  if (is.null(dot_args[["quartiles"]]) || isFALSE(dot_args[["quartiles"]])) {
    out$Q1 <- NULL
    out$Q3 <- NULL
  }

  if (!range) {
    out$Min <- NULL
    out$Max <- NULL
  }

  class(out) <- unique(c(
    "parameters_distribution",
    "see_parameters_distribution",
    class(out)
  ))
  attr(out, "data") <- x
  out
}


#' @export
describe_distribution.character <- function(
  x,
  dispersion = TRUE,
  range = TRUE,
  verbose = TRUE,
  ...
) {
  # Missing
  n_missing <- sum(is.na(x))
  x <- stats::na.omit(x)
  values <- unique(x)

  out <- data.frame(
    Mean = NA,
    SD = NA,
    IQR = NA,
    CI_low = NA,
    CI_high = NA,
    Min = values[1],
    Max = values[length(values)],
    Q1 = NA,
    Q3 = NA,
    Skewness = as.numeric(skewness(x, verbose = verbose)),
    Kurtosis = as.numeric(kurtosis(x, verbose = verbose)),
    n = length(x),
    n_Missing = n_missing,
    stringsAsFactors = FALSE
  )

  if (!dispersion) {
    out$SD <- NULL
  }

  dot_args <- list(...)
  if (is.null(dot_args[["ci"]])) {
    out$CI_low <- NULL
    out$CI_high <- NULL
  }

  if (is.null(dot_args[["iqr"]]) || isFALSE(dot_args[["iqr"]])) {
    out$IQR <- NULL
  }

  if (is.null(dot_args[["quartiles"]]) || isFALSE(dot_args[["quartiles"]])) {
    out$Q1 <- NULL
    out$Q3 <- NULL
  }

  if (!range) {
    out$Min <- NULL
    out$Max <- NULL
  }

  class(out) <- unique(c(
    "parameters_distribution",
    "see_parameters_distribution",
    class(out)
  ))
  attr(out, "data") <- x
  out
}


#' @rdname describe_distribution
#' @export
describe_distribution.data.frame <- function(
  x,
  select = NULL,
  exclude = NULL,
  centrality = "mean",
  dispersion = TRUE,
  iqr = TRUE,
  range = TRUE,
  quartiles = FALSE,
  include_factors = FALSE,
  ci = NULL,
  iterations = 100,
  threshold = 0.1,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  by = NULL,
  ...
) {
  select <- .select_nse(
    select,
    x,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  # check for reserved variable names
  .check_for_reserved_names(select)

  # tell user about bootstrapping and appropriate number of iterations
  .show_iterations_warning(verbose, iterations, ci)

  if (!is.null(by)) {
    if (!is.character(by)) {
      insight::format_error("`by` must be a character vector.")
    }
    x <- data_group(x, by)
    out <- describe_distribution(
      x,
      select = select,
      exclude = exclude,
      centrality = centrality,
      dispersion = dispersion,
      iqr = iqr,
      range = range,
      quartiles = quartiles,
      include_factors = include_factors,
      ci = ci,
      iterations = iterations,
      threshold = threshold,
      ignore_case = ignore_case,
      regex = regex,
      verbose = verbose
    )
    out <- data_ungroup(out)
    return(out)
  }

  # The function currently doesn't support descriptive summaries for character
  # or factor types.
  out <- do.call(
    rbind,
    lapply(x[select], function(i) {
      if (
        (include_factors && is.factor(i)) || (!is.character(i) && !is.factor(i))
      ) {
        describe_distribution(
          i,
          centrality = centrality,
          dispersion = dispersion,
          iqr = iqr,
          range = range,
          quartiles = quartiles,
          ci = ci,
          iterations = iterations,
          threshold = threshold,
          verbose = verbose,
          show_iterations_msg = FALSE
        )
      }
    })
  )

  if (is.null(out)) {
    return(NULL)
  }

  out$Variable <- row.names(out)
  row.names(out) <- NULL
  out <- out[c("Variable", setdiff(colnames(out), "Variable"))]

  class(out) <- unique(c(
    "parameters_distribution",
    "see_parameters_distribution",
    class(out)
  ))
  attr(out, "object_name") <- deparse(substitute(x), width.cutoff = 500)
  attr(out, "ci") <- ci
  attr(out, "centrality") <- centrality
  attr(out, "threshold") <- threshold
  out
}


#' @export
describe_distribution.grouped_df <- function(
  x,
  select = NULL,
  exclude = NULL,
  centrality = "mean",
  dispersion = TRUE,
  iqr = TRUE,
  range = TRUE,
  quartiles = FALSE,
  include_factors = FALSE,
  ci = NULL,
  iterations = 100,
  threshold = 0.1,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  by = NULL,
  ...
) {
  if (!is.null(by)) {
    if (!is.character(by)) {
      insight::format_error("`by` must be a character vector.")
    }
    existing_grps <- setdiff(colnames(attributes(x)$groups), ".rows")
    x <- data_group(x, c(existing_grps, by))
  }
  group_vars <- setdiff(colnames(attributes(x)$groups), ".rows")
  group_data <- expand.grid(lapply(x[group_vars], function(i) unique(sort(i))))
  groups <- split(x, x[group_vars])
  groups <- Filter(function(x) nrow(x) > 0, groups)

  # check for reserved variable names
  .check_for_reserved_names(group_vars, type = "group_vars")

  select <- .select_nse(
    select,
    x,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  # tell user about bootstrapping and appropriate number of iterations
  .show_iterations_warning(verbose, iterations, ci)

  out <- do.call(
    rbind,
    lapply(seq_along(groups), function(i) {
      d <- describe_distribution.data.frame(
        groups[[i]][select],
        centrality = centrality,
        dispersion = dispersion,
        iqr = iqr,
        range = range,
        quartiles = quartiles,
        include_factors = include_factors,
        ci = ci,
        iterations = iterations,
        threshold = threshold,
        verbose = verbose,
        show_iterations_msg = FALSE,
        ...
      )

      for (grp in seq_along(group_vars)) {
        d[[group_vars[grp]]] <- group_data[i, grp]
      }
      d <- data_relocate(d, group_vars, before = 1)

      d
    })
  )

  class(out) <- unique(c(
    "parameters_distribution",
    "see_parameters_distribution",
    class(out)
  ))
  attr(out, "object_name") <- deparse(substitute(x), width.cutoff = 500)
  attr(out, "ci") <- ci
  attr(out, "centrality") <- centrality
  attr(out, "threshold") <- threshold
  out
}


# methods ------------------

#' @export
print.parameters_distribution <- function(x, digits = 2, ...) {
  formatted_table <- format(
    x,
    digits = digits,
    format = "text",
    ci_width = NULL,
    ci_brackets = TRUE,
    ...
  )
  cat(insight::export_table(
    formatted_table,
    format = "text",
    digits = digits,
    ...
  ))
  invisible(x)
}


#' @export
print_md.parameters_distribution <- function(
  x,
  digits = 2,
  ci_brackets = c("(", ")"),
  ...
) {
  formatted_table <- format(
    x = x,
    digits = digits,
    format = "markdown",
    ci_width = NULL,
    ci_brackets = ci_brackets,
    ...
  )

  insight::export_table(
    formatted_table,
    format = "markdown",
    align = "firstleft",
    ...
  )
}


#' @export
print_html.parameters_distribution <- function(
  x,
  digits = 2,
  ci_brackets = c("(", ")"),
  ...
) {
  formatted_table <- format(
    x = x,
    digits = digits,
    format = "html",
    ci_width = NULL,
    ci_brackets = ci_brackets,
    ...
  )

  # determine backend
  backend <- .check_format_backend(...)

  # pass arguments to export_table
  fun_args <- list(
    formatted_table,
    format = backend,
    ...
  )

  # no "align" for format "tt" - this currently gives an error. Not sure
  # if related to insight::export_table or tinytable
  if (identical(backend, "html")) {
    fun_args$align <- "firstleft"
  }

  do.call(insight::export_table, fun_args)
}


#' @export
display.parameters_distribution <- function(
  object,
  format = "markdown",
  digits = 2,
  ...
) {
  format <- .display_default_format(format)

  fun_args <- list(
    x = object,
    digits = digits,
    ...
  )

  # print table in HTML or markdown format
  if (format %in% c("html", "tt")) {
    fun_args$backend <- format
    do.call(print_html, fun_args)
  } else {
    do.call(print_md, fun_args)
  }
}


#' @export
plot.parameters_distribution <- function(x, ...) {
  insight::check_if_installed("see")
  NextMethod()
}


# bootstrapping CIs ----------------------------------

.boot_distribution <- function(data, indices, centrality) {
  out <- datawizard::describe_distribution(
    data[indices],
    centrality = centrality,
    dispersion = FALSE,
    iqr = FALSE,
    range = FALSE,
    quartiles = FALSE,
    ci = NULL,
    verbose = FALSE
  )
  out[[1]]
}


# check centrality options ----------------------------------------

.centrality_options <- function(centrality) {
  if (identical(centrality, "all")) {
    c("mean", "median", "MAP")
  } else {
    centrality
  }
}


# sanity check ----------------------------------------

.check_for_reserved_names <- function(x, type = "select") {
  reserved_names <- c(
    "Variable",
    "CI_low",
    "CI_high",
    "n_Missing",
    "Q1",
    "Q3",
    "Quartiles",
    "Min",
    "Max",
    "Range",
    "Trimmed_Mean",
    "Trimmed",
    "Mean",
    "SD",
    "IQR",
    "Skewness",
    "Kurtosis",
    "n",
    "Median",
    "MAD",
    "MAP",
    "IQR",
    "n_Missing"
  )
  invalid_names <- intersect(reserved_names, x)

  if (length(invalid_names) > 0) {
    # adapt message to show user whether wrong variables appear in grouping or select
    msg <- switch(
      type,
      select = "with `describe_distribution()`: ",
      "as grouping variables in `describe_distribution()`: "
    )
    insight::format_error(paste0(
      "Following variable names are reserved and cannot be used ",
      msg,
      text_concatenate(invalid_names, enclose = "`"),
      ". Please rename these variables in your data."
    ))
  }
}


.show_iterations_warning <- function(verbose, iterations = 100, ci = NULL) {
  if (verbose && !is.null(ci)) {
    msg <- paste(
      "Bootstrapping confidence intervals using",
      iterations,
      "iterations, please be patient..."
    )
    if (iterations < 200) {
      msg <- c(
        msg,
        "For more stable intervals, increase the number of `iterations`, but note that this can also increase the computation time significantly."
      ) # nolint
    }
    insight::format_alert(msg)
  }
}


================================================
FILE: R/descriptives.R
================================================
# distribution_mode ----------------------------------

#' Compute mode for a statistical distribution
#'
#' @param x An atomic vector, a list, or a data frame.
#'
#' @return
#'
#' The value that appears most frequently in the provided data.
#' The returned data structure will be the same as the entered one.
#'
#' @seealso For continuous variables, the
#'   **Highest Maximum a Posteriori probability estimate (MAP)** may be
#'   a more useful way to estimate the most commonly-observed value
#'   than the mode. See [bayestestR::map_estimate()].
#'
#' @examples
#'
#' distribution_mode(c(1, 2, 3, 3, 4, 5))
#' distribution_mode(c(1.5, 2.3, 3.7, 3.7, 4.0, 5))
#'
#' @export
distribution_mode <- function(x) {
  # TODO: Add support for weights, trim, binned (method)
  uniqv <- unique(x)
  tab <- tabulate(match(x, uniqv))
  idx <- which.max(tab)
  uniqv[idx]
}

#' Compute the coefficient of variation
#'
#' Compute the coefficient of variation (CV, ratio of the standard deviation to
#' the mean, \eqn{\sigma/\mu}) for a set of numeric values.
#'
#' @return The computed coefficient of variation for `x`.
#' @export
#'
#' @examples
#' coef_var(1:10)
#' coef_var(c(1:10, 100), method = "median_mad")
#' coef_var(c(1:10, 100), method = "qcd")
#' coef_var(mu = 10, sigma = 20)
#' coef_var(mu = 10, sigma = 20, method = "unbiased", n = 30)
coef_var <- function(x, ...) {
  UseMethod("coef_var")
}
#' @name distribution_cv
#' @rdname coef_var
#' @export
distribution_coef_var <- coef_var

#' @export
coef_var.default <- function(x, verbose = TRUE, ...) {
  if (verbose) {
    insight::format_warning(
      paste0(
        "Can't compute the coefficient of variation objects of class `",
        class(x)[1],
        "`."
      )
    )
  }
  NULL
}

#' @param x A numeric vector of ratio scale (see details), or vector of values than can be coerced to one.
#' @param mu A numeric vector of mean values to use to compute the coefficient
#'   of variation. If supplied, `x` is not used to compute the mean.
#' @param sigma A numeric vector of standard deviation values to use to compute the coefficient
#'   of variation. If supplied, `x` is not used to compute the SD.
#' @param method Method to use to compute the CV. Can be `"standard"` to compute
#'   by dividing the standard deviation by the mean, `"unbiased"` for the
#'   unbiased estimator for normally distributed data, or one of two robust
#'   alternatives: `"median_mad"` to divide the median by the [stats::mad()],
#'   or `"qcd"` (quartile coefficient of dispersion, interquartile range divided
#'   by the sum of the quartiles \[twice the midhinge\]: \eqn{(Q_3 - Q_1)/(Q_3 + Q_1)}.
#' @param trim the fraction (0 to 0.5) of values to be trimmed from
#'   each end of `x` before the mean and standard deviation (or other measures)
#'   are computed. Values of `trim` outside the range of (0 to 0.5) are taken
#'   as the nearest endpoint.
#' @param remove_na Logical. Should `NA` values be removed before computing (`TRUE`)
#'   or not (`FALSE`, default)?
#' @param n If `method = "unbiased"` and both `mu` and `sigma` are provided (not
#'   computed from `x`), what sample size to use to adjust the computed CV
#'   for small-sample bias?
#' @param ... Further arguments passed to computation functions.
#'
#' @details
#' CV is only applicable of values taken on a ratio scale: values that have a
#' *fixed* meaningfully defined 0 (which is either the lowest or highest
#' possible value), and that ratios between them are interpretable For example,
#' how many sandwiches have I eaten this week? 0 means "none" and 20 sandwiches
#' is 4 times more than 5 sandwiches. If I were to center the number of
#' sandwiches, it will no longer be on a ratio scale (0 is no "none" it is the
#' mean, and the ratio between 4 and -2 is not meaningful). Scaling a ratio
#' scale still results in a ratio scale. So I can re define "how many half
#' sandwiches did I eat this week ( = sandwiches * 0.5) and 0 would still mean
#' "none", and 20 half-sandwiches is still 4 times more than 5 half-sandwiches.
#'
#' This means that CV is **NOT** invariant to shifting, but it is to scaling:

#' ```{r}
#' sandwiches <- c(0, 4, 15, 0, 0, 5, 2, 7)
#' coef_var(sandwiches)
#'
#' coef_var(sandwiches / 2) # same
#'
#' coef_var(sandwiches + 4) # different! 0 is no longer meaningful!
#' ````
#'
#' @rdname coef_var
#'
#' @export
coef_var.numeric <- function(
  x,
  mu = NULL,
  sigma = NULL,
  method = c("standard", "unbiased", "median_mad", "qcd"),
  trim = 0,
  remove_na = FALSE,
  n = NULL,
  ...
) {
  # TODO: Support weights
  if (!missing(x) && all(c(-1, 1) %in% sign(x))) {
    insight::format_error(
      "Coefficient of variation only applicable for ratio scale variables."
    )
  }
  method <- match.arg(
    method,
    choices = c("standard", "unbiased", "median_mad", "qcd")
  )
  if (is.null(mu) || is.null(sigma)) {
    if (isTRUE(remove_na)) {
      x <- .drop_na(x)
    }
    n <- length(x)
    x <- .trim_values(x, trim = trim, n = n)
  }
  if (is.null(mu)) {
    mu <- switch(
      method,
      standard = ,
      unbiased = mean(x, ...),
      median_mad = stats::median(x, ...),
      qcd = unname(sum(stats::quantile(x, probs = c(0.25, 0.75), ...)))
    )
  }
  if (is.null(sigma)) {
    sigma <- switch(
      method,
      standard = ,
      unbiased = stats::sd(x, ...),
      median_mad = stats::mad(x, center = mu, ...),
      qcd = unname(diff(stats::quantile(x, probs = c(0.25, 0.75), ...)))
    )
  }
  out <- sigma / mu
  if (method == "unbiased") {
    if (is.null(n)) {
      insight::format_error(
        "A value for `n` must be provided when `method = \"unbiased\"` and both `mu` and `sigma` are provided."
      )
    }
    # from DescTools::CoefVar
    out <- out * (1 - 1 / (4 * (n - 1)) + 1 / n * out^2 + 1 / (2 * (n - 1)^2))
  }
  out
}


# descriptives helpers

.drop_na <- function(x) {
  x[!is.na(x)]
}

.trim_values <- function(x, trim = 0, n = NULL, weights = NULL) {
  # TODO: Support weights
  if (!is.numeric(trim) || length(trim) != 1L) {
    insight::format_error("`trim` must be a single numeric value.")
  }
  if (is.null(n)) {
    n <- length(x)
  }
  if (trim > 0 && n) {
    if (anyNA(x)) {
      return(NA_real_)
    }
    if (trim >= 0.5) {
      return(stats::median(x, na.rm = FALSE))
    }
    lo <- floor(n * trim) + 1
    hi <- n + 1 - lo
    x <- sort.int(x, partial = unique(c(lo, hi)))[lo:hi]
  }
  x
}


================================================
FILE: R/extract_column_names.R
================================================
#' @title Find or get columns in a data frame based on search patterns
#' @name extract_column_names
#'
#' @description `extract_column_names()` returns column names from a data set that
#' match a certain search pattern, while `data_select()` returns the found data.
#'
#' @param data A data frame.
#' @param select Variables that will be included when performing the required
#'   tasks. Can be either
#'
#'   - a variable specified as a literal variable name (e.g., `column_name`),
#'   - a string with the variable name (e.g., `"column_name"`), a character
#'     vector of variable names (e.g., `c("col1", "col2", "col3")`), or a
#'     character vector of variable names including ranges specified via `:`
#'     (e.g., `c("col1:col3", "col5")`),
#'   - for some functions, like `data_select()` or `data_rename()`, `select` can
#'     be a named character vector. In this case, the names are used to rename
#'     the columns in the output data frame. See 'Details' in the related
#'     functions to see where this option applies.
#'   - a formula with variable names (e.g., `~column_1 + column_2`),
#'   - a vector of positive integers, giving the positions counting from the left
#'     (e.g. `1` or `c(1, 3, 5)`),
#'   - a vector of negative integers, giving the positions counting from the
#'     right (e.g., `-1` or `-1:-3`),
#'   - one of the following select-helpers: `starts_with()`, `ends_with()`,
#'     `contains()`, a range using `:`, or `regex()`. `starts_with()`,
#'     `ends_with()`, and  `contains()` accept several patterns, e.g
#'     `starts_with("Sep", "Petal")`. `regex()` can be used to define regular
#'     expression patterns.
#'   - a function testing for logical conditions, e.g. `is.numeric()` (or
#'     `is.numeric`), or any user-defined function that selects the variables
#'     for which the function returns `TRUE` (like: `foo <- function(x) mean(x) > 3`),
#'   - ranges specified via literal variable names, select-helpers (except
#'     `regex()`) and (user-defined) functions can be negated, i.e. return
#'     non-matching elements, when prefixed with a `-`, e.g. `-ends_with()`,
#'     `-is.numeric` or `-(Sepal.Width:Petal.Length)`. **Note:** Negation means
#'     that matches are _excluded_, and thus, the `exclude` argument can be
#'     used alternatively. For instance, `select=-ends_with("Length")` (with
#'     `-`) is equivalent to `exclude=ends_with("Length")` (no `-`). In case
#'     negation should not work as expected, use the `exclude` argument instead.
#'
#'   If `NULL`, selects all columns. Patterns that found no matches are silently
#'   ignored, e.g. `extract_column_names(iris, select = c("Species", "Test"))`
#'   will just return `"Species"`.
#' @param exclude See `select`, however, column names matched by the pattern
#'   from `exclude` will be excluded instead of selected. If `NULL` (the default),
#'   excludes no columns.
#' @param ignore_case Logical, if `TRUE` and when one of the select-helpers or
#'   a regular expression is used in `select`, ignores lower/upper case in the
#'   search pattern when matching against variable names.
#' @param regex Logical, if `TRUE`, the search pattern from `select` will be
#'   treated as regular expression. When `regex = TRUE`, select *must* be a
#'   character string (or a variable containing a character string) and is not
#'   allowed to be one of the supported select-helpers or a character vector
#'   of length > 1. `regex = TRUE` is comparable to using one of the two
#'   select-helpers, `select = contains()` or `select = regex()`, however,
#'   since the select-helpers may not work when called from inside other
#'   functions (see 'Details'), this argument may be used as workaround.
#' @param verbose Toggle warnings.
#' @param ... Arguments passed down to other functions. Mostly not used yet.
#'
#' @inherit data_rename seealso
#'
#' @return
#'
#' `extract_column_names()` returns a character vector with column names that
#' matched the pattern in `select` and `exclude`, or `NULL` if no matching
#' column name was found. `data_select()` returns a data frame with matching
#' columns.
#'
#' @details
#'
#' Specifically for `data_select()`, `select` can also be a named character
#' vector. In this case, the names are used to rename the columns in the
#' output data frame. See 'Examples'.
#'
#' Note that it is possible to either pass an entire select helper or only the
#' pattern inside a select helper as a function argument:
#'
#' ```r
#' foo <- function(data, pattern) {
#'   extract_column_names(data, select = starts_with(pattern))
#' }
#' foo(iris, pattern = "Sep")
#'
#' foo2 <- function(data, pattern) {
#'   extract_column_names(data, select = pattern)
#' }
#' foo2(iris, pattern = starts_with("Sep"))
#' ```
#'
#' This means that it is also possible to use loop values as arguments or patterns:
#'
#' ```r
#' for (i in c("Sepal", "Sp")) {
#'   head(iris) |>
#'     extract_column_names(select = starts_with(i)) |>
#'     print()
#' }
#' ```
#'
#' However, this behavior is limited to a "single-level function". It will not
#' work in nested functions, like below:
#'
#' ```r
#' inner <- function(data, arg) {
#'   extract_column_names(data, select = arg)
#' }
#' outer <- function(data, arg) {
#'   inner(data, starts_with(arg))
#' }
#' outer(iris, "Sep")
#' ```
#'
#' In this case, it is better to pass the whole select helper as the argument of
#' `outer()`:
#'
#' ```r
#' outer <- function(data, arg) {
#'   inner(data, arg)
#' }
#' outer(iris, starts_with("Sep"))
#' ```
#'
#' @examples
#' # Find column names by pattern
#' extract_column_names(iris, starts_with("Sepal"))
#' extract_column_names(iris, ends_with("Width"))
#' extract_column_names(iris, regex("\\."))
#' extract_column_names(iris, c("Petal.Width", "Sepal.Length"))
#'
#' # starts with "Sepal", but not allowed to end with "width"
#' extract_column_names(iris, starts_with("Sepal"), exclude = contains("Width"))
#'
#' # find numeric with mean > 3.5
#' numeric_mean_35 <- function(x) is.numeric(x) && mean(x, na.rm = TRUE) > 3.5
#' extract_column_names(iris, numeric_mean_35)
#'
#' # find column names, using range
#' extract_column_names(mtcars, c(cyl:hp, wt))
#'
#' # find range of column names by range, using character vector
#' extract_column_names(mtcars, c("cyl:hp", "wt"))
#'
#' # rename returned columns for "data_select()"
#' head(data_select(mtcars, c(`Miles per Gallon` = "mpg", Cylinders = "cyl")))
#' @export
extract_column_names <- function(
  data,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
) {
  columns <- .select_nse(
    select,
    data,
    exclude,
    ignore_case = ignore_case,
    regex = regex,
    verbose = FALSE
  )

  if (!length(columns) || is.null(columns)) {
    columns <- NULL
    if (isTRUE(verbose)) {
      insight::format_warning(
        "No column names that matched the required search pattern were found."
      )
    }
  }

  columns
}

#' @rdname extract_column_names
#' @export
find_columns <- extract_column_names


================================================
FILE: R/format.R
================================================
# distribution ---------------------------------

#' @export
format.parameters_distribution <- function(
  x,
  digits = 2,
  format = NULL,
  ci_width = "auto",
  ci_brackets = TRUE,
  ...
) {
  # save information
  att <- attributes(x)

  if (all(c("Min", "Max") %in% names(x))) {
    x$Min <- insight::format_ci(
      x$Min,
      x$Max,
      ci = NULL,
      digits = digits,
      width = ci_width,
      brackets = ci_brackets
    )
    x$Max <- NULL
    colnames(x)[which(colnames(x) == "Min")] <- "Range"
  }

  if (all(c("Q1", "Q3") %in% names(x))) {
    x$Q1 <- insight::format_ci(
      x$Q1,
      x$Q3,
      ci = NULL,
      digits = digits,
      width = ci_width,
      brackets = FALSE
    )
    x$Q3 <- NULL
    colnames(x)[which(colnames(x) == "Q1")] <- "Quartiles"
  }

  # find CI columns. We might have multiple columns for different centralities
  ci_columns <- grepl("^(CI_low|CI_high)", colnames(x))
  # make sure we have matches
  if (any(ci_columns)) {
    # iterate all centrality options
    centrality <- .centrality_options(att$centrality)
    for (ce in centrality) {
      # this is the original column name
      ci_columns <- c(paste0("CI_low_", ce), paste0("CI_high_", ce))
      # we format CI column, merge it into one column
      x[[ci_columns[1]]] <- insight::format_ci(
        x[[ci_columns[1]]],
        x[[ci_columns[2]]],
        ci = NULL,
        digits = digits,
        width = ci_width,
        brackets = ci_brackets
      )
      # ... and remove the no longer needed CI_high column
      x[[ci_columns[2]]] <- NULL
      ci_lvl <- attributes(x)$ci

      # find position of CI column
      ci_columm_pos <- which(colnames(x) == ci_columns[1])

      # rename
      if (is.null(ci_lvl)) {
        colnames(x)[ci_columm_pos] <- sprintf(
          "CI (%s)",
          insight::format_capitalize(ce)
        )
      } else {
        colnames(x)[ci_columm_pos] <- sprintf(
          "%i%% CI (%s)",
          round(100 * ci_lvl),
          insight::format_capitalize(ce)
        )
      }

      # make sure we have the correct column name of the centrality
      centr_name <- switch(
        tolower(ce),
        mean = "Mean",
        median = "Median",
        map = "MAP"
      )

      # reorder CI column, move it to related centrality index
      centr_pos <- which(colnames(x) == centr_name)
      if (length(centr_pos)) {
        x <- data_relocate(x, select = ci_columm_pos, after = centr_pos)
      }
    }
  }

  if ("Trimmed_Mean" %in% colnames(x)) {
    threshold <- attributes(x)$threshold
    if (is.null(threshold)) {
      trim_name <- "Trimmed"
    } else {
      trim_name <- sprintf("Trimmed (%g%%)", round(100 * threshold))
    }
    colnames(x)[which(colnames(x) == "Trimmed_Mean")] <- trim_name
  }

  x
}


================================================
FILE: R/labels_to_levels.R
================================================
#' @title Convert value labels into factor levels
#' @name labels_to_levels
#'
#' @details
#' `labels_to_levels()` allows to use value labels of factors as their levels.
#'
#' @param x A data frame or factor. Other variable types (e.g. numerics) are not
#' allowed.
#' @param ... Currently not used.
#' @inheritParams extract_column_names
#' @inheritParams categorize
#'
#' @return `x`, where for all factors former levels are replaced by their value
#' labels.
#'
#' @examples
#' data(efc)
#' # create factor
#' x <- as.factor(efc$c172code)
#' # add value labels - these are not factor levels yet
#' x <- assign_labels(x, values = c(`1` = "low", `2` = "mid", `3` = "high"))
#' levels(x)
#' data_tabulate(x)
#'
#' x <- labels_to_levels(x)
#' levels(x)
#' data_tabulate(x)
#' @export
labels_to_levels <- function(x, ...) {
  UseMethod("labels_to_levels")
}


#' @export
labels_to_levels.default <- function(x, verbose = TRUE, ...) {
  if (isTRUE(verbose)) {
    insight::format_alert("`labels_to_levels()` only works for factors.")
  }
  x
}

#' @rdname labels_to_levels
#' @export
labels_to_levels.factor <- function(x, verbose = TRUE, ...) {
  if (is.null(attr(x, "labels", exact = TRUE))) {
    insight::format_error(
      "Could not change factor levels. Variable had no value labels."
    )
  }
  .value_labels_to_levels(x, verbose = verbose)
}

#' @rdname labels_to_levels
#' @export
labels_to_levels.data.frame <- function(
  x,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  append = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
) {
  # validation check, return as is for complete factor
  if (all(vapply(x, is.factor, TRUE))) {
    return(x)
  }

  # evaluate arguments
  select <- .select_nse(
    select,
    x,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  # keep only factors
  select <- colnames(x[select])[vapply(x[select], is.factor, TRUE)]

  # when we append variables, we call ".process_append()", which will
  # create the new variables and updates "select", so new variables are processed
  if (!isFALSE(append)) {
    # process arguments
    arguments <- .process_append(
      x,
      select,
      append,
      append_suffix = "_l",
      preserve_value_labels = TRUE,
      keep_factors = TRUE,
      keep_character = FALSE
    )
    # update processed arguments
    x <- arguments$x
    select <- arguments$select
  }

  x[select] <- lapply(
    x[select],
    labels_to_levels,
    verbose = verbose,
    ...
  )
  x
}


================================================
FILE: R/makepredictcall.R
================================================
#' Utility Function for Safe Prediction with `datawizard` transformers
#'
#' This function allows for the use of (some of) `datawizard`'s transformers
#' inside a model formula. See examples below.
#' \cr\cr
#' Currently, [center()], [standardize()], [normalize()], & [rescale()] are
#' supported.
#'
#' @inheritParams stats::makepredictcall
#'
#' @inherit stats::makepredictcall return
#' @importFrom stats makepredictcall
#'
#' @seealso [stats::makepredictcall()]
#' @family datawizard-transformers
#'
#' @examples
#'
#' data("mtcars")
#' train <- mtcars[1:30, ]
#' test <- mtcars[31:32, ]
#'
#' m1 <- lm(mpg ~ center(hp), data = train)
#' predict(m1, newdata = test) # Data is "centered" before the prediction is made,
#' # according to the center of the old data
#'
#' m2 <- lm(mpg ~ standardize(hp), data = train)
#' m3 <- lm(mpg ~ scale(hp), data = train) # same as above
#' predict(m2, newdata = test) # Data is "standardized" before the prediction is made.
#' predict(m3, newdata = test) # Data is "standardized" before the prediction is made.
#'
#'
#' m4 <- lm(mpg ~ normalize(hp), data = mtcars)
#' m5 <- lm(mpg ~ rescale(hp, to = c(-3, 3)), data = mtcars)
#'
#' (newdata <- data.frame(hp = c(range(mtcars$hp), 400))) # 400 is outside original range!
#'
#' model.frame(delete.response(terms(m4)), data = newdata)
#' model.frame(delete.response(terms(m5)), data = newdata)
#'
#' @export
makepredictcall.dw_transformer <- function(var, call) {
  if (is.matrix(var) || is.array(var)) {
    insight::format_error(
      "datawizard scalers in model formulas are not supported for matrices."
    )
  }

  switch(
    as.character(call)[1L],
    centre = ,
    center = {
      call$center <- attr(var, "center")
    },
    standardise = ,
    standardize = {
      call$center <- attr(var, "center")
      call$scale <- attr(var, "scale")
    },
    normalize = ,
    normalise = {
      call$min_value <- attr(var, "min_value")
      call$range_difference <- attr(var, "range_difference")
      call$vector_length <- attr(var, "vector_length")
      call$include_bounds <- attr(var, "include_bounds")
      call$flag_bounds <- attr(var, "flag_bounds")
    },
    rescale = {
      call$min_value <- attr(var, "min_value")
      call$max_value <- attr(var, "max_value")
      call$new_min <- attr(var, "new_min")
      call$new_max <- attr(var, "new_max")
    },

    # ELSE:
    {
      return(call)
    }
  )

  call$verbose <- FALSE
  call
}


================================================
FILE: R/mean_sd.R
================================================
#' Summary Helpers
#'
#' @param x A numeric vector (or one that can be coerced to one via
#'   `as.numeric()`) to be summarized.
#' @param named Should the vector be named?
#'   (E.g., `c("-SD" = -1, Mean = 1, "+SD" = 2)`.)
#' @param times How many SDs above and below the Mean (or MADs around the Median)
#' @param ... Not used.
#' @inheritParams coef_var
#' @inheritParams stats::mad
#'
#' @return A (possibly named) numeric vector of length `2*times + 1` of SDs
#'   below the mean, the mean, and SDs above the mean (or median and MAD).
#'
#' @examples
#' mean_sd(mtcars$mpg)
#'
#' mean_sd(mtcars$mpg, times = 2L)
#'
#' median_mad(mtcars$mpg)
#'
#' @export
mean_sd <- function(x, times = 1L, remove_na = TRUE, named = TRUE, ...) {
  .centrality_dispersion(
    x,
    type = "mean",
    times = times,
    remove_na = remove_na,
    named = named
  )
}

#' @export
#' @rdname mean_sd
median_mad <- function(
  x,
  times = 1L,
  remove_na = TRUE,
  constant = 1.4826,
  named = TRUE,
  ...
) {
  .centrality_dispersion(
    x,
    type = "median",
    times = times,
    remove_na = remove_na,
    constant = constant,
    named = named
  )
}

#' @keywords Internal
.centrality_dispersion <- function(
  x,
  type = "mean",
  remove_na = TRUE,
  times = 1L,
  constant = 1.4826,
  named = TRUE,
  ...
) {
  x <- as.numeric(x)
  times <- as.integer(times)
  type <- match.arg(type, choices = c("mean", "median"))

  # centrality
  M <- switch(
    type,
    median = stats::median(x, na.rm = remove_na),
    mean(x, na.rm = remove_na)
  )

  S <- switch(
    type,
    median = stats::mad(x, na.rm = remove_na, constant = constant),
    stats::sd(x, na.rm = remove_na)
  )

  v <- M + c(-rev(seq_len(times)), 0, seq_len(times)) * S

  if (isTRUE(named)) {
    string_cs <- switch(type, median = c("Median", "MAD"), c("Mean", "SD"))
    if (times == 1L) {
      times <- ""
    } else {
      times <- paste0(seq_len(times), " ")
    }
    names(v) <- c(
      paste0("-", rev(times), string_cs[2]),
      string_cs[1],
      paste0("+", times, string_cs[2])
    )
  }
  v
}


================================================
FILE: R/means_by_group.R
================================================
#' @title Summary of mean values by group
#' @name means_by_group
#'
#' @description Computes summary table of means by groups.
#'
#' @param x A vector or a data frame.
#' @param by If `x` is a numeric vector, `by` should be a factor that
#' indicates the group-classifying categories. If `x` is a data frame, `by`
#' should be a character string, naming the variable in `x` that is used for
#' grouping. Numeric vectors are coerced to factors. Not that `by` should
#' only refer to a single variable.
#' @param ci Level of confidence interval for mean estimates. Default is `0.95`.
#' Use `ci = NA` to suppress confidence intervals.
#' @param weights If `x` is a numeric vector, `weights` should be a vector of
#' weights that will be applied to weight all observations. If `x` is a data
#' frame, `weights` can also be a character string indicating the name of the
#' variable in `x` that should be used for weighting. Default is `NULL`, so no
#' weights are used.
#' @param digits Optional scalar, indicating the amount of digits after decimal
#' point when rounding estimates and values.
#' @param ... Currently not used
#' @inheritParams find_columns
#'
#' @return A data frame with information on mean and further summary statistics
#' for each sub-group.
#'
#' @details This function is comparable to `aggregate(x, by, mean)`, but provides
#' some further information, including summary statistics from a One-Way-ANOVA
#' using `x` as dependent and `by` as independent variable. [`emmeans::contrast()`]
#' is used to get p-values for each sub-group. P-values indicate whether each
#' group-mean is significantly different from the total mean.
#'
#' @examples
#' data(efc)
#' means_by_group(efc, "c12hour", "e42dep")
#'
#' data(iris)
#' means_by_group(iris, "Sepal.Width", "Species")
#'
#' # weighting
#' efc$weight <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
#' means_by_group(efc, "c12hour", "e42dep", weights = "weight")
#' @export
means_by_group <- function(x, ...) {
  UseMethod("means_by_group")
}


#' @export
means_by_group.default <- function(x, ...) {
  insight::format_error(
    "`means_by_group()` does not work for objects of class `",
    class(x)[1],
    "`."
  )
}


#' @rdname means_by_group
#' @export
means_by_group.numeric <- function(
  x,
  by = NULL,
  ci = 0.95,
  weights = NULL,
  digits = NULL,
  ...
) {
  # validation check for arguments

  # "by" must be provided
  if (is.null(by)) {
    insight::format_error("Argument `by` is missing.")
  }

  # by must be of same length as x
  if (length(by) != length(x)) {
    insight::format_error("Argument `by` must be of same length as `x`.")
  }

  # if weights are provided, must be of same length as x
  if (!is.null(weights) && length(weights) != length(x)) {
    insight::format_error("Argument `weights` must be of same length as `x`.")
  }

  # if weights are NULL, set weights to 1
  if (is.null(weights)) {
    weights <- rep(1, length(x))
  }

  # retrieve labels
  var_mean_label <- attr(x, "label", exact = TRUE)
  var_grp_label <- attr(by, "label", exact = TRUE)

  # if no labels present, use variable names directly
  if (is.null(var_mean_label)) {
    var_mean_label <- deparse(substitute(x))
  }
  if (is.null(var_grp_label)) {
    var_grp_label <- deparse(substitute(by))
  }

  # coerce group to factor if numeric, or convert labels to levels, if factor
  if (is.factor(by)) {
    by <- tryCatch(labels_to_levels(by, verbose = FALSE), error = function(e) {
      by
    })
  } else {
    by <- to_factor(by)
  }

  my_data <- stats::na.omit(data.frame(
    x = x,
    group = by,
    weights = weights,
    stringsAsFactors = FALSE
  ))

  # get grouped means table
  out <- .means_by_group(my_data, ci = ci)

  # attributes
  attr(out, "var_mean_label") <- var_mean_label
  attr(out, "var_grp_label") <- var_grp_label
  attr(out, "digits") <- digits

  class(out) <- c("dw_groupmeans", "data.frame")
  out
}


#' @rdname means_by_group
#' @export
means_by_group.data.frame <- function(
  x,
  select = NULL,
  by = NULL,
  ci = 0.95,
  weights = NULL,
  digits = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
) {
  # evaluate select/exclude, may be select-helpers
  select <- .select_nse(
    select,
    x,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  if (is.null(weights)) {
    w <- NULL
  } else if (is.character(weights)) {
    w <- x[[weights]]
  } else {
    w <- weights
  }

  out <- lapply(select, function(i) {
    # if no labels present, use variable names directy
    if (is.null(attr(x[[i]], "label", exact = TRUE))) {
      attr(x[[i]], "label") <- i
    }
    if (is.null(attr(x[[by]], "label", exact = TRUE))) {
      attr(x[[by]], "label") <- by
    }
    # compute means table
    means_by_group(
      x[[i]],
      by = x[[by]],
      ci = ci,
      weights = w,
      digits = digits,
      ...
    )
  })

  class(out) <- c("dw_groupmeans_list", "list")
  out
}


#' @keywords internal
.means_by_group <- function(data, ci = 0.95) {
  # compute anova statistics for mean table
  if (is.null(data$weights) || all(data$weights == 1)) {
    fit <- stats::lm(x ~ group, data = data)
  } else {
    fit <- stats::lm(x ~ group, weights = data$weights, data = data)
  }

  # summary table data
  groups <- split(data$x, data$group)
  group_weights <- split(data$weights, data$group)
  out <- do.call(
    rbind,
    Map(
      function(x, w) {
        data.frame(
          Mean = weighted_mean(x, weights = w),
          SD = weighted_sd(x, weights = w),
          N = round(sum(w)),
          stringsAsFactors = FALSE
        )
      },
      groups,
      group_weights
    )
  )

  # add group names
  out$Category <- levels(data$group)
  out$p <- out$CI_high <- out$CI_low <- NA

  # p-values of contrast-means
  if (insight::check_if_installed("emmeans", quietly = TRUE)) {
    # create summary table of contrasts, for p-values and confidence intervals
    predicted <- emmeans::emmeans(fit, specs = "group", level = ci)
    emm_contrasts <- emmeans::contrast(predicted, method = "eff")
    # add p-values and confidence intervals to "out"
    if (!is.null(ci) && !is.na(ci)) {
      summary_table <- as.data.frame(predicted)
      out$CI_low <- summary_table$lower.CL
      out$CI_high <- summary_table$upper.CL
    }
    summary_table <- as.data.frame(emm_contrasts)
    out$p <- summary_table$p.value
  }

  # reorder columns
  out <- out[c("Category", "Mean", "N", "SD", "CI_low", "CI_high", "p")]

  # finally, add total-row
  out <- rbind(
    out,
    data.frame(
      Category = "Total",
      Mean = weighted_mean(data$x, weights = data$weights),
      N = nrow(data),
      SD = weighted_sd(data$x, weights = data$weights),
      CI_low = NA,
      CI_high = NA,
      p = NA,
      stringsAsFactors = FALSE
    )
  )

  # get anova statistics for mean table
  sum.fit <- summary(fit)

  # r-squared values
  r2 <- sum.fit$r.squared
  r2.adj <- sum.fit$adj.r.squared

  # F-statistics
  fstat <- sum.fit$fstatistic
  pval <- stats::pf(fstat[1], fstat[2], fstat[3], lower.tail = FALSE)

  # copy as attributes
  attr(out, "r2") <- r2
  attr(out, "ci") <- ci
  attr(out, "adj.r2") <- r2.adj
  attr(out, "fstat") <- fstat[1]
  attr(out, "p.value") <- pval

  out
}


# methods -----------------

#' @export
format.dw_groupmeans <- function(x, digits = NULL, ...) {
  if (is.null(digits)) {
    digits <- attr(x, "digits", exact = TRUE)
  }
  if (is.null(digits)) {
    digits <- 2
  }
  x$N <- insight::format_value(x$N, digits = 0)
  insight::format_table(remove_empty_columns(x), digits = digits, ...)
}

#' @export
print.dw_groupmeans <- function(x, digits = NULL, ...) {
  out <- format(x, digits = digits, ...)

  # caption
  l1 <- attributes(x)$var_mean_label
  l2 <- attributes(x)$var_grp_label
  if (!is.null(l1) && !is.null(l2)) {
    caption <- c(paste0("# Mean of ", l1, " by ", l2), "blue")
  } else {
    caption <- NULL
  }

  # footer
  footer <- paste0(
    "\nAnova: R2=",
    insight::format_value(attributes(x)$r2, digits = 3),
    "; adj.R2=",
    insight::format_value(attributes(x)$adj.r2, digits = 3),
    "; F=",
    insight::format_value(attributes(x)$fstat, digits = 3),
    "; ",
    insight::format_p(attributes(x)$p.value, whitespace = FALSE),
    "\n"
  )

  cat(insight::export_table(out, caption = caption, footer = footer, ...))
}

#' @export
print.dw_groupmeans_list <- function(x, digits = NULL, ...) {
  for (i in seq_along(x)) {
    if (i > 1) {
      cat("\n")
    }
    print(x[[i]], digits = digits, ...)
  }
}


================================================
FILE: R/normalize.R
================================================
#' Normalize numeric variable to 0-1 range
#'
#' Performs a normalization of data, i.e., it scales variables in the range
#' 0 - 1. This is a special case of [rescale()]. `unnormalize()` is the
#' counterpart, but only works for variables that have been normalized with
#' `normalize()`.
#'
#' @param x A numeric vector, (grouped) data frame, or matrix. See 'Details'.
#' @param include_bounds Numeric or logical. Using this can be useful in case of
#'   beta-regression, where the response variable is not allowed to include
#'   zeros and ones. If `TRUE`, the input is normalized to a range that includes
#'   zero and one. If `FALSE`, the return value is compressed, using
#'   Smithson and Verkuilen's (2006) formula `(x * (n - 1) + 0.5) / n`, to avoid
#'   zeros and ones in the normalized variables. Else, if numeric (e.g., `0.001`),
#'   `include_bounds` defines the "distance" to the lower and upper bound, i.e.
#'   the normalized vectors are rescaled to a range from `0 + include_bounds` to
#'   `1 - include_bounds`.
#' @param ... Arguments passed to or from other methods.
#' @inheritParams standardize.data.frame
#' @inheritParams extract_column_names
#'
#' @inheritSection center Selection of variables - the `select` argument
#'
#' @details
#'
#' - If `x` is a matrix, normalization is performed across all values (not
#'   column- or row-wise). For column-wise normalization, convert the matrix to a
#'   data.frame.
#' - If `x` is a grouped data frame (`grouped_df`), normalization is performed
#'   separately for each group.
#'
#' @seealso See [makepredictcall.dw_transformer()] for use in model formulas.
#'
#' @examples
#'
#' normalize(c(0, 1, 5, -5, -2))
#' normalize(c(0, 1, 5, -5, -2), include_bounds = FALSE)
#' # use a value defining the bounds
#' normalize(c(0, 1, 5, -5, -2), include_bounds = 0.001)
#'
#' head(normalize(trees))
#'
#' @references
#'
#' Smithson M, Verkuilen J (2006). A Better Lemon Squeezer? Maximum-Likelihood
#' Regression with Beta-Distributed Dependent Variables. Psychological Methods,
#' 11(1), 54–71.
#'
#' @family transform utilities
#'
#' @return A normalized object.
#'
#' @export
normalize <- function(x, ...) {
  UseMethod("normalize")
}


#' @rdname normalize
#' @export
normalize.numeric <- function(x, include_bounds = TRUE, verbose = TRUE, ...) {
  # Warning if all NaNs or infinite
  if (all(is.infinite(x) | is.na(x))) {
    return(x)
  }

  # safe name, for later use
  if (is.null(names(x))) {
    name <- insight::safe_deparse(substitute(x))
  } else {
    name <- names(x)
  }

  # Get infinite and replace by NA (so that the normalization doesn't fail)
  infinite_idx <- is.infinite(x)
  infinite_vals <- x[infinite_idx]
  x[infinite_idx] <- NA

  # called from "makepredictcal()"? Then we have additional arguments
  dot_args <- list(...)
  flag_predict <- FALSE
  required_dot_args <- c(
    "range_difference",
    "min_value",
    "vector_length",
    "flag_bounds"
  )

  if (all(required_dot_args %in% names(dot_args))) {
    # we gather informatiom about the original data, which is needed
    # for "predict()" to work properly when "normalize()" is called
    # in formulas on-the-fly, e.g. "lm(mpg ~ normalize(hp), data = mtcars)"
    range_difference <- dot_args$range_difference
    min_value <- dot_args$min_value
    vector_length <- dot_args$vector_length
    flag_bounds <- dot_args$flag_bounds
    flag_predict <- TRUE
  } else {
    range_difference <- diff(range(x, na.rm = TRUE))
    min_value <- min(x, na.rm = TRUE)
    vector_length <- length(x)
    flag_bounds <- NULL
  }

  # Warning if only one value
  if (!flag_predict && insight::has_single_value(x)) {
    if (verbose) {
      insight::format_warning(
        paste0(
          "Variable `",
          name,
          "` contains only one unique value and will not be normalized."
        )
      )
    }
    return(x)
  }

  # Warning if logical vector
  if (insight::n_unique(x) == 2 && verbose) {
    insight::format_warning(
      paste0(
        "Variable `",
        name,
        "` contains only two unique values. Consider converting it to a factor."
      )
    )
  }

  # rescale
  out <- as.vector((x - min_value) / range_difference)

  # if we don't have information on whether bounds are included or not,
  # get this information here.
  if (is.null(flag_bounds)) {
    flag_bounds <- (any(out == 0) || any(out == 1))
  }

  if (!isTRUE(include_bounds) && flag_bounds) {
    if (isFALSE(include_bounds)) {
      out <- (out * (vector_length - 1) + 0.5) / vector_length
    } else if (
      is.numeric(include_bounds) && include_bounds > 0 && include_bounds < 1
    ) {
      out <- rescale(out, to = c(0 + include_bounds, 1 - include_bounds))
    } else if (verbose) {
      insight::format_warning(
        "`include_bounds` must be either logical or numeric (between 0 and 1).",
        "Bounds (zeros and ones) are included in the returned value."
      )
    }
  }

  # Re-insert infinite values
  out[infinite_idx] <- infinite_vals

  attr(out, "include_bounds") <- include_bounds
  attr(out, "flag_bounds") <- isTRUE(flag_bounds)
  attr(out, "min_value") <- min_value
  attr(out, "vector_length") <- vector_length
  attr(out, "range_difference") <- range_difference
  # don't add attribute when we call data frame methods
  if (!isFALSE(dot_args$add_transform_class)) {
    class(out) <- c("dw_transformer", class(out))
  }

  out
}


#' @export
normalize.factor <- function(x, ...) {
  x
}


#' @export
normalize.grouped_df <- function(
  x,
  select = NULL,
  exclude = NULL,
  include_bounds = TRUE,
  append = FALSE,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
) {
  # evaluate select/exclude, may be select-helpers
  select <- .select_nse(
    select,
    x,
    exclude,
    ignore_case,
    regex = regex,
    remove_group_var = TRUE,
    verbose = verbose
  )

  info <- attributes(x)

  grps <- attr(x, "groups", exact = TRUE)[[".rows"]]

  # when we append variables, we call ".process_append()", which will
  # create the new variables and updates "select", so new variables are processed
  if (!isFALSE(append)) {
    # process arguments
    my_args <- .process_append(
      x,
      select,
      append,
      append_suffix = "_n"
    )
    # update processed arguments
    x <- my_args$x
    select <- my_args$select
  }

  x <- as.data.frame(x)

  # create column(s) to store dw_transformer attributes
  for (i in select) {
    info$groups[[paste0("attr_", i)]] <- rep(NA, length(grps))
  }

  for (rows in seq_along(grps)) {
    tmp <- normalize(
      x[grps[[rows]], , drop = FALSE],
      select = select,
      exclude = exclude,
      include_bounds = include_bounds,
      verbose = verbose,
      append = FALSE, # need to set to FALSE here, else variable will be doubled
      add_transform_class = FALSE,
      ...
    )

    # store dw_transformer_attributes
    for (i in select) {
      info$groups[rows, paste0("attr_", i)][[1]] <- list(unlist(attributes(tmp[[
        i
      ]])))
    }

    x[grps[[rows]], ] <- tmp
  }

  # last column of "groups" attributes must be called ".rows"
  info$groups <- data_relocate(info$groups, ".rows", after = -1)

  # set back class, so data frame still works with dplyr
  attributes(x) <- utils::modifyList(info, attributes(x))
  class(x) <- c("grouped_df", class(x))
  x
}


#' @rdname normalize
#' @export
normalize.data.frame <- function(
  x,
  select = NULL,
  exclude = NULL,
  include_bounds = TRUE,
  append = FALSE,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
) {
  # evaluate select/exclude, may be select-helpers
  select <- .select_nse(
    select,
    x,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  # when we append variables, we call ".process_append()", which will
  # create the new variables and updates "select", so new variables are processed
  if (!isFALSE(append)) {
    # process arguments
    my_args <- .process_append(
      x,
      select,
      append,
      append_suffix = "_n"
    )
    # update processed arguments
    x <- my_args$x
    select <- my_args$select
  }

  x[select] <- lapply(
    x[select],
    normalize,
    include_bounds = include_bounds,
    verbose = verbose,
    add_transform_class = FALSE
  )

  x
}


#' @export
normalize.matrix <- function(x, ...) {
  matrix(normalize(as.numeric(x), ...), nrow = nrow(x))
}


================================================
FILE: R/ranktransform.R
================================================
#' (Signed) rank transformation
#'
#' Transform numeric values with the integers of their rank (i.e., 1st smallest,
#' 2nd smallest, 3rd smallest, etc.). Setting the `sign` argument to `TRUE` will
#' give you signed ranks, where the ranking is done according to absolute size
#' but where the sign is preserved (i.e., 2, 1, -3, 4).
#'
#' @param x Object.
#' @param sign Logical, if `TRUE`, return signed ranks.
#' @param method Treatment of ties. Can be one of `"average"` (default),
#'   `"first"`, `"last"`, `"random"`, `"max"` or `"min"`. See [rank()] for
#'   details.
#' @param zeros How to handle zeros. If `"na"` (default), they are marked as
#' `NA`. If `"signrank"`, they are kept during the ranking and marked as zeros.
#' This is only used when `sign = TRUE`.
#' @param ... Arguments passed to or from other methods.
#' @inheritParams extract_column_names
#' @inheritParams standardize.data.frame
#'
#' @inheritSection center Selection of variables - the `select` argument
#'
#' @examples
#' ranktransform(c(0, 1, 5, -5, -2))
#'
#' # By default, zeros are converted to NA
#' suppressWarnings(
#'   ranktransform(c(0, 1, 5, -5, -2), sign = TRUE)
#' )
#' ranktransform(c(0, 1, 5, -5, -2), sign = TRUE, zeros = "signrank")
#'
#' head(ranktransform(trees))
#' @return A rank-transformed object.
#'
#' @family transform utilities
#'
#' @export
ranktransform <- function(x, ...) {
  UseMethod("ranktransform")
}


#' @rdname ranktransform
#' @export
ranktransform.numeric <- function(
  x,
  sign = FALSE,
  method = "average",
  zeros = "na",
  verbose = TRUE,
  ...
) {
  # no change if all values are `NA`s
  if (all(is.na(x))) {
    return(x)
  }

  zeros <- match.arg(zeros, c("na", "signrank"))
  method <- match.arg(
    method,
    c("average", "first", "last", "random", "max", "min")
  )

  # Warning if only one value and return early
  if (insight::has_single_value(x)) {
    if (is.null(names(x))) {
      name <- deparse(substitute(x))
    } else {
      name <- names(x)
    }

    if (verbose) {
      insight::format_warning(
        paste0(
          "Variable `",
          name,
          "` contains only one unique value and will not be normalized."
        )
      )
    }

    return(x)
  }

  # Warning if only two values present but don't return early
  if (length(unique(x)) == 2L) {
    if (is.null(names(x))) {
      name <- deparse(substitute(x))
    } else {
      name <- names(x)
    }

    if (verbose) {
      # nolint
      insight::format_warning(
        paste0(
          "Variable `",
          name,
          "` contains only two different values. Consider converting it to a factor."
        )
      )
    }
  }

  if (sign) {
    if (zeros == "na") {
      out <- rep(NA, length(x))
      ZEROES <- x == 0
      if (any(ZEROES) && verbose) {
        insight::format_warning(
          "Zeros detected. These cannot be sign-rank transformed."
        ) # nolint
      }
      out[!ZEROES] <- sign(x[!ZEROES]) *
        rank(abs(x[!ZEROES]), ties.method = method, na.last = "keep")
    } else if (zeros == "signrank") {
      out <- sign(x) * rank(abs(x), ties.method = method, na.last = "keep")
    }
  } else {
    out <- rank(x, ties.method = method, na.last = "keep")
  }

  out
}


#' @export
ranktransform.factor <- function(x, ...) {
  x
}


#' @export
ranktransform.grouped_df <- function(
  x,
  select = NULL,
  exclude = NULL,
  sign = FALSE,
  method = "average",
  ignore_case = FALSE,
  regex = FALSE,
  zeros = "na",
  verbose = TRUE,
  ...
) {
  info <- attributes(x)

  grps <- attr(x, "groups", exact = TRUE)[[".rows"]]

  # evaluate arguments
  select <- .select_nse(
    select,
    x,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  x <- as.data.frame(x)
  for (rows in grps) {
    x[rows, ] <- ranktransform(
      x[rows, , drop = FALSE],
      select = select,
      exclude = exclude,
      sign = sign,
      method = method,
      ...
    )
  }
  # set back class, so data frame still works with dplyr
  attributes(x) <- info
  x
}


#' @rdname ranktransform
#' @export
ranktransform.data.frame <- function(
  x,
  select = NULL,
  exclude = NULL,
  sign = FALSE,
  method = "average",
  ignore_case = FALSE,
  regex = FALSE,
  zeros = "na",
  verbose = TRUE,
  ...
) {
  # evaluate arguments
  select <- .select_nse(
    select,
    x,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  x[select] <- lapply(x[select], ranktransform, sign = sign, method = method)
  x
}


================================================
FILE: R/recode_into.R
================================================
#' @title Recode values from one or more variables into a new variable
#' @name recode_into
#'
#' @description
#' This functions recodes values from one or more variables into a new variable.
#' It is a convenient function to avoid nested [`ifelse()`] statements, which
#' is similar to `dplyr::case_when()`.
#'
#' @param ... A sequence of two-sided formulas, where the left hand side (LHS)
#' is a logical matching condition that determines which values match this case.
#' The LHS of this formula is also called "recode pattern" (e.g., in messages).
#' The right hand side (RHS) indicates the replacement value.
#' @param data Optional, name of a data frame. This can be used to avoid writing
#' the data name multiple times in `...`. See 'Examples'.
#' @param default Indicates the default value that is chosen when no match in
#' the formulas in `...` is found. If not provided, `NA` is used as default
#' value.
#' @param overwrite Logical, if `TRUE` (default) and more than one recode pattern
#' apply to the same case, already recoded values will be overwritten by subsequent
#' recode patterns. If `FALSE`, former recoded cases will not be altered by later
#' recode patterns that would apply to those cases again. A warning message is
#' printed to alert such situations and to avoid unintentional recodings.
#' @param preserve_na Logical, if `TRUE` and `default` is not `NA`, missing
#' values in the original variable will be set back to `NA` in the recoded
#' variable (unless overwritten by other recode patterns). If `FALSE`, missing
#' values in the original variable will be recoded to `default`. Setting
#' `preserve_na = TRUE` prevents unintentional overwriting of missing values
#' with `default`, which means that you won't find valid values where the
#' original data only had missing values. See 'Examples'.
#' @param verbose Toggle warnings.
#'
#' @return A vector with recoded values.
#'
#' @examples
#' x <- 1:30
#' recode_into(
#'   x > 15 ~ "a",
#'   x > 10 & x <= 15 ~ "b",
#'   default = "c"
#' )
#'
#' x <- 1:10
#' # default behaviour: second recode pattern "x > 5" overwrites
#' # some of the formerly recoded cases from pattern "x >= 3 & x <= 7"
#' recode_into(
#'   x >= 3 & x <= 7 ~ 1,
#'   x > 5 ~ 2,
#'   default = 0,
#'   verbose = FALSE
#' )
#'
#' # setting "overwrite = FALSE" will not alter formerly recoded cases
#' recode_into(
#'   x >= 3 & x <= 7 ~ 1,
#'   x > 5 ~ 2,
#'   default = 0,
#'   overwrite = FALSE,
#'   verbose = FALSE
#' )
#'
#' set.seed(123)
#' d <- data.frame(
#'   x = sample(1:5, 30, TRUE),
#'   y = sample(letters[1:5], 30, TRUE),
#'   stringsAsFactors = FALSE
#' )
#'
#' # from different variables into new vector
#' recode_into(
#'   d$x %in% 1:3 & d$y %in% c("a", "b") ~ 1,
#'   d$x > 3 ~ 2,
#'   default = 0
#' )
#'
#' # no need to write name of data frame each time
#' recode_into(
#'   x %in% 1:3 & y %in% c("a", "b") ~ 1,
#'   x > 3 ~ 2,
#'   data = d,
#'   default = 0
#' )
#'
#' # handling of missing values
#' d <- data.frame(
#'   x = c(1, NA, 2, NA, 3, 4),
#'   y = c(1, 11, 3, NA, 5, 6)
#' )
#' # first NA in x is overwritten by valid value from y
#' # we have no known value for second NA in x and y,
#' # thus we get one NA in the result
#' recode_into(
#'   x <= 3 ~ 1,
#'   y > 5 ~ 2,
#'   data = d,
#'   default = 0,
#'   preserve_na = TRUE
#' )
#' # first NA in x is overwritten by valid value from y
#' # default value is used for second NA
#' recode_into(
#'   x <= 3 ~ 1,
#'   y > 5 ~ 2,
#'   data = d,
#'   default = 0,
#'   preserve_na = FALSE
#' )
#' @export
recode_into <- function(
  ...,
  data = NULL,
  default = NA,
  overwrite = TRUE,
  preserve_na = FALSE,
  verbose = TRUE
) {
  dots <- list(...)

  # get length of vector, so we know the length of the output vector
  len <- if (is.null(data)) {
    length(.dynEval(dots[[1]][[2]], ifnotfound = NULL))
  } else {
    length(with(data, eval(dots[[1]][[2]])))
  }

  # how many expressions (recode-formulas) do we have?
  n_params <- length(dots)

  # last expression should always be the default value
  if (is.null(default)) {
    default <- NA
    if (verbose) {
      insight::format_warning(
        "Default value can't be `NULL`, setting to `NA` now."
      )
    }
  }

  # create default output vector
  out <- rep(default, times = len)

  all_recodes <- NULL
  all_same_length <- NULL
  new_values <- NULL
  # check recode values
  for (i in seq_len(n_params)) {
    # get type of all recode values
    if (is.null(data)) {
      value_type <- .dynEval(dots[[i]][[3]], ifnotfound = NULL)
      value_length <- .dynEval(dots[[i]][[2]], ifnotfound = NULL)
    } else {
      value_type <- with(data, eval(dots[[i]][[3]]))
      value_length <- with(data, eval(dots[[i]][[2]]))
    }
    # if we have "NA", we don't want to check the type. Else, you cannot use
    # "NA" for numeric recodes, but rather need to use "NA_real_", which is not
    # user-friendly
    if (is.na(value_type)) {
      type <- NULL
    } else {
      type <- typeof(value_type)
    }
    len_matches <- length(value_length)
    # save type and length of recode values
    all_recodes <- c(all_recodes, type)
    all_same_length <- c(all_same_length, len_matches)
    new_values <- c(new_values, value_type)
  }
  # if we have mixed types, warn user
  if (!is.null(all_recodes) && !all(all_recodes == all_recodes[1])) {
    wrong_type <- which(all_recodes != all_recodes[1])
    insight::format_error(
      paste(
        "Recoding not carried out. Not all recode values are of the same type.",
        sprintf(
          "For instance, the new value of the first pattern, `%s`, is of type `%s`. The new value of the %s recode pattern, `%s`, is of type `%s`.", # nolint
          insight::color_text(new_values[1], "cyan"),
          insight::color_text(all_recodes[1], "cyan"),
          .number_to_text(wrong_type[1]),
          insight::color_text(new_values[wrong_type[1]], "cyan"),
          insight::color_text(all_recodes[wrong_type[1]], "cyan")
        )
      )
    )
  }
  # all inputs of correct length?
  if (
    !is.null(all_same_length) && !all(all_same_length == all_same_length[1])
  ) {
    wrong_length <- which(all_same_length != all_same_length[1])
    insight::format_error(
      "The matching conditions return vectors of different length.",
      paste(
        "Please check if all variables in your recode patterns are of the same length.",
        sprintf(
          "For instance, the first and the %s recode pattern return vectors of different length.",
          .number_to_text(wrong_length[1])
        )
      )
    )
  }

  # indicator to show message when replacing NA by default
  # needed to show message only once
  overwrite_NA_msg <- TRUE

  # iterate all expressions
  for (i in seq_len(n_params)) {
    # grep index of observations with replacements and replacement value
    if (is.null(data)) {
      index <- .dynEval(dots[[i]][[2]], ifnotfound = NULL)
      value <- .dynEval(dots[[i]][[3]], ifnotfound = NULL)
    } else {
      index <- with(data, eval(dots[[i]][[2]]))
      value <- with(data, eval(dots[[i]][[3]]))
    }
    # remember missing values, so we can add back later
    missing_index <- is.na(index)
    # make sure index has no missing values. when we have missing values in
    # original expression, these are considered as "no match" and set to FALSE
    # we handle NA value later and thus want to remove them from "index" now
    index[is.na(index)] <- FALSE
    # overwriting values? do more recode-patterns match the same case?
    if (is.na(default)) {
      already_exists <- !is.na(out[index])
    } else {
      already_exists <- out[index] != default
    }
    # save indices of overwritten cases
    overwritten_cases <- which(index)[already_exists]
    # tell user...
    if (any(already_exists, na.rm = TRUE) && verbose) {
      if (overwrite) {
        msg <- paste(
          "Several recode patterns apply to the same cases.",
          "Some of the already recoded cases will be overwritten with new values again",
          sprintf(
            "(e.g. pattern %i overwrites the former recode of case %i).",
            i,
            overwritten_cases[1]
          )
        )
      } else {
        msg <- paste(
          "Several recode patterns apply to the same cases.",
          "Some of the already recoded cases will not be altered by later recode patterns.",
          sprintf(
            "(e.g. pattern %i also matches the former recode of case %i).",
            i,
            overwritten_cases[1]
          )
        )
      }
      insight::format_warning(msg, "Please check if this is intentional!")
    }
    # if user doesn't want to overwrite, remove already recoded indices
    if (!overwrite) {
      index[overwritten_cases] <- FALSE
    }
    # write new values into output vector
    out[index] <- value
    # set back missing values
    if (any(missing_index) && !is.na(default)) {
      if (preserve_na) {
        # but only where we still have default values
        # we don't want to overwrite already recoded values with NA
        out[missing_index & out == default] <- NA
      } else if (overwrite_NA_msg && verbose) {
        # don't show msg again
        overwrite_NA_msg <- FALSE
        insight::format_alert(
          "Missing values in original variable are overwritten by default value. If you want to preserve missing values, set `preserve_na = TRUE`." # nolint
        )
      }
    }
  }

  out
}

.number_to_text <- function(x) {
  if (is.null(x) || is.na(x)) {
    return("")
  }
  if (x == 1) {
    "first"
  } else if (x == 2) {
    "second"
  } else if (x == 3) {
    "third"
  } else if (x == 4) {
    "fourth"
  } else if (x == 5) {
    "fifth"
  } else if (x == 21) {
    "twenty-first"
  } else if (x == 22) {
    "twenty-second"
  } else if (x == 23) {
    "twenty-third"
  } else {
    paste0(x, "th")
  }
}


================================================
FILE: R/recode_values.R
================================================
#' @title Recode old values of variables into new values
#' @name recode_values
#'
#' @description
#' This functions recodes old values into new values and can be used to to
#' recode numeric or character vectors, or factors.
#'
#' @param x A data frame, numeric or character vector, or factor.
#' @param recode A list of named vectors, which indicate the recode pairs.
#'   The _names_ of the list-elements (i.e. the left-hand side) represent the
#'   _new_ values, while the values of the list-elements indicate the original
#'   (old) values that should be replaced. When recoding numeric vectors,
#'   element names have to be surrounded in backticks. For example,
#'   ``recode=list(`0`=1)`` would recode all `1` into `0` in a numeric
#'   vector. See also 'Examples' and 'Details'.
#' @param default Defines the default value for all values that have no match in
#'   the recode-pairs. If `NULL`, original values will be preserved when there
#'   is no match. Note that, if `preserve_na=FALSE`, missing values (`NA`) are
#'   also captured by the `default` argument, and thus will also be recoded into
#'   the specified value. See 'Examples' and 'Details'.
#' @param preserve_na Logical, if `TRUE`, `NA` (missing values) are preserved.
#'   This overrides any other arguments, including `default`. Hence, if
#'   `preserve_na=TRUE`, `default` will no longer convert `NA` into the specified
#'   default value.
#' @param ... not used.
#' @inheritParams extract_column_names
#' @inheritParams categorize
#'
#' @return `x`, where old values are replaced by new values.
#'
#' @inheritSection center Selection of variables - the `select` argument
#'
#' @inherit data_rename seealso
#'
#' @note You can use `options(data_recode_pattern = "old=new")` to switch the
#' behaviour of the `recode`-argument, i.e. recode-pairs are now following the
#' pattern `old values = new values`, e.g. if `getOption("data_recode_pattern")`
#' is set to `"old=new"`, then ``recode(`1`=0)`` would recode all 1 into 0.
#' The default for ``recode(`1`=0)`` is to recode all 0 into 1.
#'
#' @details
#' This section describes the pattern of the `recode` arguments, which also
#' provides some shortcuts, in particular when recoding numeric values.
#'
#' - Single values
#'
#'   Single values either need to be wrapped in backticks (in case of numeric
#'   values) or "as is" (for character or factor levels). Example:
#'   ``recode=list(`0`=1,`1`=2)`` would recode 1 into 0, and 2 into 1.
#'   For factors or character vectors, an example is:
#'   `recode=list(x="a",y="b")` (recode "a" into "x" and "b" into "y").
#'
#' - Multiple values
#'
#'   Multiple values that should be recoded into a new value can be separated
#'   with comma. Example: ``recode=list(`1`=c(1,4),`2`=c(2,3))`` would recode the
#'   values 1 and 4 into 1, and 2 and 3 into 2. It is also possible to define  the
#'   old values as a character string, like:  ``recode=list(`1`="1,4",`2`="2,3")``
#'   For factors or character vectors, an example is:
#'   ``recode=list(x=c("a","b"),y=c("c","d"))``.
#'
#' - Value range
#'
#'   Numeric value ranges can be defined using the `:`. Example:
#'   ``recode=list(`1`=1:3,`2`=4:6)`` would recode all values from 1 to 3 into
#'   1, and 4 to 6 into 2.
#'
#' - `min` and `max`
#'
#'   placeholder to use the minimum or maximum value of the
#'   (numeric) variable. Useful, e.g., when recoding ranges of values.
#'   Example: ``recode=list(`1`="min:10",`2`="11:max")``.
#'
#' - `default` values
#'
#'   The `default` argument defines the default value for all values that have
#'   no match in the recode-pairs. For example,
#'   ``recode=list(`1`=c(1,2),`2`=c(3,4)), default=9`` would
#'   recode values 1 and 2 into 1, 3 and 4 into 2, and all other values into 9.
#'   If `preserve_na` is set to `FALSE`, `NA` (missing values) will also be
#'   recoded into the specified default value.
#'
#' - Reversing and rescaling
#'
#'   See [reverse()] and [rescale()].
#'
#' @examples
#' # numeric ----------
#' set.seed(123)
#' x <- sample(c(1:4, NA), 15, TRUE)
#' table(x, useNA = "always")
#'
#' out <- recode_values(x, list(`0` = 1, `1` = 2:3, `2` = 4))
#' out
#' table(out, useNA = "always")
#'
#' # to recode NA values, set preserve_na to FALSE
#' out <- recode_values(
#'   x,
#'   list(`0` = 1, `1` = 2:3, `2` = 4, `9` = NA),
#'   preserve_na = FALSE
#' )
#' out
#' table(out, useNA = "always")
#'
#' # preserve na ----------
#' out <- recode_values(x, list(`0` = 1, `1` = 2:3), default = 77)
#' out
#' table(out, useNA = "always")
#'
#' # recode na into default ----------
#' out <- recode_values(
#'   x,
#'   list(`0` = 1, `1` = 2:3),
#'   default = 77,
#'   preserve_na = FALSE
#' )
#' out
#' table(out, useNA = "always")
#'
#'
#' # factors (character vectors are similar) ----------
#' set.seed(123)
#' x <- as.factor(sample(c("a", "b", "c"), 15, TRUE))
#' table(x)
#'
#' out <- recode_values(x, list(x = "a", y = c("b", "c")))
#' out
#' table(out)
#'
#' out <- recode_values(x, list(x = "a", y = "b", z = "c"))
#' out
#' table(out)
#'
#' out <- recode_values(x, list(y = "b,c"), default = 77)
#' # same as
#' # recode_values(x, list(y = c("b", "c")), default = 77)
#' out
#' table(out)
#'
#'
#' # data frames ----------
#' set.seed(123)
#' d <- data.frame(
#'   x = sample(c(1:4, NA), 12, TRUE),
#'   y = as.factor(sample(c("a", "b", "c"), 12, TRUE)),
#'   stringsAsFactors = FALSE
#' )
#'
#' recode_values(
#'   d,
#'   recode = list(`0` = 1, `1` = 2:3, `2` = 4, x = "a", y = c("b", "c")),
#'   append = TRUE
#' )
#'
#'
#' # switch recode pattern to "old=new" ----------
#' options(data_recode_pattern = "old=new")
#'
#' # numeric
#' set.seed(123)
#' x <- sample(c(1:4, NA), 15, TRUE)
#' table(x, useNA = "always")
#'
#' out <- recode_values(x, list(`1` = 0, `2:3` = 1, `4` = 2))
#' table(out, useNA = "always")
#'
#' # factors (character vectors are similar)
#' set.seed(123)
#' x <- as.factor(sample(c("a", "b", "c"), 15, TRUE))
#' table(x)
#'
#' out <- recode_values(x, list(a = "x", `b, c` = "y"))
#' table(out)
#'
#' # reset options
#' options(data_recode_pattern = NULL)
#' @export
recode_values <- function(x, ...) {
  UseMethod("recode_values")
}


#' @export
recode_values.default <- function(x, verbose = TRUE, ...) {
  if (isTRUE(verbose)) {
    insight::format_alert(
      paste0(
        "Variables of class `",
        class(x)[1],
        "` can't be recoded and remain unchanged."
      )
    )
  }
  x
}


#' @rdname recode_values
#' @export
recode_values.numeric <- function(
  x,
  recode = NULL,
  default = NULL,
  preserve_na = TRUE,
  verbose = TRUE,
  ...
) {
  # save
  original_x <- x

  # check arguments
  if (!.recode_args_ok(x, recode, verbose)) {
    return(x)
  }

  # recode-pattern option
  pattern <- getOption("data_recode_pattern")

  # make sure NAs are preserved after recoding
  missing_values <- NULL
  if (preserve_na) {
    missing_values <- is.na(x)
  }

  # check for "default" token
  if (!is.null(default)) {
    # set the default value for all values that have no match
    # (i.e. that should not be recoded)
    x <- rep(as.numeric(default), length = length(x))
  }

  for (i in names(recode)) {
    # based on option-settings, the recode-argument can either follow the
    # pattern "new=old", or "old=new"

    if (identical(pattern, "old=new")) {
      # pattern: old = new, name of list element is old value
      old_values <- i
      new_values <- recode[[i]]
    } else {
      # pattern: new = old, name of list element is new value
      old_values <- recode[[i]]
      new_values <- i
    }

    if (is.character(old_values)) {
      # replace placeholder
      old_values <- gsub("min", min(x, na.rm = TRUE), old_values, fixed = TRUE)
      old_values <- gsub("max", max(x, na.rm = TRUE), old_values, fixed = TRUE)

      # mimic vector
      if (length(old_values) == 1 && !grepl("c(", old_values, fixed = TRUE)) {
        old_values <- paste0("c(", old_values, ")")
      }

      # parse old values, which are strings (names of element), but which should
      # contain values, like "1:10" or "1, 2, 3, 4". These should now be in the
      # format "c(1, 2, 3, 4)" or "c(1:10)", and it should be possible to parse
      # and evaluate these strings into a numeric vector
      old_values <- tryCatch(
        eval(parse(text = old_values)),
        error = function(e) NULL
      )
    }

    if (!is.null(old_values) && (is.numeric(old_values) || is.na(old_values))) {
      x[which(original_x %in% old_values)] <- as.numeric(new_values)
    }
  }

  # set back variable labels, remove value labels
  # (these are most likely not matching anymore)
  attr(x, "label") <- attr(original_x, "label", exact = TRUE)
  attr(x, "labels") <- NULL

  # set back missing values
  if (!is.null(missing_values)) {
    x[missing_values] <- NA
  }

  x
}


#' @export
recode_values.factor <- function(
  x,
  recode = NULL,
  default = NULL,
  preserve_na = TRUE,
  verbose = TRUE,
  ...
) {
  # save
  original_x <- x

  # check arguments
  if (!.recode_args_ok(x, recode, verbose)) {
    return(x)
  }

  # recode-pattern option
  pattern <- getOption("data_recode_pattern")

  # make sure NAs are preserved after recoding
  missing_values <- NULL
  if (preserve_na) {
    missing_values <- is.na(x)
  }

  # as character, so recoding works
  x <- as.character(x)

  # check for "default" token
  if (!is.null(default)) {
    # set the default value for all values that have no match
    # (i.e. that should not be recoded)
    x <- rep(as.character(default), length = length(x))
  }

  for (i in names(recode)) {
    # based on option-settings, the recode-argument can either follow the
    # pattern "new=old", or "old=new"

    if (identical(pattern, "old=new")) {
      # pattern: old = new
      # name of list element is old value

      old_values <- paste(
        deparse(insight::trim_ws(unlist(
          strsplit(i, ",", fixed = TRUE),
          use.names = FALSE
        ))),
        collapse = ","
      )

      # parse old values, which are strings (names of element), but which should
      # contain values, like "a" or "a, b, c". These should now be in the
      # format "c("a", "b", "c")" and it should be possible to parse
      # and evaluate these strings into a numeric vector
      old_values <- tryCatch(
        eval(parse(text = old_values)),
        error = function(e) NULL
      )

      # recode
      x[which(original_x %in% old_values)] <- recode[[i]]
    } else {
      # pattern: new = old
      # name of list element is new value

      old_values <- as.character(recode[[i]])
      # check input style: "a, b, c"
      if (length(old_values) == 1 && grepl(",", old_values, fixed = TRUE)) {
        # split and make character vector
        old_values <- insight::trim_ws(unlist(
          strsplit(old_values, ",", fixed = TRUE),
          use.names = FALSE
        ))
      }
      # recode
      if (identical(i, "NA")) {
        x[which(original_x %in% old_values)] <- NA_character_
      } else {
        x[which(original_x %in% old_values)] <- as.character(i)
      }
    }
  }

  # set back missing values
  if (!is.null(missing_values)) {
    x[missing_values] <- NA_character_
  }

  # make sure we have correct new levels
  x <- droplevels(as.factor(x))

  # set back variable labels, remove value labels
  # (these are most likely not matching anymore)
  attr(x, "label") <- attr(original_x, "label", exact = TRUE)
  attr(x, "labels") <- NULL

  x
}


#' @export
recode_values.character <- function(
  x,
  recode = NULL,
  default = NULL,
  preserve_na = TRUE,
  verbose = TRUE,
  ...
) {
  # save
  original_x <- x

  # check arguments
  if (!.recode_args_ok(x, recode, verbose)) {
    return(x)
  }

  # recode-pattern option
  pattern <- getOption("data_recode_pattern")

  # make sure NAs are preserved after recoding
  missing_values <- NULL
  if (preserve_na) {
    missing_values <- is.na(x)
  }

  # check for "default" token
  if (!is.null(default)) {
    # set the default value for all values that have no match
    # (i.e. that should not be recoded)
    x <- rep(as.character(default), length = length(x))
  }

  for (i in names(recode)) {
    # based on option-settings, the recode-argument can either follow the
    # pattern "new=old", or "old=new"

    if (identical(pattern, "old=new")) {
      # pattern: old = new
      # name of list element is old value

      # name of list element is old value
      value_string <- paste(
        deparse(insight::trim_ws(unlist(
          strsplit(i, ",", fixed = TRUE),
          use.names = FALSE
        ))),
        collapse = ","
      )

      # parse old values, which are strings (names of element), but which should
      # contain values, like "a" or "a, b, c". These should now be in the
      # format "c("a", "b", "c")" and it should be possible to parse
      # and evaluate these strings into a numeric vector
      old_values <- tryCatch(
        eval(parse(text = value_string)),
        error = function(e) NULL
      )

      # recode
      x[which(original_x %in% old_values)] <- recode[[i]]
    } else {
      # pattern: new = old
      # name of list element is new value

      old_values <- as.character(recode[[i]])
      # check input style: "a, b, c"
      if (length(old_values) == 1 && grepl(",", old_values, fixed = TRUE)) {
        # split and make character vector
        old_values <- insight::trim_ws(unlist(
          strsplit(old_values, ",", fixed = TRUE),
          use.names = FALSE
        ))
      }
      # recode
      if (identical(i, "NA")) {
        x[which(original_x %in% old_values)] <- NA_character_
      } else {
        x[which(original_x %in% old_values)] <- as.character(i)
      }
    }
  }

  # set back variable labels, remove value labels
  # (these are most likely not matching anymore)
  attr(x, "label") <- attr(original_x, "label", exact = TRUE)
  attr(x, "labels") <- NULL

  # set back missing values
  if (!is.null(missing_values)) {
    x[missing_values] <- NA_character_
  }

  x
}


#' @rdname recode_values
#' @export
recode_values.data.frame <- function(
  x,
  select = NULL,
  exclude = NULL,
  recode = NULL,
  default = NULL,
  preserve_na = TRUE,
  append = FALSE,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
) {
  # evaluate arguments
  select <- .select_nse(
    select,
    x,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  # when we append variables, we call ".process_append()", which will
  # create the new variables and updates "select", so new variables are processed
  if (!isFALSE(append)) {
    # process arguments
    my_args <- .process_append(
      x,
      select,
      append,
      append_suffix = "_r",
      preserve_value_labels = TRUE
    )
    # update processed arguments
    x <- my_args$x
    select <- my_args$select
  }

  x[select] <- lapply(
    x[select],
    recode_values,
    recode = recode,
    default = default,
    preserve_na = preserve_na,
    verbose = verbose,
    ...
  )

  x
}


# utils --------------------------

.recode_args_ok <- function(x, recode, verbose) {
  ok <- TRUE
  # no missings
  valid <- stats::na.omit(x)

  # skip if all NA
  if (!length(valid)) {
    if (isTRUE(verbose)) {
      insight::format_warning(
        "Variable contains only missing values. No recoding carried out."
      )
    }
    ok <- FALSE
  }

  # warn if not a list
  if (!is.list(recode) || is.null(names(recode))) {
    if (isTRUE(verbose)) {
      insight::format_warning(
        "`recode` needs to be a (named) list. No recoding carried out."
      )
    }
    ok <- FALSE
  }

  ok
}


================================================
FILE: R/remove_empty.R
================================================
#' @title Return or remove variables or observations that are completely missing
#' @name remove_empty
#' @rdname remove_empty
#'
#' @description
#'
#' These functions check which rows or columns of a data frame completely
#' contain missing values, i.e. which observations or variables completely have
#' missing values, and either (1) returns their indices; or (2) removes them
#' from the data frame.
#'
#' @param x A data frame.
#'
#' @return
#'
#' - For `empty_columns()` and `empty_rows()`, a numeric (named) vector with row
#' or column indices of those variables that completely have missing values.
#'
#' - For `remove_empty_columns()` and `remove_empty_rows()`, a data frame with
#' "empty" columns or rows removed, respectively.
#'
#' - For `remove_empty()`, **both** empty rows and columns will be removed.
#'
#' @details For character vectors, empty string values (i.e. `""`) are also
#' considered as missing value. Thus, if a character vector only contains `NA`
#' and `""`, it is considered as empty variable and will be removed. Same
#' applies to observations (rows) that only contain `NA` or `""`.
#'
#' @examples
#' tmp <- data.frame(
#'   a = c(1, 2, 3, NA, 5),
#'   b = c(1, NA, 3, NA, 5),
#'   c = c(NA, NA, NA, NA, NA),
#'   d = c(1, NA, 3, NA, 5)
#' )
#'
#' tmp
#'
#' # indices of empty columns or rows
#' empty_columns(tmp)
#' empty_rows(tmp)
#'
#' # remove empty columns or rows
#' remove_empty_columns(tmp)
#' remove_empty_rows(tmp)
#'
#' # remove empty columns and rows
#' remove_empty(tmp)
#'
#' # also remove "empty" character vectors
#' tmp <- data.frame(
#'   a = c(1, 2, 3, NA, 5),
#'   b = c(1, NA, 3, NA, 5),
#'   c = c("", "", "", "", ""),
#'   stringsAsFactors = FALSE
#' )
#' empty_columns(tmp)
#'
#' @export
empty_columns <- function(x) {
  if ((!is.matrix(x) && !is.data.frame(x)) || ncol(x) < 2) {
    vector("numeric")
  } else {
    all_na <- colSums(is.na(x)) == nrow(x)
    all_empty <- vapply(
      x,
      function(i) {
        (is.character(i) || is.factor(i)) &&
          !any(nzchar(as.character(i[!is.na(i)])))
      },
      FUN.VALUE = logical(1L)
    )

    which(all_na | all_empty)
  }
}


#' @rdname remove_empty
#' @export
empty_rows <- function(x) {
  if ((!is.matrix(x) && !is.data.frame(x)) || nrow(x) < 2) {
    vector("numeric")
  } else {
    which(rowSums((is.na(x) | x == "")) == ncol(x)) # nolint
  }
}


#' @rdname remove_empty
#' @export
remove_empty_columns <- function(x) {
  # check if we have any empty columns at all
  ec <- empty_columns(x)

  # if yes, removing works, else an empty df would be returned
  if (length(ec)) {
    x <- x[-ec]
  }

  x
}


#' @rdname remove_empty
#' @export
remove_empty_rows <- function(x) {
  # check if we have any empty rows at all
  er <- empty_rows(x)

  # if yes, removing works, else an empty df would be returned
  if (length(er)) {
    attr_data <- attributes(x)
    x <- x[-er, ]
    x <- .replace_attrs(x, attr_data)
  }

  x
}

#' @rdname remove_empty
#' @export
remove_empty <- function(x) {
  x <- remove_empty_rows(x)
  x <- remove_empty_columns(x)
  x
}


================================================
FILE: R/replace_nan_inf.R
================================================
#' @title Convert infinite or `NaN` values into `NA`
#' @name replace_nan_inf
#'
#' @description
#' Replaces all infinite (`Inf` and `-Inf`) or `NaN` values with `NA`.
#'
#' @param x A vector or a dataframe
#' @param ... Currently not used.
#'
#' @return
#' Data with `Inf`, `-Inf`, and `NaN` converted to `NA`.
#'
#' @examples
#' # a vector
#' x <- c(1, 2, NA, 3, NaN, 4, NA, 5, Inf, -Inf, 6, 7)
#' replace_nan_inf(x)
#'
#' # a data frame
#' df <- data.frame(
#'   x = c(1, NA, 5, Inf, 2, NA),
#'   y = c(3, NaN, 4, -Inf, 6, 7),
#'   stringsAsFactors = FALSE
#' )
#' replace_nan_inf(df)
#' @export

replace_nan_inf <- function(x, ...) {
  UseMethod("replace_nan_inf")
}

#' @export
replace_nan_inf.default <- function(x, ...) {
  x[is.nan(x) | is.infinite(x)] <- NA
  x
}

#' @inheritParams extract_column_names
#' @export
replace_nan_inf.data.frame <- function(
  x,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
) {
  # Select and deselect
  cols <- .select_nse(
    select,
    x,
    exclude = exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  for (i in cols) {
    x[[i]] <- replace_nan_inf(x[[i]])
  }

  x
}


================================================
FILE: R/rescale_weights.R
================================================
#' @title Rescale design weights for multilevel analysis
#' @name rescale_weights
#'
#' @description Most functions to fit multilevel and mixed effects models only
#' allow the user to specify frequency weights, but not design (i.e., sampling
#' or probability) weights, which should be used when analyzing complex samples
#' (e.g., probability samples). `rescale_weights()` implements two algorithms,
#' one proposed by \cite{Asparouhov (2006)} and \cite{Carle (2009)}, to rescale
#' design weights in survey data to account for the grouping structure of
#' multilevel models, and one based on the design effect proposed by
#' \cite{Kish (1965)}, to rescale weights by the design effect to account for
#' additional sampling error introduced by weighting.
#' @param data A data frame.
#' @param by Variable names (as character vector, or as formula), indicating
#' the grouping structure (strata) of the survey data (level-2-cluster
#' variable). It is also possible to create weights for multiple group
#' variables; in such cases, each created weighting variable will be suffixed
#' by the name of the group variable. This argument is required for
#' `method = "carle"`, but optional for `method = "kish"`.
#' @param probability_weights Variable indicating the probability (design or
#' sampling) weights of the survey data (level-1-weight), provided as character
#' string or formula.
#' @param nest Logical, if `TRUE` and `by` indicates at least two group
#' variables, then groups are "nested", i.e. groups are now a combination from
#' each group level of the variables in `by`. This argument is not used when
#' `method = "kish"`.
#' @param method String, indicating which rescale-method is used for rescaling
#' weights. Can be either `"carle"` (default) or `"kish"`. See 'Details'. If
#' `method = "carle"`, the `by` argument is required.
#'
#' @return
#' `data`, including the new weighting variable(s). For `method = "carle"`, new
#' columns `rescaled_weights_a` and `rescaled_weights_b` are returned, and for
#' `method = "kish"`, the returned data contains a column `rescaled_weights`.
#' These represent the rescaled design weights to use in multilevel models (use
#' these variables for the `weights` argument).
#'
#' @details
#' - `method = "carle"`
#'
#'   Rescaling is based on two methods: For `rescaled_weights_a`, the sample
#'   weights `probability_weights` are adjusted by a factor that represents the
#'   proportion of group size divided by the sum of sampling weights within each
#'   group. The adjustment factor for `rescaled_weights_b` is the sum of sample
#'   weights within each group divided by the sum of squared sample weights
#'   within each group (see Carle (2009), Appendix B). In other words,
#'   `rescaled_weights_a` "scales the weights so that the new weights sum to the
#'   cluster sample size" while `rescaled_weights_b` "scales the weights so that
#'   the new weights sum to the effective cluster size".
#'
#'   Regarding the choice between scaling methods A and B, Carle suggests that
#'   "analysts who wish to discuss point estimates should report results based
#'   on weighting method A. For analysts more interested in residual
#'   between-group variance, method B may generally provide the least biased
#'   estimates". In general, it is recommended to fit a non-weighted model and
#'   weighted models with both scaling methods and when comparing the models,
#'   see whether the "inferential decisions converge", to gain confidence in the
#'   results.
#'
#'   Though the bias of scaled weights decreases with increasing group size,
#'   method A is preferred when insufficient or low group size is a concern.
#'
#'   The group ID and probably PSU may be used as random effects (e.g. nested
#'   design, or group and PSU as varying intercepts), depending on the survey
#'   design that should be mimicked.
#'
#' - `method = "kish"`
#'
#'   Rescaling is based on scaling the sample weights so the mean value is 1,
#'   which means the sum of all weights equals the sample size. Next, the design
#'   effect (_Kish 1965_) is calculated, which is the mean of the squared
#'   weights divided by the squared mean of the weights. The scaled sample
#'   weights are then divided by the design effect. This method is most
#'   appropriate when weights are based on additional variables beyond the
#'   grouping variables in the model (e.g., other demographic characteristics),
#'   but may also be useful in other contexts.
#'
#'   Some tests on real-world survey-data suggest that, in comparison to the
#'   Carle-method, the Kish-method comes closer to estimates from a regular
#'   survey-design using the **survey** package. Note that these tests are not
#'   representative and it is recommended to check your results against a
#'   standard survey-design.
#'
#' @references
#'   - Asparouhov T. (2006). General Multi-Level Modeling with Sampling
#'   Weights. Communications in Statistics - Theory and Methods 35: 439-460
#'
#'   - Carle A.C. (2009). Fitting multilevel models in complex survey data
#'   with design weights: Recommendations. BMC Medical Research Methodology
#'   9(49): 1-13
#'
#'   - Kish, L. (1965) Survey Sampling. London: Wiley.
#'
#' @examplesIf all(insight::check_if_installed(c("lme4", "parameters"), quietly = TRUE))
#' data(nhanes_sample)
#' head(rescale_weights(nhanes_sample, "WTINT2YR", "SDMVSTRA"))
#'
#' # also works with multiple group-variables
#' head(rescale_weights(nhanes_sample, "WTINT2YR", c("SDMVSTRA", "SDMVPSU")))
#'
#' # or nested structures.
#' x <- rescale_weights(
#'   data = nhanes_sample,
#'   probability_weights = "WTINT2YR",
#'   by = c("SDMVSTRA", "SDMVPSU"),
#'   nest = TRUE
#' )
#' head(x)
#'
#' \donttest{
#' # compare different methods, using multilevel-Poisson regression
#'
#' d <- rescale_weights(nhanes_sample, "WTINT2YR", "SDMVSTRA")
#' result1 <- lme4::glmer(
#'   total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU),
#'   family = poisson(),
#'   data = d,
#'   weights = rescaled_weights_a
#' )
#' result2 <- lme4::glmer(
#'   total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU),
#'   family = poisson(),
#'   data = d,
#'   weights = rescaled_weights_b
#' )
#'
#' d <- rescale_weights(
#'   nhanes_sample,
#'   "WTINT2YR",
#'   method = "kish"
#' )
#' result3 <- lme4::glmer(
#'   total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU),
#'   family = poisson(),
#'   data = d,
#'   weights = rescaled_weights
#' )
#' d <- rescale_weights(
#'   nhanes_sample,
#'   "WTINT2YR",
#'   "SDMVSTRA",
#'   method = "kish"
#' )
#' result4 <- lme4::glmer(
#'   total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU),
#'   family = poisson(),
#'   data = d,
#'   weights = rescaled_weights
#' )
#' parameters::compare_parameters(
#'   list(result1, result2, result3, result4),
#'   exponentiate = TRUE,
#'   column_names = c("Carle (A)", "Carle (B)", "Kish", "Kish (grouped)")
#' )
#' }
#' @export
rescale_weights <- function(
  data,
  probability_weights = NULL,
  by = NULL,
  nest = FALSE,
  method = "carle"
) {
  method <- insight::validate_argument(method, c("carle", "kish"))

  # convert formulas to strings
  if (inherits(by, "formula")) {
    by <- all.vars(by)
  }

  if (inherits(probability_weights, "formula")) {
    probability_weights <- all.vars(probability_weights)
  }

  # check for existing variable names
  if (
    (method == "carle" &&
      any(c("rescaled_weights_a", "rescaled_weights_b") %in% colnames(data))) ||
      (method == "kish" && "rescaled_weights" %in% colnames(data))
  ) {
    insight::format_warning(
      "The variable name for the rescaled weights already exists in the data. Returned columns will be renamed into unique names."
    ) # nolint
  }

  # need probability_weights
  if (is.null(probability_weights)) {
    insight::format_error(
      "The argument `probability_weights` is missing, but required to rescale weights."
    )
  }

  # check if weight has missings. we need to remove them first,
  # and add back weights to correct cases later

  weight_missings <- which(is.na(data[[probability_weights]]))
  weight_non_na <- which(!is.na(data[[probability_weights]]))

  if (length(weight_missings) > 0) {
    data_tmp <- data[weight_non_na, ]
  } else {
    data_tmp <- data
  }

  fun_args <- list(
    nest = nest,
    probability_weights = probability_weights,
    data_tmp = data_tmp,
    data = data,
    by = by,
    weight_non_na = weight_non_na
  )

  switch(
    method,
    carle = do.call(.rescale_weights_carle, fun_args),
    do.call(.rescale_weights_kish, fun_args)
  )
}


# rescale weights, method Kish ----------------------------

.rescale_weights_kish <- function(
  nest,
  probability_weights,
  data_tmp,
  data,
  by,
  weight_non_na
) {
  # sort id
  data_tmp$.bamboozled <- seq_len(nrow(data_tmp))

  # `nest` is currently ignored
  if (isTRUE(nest)) {
    insight::format_warning(
      "Argument `nest` is ignored for `method = \"kish\"`."
    )
  }

  # check by argument
  if (!is.null(by) && !all(by %in% colnames(data_tmp))) {
    dont_exist <- setdiff(by, colnames(data_tmp))
    insight::format_error(
      paste0(
        "The following variable(s) specified in `by` don't exist in the dataset: ",
        text_concatenate(dont_exist),
        "."
      ),
      .misspelled_string(colnames(data_tmp), dont_exist, "Possibly misspelled?")
    )
  } else if (is.null(by)) {
    # if `by` = NULL, we create a dummy group
    by <- "tmp_kish_by"
    data_tmp[[by]] <- 1
  }

  # split into groups, and calculate weights
  out <- lapply(split(data_tmp, data_tmp[by]), function(group_data) {
    p_weights <- group_data[[probability_weights]]
    # design effect according to Kish
    deff <- mean(p_weights^2) / (mean(p_weights)^2)
    # rescale weights, so their mean is 1
    z_weights <- p_weights * (1 / mean(p_weights))
    # divide weights by design effect
    group_data$rescaled_weights <- z_weights / deff
    group_data
  })

  # bind data
  result <- do.call(rbind, out)

  # restore original order
  result <- result[order(result$.bamboozled), ]

  # add back rescaled weights to original data, but account for missing observations
  data$rescaled_weights <- NA_real_
  data$rescaled_weights[weight_non_na] <- result$rescaled_weights
  # return result
  data
}


# rescale weights, method Carle ----------------------------

.rescale_weights_carle <- function(
  nest,
  probability_weights,
  data_tmp,
  data,
  by,
  weight_non_na
) {
  # sort id
  data_tmp$.bamboozled <- seq_len(nrow(data_tmp))

  if (is.null(by)) {
    insight::format_error(
      "Argument `by` must be specified. Please provide one or more variable names in `by` that indicate the grouping structure (strata) of the survey data (level-2-cluster variable)."
    ) # nolint
  }

  if (!all(by %in% colnames(data_tmp))) {
    dont_exist <- setdiff(by, colnames(data_tmp))
    insight::format_error(
      paste0(
        "The following variable(s) specified in `by` don't exist in the dataset: ",
        text_concatenate(dont_exist),
        "."
      ),
      .misspelled_string(colnames(data_tmp), dont_exist, "Possibly misspelled?")
    )
  }

  if (nest && length(by) < 2) {
    insight::format_warning(
      sprintf(
        "Only one group variable selected in `by`, no nested structure possible. Rescaling weights for grout '%s' now.",
        by
      )
    )
    nest <- FALSE
  }

  if (nest) {
    out <- .rescale_weights_nested(
      data_tmp,
      group = by,
      probability_weights,
      nrow(data),
      weight_non_na
    )
  } else {
    out <- lapply(by, function(i) {
      x <- .rescale_weights(
        data_tmp,
        i,
        probability_weights,
        nrow(data),
        weight_non_na
      )
      if (length(by) > 1) {
        colnames(x) <- sprintf(c("pweight_a_%s", "pweight_b_%s"), i)
      }
      x
    })
  }

  make_unique_names <- any(vapply(
    out,
    function(i) any(colnames(i) %in% colnames(data)),
    logical(1)
  ))
  # add weights to data frame
  out <- do.call(cbind, list(data, out))
  # check if we have to rename columns
  if (make_unique_names) {
    colnames(out) <- make.unique(colnames(out), sep = "_")
  }

  out
}


# rescale weights, for one or more group variables ----------------------------

.rescale_weights <- function(x, group, probability_weights, n, weight_non_na) {
  # compute sum of weights per group
  design_weights <- .data_frame(
    group = sort(unique(x[[group]])),
    sum_weights_by_group = tapply(
      x[[probability_weights]],
      as.factor(x[[group]]),
      sum
    ),
    sum_squared_weights_by_group = tapply(
      x[[probability_weights]]^2,
      as.factor(x[[group]]),
      sum
    ),
    n_per_group = as.vector(table(x[[group]]))
  )

  colnames(design_weights)[1] <- group
  x <- merge(x, design_weights, by = group, sort = FALSE)

  # restore original order
  x <- x[order(x$.bamboozled), ]
  x$.bamboozled <- NULL

  # multiply the original weight by the fraction of the
  # sampling unit total population based on Carle 2009

  w_a <- x[[probability_weights]] * x$n_per_group / x$sum_weights_by_group
  w_b <- x[[probability_weights]] *
    x$sum_weights_by_group /
    x$sum_squared_weights_by_group

  out <- data.frame(
    rescaled_weights_a = rep(NA_real_, times = n),
    rescaled_weights_b = rep(NA_real_, times = n)
  )

  out$rescaled_weights_a[weight_non_na] <- w_a
  out$rescaled_weights_b[weight_non_na] <- w_b

  out
}


# rescale weights, for nested groups ----------------------------

.rescale_weights_nested <- function(
  x,
  group,
  probability_weights,
  n,
  weight_non_na
) {
  groups <- expand.grid(lapply(group, function(i) sort(unique(x[[i]]))))
  colnames(groups) <- group

  # compute sum of weights per group
  design_weights <- cbind(
    groups,
    .data_frame(
      sum_weights_by_group = unlist(
        as.list(tapply(
          x[[probability_weights]],
          lapply(group, function(i) {
            as.factor(x[[i]])
          }),
          sum
        )),
        use.names = FALSE
      ),
      sum_squared_weights_by_group = unlist(
        as.list(tapply(
          x[[probability_weights]]^2,
          lapply(group, function(i) {
            as.factor(x[[i]])
          }),
          sum
        )),
        use.names = FALSE
      ),
      n_per_group = unlist(as.list(table(x[, group])), use.names = FALSE)
    )
  )

  x <- merge(x, design_weights, by = group, sort = FALSE)

  # restore original order
  x <- x[order(x$.bamboozled), ]
  x$.bamboozled <- NULL

  # multiply the original weight by the fraction of the
  # sampling unit total population based on Carle 2009

  w_a <- x[[probability_weights]] * x$n_per_group / x$sum_weights_by_group
  w_b <- x[[probability_weights]] *
    x$sum_weights_by_group /
    x$sum_squared_weights_by_group

  out <- data.frame(
    rescaled_weights_a = rep(NA_real_, times = n),
    rescaled_weights_b = rep(NA_real_, times = n)
  )

  out$rescaled_weights_a[weight_non_na] <- w_a
  out$rescaled_weights_b[weight_non_na] <- w_b

  out
}


================================================
FILE: R/reshape_ci.R
================================================
#' Reshape CI between wide/long formats
#'
#' Reshape CI between wide/long formats.
#'
#' @param x A data frame containing columns named `CI_low` and `CI_high` (or
#'   similar, see `ci_type`).
#' @param ci_type String indicating the "type" (i.e. prefix) of the interval
#'   columns. Per *easystats* convention, confidence or credible intervals are
#'   named `CI_low` and `CI_high`, and the related `ci_type` would be `"CI"`.
#'   If column names for other intervals differ, `ci_type` can be used to
#'   indicate the name, e.g. `ci_type = "SI"` can be used for support intervals,
#'   where the column names in the data frame would be `SI_low` and `SI_high`.
#'
#' @return
#'
#' A data frame with columns corresponding to confidence intervals reshaped
#' either to wide or long format.
#'
#' @examples
#' x <- data.frame(
#'   Parameter = c("Term 1", "Term 2", "Term 1", "Term 2"),
#'   CI = c(0.8, 0.8, 0.9, 0.9),
#'   CI_low = c(0.2, 0.3, 0.1, 0.15),
#'   CI_high = c(0.5, 0.6, 0.8, 0.85),
#'   stringsAsFactors = FALSE
#' )
#'
#' reshape_ci(x)
#' reshape_ci(reshape_ci(x))
#' @export

reshape_ci <- function(x, ci_type = "CI") {
  # define interval type
  ci_type <- match.arg(ci_type, choices = c("CI", "SI", "HDI", "ETI"))

  ci_low <- paste0(ci_type, "_low")
  ci_high <- paste0(ci_type, "_high")

  # Long to wide ----------------
  if (ci_low %in% names(x) && ci_high %in% names(x) && "CI" %in% names(x)) {
    ci_position <- which(names(x) == "CI")

    # Reshape
    if (length(unique(x$CI)) > 1) {
      if ("Parameter" %in% names(x)) {
        idvar <- "Parameter"
        remove_parameter <- FALSE
      } else if (is.null(attr(x, "idvars"))) {
        idvar <- "Parameter"
        x$Parameter <- NA
        remove_parameter <- TRUE
      } else {
        idvar <- attr(x, "idvars")
        remove_parameter <- FALSE
      }

      x <- stats::reshape(
        x,
        idvar = idvar,
        timevar = "CI",
        direction = "wide",
        v.names = c(ci_low, ci_high),
        sep = "_"
      )
      row.names(x) <- NULL
      if (remove_parameter) x$Parameter <- NULL
    }

    # Replace at the right place
    ci_colname <- names(x)[
      grepl(paste0(ci_low, "_*"), names(x)) |
        grepl(paste0(ci_high, "_*"), names(x))
    ]
    colnames_1 <- names(x)[0:(ci_position - 1)][
      !names(x)[0:(ci_position - 1)] %in% ci_colname
    ]
    colnames_2 <- names(x)[!names(x) %in% c(ci_colname, colnames_1)]
    x <- x[c(colnames_1, ci_colname, colnames_2)]

    # Wide to long --------------
  } else {
    if ("Parameter" %in% names(x)) {
      remove_parameter <- FALSE
    } else {
      x$Parameter <- seq_len(nrow(x))
      remove_parameter <- TRUE
    }

    lows <- grepl(paste0(ci_low, "_*"), names(x))
    highs <- grepl(paste0(ci_high, "_*"), names(x))
    ci <- as.numeric(gsub(paste0(ci_low, "_"), "", names(x)[lows]))
    if (
      paste(ci, collapse = "-") !=
        paste(gsub(paste0(ci_high, "_"), "", names(x)[highs]), collapse = "-")
    ) {
      insight::format_error("Something went wrong in the CIs reshaping.")
      return(x)
    }
    if (sum(lows) > 1 && sum(highs) > 1) {
      low <- stats::reshape(
        x[!highs],
        direction = "long",
        varying = list(names(x)[lows]),
        sep = "_",
        timevar = "CI",
        v.names = ci_low,
        times = ci
      )
      high <- stats::reshape(
        x[!lows],
        direction = "long",
        varying = list(names(x)[highs]),
        sep = "_",
        timevar = "CI",
        v.names = ci_high,
        times = ci
      )
      x <- merge(low, high)
      x$id <- NULL
      x <- x[order(x$Parameter), ]
      row.names(x) <- NULL
      if (remove_parameter) x$Parameter <- NULL
    }

    # Replace at the right place
    ci_position <- which(lows)[1]
    ci_colname <- c("CI", ci_low, ci_high)
    colnames_1 <- names(x)[0:(ci_position - 1)][
      !names(x)[0:(ci_position - 1)] %in% ci_colname
    ]
    colnames_2 <- names(x)[!names(x) %in% c(ci_colname, colnames_1)]
    x <- x[c(colnames_1, ci_colname, colnames_2)]
  }

  class(x) <- intersect(c("data.frame", "numeric"), class(x))
  x
}


================================================
FILE: R/row_count.R
================================================
#' @title Count specific values row-wise
#' @name row_count
#' @description `row_count()` mimics base R's `rowSums()`, with sums for a
#' specific value indicated by `count`. Hence, it is similar to
#' `rowSums(x == count, na.rm = TRUE)`, but offers some more options, including
#' strict comparisons. Comparisons using `==` coerce values to atomic vectors,
#' thus both `2 == 2` and `"2" == 2` are `TRUE`. In `row_count()`, it is also
#' possible to make "type safe" comparisons using the `allow_coercion` argument,
#' where `"2" == 2` is not true.
#'
#' @param data A data frame with at least two columns, where number of specific
#' values are counted row-wise.
#' @param count The value for which the row sum should be computed. May be a
#' numeric value, a character string (for factors or character vectors), `NA` or
#' `Inf`.
#' @param allow_coercion Logical. If `FALSE`, `count` matches only values of same
#' class (i.e. when `count = 2`, the value `"2"` is not counted and vice versa).
#' By default, when `allow_coercion = TRUE`, `count = 2` also matches `"2"`. In
#' order to count factor levels in the data, use `count = factor("level")`. See
#' 'Examples'.
#'
#' @inheritParams extract_column_names
#' @inheritParams row_means
#'
#' @return A vector with row-wise counts of values specified in `count`.
#'
#' @examples
#' dat <- data.frame(
#'   c1 = c(1, 2, NA, 4),
#'   c2 = c(NA, 2, NA, 5),
#'   c3 = c(NA, 4, NA, NA),
#'   c4 = c(2, 3, 7, 8)
#' )
#'
#' # count all 4s per row
#' row_count(dat, count = 4)
#' # count all missing values per row
#' row_count(dat, count = NA)
#'
#' dat <- data.frame(
#'   c1 = c("1", "2", NA, "3"),
#'   c2 = c(NA, "2", NA, "3"),
#'   c3 = c(NA, 4, NA, NA),
#'   c4 = c(2, 3, 7, Inf)
#' )
#' # count all 2s and "2"s per row
#' row_count(dat, count = 2)
#' # only count 2s, but not "2"s
#' row_count(dat, count = 2, allow_coercion = FALSE)
#'
#' dat <- data.frame(
#'   c1 = factor(c("1", "2", NA, "3")),
#'   c2 = c("2", "1", NA, "3"),
#'   c3 = c(NA, 4, NA, NA),
#'   c4 = c(2, 3, 7, Inf)
#' )
#' # find only character "2"s
#' row_count(dat, count = "2", allow_coercion = FALSE)
#' # find only factor level "2"s
#' row_count(dat, count = factor("2"), allow_coercion = FALSE)
#'
#' @export
row_count <- function(
  data,
  select = NULL,
  exclude = NULL,
  count = NULL,
  allow_coercion = TRUE,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE
) {
  # evaluate arguments
  select <- .select_nse(
    select,
    data,
    exclude,
    ignore_case = ignore_case,
    regex = regex,
    verbose = verbose
  )

  if (is.null(count)) {
    insight::format_error(
      "`count` must be a valid value (including `NA` or `Inf`), but not `NULL`."
    )
  }

  if (is.null(select) || length(select) == 0) {
    insight::format_error("No columns selected.")
  }

  data <- .coerce_to_dataframe(data[select])

  # check if we have a data framme with at least two columns
  if (nrow(data) < 1) {
    insight::format_error("`data` must be a data frame with at least one row.")
  }

  # check if we have a data framme with at least two columns
  if (ncol(data) < 2) {
    insight::format_error(
      "`data` must be a data frame with at least two numeric columns."
    )
  }
  # special case: count missing
  if (is.na(count)) {
    rowSums(is.na(data))
  } else {
    # comparisons in R using == coerce values into a atomic vector, i.e.
    # 2 == "2" is TRUE. If `allow_coercion = FALSE`, we only want 2 == 2 or
    # "2" == "2" (i.e. we want exact types to be compared only)
    if (isFALSE(allow_coercion)) {
      # we need the "type" of the count-value - we use class() instead of typeof(),
      # because the latter sometimes returns unsuitable classes/types. compare
      # typeof(as.Date("2020-01-01")), which returns "double".
      count_type <- class(count)[1]
      valid_columns <- vapply(data, inherits, TRUE, what = count_type)
      # check if any columns left?
      if (!any(valid_columns)) {
        insight::format_error(
          "No column has same type as the value provided in `count`. Set `allow_coercion = TRUE` or specify a valid value for `count`."
        ) # nolint
      }
      data <- data[valid_columns]
    }
    # coerce - we have only valid columns anyway, and we need to coerce factors
    # to vectors, else comparison with `==` errors.
    count <- as.vector(count)
    # finally, count
    rowSums(data == count, na.rm = TRUE)
  }
}


================================================
FILE: R/row_means.R
================================================
#' @title Row means or sums (optionally with minimum amount of valid values)
#' @name row_means
#' @description This function is similar to the SPSS `MEAN.n` or `SUM.n`
#' function and computes row means or row sums from a data frame or matrix if at
#' least `min_valid` values of a row are valid (and not `NA`).
#'
#' @param data A data frame with at least two columns, where row means or row
#' sums are applied.
#' @param min_valid Optional, a numeric value of length 1. May either be
#' - a numeric value that indicates the amount of valid values per row to
#'   calculate the row mean or row sum;
#' - or a value between `0` and `1`, indicating a proportion of valid values per
#'   row to calculate the row mean or row sum (see 'Details').
#' - `NULL` (default), in which all cases are considered.
#'
#' If a row's sum of valid values is less than `min_valid`, `NA` will be returned.
#' @param digits Numeric value indicating the number of decimal places to be
#' used for rounding mean values. Negative values are allowed (see 'Details').
#' By default, `digits = NULL` and no rounding is used.
#' @param remove_na Logical, if `TRUE` (default), removes missing (`NA`) values
#' before calculating row means or row sums. Only applies if `min_valid` is not
#' specified.
#' @param verbose Toggle warnings.
#' @inheritParams extract_column_names
#'
#' @return A vector with row means (for `row_means()`) or row sums (for
#' `row_sums()`) for those rows with at least `n` valid values.
#'
#' @details Rounding to a negative number of `digits` means rounding to a power
#' of ten, for example `row_means(df, 3, digits = -2)` rounds to the nearest
#' hundred. For `min_valid`, if not `NULL`, `min_valid` must be a numeric value
#' from `0` to `ncol(data)`. If a row in the data frame has at least `min_valid`
#' non-missing values, the row mean or row sum is returned. If `min_valid` is a
#' non-integer value from 0 to 1, `min_valid` is considered to indicate the
#' proportion of required non-missing values per row. E.g., if
#' `min_valid = 0.75`, a row must have at least `ncol(data) * min_valid`
#' non-missing values for the row mean or row sum to be calculated. See
#' 'Examples'.
#'
#' @examples
#' dat <- data.frame(
#'   c1 = c(1, 2, NA, 4),
#'   c2 = c(NA, 2, NA, 5),
#'   c3 = c(NA, 4, NA, NA),
#'   c4 = c(2, 3, 7, 8)
#' )
#'
#' # default, all means are shown, if no NA values are present
#' row_means(dat)
#'
#' # remove all NA before computing row means
#' row_means(dat, remove_na = TRUE)
#'
#' # needs at least 4 non-missing values per row
#' row_means(dat, min_valid = 4) # 1 valid return value
#' row_sums(dat, min_valid = 4) # 1 valid return value
#'
#' # needs at least 3 non-missing values per row
#' row_means(dat, min_valid = 3) # 2 valid return values
#'
#' # needs at least 2 non-missing values per row
#' row_means(dat, min_valid = 2)
#'
#' # needs at least 1 non-missing value per row, for two selected variables
#' row_means(dat, select = c("c1", "c3"), min_valid = 1)
#'
#' # needs at least 50% of non-missing values per row
#' row_means(dat, min_valid = 0.5) # 3 valid return values
#' row_sums(dat, min_valid = 0.5)
#'
#' # needs at least 75% of non-missing values per row
#' row_means(dat, min_valid = 0.75) # 2 valid return values
#'
#' @export
row_means <- function(
  data,
  select = NULL,
  exclude = NULL,
  min_valid = NULL,
  digits = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  remove_na = FALSE,
  verbose = TRUE
) {
  # evaluate arguments
  select <- .select_nse(
    select,
    data,
    exclude,
    ignore_case = ignore_case,
    regex = regex,
    verbose = verbose
  )

  # prepare data, sanity checks
  data <- .prepare_row_data(data, select, min_valid, verbose)

  # calculate row means
  .row_sums_or_means(data, min_valid, digits, remove_na, fun = "mean")
}


#' @rdname row_means
#' @export
row_sums <- function(
  data,
  select = NULL,
  exclude = NULL,
  min_valid = NULL,
  digits = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  remove_na = FALSE,
  verbose = TRUE
) {
  # evaluate arguments
  select <- .select_nse(
    select,
    data,
    exclude,
    ignore_case = ignore_case,
    regex = regex,
    verbose = verbose
  )

  # prepare data, sanity checks
  data <- .prepare_row_data(data, select, min_valid, verbose)

  # calculate row sums
  .row_sums_or_means(data, min_valid, digits, remove_na, fun = "sum")
}


# helper ------------------------

# calculate row means or sums
.row_sums_or_means <- function(data, min_valid, digits, remove_na, fun) {
  if (is.null(min_valid)) {
    # calculate row means or sums for complete data
    out <- switch(
      fun,
      mean = rowMeans(data, na.rm = remove_na),
      rowSums(data, na.rm = remove_na)
    )
  } else {
    # is 'min_valid' indicating a proportion?
    decimals <- min_valid %% 1
    if (decimals != 0) {
      min_valid <- round(ncol(data) * decimals)
    }

    # min_valid may not be larger as df's amount of columns
    if (ncol(data) < min_valid) {
      insight::format_error(
        "`min_valid` must be smaller or equal to number of columns in data frame."
      )
    }

    # row means or sums
    to_na <- rowSums(is.na(data)) > ncol(data) - min_valid
    out <- switch(
      fun,
      mean = rowMeans(data, na.rm = TRUE),
      rowSums(data, na.rm = TRUE)
    )
    out[to_na] <- NA
  }

  # round, if requested
  if (!is.null(digits) && !all(is.na(digits))) {
    out <- round(out, digits = digits)
  }

  out
}


# check that data is in shape for row means or row sums
.prepare_row_data <- function(data, select, min_valid, verbose) {
  if (is.null(select) || length(select) == 0) {
    insight::format_error("No columns selected.")
  }

  data <- .coerce_to_dataframe(data[select])

  # n must be a numeric, non-missing value
  if (
    !is.null(min_valid) &&
      (all(is.na(min_valid)) || !is.numeric(min_valid) || length(min_valid) > 1)
  ) {
    insight::format_error("`min_valid` must be a numeric value of length 1.")
  }

  # make sure we only have numeric values
  numeric_columns <- vapply(data, is.numeric, TRUE)
  if (!all(numeric_columns)) {
    if (verbose) {
      insight::format_alert(
        "Only numeric columns are considered for calculation."
      )
    }
    data <- data[numeric_columns]
  }

  # check if we have a data framme with at least two columns
  if (ncol(data) < 2) {
    insight::format_error(
      "`data` must be a data frame with at least two numeric columns."
    )
  }

  data
}


================================================
FILE: R/select_nse.R
================================================
# Code adapted from {poorman} by Nathan Eastwood [License: MIT]
# https://github.com/nathaneastwood/poorman/blob/master/R/select_positions.R

.select_nse <- function(
  select,
  data,
  exclude,
  ignore_case,
  regex = FALSE,
  remove_group_var = FALSE,
  allow_rename = FALSE,
  verbose = FALSE,
  ifnotfound = "warn"
) {
  .check_data(data)
  columns <- colnames(data)

  # avoid conflicts
  conflicting_packages <- .conflicting_packages("poorman")
  on.exit(.attach_packages(conflicting_packages))

  expr_select <- substitute(select, env = parent.frame(1L))
  expr_exclude <- substitute(exclude, env = parent.frame(1L))

  # when exclude is not an argument called from the function (e.g data_to_long),
  # do not consider "exclude" as a symbol
  if (deparse(expr_exclude) == "exclude" && is.null(substitute(exclude))) {
    expr_exclude <- NULL
  }

  # for grouped data frames, we can decide to remove group variable from selection
  grp_vars <- setdiff(colnames(attr(data, "groups", exact = TRUE)), ".rows")

  # directly return all names if select == exclude == NULL
  if (is.null(expr_select) && is.null(expr_exclude)) {
    # don't include grouping variables
    if (remove_group_var) {
      columns <- setdiff(columns, grp_vars)
    }
    return(columns)
  }

  # get the position of columns that are selected or excluded
  selected <- .eval_expr(
    expr_select,
    data,
    ignore_case = ignore_case,
    regex = regex,
    verbose = verbose,
    ifnotfound = ifnotfound
  )
  excluded <- .eval_expr(
    expr_exclude,
    data,
    ignore_case = ignore_case,
    regex = regex,
    verbose = verbose,
    ifnotfound = ifnotfound
  )

  selected_has_mix_idx <- any(selected < 0L) && any(selected > 0L)
  excluded_has_mix_idx <- any(excluded < 0L) && any(excluded > 0L)

  if (selected_has_mix_idx || excluded_has_mix_idx) {
    insight::format_error(
      "You can't mix negative and positive indices in `select` or `exclude`."
    )
  }

  # variable positions -> variable names
  selected <- columns[selected]
  excluded <- columns[excluded]

  if (length(selected) == 0L) {
    if (length(excluded) == 0L) {
      out <- character(0L)
    } else {
      out <- setdiff(columns, excluded)
    }
  } else {
    out <- setdiff(selected, excluded)
  }

  # don't include grouping variables
  if (remove_group_var && length(out)) {
    out <- setdiff(out, grp_vars)
  }

  # for named character vectors, we offer the service to rename the columns
  if (allow_rename && typeof(expr_select) == "language") {
    # safe evaluation of the expression, to get the named vector from "select"
    new_names <- tryCatch(eval(expr_select), error = function(e) NULL)
    # check if we really have a named vector
    if (!is.null(new_names) && !is.null(names(new_names))) {
      # if so, copy names
      all_names <- names(new_names)
      # if some of the elements don't have a name, we set the value as name
      names(new_names)[!nzchar(all_names)] <- new_names[!nzchar(all_names)]
      # after inclusion and exclusion, the original values in "select"
      # may have changed, so we check that we only add names of valid values
      out <- stats::setNames(out, names(new_names)[new_names %in% out])
      # check if we have any duplicated names, and if so, give an error
      if (anyDuplicated(names(out)) > 0) {
        insight::format_error(paste0(
          "Following names are duplicated after renaming: ",
          text_concatenate(names(out)[duplicated(names(out))], enclose = "`"),
          ". Using duplicated names is no good practice and therefore discouraged. Please provide unique names."
        ))
      }
    }
  }

  out
}


# This is where we dispatch the expression to several helper functions.
# This function is called multiple times for expressions that are composed
# of several symbols/language.
#
# Ex:
# * "cyl" -> will go to .select_char() and will return directly
# * cyl:gear -> function (`:`) so find which function it is, then get the
#   position for each variable, then evaluate the function with the positions

.eval_expr <- function(x, data, ignore_case, regex, verbose, ifnotfound) {
  if (is.null(x)) {
    return(NULL)
  }

  type <- typeof(x)

  out <- switch(
    type,
    integer = x,
    double = as.integer(x),
    character = .select_char(
      data,
      x,
      ignore_case,
      regex = regex,
      verbose,
      ifnotfound
    ),
    symbol = .select_symbol(
      data,
      x,
      ignore_case,
      regex = regex,
      verbose,
      ifnotfound
    ),
    language = .eval_call(
      data,
      x,
      ignore_case,
      regex = regex,
      verbose,
      ifnotfound
    ),
    insight::format_error(paste0(
      "Expressions of type <",
      typeof(x),
      "> cannot be evaluated for use when subsetting."
    ))
  )

  out
}


# Possibilities:
# - quoted variable name
# - quoted variable name with ignore case
# - quoted variable name with colon, to indicate range
# - character that should be regex-ed on variable names
# - special word "all" to return all vars

.select_char <- function(data, x, ignore_case, regex, verbose, ifnotfound) {
  # use colnames because names() doesn't work for matrices
  columns <- colnames(data)
  if (isTRUE(regex)) {
    # string is a regular expression
    grep(x, columns)
  } else if (length(x) == 1L && x == "all") {
    # string is "all" - select all columns
    seq_along(data)
  } else if (any(grepl(":", x, fixed = TRUE))) {
    # special pattern, as string (e.g.select = c("cyl:hp", "am")). However,
    # this will first go into `.eval_call()` and thus only single elements
    # are passed in `x` - we have never a character *vector* here
    # check for valid names
    colon_vars <- unlist(strsplit(x, ":", fixed = TRUE))
    colon_match <- match(colon_vars, columns)
    if (anyNA(colon_match)) {
      .action_if_not_found(
        colon_vars,
        columns,
        colon_match,
        verbose,
        ifnotfound
      )
      matches <- NA
    } else {
      start_pos <- match(colon_vars[1], columns)
      end_pos <- match(colon_vars[2], columns)
      if (!is.na(start_pos) && !is.na(end_pos)) {
        matches <- start_pos:end_pos
      } else {
        matches <- NA
      }
    }
    matches[!is.na(matches)]
  } else if (isTRUE(ignore_case)) {
    # find columns, case insensitive
    matches <- match(toupper(x), toupper(columns))
    matches[!is.na(matches)]
  } else {
    # find columns, case sensitive
    matches <- match(x, columns)
    if (anyNA(matches)) {
      .action_if_not_found(x, columns, matches, verbose, ifnotfound)
    }
    matches[!is.na(matches)]
  }
}

# small helper, to avoid duplicated code

.action_if_not_found <- function(x, columns, matches, verbose, ifnotfound) {
  msg <- paste0(
    "Following variable(s) were not found: ",
    toString(x[is.na(matches)])
  )
  msg2 <- .misspelled_string(
    columns,
    x[is.na(matches)],
    default_message = "Possibly misspelled?"
  )
  if (ifnotfound == "error") {
    insight::format_error(msg, msg2)
  }
  if (ifnotfound == "warn" && verbose) {
    insight::format_warning(msg, msg2)
  }
}


# 3 types of symbols:
# - unquoted variables
# - objects that need to be evaluated, e.g data_find(iris, i) where
#   i is a function arg or is defined before. This can also be a
#   vector of names or positions.
# - functions (without parenthesis)

# The first case is easy to deal with.
# For the 2nd one, we try to get the value of the object at each environment
# (starting from the lower one) until the global environment. If we get its
# value but it errors because the function doesn't exist then it means that
# it is a select helper that we grab from the error message.

.select_symbol <- function(data, x, ignore_case, regex, verbose, ifnotfound) {
  # We use `tryCatch()` instead of `try()` here, because for grouped data frame
  # methods, `.dynEval()` can be called many times. Since `tryCatch()` is minimal
  # faster than `try()`, we get a performance "boost" of some seconds for large
  # data frames with many groups (see https://github.com/easystats/datawizard/pull/657/)
  try_eval <- tryCatch(eval(x), error = function(e) NULL)
  x_dep <- insight::safe_deparse(x)
  is_select_helper <- FALSE
  out <- NULL

  if (x_dep %in% colnames(data)) {
    matches <- match(x_dep, colnames(data))
    out <- matches[!is.na(matches)]
  } else if (isTRUE(ignore_case)) {
    matches <- match(toupper(x_dep), toupper(colnames(data)))
    out <- matches[!is.na(matches)]
  } else {
    new_expr <- tryCatch(
      .dynGet(x, inherits = FALSE, minframe = 0L),
      error = function(e) {
        # if starts_with() et al. don't exist
        fn <- insight::safe_deparse(e$call)

        # if starts_with() et al. come from tidyselect but need to be used in
        # a select environment, then the error doesn't have the same structure.
        if (
          is.null(fn) && grepl("must be used within a", e$message, fixed = TRUE)
        ) {
          call_trace <- lapply(e$trace$call, function(x) {
            tmp <- insight::safe_deparse(x)
            if (grepl(paste0("^", .regex_select_helper()), tmp)) {
              tmp
            }
          })
          fn <- Filter(Negate(is.null), call_trace)[1]
        }
        # if we actually obtain the select helper call, return it, else return
        # what we already had
        if (length(fn) > 0L && grepl(.regex_select_helper(), fn)) {
          is_select_helper <<- TRUE
          return(fn)
        }
        NULL
      }
    )

    # when "x" is a function arg which is itself a function call to evaluate,
    # .dynGet can return "x" infinitely so we try to evaluate this arg
    # see #414
    if (!is.null(new_expr) && insight::safe_deparse(new_expr) == "x") {
      new_expr <- .dynEval(
        x,
        inherits = FALSE,
        minframe = 0L,
        remove_n_top_env = 4
      )
    }

    if (is_select_helper) {
      new_expr <- str2lang(unlist(new_expr, use.names = FALSE))
      out <- .eval_expr(
        new_expr,
        data = data,
        ignore_case = ignore_case,
        regex = regex,
        verbose = verbose,
        ifnotfound = ifnotfound
      )
    } else if (length(new_expr) == 1L && is.function(new_expr)) {
      out <- which(vapply(data, new_expr, FUN.VALUE = logical(1L)))
    } else {
      out <- unlist(
        lapply(
          new_expr,
          .eval_expr,
          data = data,
          ignore_case = ignore_case,
          regex = regex,
          verbose = verbose,
          ifnotfound = ifnotfound
        ),
        use.names = FALSE
      )
    }
  }

  # sometimes an object that needs to be evaluated has the same name as a
  # function (e.g `colnames`). Vector of names have the priority on functions
  # so function evaluation is delayed at the max.
  if (is.null(out) && is.function(try_eval)) {
    cols <- names(data)
    out <- which(vapply(data, x, FUN.VALUE = logical(1L)))
  }

  out
}

# Dispatch expressions to various select helpers according to the function call.

.eval_call <- function(data, x, ignore_case, regex, verbose, ifnotfound) {
  type <- insight::safe_deparse(x[[1]])
  switch(
    type,
    `:` = .select_seq(x, data, ignore_case, regex, verbose, ifnotfound),
    `-` = .select_minus(x, data, ignore_case, regex, verbose, ifnotfound),
    `c` = .select_c(x, data, ignore_case, regex, verbose, ifnotfound), # nolint
    `(` = .select_bracket(x, data, ignore_case, regex, verbose, ifnotfound),
    `[` = .select_square_bracket(
      x,
      data,
      ignore_case,
      regex,
      verbose,
      ifnotfound
    ),
    `$` = .select_dollar(x, data, ignore_case, regex, verbose, ifnotfound),
    `~` = .select_tilde(x, data, ignore_case, regex, verbose, ifnotfound),
    list = .select_list(x, data, ignore_case, regex, verbose, ifnotfound),
    names = .select_names(x, data, ignore_case, regex, verbose, ifnotfound),
    starts_with = ,
    ends_with = ,
    matches = ,
    contains = ,
    regex = .select_helper(x, data, ignore_case, regex, verbose, ifnotfound),
    .select_context(x, data, ignore_case, regex, verbose, ifnotfound)
  )
}

# e.g 1:3, or gear:cyl
.select_seq <- function(expr, data, ignore_case, regex, verbose, ifnotfound) {
  x <- .eval_expr(
    expr[[2]],
    data = data,
    ignore_case = ignore_case,
    regex = regex,
    verbose = verbose,
    ifnotfound = ifnotfound
  )
  y <- .eval_expr(
    expr[[3]],
    data = data,
    ignore_case = ignore_case,
    regex = regex,
    verbose = verbose,
    ifnotfound = ifnotfound
  )
  x:y
}

# e.g -cyl
.select_minus <- function(expr, data, ignore_case, regex, verbose, ifnotfound) {
  x <- .eval_expr(
    expr[[2]],
    data,
    ignore_case = ignore_case,
    regex = regex,
    verbose = verbose,
    ifnotfound = ifnotfound
  )
  if (length(x) == 0L) {
    seq_along(data)
  } else {
    x * -1L
  }
}

# e.g c("gear", "cyl")
.select_c <- function(expr, data, ignore_case, regex, verbose, ifnotfound) {
  lst_expr <- as.list(expr)
  lst_expr[[1]] <- NULL
  unlist(
    lapply(
      lst_expr,
      .eval_expr,
      data,
      ignore_case = ignore_case,
      regex = regex,
      verbose = verbose,
      ifnotfound = ifnotfound
    ),
    use.names = FALSE
  )
}

# e.g -(gear:cyl)
.select_bracket <- function(
  expr,
  data,
  ignore_case,
  regex,
  verbose,
  ifnotfound
) {
  .eval_expr(
    expr[[2]],
    data,
    ignore_case = ignore_case,
    regex = regex,
    verbose = verbose,
    ifnotfound = ifnotfound
  )
}

# e.g myvector[3]
.select_square_bracket <- function(
  expr,
  data,
  ignore_case,
  regex,
  verbose,
  ifnotfound
) {
  first_obj <- .eval_expr(
    expr[[2]],
    data,
    ignore_case = ignore_case,
    regex = regex,
    verbose = verbose,
    ifnotfound = ifnotfound
  )
  .eval_expr(
    first_obj[eval(expr[[3]])],
    data,
    ignore_case = ignore_case,
    regex = regex,
    verbose = verbose,
    ifnotfound = ifnotfound
  )
}

.select_names <- function(expr, data, ignore_case, regex, verbose, ifnotfound) {
  first_obj <- .dynEval(expr, inherits = FALSE, minframe = 0L)
  .eval_expr(
    first_obj,
    data,
    ignore_case = ignore_case,
    regex = regex,
    verbose = FALSE,
    ifnotfound = ifnotfound
  )
}

# e.g starts_with("Sep")
.select_helper <- function(
  expr,
  data,
  ignore_case,
  regex,
  verbose,
  ifnotfound
) {
  lst_expr <- as.list(expr)

  # need this if condition to distinguish between starts_with("Sep") (that we
  # can use directly) and starts_with(i) (where we need to get i)
  if (length(lst_expr) == 2L && typeof(lst_expr[[2]]) == "symbol") {
    collapsed_patterns <- .dynGet(
      lst_expr[[2]],
      inherits = FALSE,
      minframe = 0L
    )
  } else {
    collapsed_patterns <- paste(
      unlist(lst_expr[2:length(lst_expr)]),
      collapse = "|"
    )
  }

  helper <- insight::safe_deparse(lst_expr[[1]])

  rgx <- switch(
    helper,
    starts_with = paste0("^(", collapsed_patterns, ")"),
    ends_with = paste0("(", collapsed_patterns, ")$"),
    contains = paste0("(", collapsed_patterns, ")"),
    regex = collapsed_patterns,
    insight::format_error("There is no select helper called '", helper, "'.")
  )
  # starting in R 4.5, grep() errors if some logical args have NULL/NA
  if (is.null(ignore_case)) {
    ignore_case <- FALSE
  }
  grep(rgx, colnames(data), ignore.case = ignore_case)
}

# e.g args$select (happens when we use grouped_data (see center.grouped_df()))
.select_dollar <- function(
  expr,
  data,
  ignore_case,
  regex,
  verbose,
  ifnotfound
) {
  first_obj <- .dynGet(
    expr[[2]],
    ifnotfound = NULL,
    inherits = FALSE,
    minframe = 0L
  )
  if (is.null(first_obj)) {
    first_obj <- .dynEval(expr[[2]], inherits = FALSE, minframe = 0L)
  }
  .eval_expr(
    first_obj[[deparse(expr[[3]])]],
    data,
    ignore_case = ignore_case,
    regex = regex,
    verbose = verbose,
    ifnotfound = ifnotfound
  )
}

# e.g ~ gear + cyl
.select_tilde <- function(expr, data, ignore_case, regex, verbose, ifnotfound) {
  vars <- all.vars(expr)
  unlist(
    lapply(
      vars,
      .eval_expr,
      data = data,
      ignore_case = ignore_case,
      regex = regex,
      verbose = verbose,
      ifnotfound = ifnotfound
    ),
    use.names = FALSE
  )
}

# e.g list(gear = 4, cyl = 5)
.select_list <- function(expr, data, ignore_case, regex, verbose, ifnotfound) {
  vars <- names(.dynEval(expr, inherits = FALSE, minframe = 0L))
  unlist(
    lapply(
      vars,
      .eval_expr,
      data = data,
      ignore_case = ignore_case,
      regex = regex,
      verbose = verbose,
      ifnotfound = ifnotfound
    ),
    use.names = FALSE
  )
}

# e.g is.numeric()
.select_context <- function(
  expr,
  data,
  ignore_case,
  regex,
  verbose,
  ifnotfound
) {
  x_dep <- insight::safe_deparse(expr)
  if (endsWith(x_dep, "()")) {
    new_expr <- gsub("\\(\\)$", "", x_dep)
    new_expr <- str2lang(new_expr)
    .eval_expr(
      new_expr,
      data = data,
      ignore_case = ignore_case,
      regex = regex,
      verbose = verbose,
      ifnotfound = ifnotfound
    )
  } else {
    out <- .dynEval(expr, inherits = FALSE, minframe = 0L)
    .eval_expr(
      out,
      data = data,
      ignore_case = ignore_case,
      regex = regex,
      verbose = verbose,
      ifnotfound = ifnotfound
    )
  }
}

# -------------------------------------

.check_data <- function(data) {
  if (is.null(data)) {
    insight::format_error("The `data` argument must be provided.")
  }
  .coerce_to_dataframe(data)
}

.regex_select_helper <- function() {
  "(starts\\_with|ends\\_with|col\\_ends\\_with|contains|regex)"
}

.conflicting_packages <- function(packages = NULL) {
  if (is.null(packages)) {
    packages <- "poorman"
  }

  namespace <- vapply(packages, isNamespaceLoaded, FUN.VALUE = logical(1L))
  attached <- paste0("package:", packages) %in% search()
  attached <- stats::setNames(attached, packages)

  for (i in packages) {
    unloadNamespace(i)
  }

  list(package = packages, namespace = namespace, attached = attached)
}

.attach_packages <- function(packages = NULL) {
  if (!is.null(packages)) {
    pkg <- packages$package
    for (i in seq_along(pkg)) {
      if (isTRUE(packages$namespace[i])) {
        loadNamespace(pkg[i])
      }
      if (isTRUE(packages$attached[i])) {
        suppressPackageStartupMessages(
          suppressWarnings(
            require(pkg[i], quietly = TRUE, character.only = TRUE)
          )
        )
      }
    }
  }
}

# Almost identical to dynGet(). The difference is that we deparse the expression
# because get0() allows symbol only since R 4.1.0
.dynGet <- function(
  x,
  ifnotfound = stop(
    gettextf("%s not found", sQuote(x)),
    domain = NA,
    call. = FALSE
  ),
  minframe = 1L,
  inherits = FALSE
) {
  x <- insight::safe_deparse(x)
  n <- sys.nframe()
  myObj <- structure(list(.b = as.raw(7)), foo = 47L)
  while (n > minframe) {
    n <- n - 1L
    env <- sys.frame(n)
    r <- get0(x, envir = env, inherits = inherits, ifnotfound = myObj)
    if (!identical(r, myObj)) {
      return(r)
    }
  }
  ifnotfound
}

# Similar to .dynGet() but instead of getting an object from the environment,
# we try to evaluate an expression. It stops as soon as the evaluation doesn't
# error. Returns NULL if can never be evaluated.
#
# Custom arg "remove_n_top_env" to remove the first environments which are
# ".select_nse()" and the other custom functions.
#
# Arg "data" is here if we want to start searching in the data instead of the
# lowest environment.
.dynEval <- function(
  x,
  ifnotfound = stop(
    gettextf("%s not found", sQuote(x)),
    domain = NA,
    call. = FALSE
  ),
  minframe = 1L,
  inherits = FALSE,
  remove_n_top_env = 0,
  data = NULL
) {
  iter <- 0
  n <- sys.nframe() - remove_n_top_env
  x <- insight::safe_deparse(x)
  while (n > minframe) {
    if (iter == 0 && !is.null(data)) {
      env <- data
      iter <- iter + 1
    } else {
      n <- n - 1L
      env <- sys.frame(n)
    }
    # We use `tryCatch()` instead of `try()` here, because for grouped data frame
    # methods, `.dynEval()` can be called many times. Since `tryCatch()` is minimal
    # faster than `try()`, we get a performance "boost" of some seconds for large
    # data frames with many groups (see https://github.com/easystats/datawizard/pull/657/)
    r <- tryCatch(eval(str2lang(x), envir = env), error = function(e) NULL)
    if (!is.null(r)) {
      return(r)
    }
  }
  ifnotfound
}


================================================
FILE: R/skewness_kurtosis.R
================================================
#' Compute Skewness and (Excess) Kurtosis
#'
#' @param x A numeric vector or data.frame.
#' @param type Type of algorithm for computing skewness. May be one of `1`
#'   (or `"1"`, `"I"` or `"classic"`), `2` (or `"2"`,
#'   `"II"` or `"SPSS"` or `"SAS"`) or `3` (or  `"3"`,
#'   `"III"` or `"Minitab"`). See 'Details'.
#' @param iterations The number of bootstrap replicates for computing standard
#'   errors. If `NULL` (default), parametric standard errors are computed.
#' @param test Logical, if `TRUE`, tests if skewness or kurtosis is
#'   significantly different from zero.
#' @param digits Number of decimal places.
#' @param object An object returned by `skewness()` or `kurtosis()`.
#' @param verbose Toggle warnings and messages.
#' @param ... Arguments passed to or from other methods.
#' @inheritParams coef_var
#'
#' @details
#'
#' \subsection{Skewness}{
#' Symmetric distributions have a `skewness` around zero, while
#' a negative skewness values indicates a "left-skewed" distribution, and a
#' positive skewness values indicates a "right-skewed" distribution. Examples
#' for the relationship of skewness and distributions are:
#'
#'   - Normal distribution (and other symmetric distribution) has a skewness
#'   of 0
#'   - Half-normal distribution has a skewness just below 1
#'   - Exponential distribution has a skewness of 2
#'   - Lognormal distribution can have a skewness of any positive value,
#'   depending on its parameters
#'
#' (\cite{https://en.wikipedia.org/wiki/Skewness})
#' }
#'
#' \subsection{Types of Skewness}{
#' `skewness()` supports three different methods for estimating skewness,
#' as discussed in \cite{Joanes and Gill (1988)}:
#'
#' - Type "1" is the "classical" method, which is `g1 = (sum((x -
#' mean(x))^3) / n) / (sum((x - mean(x))^2) / n)^1.5`
#'
#' - Type "2" first calculates the type-1 skewness, then adjusts the result:
#' `G1 = g1 * sqrt(n * (n - 1)) / (n - 2)`. This is what SAS and SPSS
#' usually return.
#'
#' - Type "3" first calculates the type-1 skewness, then adjusts the result:
#' `b1 = g1 * ((1 - 1 / n))^1.5`. This is what Minitab usually returns.
#' }
#'
#' \subsection{Kurtosis}{
#' The `kurtosis` is a measure of "tailedness" of a distribution. A
#' distribution with a kurtosis values of about zero is called "mesokurtic". A
#' kurtosis value larger than zero indicates a "leptokurtic" distribution with
#' *fatter* tails. A kurtosis value below zero indicates a "platykurtic"
#' distribution with *thinner* tails
#' (\cite{https://en.wikipedia.org/wiki/Kurtosis}).
#' }
#'
#' \subsection{Types of Kurtosis}{
#' `kurtosis()` supports three different methods for estimating kurtosis,
#' as discussed in \cite{Joanes and Gill (1988)}:
#'
#' - Type "1" is the "classical" method, which is `g2 = n * sum((x -
#' mean(x))^4) / (sum((x - mean(x))^2)^2) - 3`.
#'
#' - Type "2" first calculates the type-1 kurtosis, then adjusts the result:
#' `G2 = ((n + 1) * g2 + 6) * (n - 1)/((n - 2) * (n - 3))`. This is what
#' SAS and SPSS usually return
#'
#' - Type "3" first calculates the type-1 kurtosis, then adjusts the result:
#' `b2 = (g2 + 3) * (1 - 1 / n)^2 - 3`. This is what Minitab usually
#' returns.
#'
#' }
#'
#' \subsection{Standard Errors}{
#' It is recommended to compute empirical (bootstrapped) standard errors (via
#' the `iterations` argument) than relying on analytic standard errors
#' (\cite{Wright & Herrington, 2011}).
#' }
#'
#' @references
#'
#' - D. N. Joanes and C. A. Gill (1998). Comparing measures of sample
#'   skewness and kurtosis. The Statistician, 47, 183–189.
#'
#' - Wright, D. B., & Herrington, J. A. (2011). Problematic standard
#'   errors and confidence intervals for skewness and kurtosis. Behavior
#'   research methods, 43(1), 8-17.
#'
#' @return Values of skewness or kurtosis.
#'
#' @examples
#' skewness(rnorm(1000))
#' kurtosis(rnorm(1000))
#' @export
skewness <- function(x, ...) {
  UseMethod("skewness")
}


# skewness -----------------------------------------

#' @rdname skewness
#' @export
skewness.numeric <- function(
  x,
  remove_na = TRUE,
  type = "2",
  iterations = NULL,
  verbose = TRUE,
  ...
) {
  if (remove_na) {
    x <- x[!is.na(x)]
  }
  n <- length(x)
  out <- (sum((x - mean(x))^3) / n) / (sum((x - mean(x))^2) / n)^1.5

  type <- .check_skewness_type(type)

  if (type == "2" && n < 3) {
    if (verbose) {
      insight::format_warning(
        "Need at least 3 complete observations for type-2-skewness. Using 'type=\"1\"' now."
      )
    }
    type <- "1"
  }

  .skewness <- switch(
    type,
    "1" = out,
    "2" = out * sqrt(n * (n - 1)) / (n - 2),
    "3" = out * ((1 - 1 / n))^1.5
  )

  out_se <- sqrt((6 * (n - 2)) / ((n + 1) * (n + 3)))

  .skewness_se <- switch(
    type,
    "1" = out_se,
    "2" = out_se * ((sqrt(n * (n - 1))) / (n - 2)),
    "3" = out_se * (((n - 1) / n)^1.5),
  )

  if (!is.null(iterations)) {
    if (requireNamespace("boot", quietly = TRUE)) {
      results <- boot::boot(
        data = x,
        statistic = .boot_skewness,
        R = iterations,
        remove_na = remove_na,
        type = type
      )
      out_se <- stats::sd(results$t, na.rm = TRUE)
    } else {
      insight::format_warning("Package 'boot' needed for bootstrapping SEs.")
    }
  }

  .skewness <- data.frame(
    Skewness = .skewness,
    SE = out_se
  )
  class(.skewness) <- unique(c("parameters_skewness", class(.skewness)))
  .skewness
}


#' @export
skewness.matrix <- function(
  x,
  remove_na = TRUE,
  type = "2",
  iterations = NULL,
  ...
) {
  .skewness <- apply(
    x,
    2,
    skewness,
    remove_na = remove_na,
    type = type,
    iterations = iterations
  )

  .names <- colnames(x)

  if (length(.names) == 0) {
    .names <- paste0("X", seq_len(ncol(x)))
  }

  .skewness <- cbind(Parameter = .names, do.call(rbind, .skewness))

  class(.skewness) <- unique(c("parameters_skewness", class(.skewness)))
  .skewness
}


#' @export
skewness.data.frame <- function(
  x,
  remove_na = TRUE,
  type = "2",
  iterations = NULL,
  ...
) {
  .skewness <- lapply(
    x,
    skewness,
    remove_na = remove_na,
    type = type,
    iterations = iterations
  )

  .skewness <- cbind(Parameter = names(.skewness), do.call(rbind, .skewness))

  class(.skewness) <- unique(c("parameters_skewness", class(.skewness)))
  .skewness
}


#' @export
skewness.default <- function(
  x,
  remove_na = TRUE,
  type = "2",
  iterations = NULL,
  ...
) {
  skewness(
    .factor_to_numeric(x),
    remove_na = remove_na,
    type = type,
    iterations = iterations
  )
}


# Kurtosis -----------------------------------

#' @rdname skewness
#' @export
kurtosis <- function(x, ...) {
  UseMethod("kurtosis")
}


#' @rdname skewness
#' @export
kurtosis.numeric <- function(
  x,
  remove_na = TRUE,
  type = "2",
  iterations = NULL,
  verbose = TRUE,
  ...
) {
  if (remove_na) {
    x <- x[!is.na(x)]
  }
  n <- length(x)
  out <- n * sum((x - mean(x))^4) / (sum((x - mean(x))^2)^2)

  type <- .check_skewness_type(type)

  if (type == "2" && n < 4) {
    if (verbose) {
      insight::format_warning(
        "Need at least 4 complete observations for type-2-kurtosis Using 'type=\"1\"' now."
      )
    }
    type <- "1"
  }

  .kurtosis <- switch(
    type,
    "1" = out - 3,
    "2" = ((n + 1) * (out - 3) + 6) * (n - 1) / ((n - 2) * (n - 3)),
    "3" = out * (1 - 1 / n)^2 - 3
  )

  out_se <- sqrt(
    (24 * n * (n - 2) * (n - 3)) / (((n + 1)^2) * (n + 3) * (n + 5))
  )

  .kurtosis_se <- switch(
    type,
    "1" = out_se,
    "2" = out_se * (((n - 1) * (n + 1)) / ((n - 2) * (n - 3))),
    "3" = out_se * ((n - 1) / n)^2
  )

  if (!is.null(iterations)) {
    insight::check_if_installed("boot")

    results <- boot::boot(
      data = x,
      statistic = .boot_kurtosis,
      R = iterations,
      remove_na = remove_na,
      type = type
    )
    out_se <- stats::sd(results$t, na.rm = TRUE)
  }

  .kurtosis <- data.frame(
    Kurtosis = .kurtosis,
    SE = out_se
  )
  class(.kurtosis) <- unique(c("parameters_kurtosis", class(.kurtosis)))
  .kurtosis
}


#' @export
kurtosis.matrix <- function(
  x,
  remove_na = TRUE,
  type = "2",
  iterations = NULL,
  ...
) {
  .kurtosis <- apply(
    x,
    2,
    kurtosis,
    remove_na = remove_na,
    type = type,
    iterations = iterations
  )
  .names <- colnames(x)
  if (length(.names) == 0) {
    .names <- paste0("X", seq_len(ncol(x)))
  }
  .kurtosis <- cbind(Parameter = .names, do.call(rbind, .kurtosis))
  class(.kurtosis) <- unique(c("parameters_kurtosis", class(.kurtosis)))
  .kurtosis
}


#' @export
kurtosis.data.frame <- function(
  x,
  remove_na = TRUE,
  type = "2",
  iterations = NULL,
  ...
) {
  .kurtosis <- lapply(
    x,
    kurtosis,
    remove_na = remove_na,
    type = type,
    iterations = iterations
  )
  .kurtosis <- cbind(Parameter = names(.kurtosis), do.call(rbind, .kurtosis))
  class(.kurtosis) <- unique(c("parameters_kurtosis", class(.kurtosis)))
  .kurtosis
}


#' @export
kurtosis.default <- function(
  x,
  remove_na = TRUE,
  type = "2",
  iterations = NULL,
  ...
) {
  kurtosis(
    .factor_to_numeric(x),
    remove_na = remove_na,
    type = type,
    iterations = iterations
  )
}


# methods -----------------------------------------

#' @export
as.numeric.parameters_kurtosis <- function(x, ...) {
  x$Kurtosis
}

#' @export
as.numeric.parameters_skewness <- function(x, ...) {
  x$Skewness
}

#' @export
as.double.parameters_kurtosis <- as.numeric.parameters_kurtosis

#' @export
as.double.parameters_skewness <- as.numeric.parameters_skewness

#' @rdname skewness
#' @export
print.parameters_kurtosis <- function(x, digits = 3, test = FALSE, ...) {
  out <- summary(x, test = test)
  cat(insight::export_table(out, digits = digits))
  invisible(x)
}

#' @rdname skewness
#' @export
print.parameters_skewness <- print.parameters_kurtosis

#' @rdname skewness
#' @export
summary.parameters_skewness <- function(object, test = FALSE, ...) {
  if (test) {
    object$z <- object$Skewness / object$SE
    object$p <- 2 * (1 - stats::pnorm(abs(object$z)))
  }
  object
}

#' @rdname skewness
#' @export
summary.parameters_kurtosis <- function(object, test = FALSE, ...) {
  if (test) {
    object$z <- object$Kurtosis / object$SE
    object$p <- 2 * (1 - stats::pnorm(abs(object$z)))
  }
  object
}

# helper ------------------------------------------

.check_skewness_type <- function(type) {
  # convenience
  if (is.numeric(type)) {
    type <- as.character(type)
  }
  skewness_types <- c(
    "1",
    "2",
    "3",
    "I",
    "II",
    "III",
    "classic",
    "SPSS",
    "SAS",
    "Minitab"
  )
  is_skewness_type_invalid <- is.null(type) ||
    is.na(type) ||
    !(type %in% skewness_types)

  if (is_skewness_type_invalid) {
    insight::format_warning(
      "'type' must be a character value from \"1\" to \"3\". Using 'type=\"2\"' now."
    )
    type <- "2"
  }

  switch(
    type,
    `1` = ,
    I = ,
    classic = "1",
    `2` = ,
    II = ,
    SPSS = ,
    SAS = "2",
    `3` = ,
    III = ,
    Minitab = "3"
  )
}


# bootstrapping -----------------------------------

.boot_skewness <- function(data, indices, remove_na, type) {
  datawizard::skewness(
    data[indices],
    remove_na = remove_na,
    type = type,
    iterations = NULL
  )$Skewness
}


.boot_kurtosis <- function(data, indices, remove_na, type) {
  datawizard::kurtosis(
    data[indices],
    remove_na = remove_na,
    type = type,
    iterations = NULL
  )$Kurtosis
}


================================================
FILE: R/slide.R
================================================
#' @title Shift numeric value range
#' @name slide
#'
#' @description
#' This functions shifts the value range of a numeric variable, so that the
#' new range starts at a given value.
#'
#' @param x A data frame or numeric vector.
#' @param verbose Toggle warnings.
#' @param ... not used.
#' @inheritParams to_numeric
#'
#' @return `x`, where the range of numeric variables starts at a new value.
#'
#' @inheritSection center Selection of variables - the `select` argument
#'
#' @inherit data_rename seealso
#'
#' @examples
#' # numeric
#' head(mtcars$gear)
#' head(slide(mtcars$gear))
#' head(slide(mtcars$gear, lowest = 10))
#'
#' # data frame
#' sapply(slide(mtcars, lowest = 1), min)
#' sapply(mtcars, min)
#' @export
slide <- function(x, ...) {
  UseMethod("slide")
}


#' @export
slide.default <- function(x, lowest = 0, verbose = TRUE, ...) {
  if (isTRUE(verbose)) {
    insight::format_alert(
      "Shifting non-numeric variables is not possible.",
      "Try using 'to_numeric()' and specify the 'lowest' argument."
    )
  }
  x
}


#' @rdname slide
#' @export
slide.numeric <- function(x, lowest = 0, ...) {
  original_x <- x
  minval <- min(x, na.rm = TRUE)
  difference <- minval - lowest
  x <- x - difference
  .set_back_labels(x, original_x, include_values = FALSE)
}


#' @rdname slide
#' @export
slide.data.frame <- function(
  x,
  select = NULL,
  exclude = NULL,
  lowest = 0,
  append = FALSE,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
) {
  # evaluate arguments
  select <- .select_nse(
    select,
    x,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  # when we append variables, we call ".process_append()", which will
  # create the new variables and updates "select", so new variables are processed
  if (!isFALSE(append)) {
    # process arguments
    my_args <- .process_append(
      x,
      select,
      append,
      append_suffix = "_s",
      keep_factors = FALSE
    )
    # update processed arguments
    x <- my_args$x
    select <- my_args$select
  }

  x[select] <- lapply(
    x[select],
    slide,
    lowest = lowest,
    verbose = verbose,
    ...
  )

  x
}


================================================
FILE: R/smoothness.R
================================================
#' Quantify the smoothness of a vector
#'
#' @param x Numeric vector (similar to a time series).
#' @param method Can be `"diff"` (the standard deviation of the standardized
#'   differences) or `"cor"` (default, lag-one autocorrelation).
#' @param lag An integer indicating which lag to use. If less than `1`, will be
#'   interpreted as expressed in percentage of the length of the vector.
#' @inheritParams skewness
#'
#' @examples
#' x <- (-10:10)^3 + rnorm(21, 0, 100)
#' plot(x)
#' smoothness(x, method = "cor")
#' smoothness(x, method = "diff")
#' @return Value of smoothness.
#' @references https://stats.stackexchange.com/questions/24607/how-to-measure-smoothness-of-a-time-series-in-r
#'
#' @export
smoothness <- function(x, method = "cor", lag = 1, iterations = NULL, ...) {
  UseMethod("smoothness")
}


#' @export
smoothness.numeric <- function(
  x,
  method = "cor",
  lag = 1,
  iterations = NULL,
  ...
) {
  if (lag < 1) {
    lag <- round(lag * length(x))
  }
  if (lag <= 0) {
    insight::format_error("'lag' cannot be that small.")
  }

  if (method == "cor") {
    smooth_data <- stats::cor(
      utils::head(x, length(x) - lag),
      utils::tail(x, length(x) - lag)
    )
  } else {
    smooth_data <- stats::sd(diff(x, lag = lag)) / abs(mean(diff(x, lag = lag)))
  }

  if (!is.null(iterations)) {
    if (requireNamespace("boot", quietly = TRUE)) {
      results <- boot::boot(
        data = x,
        statistic = .boot_smoothness,
        R = iterations,
        method = method,
        lag = lag
      )
      out_se <- stats::sd(results$t, na.rm = TRUE)
      smooth_data <- data.frame(Smoothness = smooth_data, SE = out_se)
    } else {
      insight::format_warning("Package 'boot' needed for bootstrapping SEs.")
    }
  }

  class(smooth_data) <- unique(c("parameters_smoothness", class(smooth_data)))
  smooth_data
}


#' @export
smoothness.data.frame <- function(
  x,
  method = "cor",
  lag = 1,
  iterations = NULL,
  ...
) {
  .smoothness <-
    lapply(
      x,
      smoothness,
      method = method,
      lag = lag,
      iterations = iterations
    )
  .smoothness <- cbind(
    Parameter = names(.smoothness),
    do.call(rbind, .smoothness)
  )
  class(.smoothness) <- unique(c("parameters_smoothness", class(.smoothness)))
  .smoothness
}


#' @export
smoothness.default <- function(
  x,
  method = "cor",
  lag = 1,
  iterations = NULL,
  ...
) {
  smoothness(
    .factor_to_numeric(x),
    method = method,
    lag = lag,
    iterations = iterations
  )
}


# bootstrapping -----------------------------------

.boot_smoothness <- function(data, indices, method, lag) {
  datawizard::smoothness(
    x = data[indices],
    method = method,
    lag = lag,
    iterations = NULL
  )
}


# methods -----------------------------------------

#' @export
as.numeric.parameters_smoothness <- function(x, ...) {
  if (is.data.frame(x)) {
    x$Smoothness
  } else {
    as.vector(x)
  }
}

#' @export
as.double.parameters_smoothness <- as.numeric.parameters_smoothness


================================================
FILE: R/standardize.R
================================================
#' Standardization (Z-scoring)
#'
#' Performs a standardization of data (z-scoring), i.e., centering and scaling,
#' so that the data is expressed in terms of standard deviation (i.e., mean = 0,
#' SD = 1) or Median Absolute Deviance (median = 0, MAD = 1). When applied to a
#' statistical model, this function extracts the dataset, standardizes it, and
#' refits the model with this standardized version of the dataset. The
#' [normalize()] function can also be used to scale all numeric variables within
#' the 0 - 1 range.
#' \cr\cr
#' For model standardization, see [`standardize.default()`].
#'
#' @param x A (grouped) data frame, a vector or a statistical model (for
#'   `unstandardize()` cannot be a model).
#' @param robust Logical, if `TRUE`, centering is done by subtracting the
#'   median from the variables and dividing it by the median absolute deviation
#'   (MAD). If `FALSE`, variables are standardized by subtracting the
#'   mean and dividing it by the standard deviation (SD).
#' @param two_sd If `TRUE`, the variables are scaled by two times the deviation
#'   (SD or MAD depending on `robust`). This method can be useful to obtain
#'   model coefficients of continuous parameters comparable to coefficients
#'   related to binary predictors, when applied to **the predictors** (not the
#'   outcome) (Gelman, 2008).
#' @param weights Can be `NULL` (for no weighting), or:
#' - For model: if `TRUE` (default), a weighted-standardization is carried out.
#' - For `data.frame`s: a numeric vector of weights, or a character of the
#'   name of a column in the `data.frame` that contains the weights.
#' - For numeric vectors: a numeric vector of weights.
#' @param verbose Toggle warnings and messages on or off.
#' @param remove_na How should missing values (`NA`) be treated: if `"none"`
#'   (default): each column's standardization is done separately, ignoring
#'   `NA`s. Else, rows with `NA` in the columns selected with `select` /
#'   `exclude` (`"selected"`) or in all columns (`"all"`) are dropped before
#'   standardization, and the resulting data frame does not include these cases.
#' @param force Logical, if `TRUE`, forces standardization of factors and dates
#'   as well. Factors are converted to numerical values, with the lowest level
#'   being the value `1` (unless the factor has numeric levels, which are
#'   converted to the corresponding numeric value).
#' @param append Logical or string. If `TRUE`, standardized variables get new
#'   column names (with the suffix `"_z"`) and are appended (column bind) to `x`,
#'   thus returning both the original and the standardized variables. If `FALSE`,
#'   original variables in `x` will be overwritten by their standardized versions.
#'   If a character value, standardized variables are appended with new column
#'   names (using the defined suffix) to the original data frame.
#' @param reference A data frame or variable from which the centrality and
#'   deviation will be computed instead of from the input variable. Useful for
#'   standardizing a subset or new data according to another data frame.
#' @param center,scale
#' * For `standardize()`: \cr
#'   Numeric values, which can be used as alternative to `reference` to define
#'   a reference centrality and deviation. If `scale` and `center` are of
#'   length 1, they will be recycled to match the length of selected variables
#'   for standardization. Else, `center` and `scale` must be of same length as
#'   the number of selected variables. Values in `center` and `scale` will be
#'   matched to selected variables in the provided order, unless a named vector
#'   is given. In this case, names are matched against the names of the selected
#'   variables.
#'
#' * For `unstandardize()`: \cr
#'   `center` and `scale` correspond to the center (the mean / median) and the scale (SD / MAD) of
#'   the original non-standardized data (for data frames, should be named, or
#'   have column order correspond to the numeric column). However, one can also
#'   directly provide the original data through `reference`, from which the
#'   center and the scale will be computed (according to `robust` and `two_sd`).
#'   Alternatively, if the input contains the attributes `center` and `scale`
#'   (as does the output of `standardize()`), it will take it from there if the
#'   rest of the arguments are absent.
#' @param force Logical, if `TRUE`, forces recoding of factors and character
#'   vectors as well.
#' @param ... Arguments passed to or from other methods.
#' @inheritParams extract_column_names
#'
#' @inheritSection center Selection of variables - the `select` argument
#'
#' @return The standardized object (either a standardize data frame or a
#'   statistical model fitted on standardized data).
#'
#' @note When `x` is a vector or a data frame with `remove_na = "none")`,
#'   missing values are preserved, so the return value has the same length /
#'   number of rows as the original input.
#'
#' @seealso See [center()] for grand-mean centering of variables, and
#'   [makepredictcall.dw_transformer()] for use in model formulas.
#'
#' @family transform utilities
#' @family standardize
#'
#' @examples
#' d <- iris[1:4, ]
#'
#' # vectors
#' standardise(d$Petal.Length)
#'
#' # Data frames
#' # overwrite
#' standardise(d, select = c("Sepal.Length", "Sepal.Width"))
#'
#' # append
#' standardise(d, select = c("Sepal.Length", "Sepal.Width"), append = TRUE)
#'
#' # append, suffix
#' standardise(d, select = c("Sepal.Length", "Sepal.Width"), append = "_std")
#'
#' # standardizing with reference center and scale
#' d <- data.frame(
#'   a = c(-2, -1, 0, 1, 2),
#'   b = c(3, 4, 5, 6, 7)
#' )
#'
#' # default standardization, based on mean and sd of each variable
#' standardize(d) # means are 0 and 5, sd ~ 1.581139
#'
#' # standardization, based on mean and sd set to the same values
#' standardize(d, center = c(0, 5), scale = c(1.581, 1.581))
#'
#' # standardization, mean and sd for each variable newly defined
#' standardize(d, center = c(3, 4), scale = c(2, 4))
#'
#' # standardization, taking same mean and sd for each variable
#' standardize(d, center = 1, scale = 3)
#' @export
standardize <- function(x, ...) {
  UseMethod("standardize")
}

#' @rdname standardize
#' @export
standardise <- standardize


# Default method is in effectsize

# standardize.default <- function(x, verbose = TRUE, ...) {
#   if (isTRUE(verbose)) {
#     insight::format_alert(sprintf("Standardizing currently not possible for variables of class '%s'.", class(x)[1])))
#   }
#   x
# }

#' @rdname standardize
#' @export
standardize.numeric <- function(
  x,
  robust = FALSE,
  two_sd = FALSE,
  weights = NULL,
  reference = NULL,
  center = NULL,
  scale = NULL,
  verbose = TRUE,
  ...
) {
  # set default - need to fix this, else we don't know whether this
  # comes from "center()" or "standardize()". Furthermore, data.frame
  # methods cannot return a vector of NULLs for each variable - instead
  # they return NA. Thus, we have to treat NA like NULL
  if (is.null(scale) || is.na(scale)) {
    scale <- TRUE
  }
  if (is.null(center) || is.na(center)) {
    center <- TRUE
  }

  my_args <- .process_std_center(
    x,
    weights,
    robust,
    verbose,
    reference,
    center,
    scale
  )
  dot_args <- list(...)

  # Perform standardization
  if (is.null(my_args)) {
    # all NA?
    return(x)
  } else if (is.null(my_args$check)) {
    vals <- rep(0, length(my_args$vals)) # If only unique value
  } else if (two_sd) {
    vals <- as.vector((my_args$vals - my_args$center) / (2 * my_args$scale))
  } else {
    vals <- as.vector((my_args$vals - my_args$center) / my_args$scale)
  }

  scaled_x <- rep(NA, length(my_args$valid_x))
  scaled_x[my_args$valid_x] <- vals
  attr(scaled_x, "center") <- my_args$center
  attr(scaled_x, "scale") <- my_args$scale
  attr(scaled_x, "robust") <- robust
  # labels
  z <- .set_back_labels(scaled_x, x, include_values = FALSE)
  if (!isFALSE(dot_args$add_transform_class)) {
    class(z) <- c("dw_transformer", class(z))
  }
  z
}

#' @export
standardize.double <- standardize.numeric

#' @export
standardize.integer <- standardize.numeric

#' @export
standardize.matrix <- function(x, ...) {
  xl <- lapply(seq_len(ncol(x)), function(i) x[, i])

  xz <- lapply(xl, datawizard::standardize, ...)

  x_out <- do.call(cbind, xz)
  dimnames(x_out) <- dimnames(x)

  attr(x_out, "center") <- vapply(xz, attr, "center", FUN.VALUE = numeric(1L))
  attr(x_out, "scale") <- vapply(xz, attr, "scale", FUN.VALUE = numeric(1L))
  attr(x_out, "robust") <- vapply(xz, attr, "robust", FUN.VALUE = logical(1L))[
    1
  ]
  class(x_out) <- c("dw_transformer", class(x_out))

  x_out
}


#' @rdname standardize
#' @export
standardize.factor <- function(
  x,
  robust = FALSE,
  two_sd = FALSE,
  weights = NULL,
  force = FALSE,
  verbose = TRUE,
  ...
) {
  if (!force) {
    return(x)
  }

  standardize(
    .factor_to_numeric(x),
    robust = robust,
    two_sd = two_sd,
    weights = weights,
    verbose = verbose,
    ...
  )
}


#' @export
standardize.character <- standardize.factor

#' @export
standardize.logical <- standardize.factor

#' @export
standardize.Date <- standardize.factor

#' @export
standardize.AsIs <- standardize.numeric


# Data frames -------------------------------------------------------------

#' @rdname standardize
#' @export
standardize.data.frame <- function(
  x,
  select = NULL,
  exclude = NULL,
  robust = FALSE,
  two_sd = FALSE,
  weights = NULL,
  reference = NULL,
  center = NULL,
  scale = NULL,
  remove_na = c("none", "selected", "all"),
  force = FALSE,
  append = FALSE,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
) {
  # evaluate select/exclude, may be select-helpers
  select <- .select_nse(
    select,
    x,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  # process arguments
  my_args <- .process_std_args(
    x,
    select,
    exclude,
    weights,
    append,
    append_suffix = "_z",
    keep_factors = force,
    remove_na,
    reference,
    .center = center,
    .scale = scale
  )

  # set new values
  x <- my_args$x

  # Loop through variables and standardize it
  for (var in my_args$select) {
    x[[var]] <- standardize(
      x[[var]],
      robust = robust,
      two_sd = two_sd,
      weights = my_args$weights,
      reference = reference[[var]],
      center = my_args$center[var],
      scale = my_args$scale[var],
      verbose = FALSE,
      force = force,
      add_transform_class = FALSE
    )
  }

  attr(x, "center") <- unlist(lapply(x[my_args$select], function(z) {
    attributes(z)$center
  }))
  attr(x, "scale") <- unlist(lapply(x[my_args$select], function(z) {
    attributes(z)$scale
  }))
  attr(x, "robust") <- robust
  x
}


#' @export
standardize.grouped_df <- function(
  x,
  select = NULL,
  exclude = NULL,
  robust = FALSE,
  two_sd = FALSE,
  weights = NULL,
  reference = NULL,
  center = NULL,
  scale = NULL,
  remove_na = c("none", "selected", "all"),
  force = FALSE,
  append = FALSE,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
) {
  # evaluate select/exclude, may be select-helpers
  select <- .select_nse(
    select,
    x,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  my_args <- .process_grouped_df(
    x,
    select,
    exclude,
    append,
    append_suffix = "_z",
    reference,
    weights,
    keep_factors = force
  )

  # create column(s) to store dw_transformer attributes
  for (i in select) {
    my_args$info$groups[[paste0("attr_", i)]] <- rep(NA, length(my_args$grps))
  }

  for (rows in seq_along(my_args$grps)) {
    tmp <- standardize(
      my_args$x[my_args$grps[[rows]], , drop = FALSE],
      select = my_args$select,
      exclude = NULL,
      robust = robust,
      two_sd = two_sd,
      weights = my_args$weights,
      remove_na = remove_na,
      verbose = verbose,
      force = force,
      append = FALSE,
      center = center,
      scale = scale,
      add_transform_class = FALSE,
      ...
    )

    # store dw_transformer_attributes
    for (i in select) {
      my_args$info$groups[rows, paste0("attr_", i)][[
        1
      ]] <- list(unlist(attributes(tmp[[i]])))
    }

    my_args$x[my_args$grps[[rows]], ] <- tmp
  }

  # last column of "groups" attributes must be called ".rows"
  my_args$info$groups <- data_relocate(my_args$info$groups, ".rows", after = -1)

  # set back class, so data frame still works with dplyr
  attributes(my_args$x) <- my_args$info
  my_args$x
}


# Datagrid ----------------------------------------------------------------

#' @export
standardize.datagrid <- function(x, ...) {
  x[names(x)] <- standardize(
    as.data.frame(x),
    reference = attributes(x)$data,
    ...
  )
  x
}

#' @export
standardize.visualisation_matrix <- standardize.datagrid


================================================
FILE: R/standardize.models.R
================================================
#' Re-fit a model with standardized data
#'
#' Performs a standardization of data (z-scoring) using
#' [`standardize()`] and then re-fits the model to the standardized data.
#' \cr\cr
#' Standardization is done by completely refitting the model on the standardized
#' data. Hence, this approach is equal to standardizing the variables *before*
#' fitting the model and will return a new model object. This method is
#' particularly recommended for complex models that include interactions or
#' transformations (e.g., polynomial or spline terms). The `robust` (default to
#' `FALSE`) argument enables a robust standardization of data, based on the
#' `median` and the `MAD` instead of the `mean` and the `SD`.
#'
#' @param x A statistical model.
#' @param weights If `TRUE` (default), a weighted-standardization is carried out.
#' @param include_response If `TRUE` (default), the response value will also be
#'   standardized. If `FALSE`, only the predictors will be standardized.
#'   - Note that for GLMs and models with non-linear link functions, the
#'   response value will not be standardized, to make re-fitting the model work.
#'   - If the model contains an [stats::offset()], the offset variable(s) will
#'   be standardized only if the response is standardized. If `two_sd = TRUE`,
#'   offsets are standardized by one-sd (similar to the response).
#'   - (For `mediate` models, the `include_response` refers to the outcome in
#'   the y model; m model's response will always be standardized when possible).
#' @inheritParams standardize
#'
#' @return A statistical model fitted on standardized data
#'
#' @details
#'
#' # Generalized Linear Models
#' Standardization for generalized linear models (GLM, GLMM, etc) is done only
#' with respect to the predictors (while the outcome remains as-is,
#' unstandardized) - maintaining the interpretability of the coefficients (e.g.,
#' in a binomial model: the exponent of the standardized parameter is the OR of
#' a change of 1 SD in the predictor, etc.)
#'
#' # Dealing with Factors
#' `standardize(model)` or `standardize_parameters(model, method = "refit")` do
#' *not* standardize categorical predictors (i.e. factors) / their
#' dummy-variables, which may be a different behaviour compared to other R
#' packages (such as **lm.beta**) or other software packages (like SPSS). To
#' mimic such behaviours, either use `standardize_parameters(model, method =
#' "basic")` to obtain post-hoc standardized parameters, or standardize the data
#' with `standardize(data, force = TRUE)` *before* fitting the
#' model.
#'
#' # Transformed Variables
#' When the model's formula contains transformations (e.g. `y ~ exp(X)`) the
#' transformation effectively takes place after standardization (e.g.,
#' `exp(scale(X))`). Since some transformations are undefined for none positive
#' values, such as `log()` and `sqrt()`, the relevel variables are shifted (post
#' standardization) by `Z - min(Z) + 1` or `Z - min(Z)` (respectively).
#'
#'
#' @family standardize
#' @examples
#' model <- lm(Infant.Mortality ~ Education * Fertility, data = swiss)
#' coef(standardize(model))
#'
#' @export
#' @aliases standardize_models
standardize.default <- function(
  x,
  robust = FALSE,
  two_sd = FALSE,
  weights = TRUE,
  verbose = TRUE,
  include_response = TRUE,
  ...
) {
  if (!insight::is_model(x)) {
    insight::format_warning(
      paste0(
        "Objects or variables of class '",
        class(x)[1],
        "' cannot be standardized."
      )
    )
    return(x)
  }

  data_std <- NULL # needed to avoid note
  .standardize_models(
    x,
    robust = robust,
    two_sd = two_sd,
    weights = weights,
    verbose = verbose,
    include_response = include_response,
    update_expr = stats::update(x, data = data_std),
    ...
  )
}


.standardize_models <- function(
  x,
  robust = FALSE,
  two_sd = FALSE,
  weights = TRUE,
  verbose = TRUE,
  include_response = TRUE,
  update_expr,
  ...
) {
  # check model formula. Some notations don't work when standardizing data
  insight::formula_ok(
    x,
    action = "error",
    prefix_msg = "Model cannot be standardized.",
    verbose = verbose
  )

  m_info <- .get_model_info(x, ...)
  model_data <- insight::get_data(x, source = "mf", verbose = FALSE)

  if (isTRUE(attr(model_data, "is_subset"))) {
    insight::format_error("Cannot standardize a model fit with a 'subset = '.")
  }

  if (m_info$is_bayesian && verbose) {
    insight::format_warning(
      "Standardizing variables without adjusting priors may lead to bogus results unless priors are auto-scaled."
    )
  }

  ## ---- Z the RESPONSE? ----
  # 1. Some models have special responses that should not be standardized. This
  # includes:
  # - generalized linear models (counts, binomial, etc...)
  # - Survival models
  # 2. We also don't want to standardize the response when `two_sd = TRUE` -
  # instead we will standardize the response separately.

  include_response <- include_response && .safe_to_standardize_response(m_info)

  resp <- NULL
  if (!include_response || (include_response && two_sd)) {
    resp <- c(
      insight::find_response(x),
      insight::find_response(x, combine = FALSE)
    )
    resp <- insight::clean_names(resp)
    resp <- unique(resp)
  }

  # If there's an offset, don't standardize offset OR response
  offsets <- insight::find_offset(x)
  if (length(offsets)) {
    if (include_response) {
      if (verbose) {
        insight::format_warning("Offset detected and will be standardized.")
      }

      if (two_sd) {
        # Treat offsets like responses - only standardize by 1 SD
        resp <- c(resp, offsets)
        offsets <- NULL
      }
    } else if (!include_response) {
      # Don't standardize offsets if not standardizing the response
      offsets <- NULL
    }
  }

  ## ---- DO NOT Z: ----

  # 1. WEIGHTS:
  # because negative weights will cause errors in "update()"
  weight_variable <- insight::find_weights(x)

  if (
    !is.null(weight_variable) &&
      !weight_variable %in% colnames(model_data) &&
      "(weights)" %in% colnames(model_data)
  ) {
    model_data$.missing_weight <- model_data[["(weights)"]]
    colnames(model_data)[ncol(model_data)] <- weight_variable
    weight_variable <- c(weight_variable, "(weights)")
  }

  # 2. RANDOM-GROUPS:
  random_group_factor <- insight::find_random(
    x,
    flatten = TRUE,
    split_nested = TRUE
  )

  ## ---- SUMMARY: TO Z OR NOT TO Z? ----

  dont_standardize <- c(resp, weight_variable, random_group_factor)
  do_standardize <- setdiff(colnames(model_data), dont_standardize)

  # can't std data$var variables
  doller_vars <- grepl("(.*)\\$(.*)", do_standardize)
  if (any(doller_vars)) {
    doller_vars <- colnames(model_data)[doller_vars]
    insight::format_warning(
      "Unable to standardize variables evaluated in the environment (i.e., not in `data`).",
      "The following variables will not be standardizd:",
      toString(doller_vars)
    )
    do_standardize <- setdiff(do_standardize, doller_vars)
    dont_standardize <- c(dont_standardize, doller_vars)
  }

  if (!length(do_standardize)) {
    insight::format_warning("No variables could be standardized.")
    return(x)
  }

  ## ---- STANDARDIZE! ----

  w <- insight::get_weights(x, remove_na = TRUE)

  data_std <- standardize(
    model_data[do_standardize],
    robust = robust,
    two_sd = two_sd,
    weights = if (weights) w,
    verbose = verbose
  )

  # if two_sd, it must not affect the response!
  if (include_response && two_sd) {
    data_std[resp] <- standardize(
      model_data[resp],
      robust = robust,
      two_sd = FALSE,
      weights = if (weights) w,
      verbose = verbose
    )

    dont_standardize <- setdiff(dont_standardize, resp)
  }

  # FIX LOG-SQRT VARS:
  # if we standardize log-terms, standardization will fail (because log of
  # negative value is NaN). Do some back-transformation here

  log_terms <- .log_terms(x, data_std)
  if (length(log_terms) > 0) {
    data_std[log_terms] <- lapply(
      data_std[log_terms],
      function(i) i - min(i, na.rm = TRUE) + 1
    )
  }

  # same for sqrt
  sqrt_terms <- .sqrt_terms(x, data_std)
  if (length(sqrt_terms) > 0) {
    data_std[sqrt_terms] <- lapply(
      data_std[sqrt_terms],
      function(i) i - min(i, na.rm = TRUE)
    )
  }

  if (verbose && length(c(log_terms, sqrt_terms))) {
    insight::format_alert(
      "Formula contains log- or sqrt-terms.",
      "See help(\"standardize\") for how such terms are standardized."
    )
  }

  ## ---- ADD BACK VARS THAT WHERE NOT Z ----
  if (length(dont_standardize)) {
    remaining_columns <- intersect(colnames(model_data), dont_standardize)
    data_std <- cbind(model_data[, remaining_columns, drop = FALSE], data_std)
  }

  ## ---- UPDATE MODEL WITH Z DATA ----
  on.exit(.update_failed())

  if (isTRUE(verbose)) {
    model_std <- eval(substitute(update_expr))
  } else {
    utils::capture.output({
      model_std <- eval(substitute(update_expr))
    })
  }

  on.exit() # undo previous on.exit()

  model_std
}


# Special methods ---------------------------------------------------------

#' @export
standardize.brmsfit <- function(
  x,
  robust = FALSE,
  two_sd = FALSE,
  weights = TRUE,
  verbose = TRUE,
  include_response = TRUE,
  ...
) {
  data_std <- NULL # needed to avoid note
  if (insight::is_multivariate(x)) {
    insight::format_error(
      "Multivariate brmsfit models not supported.",
      "As an alternative: you may standardize your data (and adjust your priors), and re-fit the model."
    )
  }

  .standardize_models(
    x,
    robust = robust,
    two_sd = two_sd,
    weights = weights,
    verbose = verbose,
    include_response = include_response,
    update_expr = stats::update(x, newdata = data_std),
    ...
  )
}


#' @export
standardize.mixor <- function(
  x,
  robust = FALSE,
  two_sd = FALSE,
  weights = TRUE,
  verbose = TRUE,
  include_response = TRUE,
  ...
) {
  data_std <- random_group_factor <- NULL # needed to avoid note
  .standardize_models(
    x,
    robust = robust,
    two_sd = two_sd,
    weights = weights,
    verbose = verbose,
    include_response = include_response,
    update_expr = {
      data_std <- data_std[
        order(data_std[, random_group_factor, drop = FALSE]),
      ]
      stats::update(x, data = data_std)
    },
    ...
  )
}


#' @export
standardize.mediate <- function(
  x,
  robust = FALSE,
  two_sd = FALSE,
  weights = TRUE,
  verbose = TRUE,
  include_response = TRUE,
  ...
) {
  # models and data
  y <- x$model.y
  m <- x$model.m
  y_data <- insight::get_data(y, source = "mf", verbose = FALSE)
  m_data <- insight::get_data(m, source = "mf", verbose = FALSE)

  # std models and data
  y_std <- standardize(
    y,
    robust = robust,
    two_sd = two_sd,
    weights = weights,
    verbose = verbose,
    include_response = include_response,
    ...
  )
  m_std <- standardize(
    m,
    robust = robust,
    two_sd = two_sd,
    weights = weights,
    verbose = verbose,
    include_response = TRUE,
    ...
  )
  y_data_std <- insight::get_data(y_std, source = "mf", verbose = FALSE)
  m_data_std <- insight::get_data(m_std, source = "mf", verbose = FALSE)

  # fixed values
  covs <- x$covariates
  control.value <- x$control.value
  treat.value <- x$treat.value

  if (!is.null(covs)) {
    covs <- mapply(
      .rescale_fixed_values,
      covs,
      names(covs),
      SIMPLIFY = FALSE,
      MoreArgs = list(
        y_data = y_data,
        m_data = m_data,
        y_data_std = y_data_std,
        m_data_std = m_data_std
      )
    )
    if (verbose) {
      insight::format_alert(
        "Covariates' values have been rescaled to their standardized scales."
      )
    }
  }

  # if (is.numeric(y_data[[x$treat]]) || is.numeric(m_data[[x$treat]])) {
  #   if (!(is.numeric(y_data[[x$treat]]) && is.numeric(m_data[[x$treat]]))) {
  #     stop("'treat' variable is not of same type across both y and m models.",
  #          "\nCannot consistently standardize.", call. = FALSE)
  #   }
  #
  #   temp_vals <- .rescale_fixed_values(c(control.value, treat.value), x$treat,
  #                                      y_data = y_data, m_data = m_data,
  #                                      y_data_std = y_data_std, m_data_std = m_data_std)
  #
  #   control.value <- temp_vals[1]
  #   treat.value <- temp_vals[2]
  #   if (verbose) insight::format_alert("control and treatment values have been
  #   rescaled to their standardized scales.")
  # }

  if (verbose && !all(c(control.value, treat.value) %in% c(0, 1))) {
    insight::format_warning(
      "Control and treat values are not 0 and 1, and have not been re-scaled.",
      "Interpret results with caution."
    )
  }

  junk <- utils::capture.output({
    model_std <- stats::update(
      x,
      model.y = y_std,
      model.m = m_std,
      # control.value = control.value, treat.value = treat.value
      covariates = covs
    )
  })

  model_std
}


# Cannot ------------------------------------------------------------------

#' @export
standardize.wbm <- function(x, ...) {
  .update_failed(class(x))
}

#' @export
standardize.Surv <- standardize.wbm

#' @export
standardize.clm2 <- standardize.wbm

#' @export
standardize.bcplm <- standardize.wbm

#' @export
standardize.wbgee <- standardize.wbm

#' @export
standardize.biglm <- standardize.wbm
# biglm doesn't regit the model to new data - it ADDs MORE data to the model.

#' @export
# Almost the same as `standardize.default()` but we pass `use_calling_env` in
# update().
standardize.fixest <- function(
  x,
  robust = FALSE,
  two_sd = FALSE,
  weights = TRUE,
  verbose = TRUE,
  include_response = TRUE,
  ...
) {
  data_std <- NULL # needed to avoid note
  .standardize_models(
    x,
    robust = robust,
    two_sd = two_sd,
    weights = weights,
    verbose = verbose,
    include_response = include_response,
    update_expr = stats::update(x, data = data_std, use_calling_env = FALSE),
    ...
  )
}

# helper ----------------------------

# Find log-terms inside model formula, and return "clean" term names
.log_terms <- function(model, data) {
  x <- insight::find_terms(model, flatten = TRUE)
  # log_pattern <- "^log\\((.*)\\)"
  log_pattern <- "(log\\(log|log|log1|log10|log1p|log2)\\(([^,\\+)]*).*"
  out <- insight::trim_ws(gsub(
    log_pattern,
    "\\2",
    grep(log_pattern, x, value = TRUE)
  ))
  intersect(colnames(data), out)
}

# Find log-terms inside model formula, and return "clean" term names
.sqrt_terms <- function(model, data) {
  x <- insight::find_terms(model, flatten = TRUE)
  pattern <- "sqrt\\(([^,\\+)]*).*"
  out <- insight::trim_ws(gsub(pattern, "\\1", grep(pattern, x, value = TRUE)))
  intersect(colnames(data), out)
}


#' @keywords internal
.safe_to_standardize_response <- function(info, verbose = TRUE) {
  if (is.null(info)) {
    if (verbose) {
      insight::format_warning(
        "Unable to verify if response should not be standardized.",
        "Response will be standardized."
      )
    }
    return(TRUE)
  }

  # check if model has a response variable that should not be standardized.
  info$is_linear &&
    info$family != "inverse.gaussian" &&
    !info$is_survival &&
    !info$is_censored

  # # alternative would be to keep something like:
  # !info$is_count &&
  #   !info$is_ordinal &&
  #   !info$is_multinomial &&
  #   !info$is_beta &&
  #   !info$is_censored &&
  #   !info$is_binomial &&
  #   !info$is_survival
  # # And then treating response for "Gamma()" or "inverse.gaussian" similar to
  # # log-terms...
}

#' @keywords internal
.rescale_fixed_values <- function(
  val,
  cov_nm,
  y_data,
  m_data,
  y_data_std,
  m_data_std
) {
  if (cov_nm %in% colnames(y_data)) {
    temp_data <- y_data
    temp_data_std <- y_data_std
  } else {
    temp_data <- m_data
    temp_data_std <- m_data_std
  }

  rescale(
    val,
    to = range(temp_data_std[[cov_nm]]),
    range = range(temp_data[[cov_nm]])
  )
}


#' @keywords internal
.update_failed <- function(class = NULL, ...) {
  if (is.null(class)) {
    msg1 <- "Unable to refit the model with standardized data."
  } else {
    msg1 <- sprintf(
      "Standardization of parameters not possible for models of class '%s'.",
      class
    )
  }

  insight::format_error(
    msg1,
    "Try instead to standardize the data (standardize(data)) and refit the model manually."
  )
}


================================================
FILE: R/text_format.R
================================================
#' Convenient text formatting functionalities
#'
#' Convenience functions to manipulate and format text.
#'
#' @param text,text2 A character string.
#' @param width Positive integer giving the target column width for wrapping
#' lines in the output. Can be "auto", in which case it will select 90\% of the
#' default width.
#' @param pattern Regex pattern to remove from `text`.
#' @param sep Separator.
#' @param last Last separator.
#' @param n The number of characters to find.
#' @param enclose Character that will be used to wrap elements of `text`, so
#'   these can be, e.g., enclosed with quotes or backticks. If `NULL` (default),
#'   text elements will not be enclosed.
#' @param ... Other arguments to be passed to or from other functions.
#'
#' @return A character string.
#'
#' @examples
#' # Add full stop if missing
#' text_fullstop(c("something", "something else."))
#'
#' # Find last characters
#' text_lastchar(c("ABC", "DEF"), n = 2)
#'
#' # Smart concatenation
#' text_concatenate(c("First", "Second", "Last"))
#' text_concatenate(c("First", "Second", "Last"), last = " or ", enclose = "`")
#'
#' # Remove parts of string
#' text_remove(c("one!", "two", "three!"), "!")
#'
#' # Wrap text
#' long_text <- paste(rep("abc ", 100), collapse = "")
#' cat(text_wrap(long_text, width = 50))
#'
#' # Paste with optional separator
#' text_paste(c("A", "", "B"), c("42", "42", "42"))
#' @export
text_format <- function(
  text,
  sep = ", ",
  last = " and ",
  width = NULL,
  enclose = NULL,
  ...
) {
  text_wrap(
    text_concatenate(text, sep = sep, last = last, enclose = enclose),
    width = width
  )
}

#' @rdname text_format
#' @export
text_fullstop <- function(text) {
  text[!text_lastchar(text) %in% c(".", ":", ",", ";", "!", "?")] <- paste0(
    text[text_lastchar(text) != "."],
    "."
  )
  text
}


#' @rdname text_format
#' @export
text_lastchar <- function(text, n = 1) {
  vapply(
    text,
    function(xx) {
      substr(xx, (nchar(xx) - n + 1), nchar(xx))
    },
    FUN.VALUE = character(1L)
  )
}


#' @rdname text_format
#' @export
text_concatenate <- function(text, sep = ", ", last = " and ", enclose = NULL) {
  if (length(text) == 1 && !nzchar(text, keepNA = TRUE)) {
    return(text)
  }
  text <- text[text != ""] # nolint
  if (
    length(text) &&
      !is.null(enclose) &&
      length(enclose) == 1 &&
      nzchar(enclose, keepNA = TRUE)
  ) {
    text <- paste0(enclose, text, enclose)
  }
  if (length(text) == 1) {
    s <- text
  } else {
    s <- paste(text[1:(length(text) - 1)], collapse = sep)
    s <- paste(c(s, text[length(text)]), collapse = last)
  }
  s
}


#' @rdname text_format
#' @export
text_paste <- function(text, text2 = NULL, sep = ", ", enclose = NULL, ...) {
  if (!is.null(text2)) {
    if (
      !is.null(enclose) &&
        length(enclose) == 1 &&
        nzchar(enclose, keepNA = TRUE)
    ) {
      text <- vapply(
        text,
        function(i) {
          if (i != "") {
            i <- paste0(enclose, i, enclose)
          }
          i
        },
        character(1L)
      )
      text2 <- vapply(
        text2,
        function(i) {
          if (i != "") {
            i <- paste0(enclose, i, enclose)
          }
          i
        },
        character(1L)
      )
    }
    paste0(text, ifelse(text == "" | text2 == "", "", sep), text2) # nolint
  }
}


#' @rdname text_format
#' @export
text_remove <- function(text, pattern = "", ...) {
  gsub(pattern, "", text, ...)
}


#' @rdname text_format
#' @export
text_wrap <- function(text, width = NULL, ...) {
  width <- width %||% getOption("width")

  text <- strsplit(text, "\n", fixed = TRUE)
  text <- unlist(text, use.names = FALSE)

  wrapped <- ""

  for (s in text) {
    if (nchar(s) > width) {
      leading_spaces <- nchar(s) - nchar(insight::trim_ws(s))
      s <- strwrap(s, width = width)
      s <- paste(s, collapse = "\n")
      s <- paste0(strrep(" ", leading_spaces), s)
    }
    wrapped <- paste0(wrapped, s, "\n")
  }

  wrapped
}


================================================
FILE: R/to_factor.R
================================================
#' @title Convert data to factors
#' @name to_factor
#'
#' @details
#' Convert variables or data into factors. If the data is labelled, value labels
#' will be used as factor levels. The counterpart to convert variables into
#' numeric is `to_numeric()`.
#'
#' @param x A data frame or vector.
#' @param labels_to_levels Logical, if `TRUE`, value labels are used as factor
#' levels after `x` was converted to factor. Else, factor levels are based on
#' the values of `x` (i.e. as if using `as.factor()`).
#' @param ... Arguments passed to or from other methods.
#' @inheritParams extract_column_names
#' @inheritParams categorize
#'
#' @inheritSection center Selection of variables - the `select` argument
#'
#' @return A factor, or a data frame of factors.
#'
#' @note Factors are ignored and returned as is. If you want to use value labels
#' as levels for factors, use [`labels_to_levels()`] instead.
#'
#' @examples
#' str(to_factor(iris))
#'
#' # use labels as levels
#' data(efc)
#' str(efc$c172code)
#' head(to_factor(efc$c172code))
#' @export
to_factor <- function(x, ...) {
  UseMethod("to_factor")
}


#' @export
to_factor.default <- function(x, verbose = TRUE, ...) {
  if (isTRUE(verbose)) {
    insight::format_alert(
      sprintf(
        "Converting into factors values currently not possible for variables of class `%s`.",
        class(x)[1]
      )
    )
  }
  x
}

#' @export
to_factor.factor <- function(x, ...) {
  x
}

#' @rdname to_factor
#' @export
to_factor.numeric <- function(x, labels_to_levels = TRUE, verbose = TRUE, ...) {
  # preserve labels
  variable_label <- attr(x, "label", exact = TRUE)
  value_labels <- attr(x, "labels", exact = TRUE)

  # to factor
  x <- as.factor(x)

  # add back labels
  attr(x, "label") <- variable_label
  attr(x, "labels") <- value_labels

  # value labels to factor levels
  if (labels_to_levels) {
    x <- .value_labels_to_levels(x, verbose = verbose, ...)
  }
  x
}

#' @export
to_factor.logical <- to_factor.numeric

#' @export
to_factor.character <- to_factor.numeric

#' @export
to_factor.Date <- to_factor.numeric

#' @export
to_factor.haven_labelled <- to_factor.numeric

#' @export
to_factor.double <- to_factor.numeric

#' @rdname to_factor
#' @export
to_factor.data.frame <- function(
  x,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  append = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
) {
  # validation check, return as is for complete factor
  if (all(vapply(x, is.factor, FUN.VALUE = logical(1L)))) {
    return(x)
  }

  # evaluate arguments
  select <- .select_nse(
    select,
    x,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  # when we append variables, we call ".process_append()", which will
  # create the new variables and updates "select", so new variables are processed
  if (!isFALSE(append)) {
    # drop factors, when append is not FALSE
    select <- colnames(x[select])[
      !vapply(x[select], is.factor, FUN.VALUE = logical(1L))
    ]
    # process arguments
    my_args <- .process_append(
      x,
      select,
      append,
      append_suffix = "_f",
      keep_factors = FALSE,
      keep_character = TRUE,
      preserve_value_labels = TRUE
    )
    # update processed arguments
    x <- my_args$x
    select <- my_args$select
  }

  x[select] <- lapply(x[select], to_factor, verbose = verbose, ...)
  x
}


================================================
FILE: R/to_numeric.R
================================================
#' Convert data to numeric
#'
#' Convert data to numeric by converting characters to factors and factors to
#' either numeric levels or dummy variables. The "counterpart" to convert
#' variables into factors is `to_factor()`.
#'
#' @param x A data frame, factor or vector.
#' @param dummy_factors Transform factors to dummy factors (all factor levels as
#'   different columns filled with a binary 0-1 value).
#' @param preserve_levels Logical, only applies if `x` is a factor. If `TRUE`,
#' and `x` has numeric factor levels, these will be converted into the related
#' numeric values. If this is not possible, the converted numeric values will
#' start from 1 to number of levels.
#' @param lowest Numeric, indicating the lowest (minimum) value when converting
#' factors or character vectors to numeric values.
#' @param ... Arguments passed to or from other methods.
#' @inheritParams extract_column_names
#' @inheritParams categorize
#'
#' @note When factors should be converted into multiple "binary" dummies, i.e.
#' each factor level is converted into a separate column filled with a binary
#' 0-1 value, set `dummy_factors = TRUE`. If you want to preserve the original
#' factor levels (in case these represent numeric values), use
#' `preserve_levels = TRUE`.
#'
#' @section Selection of variables - `select` argument:
#' For most functions that have a `select` argument the complete input data
#' frame is returned, even when `select` only selects a range of variables.
#' However, for `to_numeric()`, factors might be converted into dummies,
#' thus, the number of variables of the returned data frame no longer match
#' the input data frame. Hence, when `select` is used, *only* those variables
#' (or their dummies) specified in `select` will be returned. Use `append=TRUE`
#' to also include the original variables in the returned data frame.
#'
#' @examples
#' to_numeric(head(ToothGrowth))
#' to_numeric(head(ToothGrowth), dummy_factors = TRUE)
#'
#' # factors
#' x <- as.factor(mtcars$gear)
#' to_numeric(x)
#' to_numeric(x, preserve_levels = TRUE)
#' # same as:
#' coerce_to_numeric(x)
#'
#' @return A data frame of numeric variables.
#'
#' @export
to_numeric <- function(x, ...) {
  UseMethod("to_numeric")
}

#' @export
to_numeric.default <- function(x, verbose = TRUE, ...) {
  if (isTRUE(verbose)) {
    insight::format_alert(
      sprintf(
        "Converting into numeric values currently not possible for variables of class '%s'.",
        class(x)[1]
      )
    )
  }
  x
}


#' @rdname to_numeric
#' @export
to_numeric.data.frame <- function(
  x,
  select = NULL,
  exclude = NULL,
  dummy_factors = FALSE,
  preserve_levels = FALSE,
  lowest = NULL,
  append = FALSE,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
) {
  # validation check, return as is for complete numeric
  if (all(vapply(x, is.numeric, FUN.VALUE = logical(1L)))) {
    return(x)
  }

  df_attr <- attributes(x)

  # evaluate arguments
  select <- .select_nse(
    select,
    x,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  # when we append variables, we call ".process_append()", which will
  # create the new variables and updates "select", so new variables are processed
  if (!isFALSE(append)) {
    # drop numerics, when append is not FALSE
    select <- colnames(x[select])[
      !vapply(x[select], is.numeric, FUN.VALUE = logical(1L))
    ]
    # process arguments
    fun_args <- .process_append(
      x,
      select,
      append,
      append_suffix = "_n",
      keep_factors = TRUE
    )
    # update processed arguments
    x <- fun_args$x
    select <- fun_args$select
  }

  out <- sapply(
    x[select],
    to_numeric,
    dummy_factors = dummy_factors,
    preserve_levels = preserve_levels,
    lowest = lowest,
    verbose = verbose,
    simplify = FALSE
  )

  # save variable attributes
  attr_vars <- lapply(out, attributes)
  # "out" is currently a list, bind columns and to data frame
  out <- as.data.frame(do.call(cbind, out))
  # set back attributes
  for (i in colnames(out)) {
    if (is.list(attr_vars[[i]])) {
      if (is.list(attributes(out[[i]]))) {
        attributes(out[[i]]) <- utils::modifyList(
          attr_vars[[i]],
          attributes(out[[i]])
        )
      } else {
        attributes(out[[i]]) <- attr_vars[[i]]
      }
    }
  }

  # due to the special handling of dummy factors, we need to take care
  # of appending the data here again. usually, "fun_args$x" includes the appended
  # data, which does not work here...

  if (!isFALSE(append)) {
    common_columns <- intersect(colnames(x), colnames(out))
    if (length(common_columns)) {
      x[common_columns] <- NULL
    }
    out <- cbind(x, out)
  }

  # add back custom attributes
  out <- .replace_attrs(out, df_attr)
  out
}


#' @export
to_numeric.numeric <- function(x, verbose = TRUE, ...) {
  .set_back_labels(as.numeric(x), x, reverse_values = FALSE)
}

#' @export
to_numeric.double <- to_numeric.numeric

#' @export
to_numeric.logical <- to_numeric.numeric

#' @export
to_numeric.haven_labelled <- to_numeric.numeric

#' @export
to_numeric.Date <- function(x, verbose = TRUE, ...) {
  if (verbose) {
    insight::format_warning(
      paste0(
        "Converting a date-time variable of class `",
        class(x)[1],
        "` into numeric."
      ),
      "Please note that this conversion probably does not return meaningful results."
    )
  }
  as.numeric(x)
}

#' @export
to_numeric.POSIXt <- to_numeric.Date

#' @export
to_numeric.POSIXct <- to_numeric.Date

#' @export
to_numeric.POSIXlt <- to_numeric.Date


#' @export
to_numeric.factor <- function(
  x,
  dummy_factors = FALSE,
  preserve_levels = FALSE,
  lowest = NULL,
  verbose = TRUE,
  ...
) {
  # preserving levels only works when factor levels are numeric
  if (
    isTRUE(preserve_levels) &&
      anyNA(suppressWarnings(as.numeric(as.character(stats::na.omit(x)))))
  ) {
    preserve_levels <- FALSE
  }

  if (dummy_factors) {
    out <- as.data.frame(stats::model.matrix(
      ~x,
      contrasts.arg = list(x = "contr.treatment")
    ))
    out[1] <- as.numeric(rowSums(out[2:ncol(out)]) == 0)

    # insert back NA rows. if "x" had missing values, model.matrix() creates an
    # array with only non-missing values, so some rows are missing. First, we
    # need to now which rows are missing (na_values) and the length of the
    # original vector (which will be the number of rows in the final data frame)

    na_values <- which(is.na(x))
    rows_x <- length(x)

    if (any(na_values)) {
      # iterate all missing values that have
      for (i in seq_along(na_values)) {
        # if the first observation was missing, add NA row and bind data frame
        if (i == 1 && na_values[i] == 1) {
          out <- rbind(NA, out)
        } else if (na_values[i] == rows_x) {
          # if the last observation was NA, add NA row to data frame
          out <- rbind(out, NA)
        } else {
          # else, pick rows from beginning to current NA value, add NA,
          # and rbind the remaining rows
          out <- rbind(
            out[1:(na_values[i] - 1), ],
            NA,
            out[na_values[i]:nrow(out), ]
          )
        }
      }
      rownames(out) <- NULL
    }
    names(out) <- levels(x)
  } else if (preserve_levels) {
    if (is.unsorted(levels(x))) {
      x_inverse <- rep(NA_real_, length(x))
      for (i in 1:nlevels(x)) {
        x_inverse[x == levels(x)[i]] <- as.numeric(levels(x)[
          nlevels(x) - i + 1
        ])
      }
      x <- factor(x_inverse)
    }
    out <- .set_back_labels(
      as.numeric(as.character(x)),
      x,
      reverse_values = FALSE
    )
  } else {
    out <- .set_back_labels(as.numeric(x), x, reverse_values = FALSE)
  }

  # shift to requested starting value
  if (!is.null(lowest)) {
    difference <- min(out) - lowest
    out <- out - difference
  }

  out
}


#' @export
to_numeric.character <- function(
  x,
  dummy_factors = FALSE,
  lowest = NULL,
  verbose = TRUE,
  ...
) {
  numbers <- vapply(
    x,
    function(i) {
      element <- tryCatch(str2lang(i), error = function(e) NULL)
      !is.null(element) && is.numeric(element)
    },
    FUN.VALUE = logical(1L)
  )
  if (all(numbers)) {
    out <- as.numeric(vapply(x, str2lang, FUN.VALUE = numeric(1L)))
  } else {
    out <- to_numeric(as.factor(x), dummy_factors = dummy_factors)
  }

  # shift to requested starting value
  if (!is.null(lowest)) {
    difference <- min(out) - lowest
    out <- out - difference
  }

  out
}


#' Convert to Numeric (if possible)
#'
#' Tries to convert vector to numeric if possible (if no warnings or errors).
#' Otherwise, leaves it as is.
#'
#' @param x A vector to be converted.
#'
#' @examples
#' coerce_to_numeric(c("1", "2"))
#' coerce_to_numeric(c("1", "2", "A"))
#' @return Numeric vector (if possible)
#' @export
coerce_to_numeric <- function(x) {
  tryCatch(
    as.numeric(as.character(x)),
    error = function(e) x,
    warning = function(w) x
  )
}


================================================
FILE: R/unnormalize.R
================================================
#' @rdname normalize
#' @export
unnormalize <- function(x, ...) {
  UseMethod("unnormalize")
}


#' @export
unnormalize.default <- function(x, ...) {
  insight::format_error(
    "Variables of class '",
    class(x)[1],
    "' can't be unnormalized."
  )
}


#' @rdname normalize
#' @export
unnormalize.numeric <- function(x, verbose = TRUE, ...) {
  ## TODO implement algorithm include_bounds = FALSE

  # if function called from the "grouped_df" method, we use the dw_transformer
  # attributes that were recovered in the "grouped_df" method

  dots <- match.call(expand.dots = FALSE)[["..."]]
  grp_attr_dw <- eval(dots$grp_attr_dw, envir = parent.frame(1L))

  if (is.null(grp_attr_dw)) {
    include_bounds <- attr(x, "include_bounds")
    min_value <- attr(x, "min_value")
    range_difference <- attr(x, "range_difference")
    to_range <- attr(x, "to_range")
  } else {
    names(grp_attr_dw) <- gsub(".*\\.", "", names(grp_attr_dw))
    include_bounds <- grp_attr_dw["include_bounds"]
    min_value <- grp_attr_dw["min_value"]
    range_difference <- grp_attr_dw["range_difference"]
    to_range <- grp_attr_dw["to_range"]
    if (is.na(to_range)) {
      to_range <- NULL
    }
  }

  if (is.null(min_value) || is.null(range_difference)) {
    if (verbose) {
      insight::format_error(
        "Can't unnormalize variable. Information about range and/or minimum value is missing."
      )
    }
    return(x)
  }

  if (is.null(to_range)) {
    x * range_difference + min_value
  } else {
    (x - to_range[1]) * (range_difference / diff(to_range)) + min_value
  }
}


#' @rdname normalize
#' @export
unnormalize.data.frame <- function(
  x,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
) {
  # evaluate select/exclude, may be select-helpers
  select <- .select_nse(
    select,
    x,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  # if function called from the "grouped_df" method, we use the dw_transformer
  # attributes that were recovered in the "grouped_df" method

  dots <- match.call(expand.dots = FALSE)[["..."]]

  if (is.null(dots$grp_attr_dw)) {
    grp_attr_dw <- NULL
  } else {
    grp_attr_dw <- eval(dots$grp_attr_dw, envir = parent.frame(1L))
  }

  for (i in select) {
    var_attr <- grep(paste0("^attr\\_", i, "\\."), names(grp_attr_dw))
    attrs <- grp_attr_dw[var_attr]
    x[[i]] <- unnormalize(x[[i]], verbose = verbose, grp_attr_dw = attrs)
  }

  x
}

#' @rdname normalize
#' @export
unnormalize.grouped_df <- function(
  x,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
) {
  # evaluate select/exclude, may be select-helpers
  select <- .select_nse(
    select,
    x,
    exclude,
    ignore_case,
    regex = regex,
    remove_group_var = TRUE,
    verbose = verbose
  )

  info <- attributes(x)

  grps <- attr(x, "groups", exact = TRUE)[[".rows"]]

  x <- as.data.frame(x)

  for (i in select) {
    if (is.null(info$groups[[paste0("attr_", i)]])) {
      insight::format_error(
        paste(
          "Couldn't retrieve the necessary information to unnormalize",
          text_concatenate(i, enclose = "`")
        )
      )
    }
  }
  for (rows in seq_along(grps)) {
    # get the dw_transformer attributes for this group
    raw_attrs <- unlist(info$groups[
      rows,
      startsWith(names(info$groups), "attr")
    ])
    if (length(select) == 1L) {
      names(raw_attrs) <- paste0("attr_", select, ".", names(raw_attrs))
    }

    tmp <- unnormalize(
      x[grps[[rows]], , drop = FALSE],
      select = select,
      exclude = exclude,
      ignore_case = ignore_case,
      regex = regex,
      verbose = verbose,
      grp_attr_dw = raw_attrs
    )
    x[grps[[rows]], ] <- tmp
  }
  # set back class, so data frame still works with dplyr
  attributes(x) <- utils::modifyList(info, attributes(x))
  class(x) <- c("grouped_df", class(x))
  x
}


================================================
FILE: R/unstandardize.R
================================================
#' @rdname standardize
#' @export
unstandardize <- function(x, ...) {
  UseMethod("unstandardize")
}

#' @rdname standardize
#' @export
unstandardise <- unstandardize

#' @rdname standardize
#' @export
unstandardize.numeric <- function(
  x,
  center = NULL,
  scale = NULL,
  reference = NULL,
  robust = FALSE,
  two_sd = FALSE,
  ...
) {
  if (!is.null(reference)) {
    if (robust) {
      center <- stats::median(reference, na.rm = TRUE)
      scale <- stats::mad(reference, na.rm = TRUE)
    } else {
      center <- mean(reference, na.rm = TRUE)
      scale <- stats::sd(reference, na.rm = TRUE)
    }
  } else if (is.null(center) || is.null(scale)) {
    if (all(c("center", "scale") %in% names(attributes(x)))) {
      center <- attr(x, "center", exact = TRUE)
      scale <- attr(x, "scale", exact = TRUE)
      attr(x, "scale") <- attr(x, "center") <- NULL
    } else if (
      all(c("scaled:center", "scaled:scale") %in% names(attributes(x)))
    ) {
      center <- attr(x, "scaled:center", exact = TRUE)
      scale <- attr(x, "scaled:scale", exact = TRUE)
      attr(x, "scaled:scale") <- attr(x, "scaled:center") <- NULL
    } else {
      insight::format_error(
        "You must provide the arguments `center`, `scale` or `reference`."
      )
    }
  }

  if (two_sd) {
    scale <- 2 * scale
  }

  x * scale + center
}

#' @rdname standardize
#' @export
unstandardize.data.frame <- function(
  x,
  center = NULL,
  scale = NULL,
  reference = NULL,
  robust = FALSE,
  two_sd = FALSE,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
) {
  # Select and deselect
  cols <- .select_nse(
    select,
    x,
    exclude = exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  dots <- match.call(expand.dots = FALSE)[["..."]]

  if (is.null(dots$grp_attr_dw)) {
    grp_attr_dw <- NULL
  } else {
    grp_attr_dw <- eval(dots$grp_attr_dw, envir = parent.frame(1L))
  }

  if (!is.null(grp_attr_dw)) {
    center <- vapply(
      cols,
      function(x) {
        grp_attr_dw[grep(
          paste0("^attr\\_", x, "\\.center"),
          names(grp_attr_dw)
        )]
      },
      FUN.VALUE = numeric(1L)
    )
    scale <- vapply(
      cols,
      function(x) {
        grp_attr_dw[grep(paste0("^attr\\_", x, "\\.scale"), names(grp_attr_dw))]
      },
      FUN.VALUE = numeric(1L)
    )
    i <- vapply(x[, cols, drop = FALSE], is.numeric, FUN.VALUE = logical(1L))
  } else if (!is.null(reference)) {
    i <- vapply(x[, cols, drop = FALSE], is.numeric, FUN.VALUE = logical(1L))
    i <- i[i]
    reference <- reference[names(i)]
    if (robust) {
      center <- vapply(
        reference,
        FUN.VALUE = numeric(1L),
        stats::median,
        na.rm = TRUE
      )
      scale <- vapply(
        reference,
        FUN.VALUE = numeric(1L),
        stats::mad,
        na.rm = TRUE
      )
    } else {
      center <- vapply(reference, FUN.VALUE = numeric(1L), mean, na.rm = TRUE)
      scale <- vapply(
        reference,
        FUN.VALUE = numeric(1L),
        stats::sd,
        na.rm = TRUE
      )
    }
  } else if (is.null(center) || is.null(scale)) {
    i <- vapply(
      x[, cols, drop = FALSE],
      function(k) {
        a <- attributes(k)
        is.numeric(k) && !is.null(a) && all(c("scale", "center") %in% names(a))
      },
      FUN.VALUE = logical(1L)
    )

    if (any(i)) {
      i <- i[i]
      center <- vapply(
        x[names(i)],
        FUN.VALUE = numeric(1L),
        attr,
        "center",
        exact = TRUE
      )
      scale <- vapply(
        x[names(i)],
        FUN.VALUE = numeric(1L),
        attr,
        "scale",
        exact = TRUE
      )
    } else if (all(c("center", "scale") %in% names(attributes(x)))) {
      center <- attr(x, "center", exact = TRUE)
      scale <- attr(x, "scale", exact = TRUE)
      attr(x, "center") <- attr(x, "scale") <- NULL
      i <- names(x) %in% names(scale)
      i <- i[i]
    } else {
      insight::format_error(
        "You must provide the arguments `center`, `scale` or `reference`."
      )
    }
  } else {
    if (is.null(names(center))) {
      i <- vapply(x, is.numeric, FUN.VALUE = logical(1L))
      names(center) <- names(scale) <- names(x[i])
    }

    i <- names(x) %in% names(center)
    names(i) <- names(x)
    i <- i[i]
  }

  if (two_sd) {
    scale <- 2 * scale
  }

  cols <- names(i)

  # Apply unstandardization to cols
  for (col in cols) {
    x[col] <- unstandardize(
      x[[col]],
      center = center[[col]],
      scale = scale[[col]]
    )
  }
  x
}

#' @export
unstandardize.factor <- function(x, ...) {
  x
}

#' @export
unstandardize.character <- function(x, ...) {
  x
}


#' @export
unstandardize.grouped_df <- function(
  x,
  center = NULL,
  scale = NULL,
  reference = NULL,
  robust = FALSE,
  two_sd = FALSE,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
) {
  # evaluate select/exclude, may be select-helpers
  select <- .select_nse(
    select,
    x,
    exclude,
    ignore_case,
    regex = regex,
    remove_group_var = TRUE,
    verbose = verbose
  )

  info <- attributes(x)

  grps <- attr(x, "groups", exact = TRUE)[[".rows"]]

  x <- as.data.frame(x)

  for (i in select) {
    if (is.null(info$groups[[paste0("attr_", i)]])) {
      insight::format_error(
        paste(
          "Couldn't retrieve the necessary information to unstandardize",
          text_concatenate(i, enclose = "`")
        )
      )
    }
  }

  for (rows in seq_along(grps)) {
    # get the dw_transformer attributes for this group
    raw_attrs <- unlist(info$groups[
      rows,
      startsWith(names(info$groups), "attr")
    ])
    if (length(select) == 1L) {
      names(raw_attrs) <- paste0("attr_", select, ".", names(raw_attrs))
    }

    tmp <- unstandardise(
      x[grps[[rows]], , drop = FALSE],
      center = center,
      scale = scale,
      reference = reference,
      robust = robust,
      two_sd = two_sd,
      select = select,
      exclude = exclude,
      ignore_case = ignore_case,
      regex = regex,
      verbose = verbose,
      grp_attr_dw = raw_attrs
    )
    x[grps[[rows]], ] <- tmp
  }
  # set back class, so data frame still works with dplyr
  attributes(x) <- utils::modifyList(info, attributes(x))
  class(x) <- c("grouped_df", class(x))
  x
}

#' @export
unstandardize.matrix <- function(
  x,
  center = NULL,
  scale = NULL,
  reference = NULL,
  robust = FALSE,
  two_sd = FALSE,
  ...
) {
  if (all(c("scaled:center", "scaled:scale") %in% names(attributes(x)))) {
    center <- attr(x, "scaled:center", exact = TRUE)
    scale <- attr(x, "scaled:scale", exact = TRUE)
    attr(x, "scaled:center") <- attr(x, "scaled:scale") <- NULL

    for (col in seq_len(ncol(x))) {
      x[, col] <- unstandardize.numeric(
        x[, col],
        center = center[col],
        scale = scale[col]
      )
    }
  } else {
    scales <- attr(x, "scale")
    centers <- attr(x, "center")

    xl <- lapply(seq_len(ncol(x)), function(i) {
      tmp <- x[, i]
      attributes(tmp) <- list(center = centers[i], scale = scales[i])
      tmp
    })

    xz <- lapply(xl, datawizard::unstandardize, ...)
    x_out <- do.call(cbind, xz)
    dimnames(x_out) <- dimnames(x)

    x <- x_out
  }
  x
}

#' @export
unstandardize.array <- unstandardize.matrix


# Datagrid ----------------------------------------------------------------

#' @export
unstandardize.datagrid <- function(x, ...) {
  x[names(x)] <- unstandardize(
    as.data.frame(x),
    reference = attributes(x)$data,
    ...
  )
  x
}

#' @export
unstandardize.visualisation_matrix <- unstandardize.datagrid


================================================
FILE: R/utils-cols.R
================================================
#' Tools for working with column names
#'
#' @param x A data frame.
#' @param row Row to use as column names.
#' @param na_prefix Prefix to give to the column name if the row has an `NA`.
#' Default is 'x', and it will be incremented at each `NA` (`x1`, `x2`, etc.).
#' @param verbose Toggle warnings.
#' @param prefix Prefix to give to the column name. Default is 'x', and it will
#' be incremented at each column (`x1`, `x2`, etc.).
#'
#' @return
#' `row_to_colnames()` and `colnames_to_row()` both return a data frame.
#'
#' @rdname colnames
#'
#' @export
#'
#' @examples
#' # Convert a row to column names --------------------------------
#' test <- data.frame(
#'   a = c("iso", 2, 5),
#'   b = c("year", 3, 6),
#'   c = c("value", 5, 7)
#' )
#' test
#' row_to_colnames(test)
#'
#' # Convert column names to row --------------------------------
#' test <- data.frame(
#'   ARG = c("BRA", "FRA"),
#'   `1960` = c(1960, 1960),
#'   `2000` = c(2000, 2000)
#' )
#' test
#' colnames_to_row(test)
#'
row_to_colnames <- function(x, row = 1, na_prefix = "x", verbose = TRUE) {
  if (!is.numeric(row)) {
    insight::format_error("Argument `row` must be of type numeric.")
  }
  if (length(row) != 1) {
    insight::format_error("Argument `row` must be of length 1.")
  }
  if (nrow(x) < row) {
    insight::format_error(
      paste0(
        "You used row = ",
        row,
        " but the dataset only has ",
        nrow(x),
        " rows."
      )
    )
  }

  new_colnames <- as.character(unlist(x[row, ], use.names = FALSE))

  # Create default colnames if there are NAs in the row used
  which_na <- which(is.na(new_colnames))
  n_na <- length(which_na)
  if (n_na > 0) {
    for (i in seq_along(which_na)) {
      new_colnames[which_na[i]] <- paste0(na_prefix, i)
    }
    if (verbose) {
      insight::format_warning(
        paste0(
          "Some values of row ",
          row,
          " were NAs. The corresponding column names are prefixed with `",
          na_prefix,
          "`."
        )
      )
    }
  }
  colnames(x) <- new_colnames
  x[-row, ]
}


#' @rdname colnames
#' @export
colnames_to_row <- function(x, prefix = "x") {
  if (length(prefix) != 1) {
    insight::format_error("Argument `prefix` must be of length 1.")
  }
  if (!is.character(prefix)) {
    insight::format_error("Argument `prefix` must be of type character.")
  }
  x2 <- rbind(colnames(x), x)
  colnames(x2) <- paste0(prefix, seq_len(ncol(x2)))
  x2
}


================================================
FILE: R/utils-rows.R
================================================
#' Tools for working with row names or row ids
#'
#' @param x A data frame.
#' @param var Name of column to use for row names/ids. For `column_as_rownames()`,
#'   this argument can be the variable name or the column number. For
#'   `rownames_as_column()` and `rowid_as_column()`, the column name must not
#'   already exist in the data.
#'
#' @details
#' These are similar to `tibble`'s functions `column_to_rownames()`,
#' `rownames_to_column()` and `rowid_to_column()`. Note that the behavior of
#' `rowid_as_column()` is different for grouped dataframe: instead of making
#' the rowid unique across the full dataframe, it creates rowid per group.
#' Therefore, there can be several rows with the same rowid if they belong to
#' different groups.
#'
#' If you are familiar with `dplyr`, this is similar to doing the following:
#' ```r
#' data |>
#'   group_by(grp) |>
#'   mutate(id = row_number()) |>
#'   ungroup()
#' ```
#'
#' @return
#' A data frame.
#'
#' @rdname rownames
#'
#' @examples
#' # Convert between row names and column --------------------------------
#' test <- rownames_as_column(mtcars, var = "car")
#' test
#' head(column_as_rownames(test, var = "car"))
#'
#' @export
rownames_as_column <- function(x, var = "rowname") {
  if (!insight::object_has_rownames(x)) {
    insight::format_error("The data frame doesn't have rownames.")
  }
  if (is.null(var)) {
    var <- "rowname"
  }
  if (!is.character(var)) {
    insight::format_error("Argument 'var' must be of type character.")
  }
  if (var %in% colnames(x)) {
    insight::format_error(
      paste0("There is already a variable named `", var, "` in your dataset.")
    )
  }
  original_x <- x
  rn <- data.frame(rn = rownames(x), stringsAsFactors = FALSE)
  x <- cbind(rn, x)
  colnames(x)[1] <- var
  rownames(x) <- NULL
  x <- .replace_attrs(x, attributes(original_x))
  x
}

#' @rdname rownames
#' @export
column_as_rownames <- function(x, var = "rowname") {
  if (!is.character(var) && !is.numeric(var)) {
    insight::format_error(
      "Argument `var` must be of type character or numeric."
    )
  }
  if (is.character(var) && !var %in% names(x)) {
    insight::format_error(paste0(
      "Variable \"",
      var,
      "\" is not in the data frame."
    ))
  }
  if (is.numeric(var) && (var > ncol(x) || var <= 0)) {
    insight::format_error(
      "Column ",
      var,
      " does not exist. There are ",
      ncol(x),
      " columns in the data frame."
    )
  }

  original_x <- x
  rownames(x) <- x[[var]]
  x[[var]] <- NULL
  x <- .replace_attrs(x, attributes(original_x))
  x
}


#' @rdname rownames
#' @export
#' @examples
#' test_data <- head(iris)
#'
#' rowid_as_column(test_data)
#' rowid_as_column(test_data, var = "my_id")
rowid_as_column <- function(x, var = "rowid") {
  UseMethod("rowid_as_column")
}

#' @export
rowid_as_column.default <- function(x, var = "rowid") {
  if (is.null(var)) {
    var <- "rowid"
  }
  if (!is.character(var)) {
    insight::format_error("Argument 'var' must be of type character.")
  }
  if (var %in% colnames(x)) {
    insight::format_error(
      paste0("There is already a variable named `", var, "` in your dataset.")
    )
  }
  original_x <- x
  rn <- data.frame(rn = seq_len(nrow(x)), stringsAsFactors = FALSE)
  x <- cbind(rn, x)
  colnames(x)[1] <- var
  rownames(x) <- NULL
  x <- .replace_attrs(x, attributes(original_x))
  x
}


#' @export
rowid_as_column.grouped_df <- function(x, var = "rowid") {
  if (!is.character(var)) {
    insight::format_error("Argument 'var' must be of type character.")
  }
  if (var %in% colnames(x)) {
    insight::format_error(
      paste0("There is already a variable named `", var, "` in your dataset.")
    )
  }

  grps <- attr(x, "groups", exact = TRUE)
  grps <- grps[[".rows"]]

  for (i in seq_along(grps)) {
    x[grps[[i]], var] <- seq_along(grps[[i]])
  }

  # can't just put select = "var" because there could be another variable
  # called var
  x <- data_relocate(x, paste0("^", var, "$"), regex = TRUE, before = 1)

  x
}


================================================
FILE: R/utils.R
================================================
#' @keywords internal
.get_model_info <- function(model, model_info = NULL, ...) {
  if (is.null(model_info)) {
    model_info <- insight::model_info(model)
  }

  model_info
}

#' `NULL` coalescing operator
#'
#' @keywords internal
#' @noRd
`%||%` <- function(x, y) {
  if (is.null(x)) y else x
}


#' Try to convert object to a dataframe
#'
#' @keywords internal
#' @noRd
.coerce_to_dataframe <- function(data) {
  if (!is.data.frame(data)) {
    data <- tryCatch(
      as.data.frame(data, stringsAsFactors = FALSE),
      error = function(e) {
        insight::format_error(
          "`data` must be a data frame, or an object that can be coerced to a data frame."
        )
      }
    )
  }
  data
}


#' Checks dataframes for syntactically valid column names
#' Argument "action" can be "warning", "error", or "message".
#'
#' @keywords internal
#' @noRd
.check_dataframe_names <- function(x, action = "warning", verbose = TRUE) {
  if (verbose && !all(colnames(x) == make.names(colnames(x), unique = TRUE))) {
    insight::format_alert(
      "Bad column names (e.g., with spaces) have been detected which might create issues in many functions.",
      paste0(
        "We recommend to rename following columns: ",
        text_concatenate(
          colnames(x)[colnames(x) != make.names(colnames(x), unique = TRUE)],
          enclose = "`"
        )
      ),
      "You can run `names(mydata) <- make.names(names(mydata))` or use `janitor::clean_names()` for a quick fix.", # nolint
      type = action
    )
  }
}


#' Fuzzy grep, matches pattern that are close, but not identical
#' @examples
#' colnames(iris)
#' p <- sprintf("(%s){~%i}", "Spela", 2)
#' grep(pattern = p, x = colnames(iris), ignore.case = FALSE)
#' @keywords internal
#' @noRd
.fuzzy_grep <- function(x, pattern, precision = NULL) {
  if (is.null(precision)) {
    precision <- round(nchar(pattern) / 3)
  }
  if (precision > nchar(pattern)) {
    return(NULL)
  }
  p <- sprintf("(%s){~%i}", pattern, precision)
  grep(pattern = p, x = x, ignore.case = FALSE)
}


#' create a message string to tell user about matches that could possibly
#' be the string they were looking for
#'
#' @keywords internal
#' @noRd
.misspelled_string <- function(source, searchterm, default_message = NULL) {
  if (is.null(searchterm) || length(searchterm) < 1) {
    return(default_message)
  }
  # used for many matches
  more_found <- ""
  # init default
  msg <- ""
  # guess the misspelled string
  possible_strings <- unlist(
    lapply(searchterm, function(s) {
      source[.fuzzy_grep(source, s)] # nolint
    }),
    use.names = FALSE
  )
  if (length(possible_strings)) {
    msg <- "Did you mean "
    if (length(possible_strings) > 1) {
      # make sure we don't print dozens of alternatives for larger data frames
      if (length(possible_strings) > 5) {
        more_found <- sprintf(
          " We even found %i more possible matches, not shown here.",
          length(possible_strings) - 5
        )
        possible_strings <- possible_strings[1:5]
      }
      msg <- paste0(
        msg,
        "one of ",
        text_concatenate(possible_strings, enclose = "\"", last = " or ")
      )
    } else {
      msg <- paste0(msg, "\"", possible_strings, "\"")
    }
    msg <- paste0(msg, "?", more_found)
  } else {
    msg <- default_message
  }
  # no double white space
  insight::trim_ws(msg)
}


#' Check that a vector is sorted
#' @noRd
#' @keywords internal
.is_sorted <- Negate(is.unsorted)


#' Replace only custom attributes
#'
#' Using "attributes(out) <- attributes(data)" or similar doesn't work so well
#' for big datasets because it takes some time to attribute the row names.
#'
#' This function gives only custom attributes to the new dataset.
#' @noRd
#' @keywords internal
.replace_attrs <- function(data, custom_attr) {
  for (nm in setdiff(names(custom_attr), names(attributes(data.frame())))) {
    attr(data, which = nm) <- custom_attr[[nm]]
  }
  data
}


#' @keywords internal
.is_date <- function(x) {
  inherits(x, "Date")
}


#' @keywords internal
.are_weights <- function(w) {
  !is.null(w) && length(w) && !all(w == 1) && !all(w == w[1])
}


#' @keywords internal
.factor_to_numeric <- function(x) {
  # no need to change for numeric
  if (is.numeric(x)) {
    return(x)
  }

  # Dates can be coerced by as.numeric(), w/o as.character()
  if (inherits(x, "Date")) {
    return(as.numeric(x))
  }

  # Logicals should be 0/1
  if (is.logical(x)) {
    return(as.numeric(x))
  }

  if (anyNA(suppressWarnings(as.numeric(as.character(stats::na.omit(x)))))) {
    if (is.character(x)) {
      x <- as.factor(x)
    }
    levels(x) <- 1:nlevels(x)
  }

  as.numeric(as.character(x))
}


# For standardize_parameters ----------------------------------------------

#' Taken from https://github.com/coolbutuseless/gluestick (licence: MIT)
#' Same functionality as `{glue}`
#'
#' @noRd
#' @keywords internal
.gluestick <- function(
  fmt,
  src = parent.frame(),
  open = "{",
  close = "}",
  eval = TRUE
) {
  nchar_open <- nchar(open)
  nchar_close <- nchar(close)

  # validation checks
  stopifnot(exprs = {
    is.character(fmt)
    length(fmt) == 1L
    is.character(open)
    length(open) == 1L
    nchar_open > 0L
    is.character(close)
    length(close) == 1
    nchar_close > 0
  })

  # Brute force the open/close characters into a regular expression for
  # extracting the expressions from the format string
  open <- gsub("(.)", "\\\\\\1", open) # Escape everything!!
  close <- gsub("(.)", "\\\\\\1", close) # Escape everything!!
  re <- paste0(open, ".*?", close)

  # Extract the delimited expressions
  matches <- gregexpr(re, fmt)
  exprs <- regmatches(fmt, matches)[[1]]

  # Remove the delimiters
  exprs <- substr(exprs, nchar_open + 1L, nchar(exprs) - nchar_close)

  # create a valid sprintf fmt string.
  #  - replace all "{expr}" strings with "%s"
  #  - escape any '%' so sprintf() doesn't try and use them for formatting
  #    but only if the '%' is NOT followed by an 's'
  #
  # gluestick() doesn't deal with any pathological cases
  fmt_sprintf <- gsub(re, "%s", fmt)
  fmt_sprintf <- gsub("%(?!s)", "%%", fmt_sprintf, perl = TRUE)

  # Evaluate
  if (eval) {
    fun_args <- lapply(exprs, function(expr) {
      eval(parse(text = expr), envir = src)
    })
  } else {
    fun_args <- unname(mget(exprs, envir = as.environment(src)))
  }

  # Create the string(s)
  do.call(sprintf, c(list(fmt_sprintf), fun_args))
}


#' help-functions
#' @keywords internal
#' @noRd
.data_frame <- function(...) {
  x <- data.frame(..., stringsAsFactors = FALSE)
  rownames(x) <- NULL
  x
}


================================================
FILE: R/utils_labels.R
================================================
# after data transformation, label attributes get lost. This function
# extracts label attributes from the original vector and adds them back
# to the transformed vector

#' @keywords internal
.set_back_labels <- function(
  new,
  old,
  include_values = TRUE,
  reverse_values = FALSE
) {
  # labelled data?
  attr(new, "label") <- attr(old, "label", exact = TRUE)
  value_labels <- attr(old, "labels", exact = TRUE)
  # "include_values" is used to preserve value labels
  if (isTRUE(include_values) && !is.null(value_labels)) {
    if (reverse_values) {
      # reverse values? Used for "reverse_scale()"
      attr(new, "labels") <- stats::setNames(
        rev(value_labels),
        names(value_labels)
      )
    } else if (is.numeric(new)) {
      # keep value oder? Used for "to_numeric()"
      if (any(grepl("[^0-9]", value_labels))) {
        # if we have any non-numeric characters, convert to numeric
        attr(new, "labels") <- stats::setNames(
          as.numeric(as.factor(value_labels)),
          names(value_labels)
        )
      } else {
        # if we have numeric, or "numeric character" (like "1", "2", "3" etc.)
        attr(new, "labels") <- stats::setNames(
          as.numeric(value_labels),
          names(value_labels)
        )
      }
    } else {
      attr(new, "labels") <- stats::setNames(value_labels, names(value_labels))
    }
  } else if (isFALSE(include_values)) {
    attr(new, "labels") <- NULL
  }
  new
}


# This functions converts value labels that are saved as attributes
# into factor levels

#' @keywords internal
.value_labels_to_levels <- function(x, verbose = TRUE, ...) {
  # extract value labels
  value_labels <- attr(x, "labels", exact = TRUE)
  # return, if none
  if (is.null(value_labels)) {
    return(x)
  }
  # check positions of matching values and levels
  levels_in_labs <- stats::na.omit(match(value_labels, levels(x)))
  labs_in_levels <- stats::na.omit(match(levels(x), value_labels))
  # validation check - if labelled values and levels don't match
  if (!length(levels_in_labs) || !length(labs_in_levels)) {
    if (verbose) {
      insight::format_alert(
        "Could not use value labels as factor levels.",
        "Labelled values and factor levels had no match."
      )
    }
    return(x)
  }
  # check if all levels have matching labels, and if not, tell user
  if (verbose && nlevels(x) != length(levels_in_labs)) {
    insight::format_alert(
      "Not all factor levels had a matching value label. Non-matching levels were preserved."
    )
  }
  # we need to find out which levels have no labelled value
  missing_levels <- levels(x)[!levels(x) %in% value_labels]

  # and we need to remove those value labels that don't have a matching level
  value_labels <- value_labels[value_labels %in% levels(x)]

  # for levels that have no label, we just keep the original factor level
  value_labels <- c(
    value_labels,
    stats::setNames(missing_levels, missing_levels)
  )

  # now we can add back levels
  levels(x) <- names(value_labels)[order(as.numeric(value_labels))]
  attr(x, "labels") <- NULL

  x
}


================================================
FILE: R/utils_standardize_center.R
================================================
# preparation for standardize and center ----
#
# Performs some preparation when standardizing or centering variables,
# like finding the center or scale, also in relation to some reference values.
# This function is applied to *vectors*.
#
#' @keywords internal
.process_std_center <- function(
  x,
  weights,
  robust,
  verbose = TRUE,
  reference = NULL,
  center = NULL,
  scale = NULL
) {
  # Warning if all NaNs
  if (all(is.na(x) | is.infinite(x))) {
    return(NULL)
  }

  if (.are_weights(weights)) {
    valid_x <- !is.na(x) &
      !is.na(weights) &
      !is.infinite(x) &
      !is.infinite(weights)
    na_values <- is.na(x) | is.na(weights)
    inf_values <- is.infinite(x) | is.infinite(weights)
    vals <- x[valid_x]
    weights <- weights[valid_x]
  } else {
    valid_x <- !is.na(x) & !is.infinite(x)
    na_values <- is.na(x)
    inf_values <- is.infinite(x)
    vals <- x[valid_x]
  }

  # validation checks
  check <- .check_standardize_numeric(
    x,
    name = NULL,
    verbose = verbose,
    reference = reference,
    center = center
  )

  if (is.factor(vals) || is.character(vals)) {
    vals <- .factor_to_numeric(vals)
  }

  # Get center and scale
  ref <- .get_center_scale(
    vals,
    robust,
    weights,
    reference,
    .center = center,
    .scale = scale,
    verbose = verbose
  )

  list(
    vals = vals,
    valid_x = valid_x,
    center = ref$center,
    scale = ref$scale,
    check = check,
    na_values = na_values,
    inf_values = inf_values
  )
}


# processing and checking of arguments ----
#
# Performs some preparation when standardizing or centering variables,
# like finding the center or scale, also in relation to some reference values.
# This function is applied to the *data frame methods*.
#
#' @keywords internal
.process_std_args <- function(
  x,
  select,
  exclude,
  weights,
  append,
  append_suffix = "_z",
  keep_factors,
  remove_na = "none",
  reference = NULL,
  .center = NULL,
  .scale = NULL,
  keep_character = FALSE,
  preserve_value_labels = FALSE
) {
  # check append argument, and set default
  if (isFALSE(append)) {
    append <- NULL
  } else if (isTRUE(append)) {
    append <- append_suffix
  }

  if (!is.null(weights) && is.character(weights)) {
    if (weights %in% colnames(x)) {
      exclude <- c(exclude, weights)
    } else {
      insight::format_warning(
        paste0(
          "Could not find weighting column `",
          weights,
          "`. Weighting not carried out."
        )
      )
      weights <- NULL
    }
  }

  select <- .select_variables(x, select, exclude, keep_factors, keep_character)

  # check if selected variables are in reference
  if (!is.null(reference) && !all(select %in% names(reference))) {
    insight::format_error(
      "The `reference` must include all variables from `select`."
    )
  }

  # copy label attributes
  variable_labels <- insight::compact_list(lapply(
    x,
    attr,
    "label",
    exact = TRUE
  ))
  value_labels <- NULL
  if (preserve_value_labels) {
    value_labels <- insight::compact_list(lapply(
      x,
      attr,
      "labels",
      exact = TRUE
    ))
  }

  # drop NAs
  remove_na <- match.arg(remove_na, c("none", "selected", "all"))

  omit <- switch(
    remove_na,
    none = logical(nrow(x)),
    selected = rowSums(vapply(x[select], is.na, FUN.VALUE = logical(nrow(x)))) >
      0,
    all = rowSums(vapply(x, is.na, FUN.VALUE = logical(nrow(x)))) > 0
  )
  x <- x[!omit, , drop = FALSE]

  if (!is.null(weights) && is.character(weights)) {
    weights <- x[[weights]]
  }

  # append standardized variables
  if (!is.null(append) && append != "") {
    new_variables <- x[select]
    colnames(new_variables) <- paste0(colnames(new_variables), append)
    if (length(variable_labels)) {
      variable_labels <- c(
        variable_labels,
        stats::setNames(variable_labels[select], colnames(new_variables))
      )
    }
    if (length(value_labels)) {
      value_labels <- c(
        value_labels,
        stats::setNames(value_labels[select], colnames(new_variables))
      )
    }
    x <- cbind(x, new_variables)
    select <- colnames(new_variables)
  }

  # check for reference center and scale
  if (!is.null(.center)) {
    # for center(), we have no scale - set it to default value
    if (is.null(.scale)) {
      .scale <- rep(1, length(.center))
    }

    # center and scale must have same length
    if (length(.center) != length(.scale)) {
      insight::format_error("`center` and `scale` must be of same length.")
    }

    # center and scale must either be of length 1 or of same length as selected variables
    if (length(.center) > 1 && length(.center) != length(select)) {
      insight::format_error(
        "`center` and `scale` must have the same length as the selected variables for standardization or centering."
      )
    }

    # if of length 1, recycle
    if (length(.center) == 1) {
      .center <- rep(.center, length(select))
      .scale <- rep(.scale, length(select))
    }

    # set names
    if (is.null(names(.center))) {
      .center <- stats::setNames(.center, select)
    }
    if (is.null(names(.scale))) {
      .scale <- stats::setNames(.scale, select)
    }
  } else {
    # use NA if missing, so we can index these as vectors
    .center <- stats::setNames(rep(NA, length(select)), select)
    .scale <- stats::setNames(rep(NA, length(select)), select)
  }

  # add back variable labels
  if (length(variable_labels)) {
    for (i in names(variable_labels)) {
      attr(x[[i]], "label") <- variable_labels[[i]]
    }
  }

  if (preserve_value_labels && length(value_labels)) {
    for (i in names(value_labels)) {
      attr(x[[i]], "labels") <- value_labels[[i]]
    }
  }

  list(
    x = x,
    select = select,
    exclude = exclude,
    weights = weights,
    append = append,
    center = .center,
    scale = .scale
  )
}


# retrieve center and scale information ----
#' @keywords internal
.get_center_scale <- function(
  x,
  robust = FALSE,
  weights = NULL,
  reference = NULL,
  .center = NULL,
  .scale = NULL,
  verbose = TRUE
) {
  if (is.null(reference)) {
    reference <- x
  }

  # for center(), we have no scale. default to 1
  if (is.null(.scale) || is.na(.scale) || isFALSE(.scale)) {
    scale <- 1
  } else if (isTRUE(.scale)) {
    if (robust) {
      scale <- weighted_mad(reference, weights)
    } else {
      scale <- weighted_sd(reference, weights)
    }
  } else {
    # we must have a numeric value here
    scale <- .scale
  }

  # process center
  if (is.null(.center) || is.na(.center) || isFALSE(.center)) {
    center <- 0
  } else if (isTRUE(.center)) {
    if (robust) {
      center <- weighted_median(reference, weights)
    } else {
      center <- weighted_mean(reference, weights)
    }
  } else {
    # we must have a numeric value here
    center <- .center
  }

  if (scale == 0) {
    scale <- 1
    if (verbose) {
      insight::format_warning(sprintf(
        "%s is 0 - variable not standardized (only scaled).",
        if (robust) "MAD" else "SD"
      ))
    }
  }

  list(center = center, scale = scale)
}


# check range of input variables ----
#' @keywords internal
.check_standardize_numeric <- function(
  x,
  name = NULL,
  verbose = TRUE,
  reference = NULL,
  center
) {
  # Warning if only one value
  if (
    insight::has_single_value(x) &&
      is.null(reference) &&
      (is.null(center) || isTRUE(center))
  ) {
    if (verbose) {
      if (is.null(name)) {
        insight::format_alert(
          "The variable contains only one unique value and will be set to 0."
        )
      } else {
        insight::format_alert(
          paste0(
            "The variable `",
            name,
            "` contains only one unique value and will be set to 0."
          )
        )
      }
    }
    return(NULL)
  }

  # Warning if logical vector
  if (
    verbose && insight::n_unique(x) == 2 && !is.factor(x) && !is.character(x)
  ) {
    if (is.null(name)) {
      insight::format_alert(
        "The variable contains only two different values. Consider converting it to a factor."
      )
    } else {
      insight::format_alert(
        paste0(
          "Variable `",
          name,
          "` contains only two different values. Consider converting it to a factor."
        )
      )
    }
  }
  x
}


# process append argument ----
#' @keywords internal
.process_append <- function(
  x,
  select,
  append,
  append_suffix = "_z",
  preserve_value_labels = FALSE,
  keep_factors = TRUE,
  keep_character = FALSE
) {
  # check append argument, and set default
  if (isFALSE(append)) {
    append <- NULL
  } else if (isTRUE(append)) {
    append <- append_suffix
  }

  # append recoded variables
  if (!is.null(append) && append != "") {
    # keep or drop factors and characters
    select <- .select_variables(
      x,
      select,
      exclude = NULL,
      keep_factors = keep_factors,
      keep_character = keep_character
    )

    # copy label attributes
    variable_labels <- insight::compact_list(lapply(
      x,
      attr,
      "label",
      exact = TRUE
    ))
    value_labels <- NULL
    if (preserve_value_labels) {
      value_labels <- insight::compact_list(lapply(
        x,
        attr,
        "labels",
        exact = TRUE
      ))
    }

    # add new variables that sould be appended
    new_variables <- x[select]
    colnames(new_variables) <- paste0(colnames(new_variables), append)
    if (length(variable_labels)) {
      variable_labels <- c(
        variable_labels,
        stats::setNames(variable_labels[select], colnames(new_variables))
      )
    }
    if (length(value_labels)) {
      value_labels <- c(
        value_labels,
        stats::setNames(value_labels[select], colnames(new_variables))
      )
    }
    x <- cbind(x, new_variables)
    select <- colnames(new_variables)

    # add back variable labels
    if (length(variable_labels)) {
      for (i in names(variable_labels)) {
        attr(x[[i]], "label") <- variable_labels[[i]]
      }
    }

    if (preserve_value_labels && length(value_labels)) {
      for (i in names(value_labels)) {
        attr(x[[i]], "labels") <- value_labels[[i]]
      }
    }
  }
  list(x = x, select = select)
}


# variables to standardize and center ----
#
# This function mainly serves the purpose to keep or drop factors and
# character vectors from transformation functions.
#
#' @keywords internal
.select_variables <- function(
  x,
  select,
  exclude,
  keep_factors,
  keep_character = FALSE
) {
  if (is.null(select)) {
    select <- names(x)
  }

  if (!is.null(exclude)) {
    select <- setdiff(select, exclude)
  }

  if (!keep_factors) {
    if (!keep_character) {
      factors <- vapply(
        x[select],
        function(i) is.factor(i) | is.character(i),
        FUN.VALUE = logical(1L)
      )
    } else {
      factors <- vapply(x[select], is.factor, FUN.VALUE = logical(1L))
    }
    select <- select[!factors]
  }

  select
}


# for grouped df ---------------------------
#' @keywords internal
.process_grouped_df <- function(
  x,
  select,
  exclude,
  append,
  append_suffix = "_z",
  reference,
  weights,
  keep_factors
) {
  if (!is.null(reference)) {
    insight::format_error(
      "The `reference` argument cannot be used with grouped standardization for now."
    )
  }

  # check append argument, and set default
  if (isFALSE(append)) {
    append <- NULL
  } else if (isTRUE(append)) {
    append <- append_suffix
  }

  info <- attributes(x)

  grps <- attr(x, "groups", exact = TRUE)[[".rows"]]

  # for grouped data frames, we can decide to remove group variable from selection
  grp_vars <- setdiff(colnames(attr(x, "groups", exact = TRUE)), ".rows")

  if (is.numeric(weights)) {
    insight::format_warning(
      "For grouped data frames, `weights` must be a character, not a numeric vector.",
      "Ignoring weightings."
    )
    weights <- NULL
  }

  x <- as.data.frame(x)
  select <- .select_variables(x, select, exclude, keep_factors)
  select <- setdiff(select, grp_vars)

  # append standardized variables
  if (!is.null(append) && append != "") {
    new_variables <- x[select]
    colnames(new_variables) <- paste0(colnames(new_variables), append)
    x <- cbind(x, new_variables)
    select <- colnames(new_variables)
    info$names <- c(info$names, select)
  }

  list(x = x, info = info, select = select, grps = grps, weights = weights)
}


================================================
FILE: R/visualisation_recipe.R
================================================
#' Prepare objects for visualisation
#'
#' @description This function prepares objects for visualisation by returning a list of
#' layers with data and geoms that can be easily plotted using for instance
#' `ggplot2`.
#'
#' If the `see` package is installed, the call to `visualization_recipe()` can be
#' replaced by `plot()`, which will internally call the former and then plot it
#' using `ggplot`. The resulting plot can be customized ad-hoc (by adding
#' ggplot's geoms, theme or specifications), or via some of the arguments
#' of `visualisation_recipe()` that control the aesthetic parameters.
#'
#' See the specific documentation page for your object's class:
#'
#'  - {modelbased}: <https://easystats.github.io/modelbased/reference/visualisation_recipe.estimate_predicted.html>
#'  - {correlation}: <https://easystats.github.io/correlation/reference/visualisation_recipe.easycormatrix.html>
#'
#' @param x An `easystats` object.
#' @param ... Other arguments passed to other functions.
#'
#' @export
visualisation_recipe <- function(x, ...) {
  UseMethod("visualisation_recipe")
}


#' @export
print.visualisation_recipe <- function(x, ...) {
  for (i in seq_along(x)) {
    l <- x[[paste0("l", i)]]
    insight::print_color(paste0("Layer ", i, "\n--------\n"), "blue")
    insight::print_color(
      paste0("Geom type: ", ifelse(is.null(l$geom), "[NULL]", l$geom), "\n"),
      "yellow"
    )

    elements <- names(l)[!vapply(l, is.null, FUN.VALUE = logical(1L))]

    # Loop through all elements of list
    for (element in elements[elements != "geom"]) {
      # Print element name
      if (element == "aes") {
        cat("aes_string(\n")
      } else {
        cat(paste0(element, " = "))
      }

      # Print element
      if (element == "data") {
        cat(paste0("[", paste0(dim(l$data), collapse = " x "), "]"))
      } else if (element == "aes") {
        for (aes in names(l$aes)) {
          if (!is.null(l$aes[[aes]])) {
            if (is.character(l$aes[[aes]])) {
              cat(paste0("  ", aes, " = '", l$aes[[aes]], "'\n"))
            } else {
              cat(paste0("  ", aes, " = ", l$aes[[aes]], "\n"))
            }
          }
        }
        cat(")")
      } else {
        if (
          is.character(l[[element]]) ||
            is.numeric(l[[element]]) ||
            is.factor(l[[element]])
        ) {
          if (is.character(l[[element]])) {
            cat(paste0("'", l[[element]], "'"))
          } else {
            if (length(l[[element]]) == 1) {
              cat(l[[element]])
            } else {
              cat(paste0("c(", toString(l[[element]]), ")"))
            }
          }
        } else {
          cat(paste0("class: ", class(l[[element]]), collapse = "/"))
        }
      }
      cat("\n")
    }
    cat("\n")
  }
}


#' @export
plot.visualisation_recipe <- function(x, ...) {
  insight::check_if_installed("see")
  NextMethod()
}


================================================
FILE: R/weighted_mean_median_sd_mad.R
================================================
#' Weighted Mean, Median, SD, and MAD
#'
#' @inheritParams stats::weighted.mean
#' @inheritParams stats::mad
#' @param weights A numerical vector of weights the same length as `x` giving
#' the weights to use for elements of `x`. If `weights = NULL`, `x` is passed
#' to the non-weighted function.
#' @param verbose Show warning when `weights` are negative?
#' @param remove_na Logical, if `TRUE` (default), removes missing (`NA`) and infinite
#' values from `x` and `weights`.
#'
#' @examples
#' ## GPA from Siegel 1994
#' x <- c(3.7, 3.3, 3.5, 2.8)
#' wt <- c(5, 5, 4, 1) / 15
#'
#' weighted_mean(x, wt)
#' weighted_median(x, wt)
#'
#' weighted_sd(x, wt)
#' weighted_mad(x, wt)
#'
#' @export
weighted_mean <- function(
  x,
  weights = NULL,
  remove_na = TRUE,
  verbose = TRUE,
  ...
) {
  if (!.are_weights(weights) || !.validate_weights(weights, verbose)) {
    return(mean(x, na.rm = remove_na))
  }

  # remove missings
  complete <- .clean_missings(x, weights, remove_na)
  stats::weighted.mean(complete$x, complete$weights, na.rm = remove_na)
}


#' @export
#' @rdname weighted_mean
weighted_median <- function(
  x,
  weights = NULL,
  remove_na = TRUE,
  verbose = TRUE,
  ...
) {
  if (!.are_weights(weights) || !.validate_weights(weights, verbose)) {
    return(stats::median(x, na.rm = remove_na))
  }

  p <- 0.5 # split probability

  # remove missings
  complete <- .clean_missings(x, weights, remove_na)

  order <- order(complete$x)
  x <- complete$x[order]
  weights <- complete$weights[order]

  rw <- cumsum(weights) / sum(weights)
  # validation check
  if (all(is.na(rw))) {
    return(NA_real_)
  }

  md.values <- min(which(rw >= p))

  if (rw[md.values] == p) {
    q <- mean(x[md.values:(md.values + 1)])
  } else {
    q <- x[md.values]
  }

  q
}


#' @export
#' @rdname weighted_mean
weighted_sd <- function(
  x,
  weights = NULL,
  remove_na = TRUE,
  verbose = TRUE,
  ...
) {
  # from cov.wt
  if (!.are_weights(weights) || !.validate_weights(weights, verbose)) {
    return(stats::sd(x, na.rm = remove_na))
  }

  # remove missings
  complete <- .clean_missings(x, weights, remove_na)

  weights1 <- complete$weights / sum(complete$weights)
  center <- sum(weights1 * complete$x)
  xc <- sqrt(weights1) * (complete$x - center)
  var <- (t(xc) %*% xc) / (1 - sum(weights1^2))
  sqrt(as.vector(var))
}

#' @export
#' @rdname weighted_mean
weighted_mad <- function(
  x,
  weights = NULL,
  constant = 1.4826,
  remove_na = TRUE,
  verbose = TRUE,
  ...
) {
  # From matrixStats
  if (!.are_weights(weights) || !.validate_weights(weights, verbose)) {
    return(stats::mad(x, na.rm = remove_na))
  }

  center <- weighted_median(x, weights = weights, remove_na = remove_na)
  x <- abs(x - center)
  constant * weighted_median(x, weights = weights, remove_na = remove_na)
}


# Utils -------------------------------------------------------------------

.validate_weights <- function(weights, verbose = TRUE) {
  pos <- all(weights > 0, na.rm = TRUE)

  if (isTRUE(!pos) && isTRUE(verbose)) {
    insight::format_warning(
      "Some `weights` were negative. Weighting not carried out."
    )
  }

  pos
}

.clean_missings <- function(x, weights, remove_na) {
  if (isTRUE(remove_na)) {
    flag <- FALSE
    if (any(is.infinite(x)) || any(is.infinite(weights))) {
      # remove Inf
      x[is.infinite(x)] <- NA
      weights[is.infinite(weights)] <- NA
      flag <- TRUE
    }

    if (anyNA(x) || anyNA(weights)) {
      # remove missings
      x[is.na(weights)] <- NA
      weights[is.na(x)] <- NA
      flag <- TRUE
    }

    if (flag) {
      weights <- stats::na.omit(weights)
      x <- stats::na.omit(x)
    }
  }

  list(x = x, weights = weights)
}


================================================
FILE: R/winsorize.R
================================================
#' Winsorize data
#'
#' @details
#'
#' Winsorizing or winsorization is the transformation of statistics by limiting
#' extreme values in the statistical data to reduce the effect of possibly
#' spurious outliers. The distribution of many statistics can be heavily
#' influenced by outliers. A typical strategy is to set all outliers (values
#' beyond a certain threshold) to a specified percentile of the data; for
#' example, a `90%` winsorization would see all data below the 5th percentile set
#' to the 5th percentile, and data above the 95th percentile set to the 95th
#' percentile. Winsorized estimators are usually more robust to outliers than
#' their more standard forms.
#'
#' @return
#'
#' A data frame with winsorized columns or a winsorized vector.
#'
#' @param data data frame or vector.
#' @param threshold The amount of winsorization, depends on the value of `method`:
#' - For `method = "percentile"`: the amount to winsorize from *each* tail.
#'   The value of `threshold` must be between 0 and 0.5 and of length 1.
#' - For `method = "zscore"`: the number of *SD*/*MAD*-deviations from the
#'   *mean*/*median* (see `robust`). The value of `threshold` must be greater
#'   than 0 and of length 1.
#' - For `method = "raw"`: a vector of length 2 with the lower and upper bound
#'   for winsorization.
#' @param method One of "percentile" (default), "zscore", or "raw".
#' @param robust Logical, if TRUE, winsorizing through the "zscore" method is
#'   done via the median and the median absolute deviation (MAD); if FALSE, via
#'   the mean and the standard deviation.
#' @param ... Currently not used.
#' @param verbose Not used anymore since `datawizard` 0.6.6.
#'
#' @examples
#' hist(iris$Sepal.Length, main = "Original data")
#'
#' hist(winsorize(iris$Sepal.Length, threshold = 0.2),
#'   xlim = c(4, 8), main = "Percentile Winsorization"
#' )
#'
#' hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore"),
#'   xlim = c(4, 8), main = "Mean (+/- SD) Winsorization"
#' )
#'
#' hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore", robust = TRUE),
#'   xlim = c(4, 8), main = "Median (+/- MAD) Winsorization"
#' )
#'
#' hist(winsorize(iris$Sepal.Length, threshold = c(5, 7.5), method = "raw"),
#'   xlim = c(4, 8), main = "Raw Thresholds"
#' )
#'
#' # Also works on a data frame:
#' winsorize(iris, threshold = 0.2)
#'
#' @inherit data_rename seealso
#' @export
winsorize <- function(data, ...) {
  UseMethod("winsorize")
}


#' @export
winsorize.factor <- function(data, ...) {
  data
}

#' @export
winsorize.character <- winsorize.factor

#' @export
winsorize.logical <- winsorize.factor

#' @export
winsorize.data.frame <- function(
  data,
  threshold = 0.2,
  method = "percentile",
  robust = FALSE,
  verbose = TRUE,
  ...
) {
  data[] <- lapply(
    data,
    winsorize,
    threshold = threshold,
    method = method,
    robust = robust,
    verbose = verbose
  )
  data
}

#' @rdname winsorize
#' @export
winsorize.numeric <- function(
  data,
  threshold = 0.2,
  method = "percentile",
  robust = FALSE,
  verbose = TRUE,
  ...
) {
  method <- match.arg(method, choices = c("percentile", "zscore", "raw"))

  if (method == "raw" && length(threshold) != 2L) {
    insight::format_error(
      "`threshold` must be of length 2 for lower and upper bound."
    )
  }

  if (method == "percentile") {
    if (threshold < 0 || threshold > 0.5) {
      insight::format_error(
        "`threshold` for winsorization must be a scalar between 0 and 0.5."
      )
    }

    y <- sort(data)
    n <- length(data)
    ibot <- floor(threshold * n) + 1
    itop <- length(data) - ibot + 1

    threshold <- c(y[ibot], y[itop])
  }

  if (method == "zscore") {
    if (threshold <= 0) {
      insight::format_error(
        "'threshold' for winsorization must be a scalar greater than 0."
      )
    }

    if (isTRUE(robust)) {
      centeral <- stats::median(data, na.rm = TRUE)
      deviation <- stats::mad(data, center = centeral, na.rm = TRUE)
    } else {
      centeral <- mean(data, na.rm = TRUE)
      deviation <- stats::sd(data, na.rm = TRUE)
    }

    threshold <- centeral + c(-1, 1) * deviation * threshold
  }

  data[data < threshold[1]] <- threshold[1]
  data[data > threshold[2]] <- threshold[2]
  return(data)
}


================================================
FILE: README.Rmd
================================================
---
output: github_document
---

# `datawizard`: Easy Data Wrangling and Statistical Transformations <img src='man/figures/logo.png' align="right" height="139" />

```{r, echo=FALSE, warning=FALSE, message=FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  dpi = 300,
  out.width = "100%",
  fig.path = "man/figures/",
  comment = "#>"
)

set.seed(333)
library(datawizard)
```

[![DOI](https://joss.theoj.org/papers/10.21105/joss.04684/status.svg)](https://doi.org/10.21105/joss.04684)
[![downloads](https://cranlogs.r-pkg.org/badges/datawizard)](https://cran.r-project.org/package=datawizard)
[![total](https://cranlogs.r-pkg.org/badges/grand-total/datawizard)](https://cranlogs.r-pkg.org/)

<!-- ***:sparkles: Hockety pockety wockety wack, prepare this data forth and back*** -->

<!-- ***Hockety pockety wockety wock, messy data is in shock*** -->

<!-- ***Hockety pockety wockety woss, you can cite i-it from JOSS*** <sup>(soon)</sup> -->

<!-- ***Hockety pockety wockety wass, datawizard saves your ass! :sparkles:*** -->

`{datawizard}` is a lightweight package to easily manipulate, clean, transform, and prepare your data for analysis. It is part of the [easystats ecosystem](https://easystats.github.io/easystats/), a suite of R packages to deal with your entire statistical analysis, from cleaning the data to reporting the results.

It covers two aspects of data preparation:

- **Data manipulation**: `{datawizard}` offers a very similar set of functions to that of the *tidyverse* packages, such as a `{dplyr}` and `{tidyr}`, to select, filter and reshape data, with a few key differences. 1) All data manipulation functions start with the prefix `data_*` (which makes them easy to identify). 2) Although most functions can be used exactly as their *tidyverse* equivalents, they are also string-friendly (which makes them easy to program with and use inside functions). Finally, `{datawizard}` is super lightweight (no dependencies, similar to [poorman](https://github.com/nathaneastwood/poorman)), which makes it awesome for developers to use in their packages.

- **Statistical transformations**: `{datawizard}` also has powerful functions to easily apply common data [transformations](https://easystats.github.io/datawizard/reference/index.html#statistical-transformations), including standardization, normalization, rescaling, rank-transformation, scale reversing, recoding, binning, etc.


</br>

<img src='https://media.giphy.com/media/VcizxCUIgaKpa/giphy.gif' width="300"/>

</br>

# Installation

[![CRAN_Status_Badge](https://www.r-pkg.org/badges/version/datawizard)](https://cran.r-project.org/package=datawizard) [![datawizard status badge](https://easystats.r-universe.dev/badges/datawizard)](https://easystats.r-universe.dev) [![codecov](https://codecov.io/gh/easystats/datawizard/branch/main/graph/badge.svg)](https://app.codecov.io/gh/easystats/datawizard) [![R-CMD-check](https://github.com/easystats/datawizard/workflows/R-CMD-check/badge.svg?branch=main)](https://github.com/easystats/datawizard/actions)

Type | Source | Command
---|---|---
Release | CRAN | `install.packages("datawizard")`
Development | r-universe | `install.packages("datawizard", repos = "https://easystats.r-universe.dev")`
Development | GitHub | `remotes::install_github("easystats/datawizard")`

> **Tip**
>
> **Instead of `library(datawizard)`, use `library(easystats)`.**
> **This will make all features of the  easystats-ecosystem available.**
>
> **To stay updated, use `easystats::install_latest()`.**

# Citation

To cite the package, run the following command:

```{r, comment=""}
citation("datawizard")
```

# Features

[![Documentation](https://img.shields.io/badge/documentation-datawizard-orange.svg?colorB=E91E63)](https://easystats.github.io/datawizard/)
[![Blog](https://img.shields.io/badge/blog-easystats-orange.svg?colorB=FF9800)](https://easystats.github.io/blog/posts/)
[![Features](https://img.shields.io/badge/features-datawizard-orange.svg?colorB=2196F3)](https://easystats.github.io/datawizard/reference/index.html)

Most courses and tutorials about statistical modeling assume that you are working with a clean and tidy dataset. In practice, however, a major part of doing statistical modeling is preparing your data--cleaning up values, creating new columns, reshaping the dataset, or transforming some variables. `{datawizard}` provides easy to use tools to perform these common, critical, and sometimes tedious data preparation tasks.

## Data wrangling

### Select, filter and remove variables

The package provides helpers to filter rows meeting certain conditions...

```{r}
data_match(mtcars, data.frame(vs = 0, am = 1))
```

... or logical expressions:

```{r}
data_filter(mtcars, vs == 0 & am == 1)
```

Finding columns in a data frame, or retrieving the data of selected columns, can be  achieved using `extract_column_names()` or `data_select()`:

```{r}
# find column names matching a pattern
extract_column_names(iris, starts_with("Sepal"))

# return data columns matching a pattern
data_select(iris, starts_with("Sepal")) |> head()
```

It is also possible to extract one or more variables:

```{r}
# single variable
data_extract(mtcars, "gear")

# more variables
head(data_extract(iris, ends_with("Width")))
```

Due to the consistent API, removing variables is just as simple:

```{r}
head(data_remove(iris, starts_with("Sepal")))
```

### Reorder or rename

```{r}
head(data_relocate(iris, select = "Species", before = "Sepal.Length"))
```

```{r}
head(data_rename(iris, c("Sepal.Length", "Sepal.Width"), c("length", "width")))
```

### Merge

```{r}
x <- data.frame(a = 1:3, b = c("a", "b", "c"), c = 5:7, id = 1:3)
y <- data.frame(c = 6:8, d = c("f", "g", "h"), e = 100:102, id = 2:4)

x
y

data_merge(x, y, join = "full")

data_merge(x, y, join = "left")

data_merge(x, y, join = "right")

data_merge(x, y, join = "semi", by = "c")

data_merge(x, y, join = "anti", by = "c")

data_merge(x, y, join = "inner")

data_merge(x, y, join = "bind")
```

### Reshape

A common data wrangling task is to reshape data.

Either to go from wide/Cartesian to long/tidy format

```{r}
wide_data <- data.frame(replicate(5, rnorm(10)))

head(data_to_long(wide_data))
```

or the other way

```{r}
long_data <- data_to_long(wide_data, rows_to = "Row_ID") # Save row number

data_to_wide(long_data,
  names_from = "name",
  values_from = "value",
  id_cols = "Row_ID"
)
```

### Empty rows and columns

```{r}
tmp <- data.frame(
  a = c(1, 2, 3, NA, 5),
  b = c(1, NA, 3, NA, 5),
  c = c(NA, NA, NA, NA, NA),
  d = c(1, NA, 3, NA, 5)
)

tmp

# indices of empty columns or rows
empty_columns(tmp)
empty_rows(tmp)

# remove empty columns or rows
remove_empty_columns(tmp)
remove_empty_rows(tmp)

# remove empty columns and rows
remove_empty(tmp)
```

### Recode or cut dataframe

```{r}
set.seed(123)
x <- sample(1:10, size = 50, replace = TRUE)

table(x)

# cut into 3 groups, based on distribution (quantiles)
table(categorize(x, split = "quantile", n_groups = 3))
```

## Data Transformations

The packages also contains multiple functions to help transform data.

### Standardize

For example, to standardize (*z*-score) data:

```{r}
# before
summary(swiss)

# after
summary(standardize(swiss))
```

### Winsorize

To winsorize data:

```{r}
# before
anscombe

# after
winsorize(anscombe)
```

### Center

To grand-mean center data

```{r}
center(anscombe)
```

### Ranktransform

To rank-transform data:

```{r}
# before
head(trees)

# after
head(ranktransform(trees))
```

### Rescale

To rescale a numeric variable to a new range:

```{r}
change_scale(c(0, 1, 5, -5, -2))
```

### Rotate or transpose

```{r}
x <- mtcars[1:3, 1:4]

x

data_rotate(x)
```


## Data properties

`datawizard` provides a way to provide comprehensive descriptive summary for all variables in a dataframe:

```{r}
data(iris)
describe_distribution(iris)
```

Or even just a variable

```{r}
describe_distribution(mtcars$wt)
```

There are also some additional data properties that can be computed using this package.

```{r}
x <- (-10:10)^3 + rnorm(21, 0, 100)
smoothness(x, method = "diff")
```

## Function design and pipe-workflow

The design of the `{datawizard}` functions follows a design principle that makes it easy for user to understand and remember how functions work:

1. the first argument is the data
2. for methods that work on data frames, two arguments are following to `select` and `exclude` variables
3. the following arguments are arguments related to the specific tasks of the functions

Most important, functions that accept data frames usually have this as their first argument, and also return a (modified) data frame again. Thus, `{datawizard}` integrates smoothly into a "pipe-workflow".

```{r}
iris |>
  # all rows where Species is "versicolor" or "virginica"
  data_filter(Species %in% c("versicolor", "virginica")) |>
  # select only columns with "." in names (i.e. drop Species)
  data_select(contains("\\.")) |>
  # move columns that ends with "Length" to start of data frame
  data_relocate(ends_with("Length")) |>
  # remove fourth column
  data_remove(4) |>
  head()
```

# Contributing and Support

In case you want to file an issue or contribute in another way to the package, please follow [this guide](https://easystats.github.io/datawizard/CONTRIBUTING.html). For questions about the functionality, you may either contact us via email or also file an issue.

# Code of Conduct

Please note that this project is released with a
[Contributor Code of Conduct](https://easystats.github.io/datawizard/CODE_OF_CONDUCT.html). By participating in this project you agree to abide by its terms.


================================================
FILE: README.md
================================================

# `datawizard`: Easy Data Wrangling and Statistical Transformations <img src='man/figures/logo.png' align="right" height="139" />

[![DOI](https://joss.theoj.org/papers/10.21105/joss.04684/status.svg)](https://doi.org/10.21105/joss.04684)
[![downloads](https://cranlogs.r-pkg.org/badges/datawizard)](https://cran.r-project.org/package=datawizard)
[![total](https://cranlogs.r-pkg.org/badges/grand-total/datawizard)](https://cranlogs.r-pkg.org/)

<!-- ***:sparkles: Hockety pockety wockety wack, prepare this data forth and back*** -->

<!-- ***Hockety pockety wockety wock, messy data is in shock*** -->

<!-- ***Hockety pockety wockety woss, you can cite i-it from JOSS*** <sup>(soon)</sup> -->

<!-- ***Hockety pockety wockety wass, datawizard saves your ass! :sparkles:*** -->

`{datawizard}` is a lightweight package to easily manipulate, clean,
transform, and prepare your data for analysis. It is part of the
[easystats ecosystem](https://easystats.github.io/easystats/), a suite
of R packages to deal with your entire statistical analysis, from
cleaning the data to reporting the results.

It covers two aspects of data preparation:

- **Data manipulation**: `{datawizard}` offers a very similar set of
  functions to that of the *tidyverse* packages, such as a `{dplyr}` and
  `{tidyr}`, to select, filter and reshape data, with a few key
  differences. 1) All data manipulation functions start with the prefix
  `data_*` (which makes them easy to identify). 2) Although most
  functions can be used exactly as their *tidyverse* equivalents, they
  are also string-friendly (which makes them easy to program with and
  use inside functions). Finally, `{datawizard}` is super lightweight
  (no dependencies, similar to
  [poorman](https://github.com/nathaneastwood/poorman)), which makes it
  awesome for developers to use in their packages.

- **Statistical transformations**: `{datawizard}` also has powerful
  functions to easily apply common data
  [transformations](https://easystats.github.io/datawizard/reference/index.html#statistical-transformations),
  including standardization, normalization, rescaling,
  rank-transformation, scale reversing, recoding, binning, etc.

</br>

<img src='https://media.giphy.com/media/VcizxCUIgaKpa/giphy.gif' width="300"/>

</br>

# Installation

[![CRAN_Status_Badge](https://www.r-pkg.org/badges/version/datawizard)](https://cran.r-project.org/package=datawizard)
[![datawizard status
badge](https://easystats.r-universe.dev/badges/datawizard)](https://easystats.r-universe.dev)
[![codecov](https://codecov.io/gh/easystats/datawizard/branch/main/graph/badge.svg)](https://app.codecov.io/gh/easystats/datawizard)
[![R-CMD-check](https://github.com/easystats/datawizard/workflows/R-CMD-check/badge.svg?branch=main)](https://github.com/easystats/datawizard/actions)

| Type | Source | Command |
|----|----|----|
| Release | CRAN | `install.packages("datawizard")` |
| Development | r-universe | `install.packages("datawizard", repos = "https://easystats.r-universe.dev")` |
| Development | GitHub | `remotes::install_github("easystats/datawizard")` |

> **Tip**
>
> **Instead of `library(datawizard)`, use `library(easystats)`.** **This
> will make all features of the easystats-ecosystem available.**
>
> **To stay updated, use `easystats::install_latest()`.**

# Citation

To cite the package, run the following command:

``` r
citation("datawizard")
To cite package 'datawizard' in publications use:

  Patil et al., (2022). datawizard: An R Package for Easy Data
  Preparation and Statistical Transformations. Journal of Open Source
  Software, 7(78), 4684, https://doi.org/10.21105/joss.04684

A BibTeX entry for LaTeX users is

  @Article{,
    title = {{datawizard}: An {R} Package for Easy Data Preparation and Statistical Transformations},
    author = {Indrajeet Patil and Dominique Makowski and Mattan S. Ben-Shachar and Brenton M. Wiernik and Etienne Bacher and Daniel Lüdecke},
    journal = {Journal of Open Source Software},
    year = {2022},
    volume = {7},
    number = {78},
    pages = {4684},
    doi = {10.21105/joss.04684},
  }
```

# Features

[![Documentation](https://img.shields.io/badge/documentation-datawizard-orange.svg?colorB=E91E63)](https://easystats.github.io/datawizard/)
[![Blog](https://img.shields.io/badge/blog-easystats-orange.svg?colorB=FF9800)](https://easystats.github.io/blog/posts/)
[![Features](https://img.shields.io/badge/features-datawizard-orange.svg?colorB=2196F3)](https://easystats.github.io/datawizard/reference/index.html)

Most courses and tutorials about statistical modeling assume that you
are working with a clean and tidy dataset. In practice, however, a major
part of doing statistical modeling is preparing your data–cleaning up
values, creating new columns, reshaping the dataset, or transforming
some variables. `{datawizard}` provides easy to use tools to perform
these common, critical, and sometimes tedious data preparation tasks.

## Data wrangling

### Select, filter and remove variables

The package provides helpers to filter rows meeting certain conditions…

``` r
data_match(mtcars, data.frame(vs = 0, am = 1))
#>                 mpg cyl  disp  hp drat    wt  qsec vs am gear carb
#> Mazda RX4      21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
#> Mazda RX4 Wag  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
#> Porsche 914-2  26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
#> Ford Pantera L 15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
#> Ferrari Dino   19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
#> Maserati Bora  15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
```

… or logical expressions:

``` r
data_filter(mtcars, vs == 0 & am == 1)
#>                 mpg cyl  disp  hp drat    wt  qsec vs am gear carb
#> Mazda RX4      21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
#> Mazda RX4 Wag  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
#> Porsche 914-2  26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
#> Ford Pantera L 15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
#> Ferrari Dino   19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
#> Maserati Bora  15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
```

Finding columns in a data frame, or retrieving the data of selected
columns, can be achieved using `extract_column_names()` or
`data_select()`:

``` r
# find column names matching a pattern
extract_column_names(iris, starts_with("Sepal"))
#> [1] "Sepal.Length" "Sepal.Width"

# return data columns matching a pattern
data_select(iris, starts_with("Sepal")) |> head()
#>   Sepal.Length Sepal.Width
#> 1          5.1         3.5
#> 2          4.9         3.0
#> 3          4.7         3.2
#> 4          4.6         3.1
#> 5          5.0         3.6
#> 6          5.4         3.9
```

It is also possible to extract one or more variables:

``` r
# single variable
data_extract(mtcars, "gear")
#>  [1] 4 4 4 3 3 3 3 4 4 4 4 3 3 3 3 3 3 4 4 4 3 3 3 3 3 4 5 5 5 5 5 4

# more variables
head(data_extract(iris, ends_with("Width")))
#>   Sepal.Width Petal.Width
#> 1         3.5         0.2
#> 2         3.0         0.2
#> 3         3.2         0.2
#> 4         3.1         0.2
#> 5         3.6         0.2
#> 6         3.9         0.4
```

Due to the consistent API, removing variables is just as simple:

``` r
head(data_remove(iris, starts_with("Sepal")))
#>   Petal.Length Petal.Width Species
#> 1          1.4         0.2  setosa
#> 2          1.4         0.2  setosa
#> 3          1.3         0.2  setosa
#> 4          1.5         0.2  setosa
#> 5          1.4         0.2  setosa
#> 6          1.7         0.4  setosa
```

### Reorder or rename

``` r
head(data_relocate(iris, select = "Species", before = "Sepal.Length"))
#>   Species Sepal.Length Sepal.Width Petal.Length Petal.Width
#> 1  setosa          5.1         3.5          1.4         0.2
#> 2  setosa          4.9         3.0          1.4         0.2
#> 3  setosa          4.7         3.2          1.3         0.2
#> 4  setosa          4.6         3.1          1.5         0.2
#> 5  setosa          5.0         3.6          1.4         0.2
#> 6  setosa          5.4         3.9          1.7         0.4
```

``` r
head(data_rename(iris, c("Sepal.Length", "Sepal.Width"), c("length", "width")))
#>   length width Petal.Length Petal.Width Species
#> 1    5.1   3.5          1.4         0.2  setosa
#> 2    4.9   3.0          1.4         0.2  setosa
#> 3    4.7   3.2          1.3         0.2  setosa
#> 4    4.6   3.1          1.5         0.2  setosa
#> 5    5.0   3.6          1.4         0.2  setosa
#> 6    5.4   3.9          1.7         0.4  setosa
```

### Merge

``` r
x <- data.frame(a = 1:3, b = c("a", "b", "c"), c = 5:7, id = 1:3)
y <- data.frame(c = 6:8, d = c("f", "g", "h"), e = 100:102, id = 2:4)

x
#>   a b c id
#> 1 1 a 5  1
#> 2 2 b 6  2
#> 3 3 c 7  3
y
#>   c d   e id
#> 1 6 f 100  2
#> 2 7 g 101  3
#> 3 8 h 102  4

data_merge(x, y, join = "full")
#>    a    b c id    d   e
#> 3  1    a 5  1 <NA>  NA
#> 1  2    b 6  2    f 100
#> 2  3    c 7  3    g 101
#> 4 NA <NA> 8  4    h 102

data_merge(x, y, join = "left")
#>   a b c id    d   e
#> 3 1 a 5  1 <NA>  NA
#> 1 2 b 6  2    f 100
#> 2 3 c 7  3    g 101

data_merge(x, y, join = "right")
#>    a    b c id d   e
#> 1  2    b 6  2 f 100
#> 2  3    c 7  3 g 101
#> 3 NA <NA> 8  4 h 102

data_merge(x, y, join = "semi", by = "c")
#>   a b c id
#> 2 2 b 6  2
#> 3 3 c 7  3

data_merge(x, y, join = "anti", by = "c")
#>   a b c id
#> 1 1 a 5  1

data_merge(x, y, join = "inner")
#>   a b c id d   e
#> 1 2 b 6  2 f 100
#> 2 3 c 7  3 g 101

data_merge(x, y, join = "bind")
#>    a    b c id    d   e
#> 1  1    a 5  1 <NA>  NA
#> 2  2    b 6  2 <NA>  NA
#> 3  3    c 7  3 <NA>  NA
#> 4 NA <NA> 6  2    f 100
#> 5 NA <NA> 7  3    g 101
#> 6 NA <NA> 8  4    h 102
```

### Reshape

A common data wrangling task is to reshape data.

Either to go from wide/Cartesian to long/tidy format

``` r
wide_data <- data.frame(replicate(5, rnorm(10)))

head(data_to_long(wide_data))
#>   name       value
#> 1   X1 -0.08281164
#> 2   X2 -1.12490028
#> 3   X3 -0.70632036
#> 4   X4 -0.70278946
#> 5   X5  0.07633326
#> 6   X1  1.93468099
```

or the other way

``` r
long_data <- data_to_long(wide_data, rows_to = "Row_ID") # Save row number

data_to_wide(long_data,
  names_from = "name",
  values_from = "value",
  id_cols = "Row_ID"
)
#>    Row_ID          X1          X2          X3         X4          X5
#> 1       1 -0.08281164 -1.12490028 -0.70632036 -0.7027895  0.07633326
#> 2       2  1.93468099 -0.87430362  0.96687656  0.2998642 -0.23035595
#> 3       3 -2.05128979  0.04386162 -0.71016648  1.1494697  0.31746484
#> 4       4  0.27773897 -0.58397514 -0.05917365 -0.3016415 -1.59268440
#> 5       5 -1.52596060 -0.82329858 -0.23094342 -0.5473394 -0.18194062
#> 6       6 -0.26916362  0.11059280  0.69200045 -0.3854041  1.75614174
#> 7       7  1.23305388  0.36472778  1.35682290  0.2763720  0.11394932
#> 8       8  0.63360774  0.05370100  1.78872284  0.1518608 -0.29216508
#> 9       9  0.35271746  1.36867235  0.41071582 -0.4313808  1.75409316
#> 10     10 -0.56048248 -0.38045724 -2.18785470 -1.8705001  1.80958455
```

### Empty rows and columns

``` r
tmp <- data.frame(
  a = c(1, 2, 3, NA, 5),
  b = c(1, NA, 3, NA, 5),
  c = c(NA, NA, NA, NA, NA),
  d = c(1, NA, 3, NA, 5)
)

tmp
#>    a  b  c  d
#> 1  1  1 NA  1
#> 2  2 NA NA NA
#> 3  3  3 NA  3
#> 4 NA NA NA NA
#> 5  5  5 NA  5

# indices of empty columns or rows
empty_columns(tmp)
#> c 
#> 3
empty_rows(tmp)
#> [1] 4

# remove empty columns or rows
remove_empty_columns(tmp)
#>    a  b  d
#> 1  1  1  1
#> 2  2 NA NA
#> 3  3  3  3
#> 4 NA NA NA
#> 5  5  5  5
remove_empty_rows(tmp)
#>   a  b  c  d
#> 1 1  1 NA  1
#> 2 2 NA NA NA
#> 3 3  3 NA  3
#> 5 5  5 NA  5

# remove empty columns and rows
remove_empty(tmp)
#>   a  b  d
#> 1 1  1  1
#> 2 2 NA NA
#> 3 3  3  3
#> 5 5  5  5
```

### Recode or cut dataframe

``` r
set.seed(123)
x <- sample(1:10, size = 50, replace = TRUE)

table(x)
#> x
#>  1  2  3  4  5  6  7  8  9 10 
#>  2  3  5  3  7  5  5  2 11  7

# cut into 3 groups, based on distribution (quantiles)
table(categorize(x, split = "quantile", n_groups = 3))
#> 
#>  1  2  3 
#> 13 19 18
```

## Data Transformations

The packages also contains multiple functions to help transform data.

### Standardize

For example, to standardize (*z*-score) data:

``` r
# before
summary(swiss)
#>    Fertility      Agriculture     Examination      Education    
#>  Min.   :35.00   Min.   : 1.20   Min.   : 3.00   Min.   : 1.00  
#>  1st Qu.:64.70   1st Qu.:35.90   1st Qu.:12.00   1st Qu.: 6.00  
#>  Median :70.40   Median :54.10   Median :16.00   Median : 8.00  
#>  Mean   :70.14   Mean   :50.66   Mean   :16.49   Mean   :10.98  
#>  3rd Qu.:78.45   3rd Qu.:67.65   3rd Qu.:22.00   3rd Qu.:12.00  
#>  Max.   :92.50   Max.   :89.70   Max.   :37.00   Max.   :53.00  
#>     Catholic       Infant.Mortality
#>  Min.   :  2.150   Min.   :10.80   
#>  1st Qu.:  5.195   1st Qu.:18.15   
#>  Median : 15.140   Median :20.00   
#>  Mean   : 41.144   Mean   :19.94   
#>  3rd Qu.: 93.125   3rd Qu.:21.70   
#>  Max.   :100.000   Max.   :26.60

# after
summary(standardize(swiss))
#>    Fertility         Agriculture       Examination         Education      
#>  Min.   :-2.81327   Min.   :-2.1778   Min.   :-1.69084   Min.   :-1.0378  
#>  1st Qu.:-0.43569   1st Qu.:-0.6499   1st Qu.:-0.56273   1st Qu.:-0.5178  
#>  Median : 0.02061   Median : 0.1515   Median :-0.06134   Median :-0.3098  
#>  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000  
#>  3rd Qu.: 0.66504   3rd Qu.: 0.7481   3rd Qu.: 0.69074   3rd Qu.: 0.1062  
#>  Max.   : 1.78978   Max.   : 1.7190   Max.   : 2.57094   Max.   : 4.3702  
#>     Catholic       Infant.Mortality  
#>  Min.   :-0.9350   Min.   :-3.13886  
#>  1st Qu.:-0.8620   1st Qu.:-0.61543  
#>  Median :-0.6235   Median : 0.01972  
#>  Mean   : 0.0000   Mean   : 0.00000  
#>  3rd Qu.: 1.2464   3rd Qu.: 0.60337  
#>  Max.   : 1.4113   Max.   : 2.28566
```

### Winsorize

To winsorize data:

``` r
# before
anscombe
#>    x1 x2 x3 x4    y1   y2    y3    y4
#> 1  10 10 10  8  8.04 9.14  7.46  6.58
#> 2   8  8  8  8  6.95 8.14  6.77  5.76
#> 3  13 13 13  8  7.58 8.74 12.74  7.71
#> 4   9  9  9  8  8.81 8.77  7.11  8.84
#> 5  11 11 11  8  8.33 9.26  7.81  8.47
#> 6  14 14 14  8  9.96 8.10  8.84  7.04
#> 7   6  6  6  8  7.24 6.13  6.08  5.25
#> 8   4  4  4 19  4.26 3.10  5.39 12.50
#> 9  12 12 12  8 10.84 9.13  8.15  5.56
#> 10  7  7  7  8  4.82 7.26  6.42  7.91
#> 11  5  5  5  8  5.68 4.74  5.73  6.89

# after
winsorize(anscombe)
#>    x1 x2 x3 x4   y1   y2   y3   y4
#> 1  10 10 10  8 8.04 9.13 7.46 6.58
#> 2   8  8  8  8 6.95 8.14 6.77 5.76
#> 3  12 12 12  8 7.58 8.74 8.15 7.71
#> 4   9  9  9  8 8.81 8.77 7.11 8.47
#> 5  11 11 11  8 8.33 9.13 7.81 8.47
#> 6  12 12 12  8 8.81 8.10 8.15 7.04
#> 7   6  6  6  8 7.24 6.13 6.08 5.76
#> 8   6  6  6  8 5.68 6.13 6.08 8.47
#> 9  12 12 12  8 8.81 9.13 8.15 5.76
#> 10  7  7  7  8 5.68 7.26 6.42 7.91
#> 11  6  6  6  8 5.68 6.13 6.08 6.89
```

### Center

To grand-mean center data

``` r
center(anscombe)
#>    x1 x2 x3 x4          y1         y2    y3         y4
#> 1   1  1  1 -1  0.53909091  1.6390909 -0.04 -0.9209091
#> 2  -1 -1 -1 -1 -0.55090909  0.6390909 -0.73 -1.7409091
#> 3   4  4  4 -1  0.07909091  1.2390909  5.24  0.2090909
#> 4   0  0  0 -1  1.30909091  1.2690909 -0.39  1.3390909
#> 5   2  2  2 -1  0.82909091  1.7590909  0.31  0.9690909
#> 6   5  5  5 -1  2.45909091  0.5990909  1.34 -0.4609091
#> 7  -3 -3 -3 -1 -0.26090909 -1.3709091 -1.42 -2.2509091
#> 8  -5 -5 -5 10 -3.24090909 -4.4009091 -2.11  4.9990909
#> 9   3  3  3 -1  3.33909091  1.6290909  0.65 -1.9409091
#> 10 -2 -2 -2 -1 -2.68090909 -0.2409091 -1.08  0.4090909
#> 11 -4 -4 -4 -1 -1.82090909 -2.7609091 -1.77 -0.6109091
```

### Ranktransform

To rank-transform data:

``` r
# before
head(trees)
#>   Girth Height Volume
#> 1   8.3     70   10.3
#> 2   8.6     65   10.3
#> 3   8.8     63   10.2
#> 4  10.5     72   16.4
#> 5  10.7     81   18.8
#> 6  10.8     83   19.7

# after
head(ranktransform(trees))
#>   Girth Height Volume
#> 1     1    6.0    2.5
#> 2     2    3.0    2.5
#> 3     3    1.0    1.0
#> 4     4    8.5    5.0
#> 5     5   25.5    7.0
#> 6     6   28.0    9.0
```

### Rescale

To rescale a numeric variable to a new range:

``` r
change_scale(c(0, 1, 5, -5, -2))
#> [1]  50  60 100   0  30
#> (original range = -5 to 5)
```

### Rotate or transpose

``` r
x <- mtcars[1:3, 1:4]

x
#>                mpg cyl disp  hp
#> Mazda RX4     21.0   6  160 110
#> Mazda RX4 Wag 21.0   6  160 110
#> Datsun 710    22.8   4  108  93

data_rotate(x)
#>      Mazda RX4 Mazda RX4 Wag Datsun 710
#> mpg         21            21       22.8
#> cyl          6             6        4.0
#> disp       160           160      108.0
#> hp         110           110       93.0
```

## Data properties

`datawizard` provides a way to provide comprehensive descriptive summary
for all variables in a dataframe:

``` r
data(iris)
describe_distribution(iris)
#> Variable     | Mean |   SD |  IQR |        Range | Skewness | Kurtosis |   n | n_Missing
#> ----------------------------------------------------------------------------------------
#> Sepal.Length | 5.84 | 0.83 | 1.30 | [4.30, 7.90] |     0.31 |    -0.55 | 150 |         0
#> Sepal.Width  | 3.06 | 0.44 | 0.52 | [2.00, 4.40] |     0.32 |     0.23 | 150 |         0
#> Petal.Length | 3.76 | 1.77 | 3.52 | [1.00, 6.90] |    -0.27 |    -1.40 | 150 |         0
#> Petal.Width  | 1.20 | 0.76 | 1.50 | [0.10, 2.50] |    -0.10 |    -1.34 | 150 |         0
```

Or even just a variable

``` r
describe_distribution(mtcars$wt)
#> Mean |   SD |  IQR |        Range | Skewness | Kurtosis |  n | n_Missing
#> ------------------------------------------------------------------------
#> 3.22 | 0.98 | 1.19 | [1.51, 5.42] |     0.47 |     0.42 | 32 |         0
```

There are also some additional data properties that can be computed
using this package.

``` r
x <- (-10:10)^3 + rnorm(21, 0, 100)
smoothness(x, method = "diff")
#> [1] 1.791243
#> attr(,"class")
#> [1] "parameters_smoothness" "numeric"
```

## Function design and pipe-workflow

The design of the `{datawizard}` functions follows a design principle
that makes it easy for user to understand and remember how functions
work:

1.  the first argument is the data
2.  for methods that work on data frames, two arguments are following to
    `select` and `exclude` variables
3.  the following arguments are arguments related to the specific tasks
    of the functions

Most important, functions that accept data frames usually have this as
their first argument, and also return a (modified) data frame again.
Thus, `{datawizard}` integrates smoothly into a “pipe-workflow”.

``` r
iris |>
  # all rows where Species is "versicolor" or "virginica"
  data_filter(Species %in% c("versicolor", "virginica")) |>
  # select only columns with "." in names (i.e. drop Species)
  data_select(contains("\\.")) |>
  # move columns that ends with "Length" to start of data frame
  data_relocate(ends_with("Length")) |>
  # remove fourth column
  data_remove(4) |>
  head()
#>    Sepal.Length Petal.Length Sepal.Width
#> 51          7.0          4.7         3.2
#> 52          6.4          4.5         3.2
#> 53          6.9          4.9         3.1
#> 54          5.5          4.0         2.3
#> 55          6.5          4.6         2.8
#> 56          5.7          4.5         2.8
```

# Contributing and Support

In case you want to file an issue or contribute in another way to the
package, please follow [this
guide](https://easystats.github.io/datawizard/CONTRIBUTING.html). For
questions about the functionality, you may either contact us via email
or also file an issue.

# Code of Conduct

Please note that this project is released with a [Contributor Code of
Conduct](https://easystats.github.io/datawizard/CODE_OF_CONDUCT.html).
By participating in this project you agree to abide by its terms.


================================================
FILE: air.toml
================================================
[format]
line-width = 80
indent-width = 2
indent-style = "space"
line-ending = "lf"
persistent-line-breaks = true
skip = ["tribble"]


================================================
FILE: cran-comments.md
================================================
This fixes R-devel errors reported on 2026-04-23.


================================================
FILE: datawizard.Rproj
================================================
Version: 1.0

RestoreWorkspace: No
SaveWorkspace: No
AlwaysSaveHistory: No

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8

RnwWeave: knitr
LaTeX: XeLaTeX

AutoAppendNewline: Yes
StripTrailingWhitespace: Yes

BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
PackageRoxygenize: rd,collate,namespace

QuitChildProcessesOnExit: Yes
DisableExecuteRprofile: Yes


================================================
FILE: datawizard.code-workspace
================================================
{
	"folders": [
		{
			"path": "."
		}
  ],
  "launch": {
    "version": "0.2.0",
    "configurations": [
      {
        "type": "R-Debugger",
        "name": "Launch R-Workspace",
        "request": "launch",
        "debugMode": "workspace",
        "workingDirectory": ""
      }
    ]
  }
}


================================================
FILE: inst/CITATION
================================================
bibentry(
  bibtype="Article",
  title="{datawizard}: An {R} Package for Easy Data Preparation and Statistical Transformations",
  author=c(person("Indrajeet", "Patil"), person("Dominique", "Makowski"), person("Mattan S.", "Ben-Shachar"), person("Brenton M.", "Wiernik"), person("Etienne", "Bacher"), person("Daniel", "Lüdecke")),
  journal="Journal of Open Source Software",
  year = 2022,
  volume = 7,
  number = 78,
  pages = 4684,
  doi = "10.21105/joss.04684",
  textVersion = "Patil et al., (2022). datawizard: An R Package for Easy Data Preparation and Statistical Transformations. Journal of Open Source Software, 7(78), 4684, https://doi.org/10.21105/joss.04684"
)


================================================
FILE: inst/WORDLIST
================================================
AES
Analysing
Asparouhov
BMC
Bafumi
Brincks
Bulotsky
CMD
Carle
Catran
Crosstables
DEPRECATIONS
De
Dhaliwal
Disaggregating
EFC
EUROFAMCARE
Enders
Fairbrother
GCM
GLMM
Gelman
Giesecke
Giesselmann
Guo
Heisig
Herrington
Hoffmann
Joanes
Kish
Llabre
Lumley
MADs
Mattan
Minitab
ORCID
OpenBLAS
PSU
Ranktransform
Routledge
SDs
Schaeffer
Shachar
Stata
Tidyverse
TitleCase
Verkuilen
Verkuilen's
Winsorize
Winsorized
Winsorizing
al
behaviour
behaviours
brms
codebook
codebooks
codecov
crosstable
crosstables
csv
de
decrypt
decrypted
doi
easystats
effectsize
endogeneity
et
geoms
ggplot's
https
ing
interpretability
inversed
joss
labelled
labelling
leptokurtic
lm
lme
meaned
mesokurtic
midhinge
modelbased
nanoparquet
nd
noLD
openssl
panelr
partialization
platykurtic
poorman
pre
px
readr
readxl
relevel
rio
rowid
rstanarm
sd
stackexchange
tailedness
th
tibble
tibbles
tidyverse
unitless
unstored
visualisation
wikipedia
winsorization
winsorize
winsorized
winsorizes
winsorizing
zscore


================================================
FILE: man/adjust.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/adjust.R
\name{adjust}
\alias{adjust}
\alias{data_adjust}
\title{Adjust data for the effect of other variable(s)}
\usage{
adjust(
  data,
  effect = NULL,
  select = is.numeric,
  exclude = NULL,
  multilevel = FALSE,
  additive = FALSE,
  bayesian = FALSE,
  keep_intercept = FALSE,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = FALSE
)

data_adjust(
  data,
  effect = NULL,
  select = is.numeric,
  exclude = NULL,
  multilevel = FALSE,
  additive = FALSE,
  bayesian = FALSE,
  keep_intercept = FALSE,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = FALSE
)
}
\arguments{
\item{data}{A data frame.}

\item{effect}{Character vector of column names to be adjusted for (regressed
out). If \code{NULL} (the default), all variables will be selected.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}

\item{multilevel}{If \code{TRUE}, the factors are included as random factors.
Else, if \code{FALSE} (default), they are included as fixed effects in the
simple regression model.}

\item{additive}{If \code{TRUE}, continuous variables as included as smooth terms
in additive models. The goal is to regress-out potential non-linear
effects.}

\item{bayesian}{If \code{TRUE}, the models are fitted under the Bayesian framework
using \code{rstanarm}.}

\item{keep_intercept}{If \code{FALSE} (default), the intercept of the model is
re-added. This avoids the centering around 0 that happens by default
when regressing out another variable (see the examples below for a
visual representation of this).}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}

\item{verbose}{Toggle warnings.}
}
\value{
A data frame comparable to \code{data}, with adjusted variables.
}
\description{
This function can be used to adjust the data for the effect of other
variables present in the dataset. It is based on an underlying fitting of
regressions models, allowing for quite some flexibility, such as including
factors as random effects in mixed models (multilevel partialization),
continuous variables as smooth terms in general additive models (non-linear
partialization) and/or fitting these models under a Bayesian framework. The
values returned by this function are the residuals of the regression models.
Note that a regular correlation between two "adjusted" variables is
equivalent to the partial correlation between them.
}
\examples{
\dontshow{if (all(insight::check_if_installed(c("bayestestR", "rstanarm", "gamm4"), quietly = TRUE))) withAutoprint(\{ # examplesIf}
adjusted_all <- adjust(attitude)
head(adjusted_all)
adjusted_one <- adjust(attitude, effect = "complaints", select = "rating")
head(adjusted_one)
\donttest{
adjust(attitude, effect = "complaints", select = "rating", bayesian = TRUE)
adjust(attitude, effect = "complaints", select = "rating", additive = TRUE)
attitude$complaints_LMH <- cut(attitude$complaints, 3)
adjust(attitude, effect = "complaints_LMH", select = "rating", multilevel = TRUE)
}

# Generate data
data <- bayestestR::simulate_correlation(n = 100, r = 0.7)
data$V2 <- (5 * data$V2) + 20 # Add intercept

# Adjust
adjusted <- adjust(data, effect = "V1", select = "V2")
adjusted_icpt <- adjust(data, effect = "V1", select = "V2", keep_intercept = TRUE)

# Visualize
plot(
  data$V1, data$V2,
  pch = 19, col = "blue",
  ylim = c(min(adjusted$V2), max(data$V2)),
  main = "Original (blue), adjusted (green), and adjusted - intercept kept (red) data"
)
abline(lm(V2 ~ V1, data = data), col = "blue")
points(adjusted$V1, adjusted$V2, pch = 19, col = "green")
abline(lm(V2 ~ V1, data = adjusted), col = "green")
points(adjusted_icpt$V1, adjusted_icpt$V2, pch = 19, col = "red")
abline(lm(V2 ~ V1, data = adjusted_icpt), col = "red")
\dontshow{\}) # examplesIf}
}


================================================
FILE: man/as.prop.table.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_tabulate.R
\name{as.prop.table}
\alias{as.prop.table}
\alias{as.prop.table.datawizard_crosstab}
\alias{as.data.frame.datawizard_tables}
\alias{as.table.datawizard_table}
\title{Convert a crosstable to a frequency or a propensity table}
\usage{
as.prop.table(x, ...)

\method{as.prop.table}{datawizard_crosstab}(x, remove_na = TRUE, simplify = FALSE, verbose = TRUE, ...)

\method{as.data.frame}{datawizard_tables}(
  x,
  row.names = NULL,
  optional = FALSE,
  ...,
  stringsAsFactors = FALSE,
  add_total = FALSE
)

\method{as.table}{datawizard_table}(x, remove_na = TRUE, simplify = FALSE, verbose = TRUE, ...)
}
\arguments{
\item{x}{An object created by \code{data_tabulate()}. It must be of class
\code{datawizard_crosstab} for \code{as.prop.table()}.}

\item{...}{not used.}

\item{remove_na}{Logical, if \code{FALSE}, missing values are included in the
frequency or crosstable, else missing values are omitted. Note that the
default for the \code{as.table()} method is \code{remove_na = TRUE}, so that missing
values are not included in the returned table, which makes more sense for
post-processing of the table, e.g. using \code{chisq.test()}.}

\item{simplify}{Logical, if \code{TRUE}, the returned table is simplified to a
single table object if there is only one frequency or contingency table
input. Else, always for multiple table inputs or when \code{simplify = FALSE}, a
list of tables is returned. This is only relevant for the \code{as.table()}
methods. To ensure consistent output, the default is \code{FALSE}.}

\item{verbose}{Toggle warnings and messages.}

\item{row.names}{\code{NULL} or a character vector giving the row
    names for the data frame.  Missing values are not allowed.}

\item{optional}{logical. If \code{TRUE}, setting row names and
    converting column names (to syntactic names: see
    \code{\link[base]{make.names}}) is optional.  Note that all of \R's
    \pkg{base} package \code{as.data.frame()} methods use
    \code{optional} only for column names treatment, basically with the
    meaning of \code{\link[base]{data.frame}(*, check.names = !optional)}.
    See also the \code{make.names} argument of the \code{matrix} method.}

\item{stringsAsFactors}{logical: should the character vector be converted
    to a factor?}

\item{add_total}{For crosstables (i.e. when \code{by} is not \code{NULL}), a row and
column with the total N values are added to the data frame. \code{add_total} has
no effect in \code{as.data.frame()} for simple frequency tables.}
}
\description{
\code{as.prop.table()} is an S3 generic. It can be used on objects of class
\code{datawizard_crosstab} created by \code{data_tabulate()} when it was run with the
arguments \code{by} and \code{proportions}.
}
\examples{
data(efc)

# Some cross tabulation
cross <- data_tabulate(efc, select = "e42dep", by = "c172code", proportions = "row")
cross

# Convert to a propensity table
as.prop.table(cross)

# Convert to data.frame
result <- data_tabulate(efc, "c172code", by = "e16sex")
as.data.frame(result)
as.data.frame(result)$table
as.data.frame(result, add_total = TRUE)$table

# Convert to a table that can be passed to chisq.test()

out <- data_tabulate(efc, "c172code", by = "e16sex")
# we need to simplify the output, else we get a list of tables
tbl <- as.table(out, simplify = TRUE)
tbl
suppressWarnings(chisq.test(tbl))

# apply chisq.test to each table
out <- data_tabulate(efc, c("c172code", "e16sex"))
suppressWarnings(lapply(as.table(out), chisq.test))

# can also handle grouped data frames
d <- data_group(mtcars, "am")
x <- data_tabulate(d, "cyl", by = "gear")
as.table(x)
}
\seealso{
\link{data_tabulate}
}


================================================
FILE: man/assign_labels.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/assign_labels.R
\name{assign_labels}
\alias{assign_labels}
\alias{assign_labels.numeric}
\alias{assign_labels.data.frame}
\title{Assign variable and value labels}
\usage{
assign_labels(x, ...)

\method{assign_labels}{numeric}(x, variable = NULL, values = NULL, ...)

\method{assign_labels}{data.frame}(
  x,
  select = NULL,
  exclude = NULL,
  values = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
)
}
\arguments{
\item{x}{A data frame, factor or vector.}

\item{...}{Currently not used.}

\item{variable}{The variable label as string.}

\item{values}{The value labels as (named) character vector. If \code{values} is
\emph{not} a named vector, the length of labels must be equal to the length of
unique values. For a named vector, the left-hand side (LHS) is the value in
\code{x}, the right-hand side (RHS) the associated value label. Non-matching
labels are omitted.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}

\item{verbose}{Toggle warnings.}
}
\value{
A labelled variable, or a data frame of labelled variables.
}
\description{
Assign variable and values labels to a variable or variables in a data frame.
Labels are stored as attributes (\code{"label"} for variable labels and \code{"labels"})
for value labels.
}
\section{Selection of variables - the \code{select} argument}{

For most functions that have a \code{select} argument (including this function),
the complete input data frame is returned, even when \code{select} only selects
a range of variables. That is, the function is only applied to those variables
that have a match in \code{select}, while all other variables remain unchanged.
In other words: for this function, \code{select} will not omit any non-included
variables, so that the returned data frame will include all variables
from the input data frame.
}

\examples{
x <- 1:3
# labelling by providing required number of labels
assign_labels(
  x,
  variable = "My x",
  values = c("one", "two", "three")
)

# labelling using named vectors
data(iris)
out <- assign_labels(
  iris$Species,
  variable = "Labelled Species",
  values = c(`setosa` = "Spec1", `versicolor` = "Spec2", `virginica` = "Spec3")
)
str(out)

# data frame example
out <- assign_labels(
  iris,
  select = "Species",
  variable = "Labelled Species",
  values = c(`setosa` = "Spec1", `versicolor` = "Spec2", `virginica` = "Spec3")
)
str(out$Species)

# Partial labelling
x <- 1:5
assign_labels(
  x,
  variable = "My x",
  values = c(`1` = "lowest", `5` = "highest")
)
}


================================================
FILE: man/categorize.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/categorize.R
\name{categorize}
\alias{categorize}
\alias{categorize.numeric}
\alias{categorize.data.frame}
\title{Recode (or "cut" / "bin") data into groups of values.}
\usage{
categorize(x, ...)

\method{categorize}{numeric}(
  x,
  split = "median",
  n_groups = NULL,
  range = NULL,
  lowest = 1,
  breaks = "exclusive",
  labels = NULL,
  verbose = TRUE,
  ...
)

\method{categorize}{data.frame}(
  x,
  select = NULL,
  exclude = NULL,
  split = "median",
  n_groups = NULL,
  range = NULL,
  lowest = 1,
  breaks = "exclusive",
  labels = NULL,
  append = FALSE,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
)
}
\arguments{
\item{x}{A (grouped) data frame, numeric vector or factor.}

\item{...}{not used.}

\item{split}{Character vector, indicating at which breaks to split variables,
or numeric values with values indicating breaks. If character, may be one
of \code{"median"}, \code{"mean"}, \code{"quantile"}, \code{"equal_length"}, or \code{"equal_range"}.
\code{"median"} or \code{"mean"} will return dichotomous variables, split at their
mean or median, respectively. \code{"quantile"} and \code{"equal_length"} will split
the variable into \code{n_groups} groups, where each group refers to an interval
of a specific range of values. Thus, the length of each interval will be
based on the number of groups. \code{"equal_range"} also splits the variable
into multiple groups, however, the length of the interval is given, and
the number of resulting groups (and hence, the number of breaks) will be
determined by how many intervals can be generated, based on the full range
of the variable.}

\item{n_groups}{If \code{split} is \code{"quantile"} or \code{"equal_length"}, this defines
the number of requested groups (i.e. resulting number of levels or values)
for the recoded variable(s). \code{"quantile"} will define intervals based
on the distribution of the variable, while \code{"equal_length"} tries to
divide the range of the variable into pieces of equal length.}

\item{range}{If \code{split = "equal_range"}, this defines the range of values
that are recoded into a new value.}

\item{lowest}{Minimum value of the recoded variable(s). If \code{NULL} (the default),
for numeric variables, the minimum of the original input is preserved. For
factors, the default minimum is \code{1}. For \code{split = "equal_range"}, the
default minimum is always \code{1}, unless specified otherwise in \code{lowest}.}

\item{breaks}{Character, indicating whether breaks for categorizing data are
\code{"inclusive"} (values indicate the \emph{upper} bound of the \emph{previous} group or
interval) or \code{"exclusive"} (values indicate the \emph{lower} bound of the \emph{next}
group or interval to begin). Use \code{labels = "range"} to make this behaviour
easier to see.}

\item{labels}{Character vector of value labels. If not \code{NULL}, \code{categorize()}
will returns factors instead of numeric variables, with \code{labels} used
for labelling the factor levels. Can also be \code{"mean"}, \code{"median"},
\code{"range"} or \code{"observed"} for a factor with labels as the mean/median,
the requested range (even if not all values of that range are present in
the data) or observed range (range of the actual recoded values) of each
group. See 'Examples'.}

\item{verbose}{Toggle warnings.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}

\item{append}{Logical or string. If \code{TRUE}, recoded or converted variables
get new column names and are appended (column bind) to \code{x}, thus returning
both the original and the recoded variables. The new columns get a suffix,
based on the calling function: \code{"_r"} for recode functions, \code{"_n"} for
\code{to_numeric()}, \code{"_f"} for \code{to_factor()}, or \code{"_s"} for
\code{slide()}. If \code{append=FALSE}, original variables in \code{x} will be
overwritten by their recoded versions. If a character value, recoded
variables are appended with new column names (using the defined suffix) to
the original data frame.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}
}
\value{
\code{x}, recoded into groups. By default \code{x} is numeric, unless \code{labels}
is specified. In this case, a factor is returned, where the factor levels
(i.e. recoded groups are labelled accordingly.
}
\description{
This functions divides the range of variables into intervals and recodes
the values inside these intervals according to their related interval.
It is basically a wrapper around base R's \code{cut()}, providing a simplified
and more accessible way to define the interval breaks (cut-off values).
}
\section{Splits and breaks (cut-off values)}{
Breaks are by default \emph{exclusive}, this means that these values indicate
the lower bound of the next group or interval to begin. Take a simple
example, a numeric variable with values from 1 to 9. The median would be 5,
thus the first interval ranges from 1-4 and is recoded into 1, while 5-9
would turn into 2 (compare \code{cbind(1:9, categorize(1:9))}). The same variable,
using \code{split = "quantile"} and \code{n_groups = 3} would define breaks at 3.67
and 6.33 (see \code{quantile(1:9, probs = c(1/3, 2/3))}), which means that values
from 1 to 3 belong to the first interval and are recoded into 1 (because
the next interval starts at 3.67), 4 to 6 into 2 and 7 to 9 into 3.

The opposite behaviour can be achieved using \code{breaks = "inclusive"}, in which
case
}

\section{Recoding into groups with equal size or range}{
\code{split = "equal_length"} and \code{split = "equal_range"} try to divide the
range of \code{x} into intervals of similar (or same) length. The difference is
that \code{split = "equal_length"} will divide the range of \code{x} into \code{n_groups}
pieces and thereby defining the intervals used as breaks (hence, it is
equivalent to \code{cut(x, breaks = n_groups)}), while  \code{split = "equal_range"}
will cut \code{x} into intervals that all have the length of \code{range}, where the
first interval by defaults starts at \code{1}. The lowest (or starting) value
of that interval can be defined using the \code{lowest} argument.
}

\section{Selection of variables - the \code{select} argument}{

For most functions that have a \code{select} argument (including this function),
the complete input data frame is returned, even when \code{select} only selects
a range of variables. That is, the function is only applied to those variables
that have a match in \code{select}, while all other variables remain unchanged.
In other words: for this function, \code{select} will not omit any non-included
variables, so that the returned data frame will include all variables
from the input data frame.
}

\examples{
set.seed(123)
x <- sample(1:10, size = 50, replace = TRUE)

table(x)

# by default, at median
table(categorize(x))

# into 3 groups, based on distribution (quantiles)
table(categorize(x, split = "quantile", n_groups = 3))

# into 3 groups, user-defined break
table(categorize(x, split = c(3, 5)))

set.seed(123)
x <- sample(1:100, size = 500, replace = TRUE)

# into 5 groups, try to recode into intervals of similar length,
# i.e. the range within groups is the same for all groups
table(categorize(x, split = "equal_length", n_groups = 5))

# into 5 groups, try to return same range within groups
# i.e. 1-20, 21-40, 41-60, etc. Since the range of "x" is
# 1-100, and we have a range of 20, this results into 5
# groups, and thus is for this particular case identical
# to the previous result.
table(categorize(x, split = "equal_range", range = 20))

# return factor with value labels instead of numeric value
set.seed(123)
x <- sample(1:10, size = 30, replace = TRUE)
categorize(x, "equal_length", n_groups = 3)
categorize(x, "equal_length", n_groups = 3, labels = c("low", "mid", "high"))

# cut numeric into groups with the mean or median as a label name
x <- sample(1:10, size = 30, replace = TRUE)
categorize(x, "equal_length", n_groups = 3, labels = "mean")
categorize(x, "equal_length", n_groups = 3, labels = "median")

# cut numeric into groups with the requested range as a label name
# each category has the same range, and labels indicate this range
categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range")
# in this example, each category has the same range, but labels only refer
# to the ranges of the actual values (present in the data) inside each group
categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed")
}
\seealso{
\itemize{
\item Add a prefix or suffix to column names: \code{\link[=data_addprefix]{data_addprefix()}}, \code{\link[=data_addsuffix]{data_addsuffix()}}
\item Functions to reorder or remove columns: \code{\link[=data_reorder]{data_reorder()}}, \code{\link[=data_relocate]{data_relocate()}},
\code{\link[=data_remove]{data_remove()}}
\item Functions to reshape, pivot or rotate data frames: \code{\link[=data_to_long]{data_to_long()}},
\code{\link[=data_to_wide]{data_to_wide()}}, \code{\link[=data_rotate]{data_rotate()}}
\item Functions to recode data: \code{\link[=rescale]{rescale()}}, \code{\link[=reverse]{reverse()}}, \code{\link[=categorize]{categorize()}},
\code{\link[=recode_values]{recode_values()}}, \code{\link[=slide]{slide()}}
\item Functions to standardize, normalize, rank-transform: \code{\link[=center]{center()}}, \code{\link[=standardize]{standardize()}},
\code{\link[=normalize]{normalize()}}, \code{\link[=ranktransform]{ranktransform()}}, \code{\link[=winsorize]{winsorize()}}
\item Split and merge data frames: \code{\link[=data_partition]{data_partition()}}, \code{\link[=data_merge]{data_merge()}}
\item Functions to find or select columns: \code{\link[=data_select]{data_select()}}, \code{\link[=extract_column_names]{extract_column_names()}}
\item Functions to filter rows: \code{\link[=data_match]{data_match()}}, \code{\link[=data_filter]{data_filter()}}
}
}


================================================
FILE: man/center.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/center.R
\name{center}
\alias{center}
\alias{centre}
\alias{center.numeric}
\alias{center.data.frame}
\title{Centering (Grand-Mean Centering)}
\usage{
center(x, ...)

centre(x, ...)

\method{center}{numeric}(
  x,
  robust = FALSE,
  weights = NULL,
  reference = NULL,
  center = NULL,
  verbose = TRUE,
  ...
)

\method{center}{data.frame}(
  x,
  select = NULL,
  exclude = NULL,
  robust = FALSE,
  weights = NULL,
  reference = NULL,
  center = NULL,
  force = FALSE,
  remove_na = c("none", "selected", "all"),
  append = FALSE,
  ignore_case = FALSE,
  verbose = TRUE,
  regex = FALSE,
  ...
)
}
\arguments{
\item{x}{A (grouped) data frame, a (numeric or character) vector or a factor.}

\item{...}{Currently not used.}

\item{robust}{Logical, if \code{TRUE}, centering is done by subtracting the
median from the variables. If \code{FALSE}, variables are centered by
subtracting the mean.}

\item{weights}{Can be \code{NULL} (for no weighting), or:
\itemize{
\item For data frames: a numeric vector of weights, or a character of the
name of a column in the \code{data.frame} that contains the weights.
\item For numeric vectors: a numeric vector of weights.
}}

\item{reference}{A data frame or variable from which the centrality and
deviation will be computed instead of from the input variable. Useful for
standardizing a subset or new data according to another data frame.}

\item{center}{Numeric value, which can be used as alternative to
\code{reference} to define a reference centrality. If \code{center} is of length 1,
it will be recycled to match the length of selected variables for centering.
Else, \code{center} must be of same length as the number of selected variables.
Values in \code{center} will be matched to selected variables in the provided
order, unless a named vector is given. In this case, names are matched
against the names of the selected variables.}

\item{verbose}{Toggle warnings and messages.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}

\item{force}{Logical, if \code{TRUE}, forces centering of factors as
well. Factors are converted to numerical values, with the lowest level
being the value \code{1} (unless the factor has numeric levels, which are
converted to the corresponding numeric value).}

\item{remove_na}{How should missing values (\code{NA}) be treated: if \code{"none"}
(default): each column's standardization is done separately, ignoring
\code{NA}s. Else, rows with \code{NA} in the columns selected with \code{select} /
\code{exclude} (\code{"selected"}) or in all columns (\code{"all"}) are dropped before
standardization, and the resulting data frame does not include these cases.}

\item{append}{Logical or string. If \code{TRUE}, centered variables get new
column names (with the suffix \code{"_c"}) and are appended (column bind) to \code{x},
thus returning both the original and the centered variables. If \code{FALSE},
original variables in \code{x} will be overwritten by their centered versions.
If a character value, centered variables are appended with new column
names (using the defined suffix) to the original data frame.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}
}
\value{
The centered variables.
}
\description{
Performs a grand-mean centering of data.
}
\note{
\strong{Difference between centering and standardizing}: Standardized variables
are computed by subtracting the mean of the variable and then dividing it by
the standard deviation, while centering variables involves only the
subtraction.
}
\section{Selection of variables - the \code{select} argument}{

For most functions that have a \code{select} argument (including this function),
the complete input data frame is returned, even when \code{select} only selects
a range of variables. That is, the function is only applied to those variables
that have a match in \code{select}, while all other variables remain unchanged.
In other words: for this function, \code{select} will not omit any non-included
variables, so that the returned data frame will include all variables
from the input data frame.
}

\examples{
data(iris)

# entire data frame or a vector
head(iris$Sepal.Width)
head(center(iris$Sepal.Width))
head(center(iris))
head(center(iris, force = TRUE))

# only the selected columns from a data frame
center(anscombe, select = c("x1", "x3"))
center(anscombe, exclude = c("x1", "x3"))

# centering with reference center and scale
d <- data.frame(
  a = c(-2, -1, 0, 1, 2),
  b = c(3, 4, 5, 6, 7)
)

# default centering at mean
center(d)

# centering, using 0 as mean
center(d, center = 0)

# centering, using -5 as mean
center(d, center = -5)
}
\seealso{
If centering within-clusters (instead of grand-mean centering)
is required, see \code{\link[=demean]{demean()}}. For standardizing, see \code{\link[=standardize]{standardize()}}, and
\code{\link[=makepredictcall.dw_transformer]{makepredictcall.dw_transformer()}} for use in model formulas.
}


================================================
FILE: man/coef_var.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/descriptives.R
\name{coef_var}
\alias{coef_var}
\alias{distribution_cv}
\alias{distribution_coef_var}
\alias{coef_var.numeric}
\title{Compute the coefficient of variation}
\usage{
coef_var(x, ...)

distribution_coef_var(x, ...)

\method{coef_var}{numeric}(
  x,
  mu = NULL,
  sigma = NULL,
  method = c("standard", "unbiased", "median_mad", "qcd"),
  trim = 0,
  remove_na = FALSE,
  n = NULL,
  ...
)
}
\arguments{
\item{x}{A numeric vector of ratio scale (see details), or vector of values than can be coerced to one.}

\item{...}{Further arguments passed to computation functions.}

\item{mu}{A numeric vector of mean values to use to compute the coefficient
of variation. If supplied, \code{x} is not used to compute the mean.}

\item{sigma}{A numeric vector of standard deviation values to use to compute the coefficient
of variation. If supplied, \code{x} is not used to compute the SD.}

\item{method}{Method to use to compute the CV. Can be \code{"standard"} to compute
by dividing the standard deviation by the mean, \code{"unbiased"} for the
unbiased estimator for normally distributed data, or one of two robust
alternatives: \code{"median_mad"} to divide the median by the \code{\link[stats:mad]{stats::mad()}},
or \code{"qcd"} (quartile coefficient of dispersion, interquartile range divided
by the sum of the quartiles [twice the midhinge]: \eqn{(Q_3 - Q_1)/(Q_3 + Q_1)}.}

\item{trim}{the fraction (0 to 0.5) of values to be trimmed from
each end of \code{x} before the mean and standard deviation (or other measures)
are computed. Values of \code{trim} outside the range of (0 to 0.5) are taken
as the nearest endpoint.}

\item{remove_na}{Logical. Should \code{NA} values be removed before computing (\code{TRUE})
or not (\code{FALSE}, default)?}

\item{n}{If \code{method = "unbiased"} and both \code{mu} and \code{sigma} are provided (not
computed from \code{x}), what sample size to use to adjust the computed CV
for small-sample bias?}
}
\value{
The computed coefficient of variation for \code{x}.
}
\description{
Compute the coefficient of variation (CV, ratio of the standard deviation to
the mean, \eqn{\sigma/\mu}) for a set of numeric values.
}
\details{
CV is only applicable of values taken on a ratio scale: values that have a
\emph{fixed} meaningfully defined 0 (which is either the lowest or highest
possible value), and that ratios between them are interpretable For example,
how many sandwiches have I eaten this week? 0 means "none" and 20 sandwiches
is 4 times more than 5 sandwiches. If I were to center the number of
sandwiches, it will no longer be on a ratio scale (0 is no "none" it is the
mean, and the ratio between 4 and -2 is not meaningful). Scaling a ratio
scale still results in a ratio scale. So I can re define "how many half
sandwiches did I eat this week ( = sandwiches * 0.5) and 0 would still mean
"none", and 20 half-sandwiches is still 4 times more than 5 half-sandwiches.

This means that CV is \strong{NOT} invariant to shifting, but it is to scaling:

\if{html}{\out{<div class="sourceCode r">}}\preformatted{sandwiches <- c(0, 4, 15, 0, 0, 5, 2, 7)
coef_var(sandwiches)
#> [1] 1.239094

coef_var(sandwiches / 2) # same
#> [1] 1.239094

coef_var(sandwiches + 4) # different! 0 is no longer meaningful!
#> [1] 0.6290784
}\if{html}{\out{</div>}}
}
\examples{
coef_var(1:10)
coef_var(c(1:10, 100), method = "median_mad")
coef_var(c(1:10, 100), method = "qcd")
coef_var(mu = 10, sigma = 20)
coef_var(mu = 10, sigma = 20, method = "unbiased", n = 30)
}


================================================
FILE: man/coerce_to_numeric.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/to_numeric.R
\name{coerce_to_numeric}
\alias{coerce_to_numeric}
\title{Convert to Numeric (if possible)}
\usage{
coerce_to_numeric(x)
}
\arguments{
\item{x}{A vector to be converted.}
}
\value{
Numeric vector (if possible)
}
\description{
Tries to convert vector to numeric if possible (if no warnings or errors).
Otherwise, leaves it as is.
}
\examples{
coerce_to_numeric(c("1", "2"))
coerce_to_numeric(c("1", "2", "A"))
}


================================================
FILE: man/colnames.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/utils-cols.R
\name{row_to_colnames}
\alias{row_to_colnames}
\alias{colnames_to_row}
\title{Tools for working with column names}
\usage{
row_to_colnames(x, row = 1, na_prefix = "x", verbose = TRUE)

colnames_to_row(x, prefix = "x")
}
\arguments{
\item{x}{A data frame.}

\item{row}{Row to use as column names.}

\item{na_prefix}{Prefix to give to the column name if the row has an \code{NA}.
Default is 'x', and it will be incremented at each \code{NA} (\code{x1}, \code{x2}, etc.).}

\item{verbose}{Toggle warnings.}

\item{prefix}{Prefix to give to the column name. Default is 'x', and it will
be incremented at each column (\code{x1}, \code{x2}, etc.).}
}
\value{
\code{row_to_colnames()} and \code{colnames_to_row()} both return a data frame.
}
\description{
Tools for working with column names
}
\examples{
# Convert a row to column names --------------------------------
test <- data.frame(
  a = c("iso", 2, 5),
  b = c("year", 3, 6),
  c = c("value", 5, 7)
)
test
row_to_colnames(test)

# Convert column names to row --------------------------------
test <- data.frame(
  ARG = c("BRA", "FRA"),
  `1960` = c(1960, 1960),
  `2000` = c(2000, 2000)
)
test
colnames_to_row(test)

}


================================================
FILE: man/contr.deviation.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/contrs.R
\name{contr.deviation}
\alias{contr.deviation}
\title{Deviation Contrast Matrix}
\usage{
contr.deviation(n, base = 1, contrasts = TRUE, sparse = FALSE)
}
\arguments{
\item{n}{a vector of levels for a factor, or the number of levels.}

\item{base}{an integer specifying which group is considered the
    baseline group. Ignored if \code{contrasts} is \code{FALSE}.}

\item{contrasts}{a logical indicating whether contrasts should be
    computed.}

\item{sparse}{logical indicating if the result should be sparse
    (of class \code{\link[Matrix:dgCMatrix-class]{dgCMatrix}}), using
    package \href{https://CRAN.R-project.org/package=Matrix}{\pkg{Matrix}}.}
}
\description{
Build a deviation contrast matrix, a type of \emph{effects contrast} matrix.
}
\details{
In effects coding, unlike treatment/dummy coding
(\code{\link[stats:contrast]{stats::contr.treatment()}}), each contrast sums to 0. In regressions models,
this results in an intercept that represents the (unweighted) average of the
group means. In ANOVA settings, this also guarantees that lower order effects
represent \emph{main} effects (and not \emph{simple} or \emph{conditional} effects, as is
the case when using R's default \code{\link[stats:contrast]{stats::contr.treatment()}}).
\cr\cr
Deviation coding (\code{contr.deviation}) is a type of effects coding. With
deviation coding, the coefficients for factor variables are interpreted as
the difference of each factor level from the base level (this is the same
interpretation as with treatment/dummy coding). For example, for a factor
\code{group} with levels "A", "B", and "C", with \code{contr.devation}, the intercept
represents the overall mean (average of the group means for the 3 groups),
and the coefficients \code{groupB} and \code{groupC} represent the differences between
the A group mean and the B and C group means, respectively.
\cr\cr
Sum coding (\code{\link[stats:contrast]{stats::contr.sum()}}) is another type of effects coding. With sum
coding, the coefficients for factor variables are interpreted as the
difference of each factor level from \strong{the grand (across-groups) mean}. For
example, for a factor \code{group} with levels "A", "B", and "C", with
\code{contr.sum}, the intercept represents the overall mean (average of the group
means for the 3 groups), and the coefficients \code{group1} and \code{group2} represent
the differences the
\strong{A} and \strong{B} group means from the overall mean, respectively.
}
\examples{
\dontshow{if (!identical(Sys.getenv("IN_PKGDOWN"), "true")) withAutoprint(\{ # examplesIf}
\donttest{
data("mtcars")

mtcars <- data_modify(mtcars, cyl = factor(cyl))

c.treatment <- cbind(Intercept = 1, contrasts(mtcars$cyl))
solve(c.treatment)
#>            4 6 8
#> Intercept  1 0 0  # mean of the 1st level
#> 6         -1 1 0  # 2nd level - 1st level
#> 8         -1 0 1  # 3rd level - 1st level

contrasts(mtcars$cyl) <- contr.sum
c.sum <- cbind(Intercept = 1, contrasts(mtcars$cyl))
solve(c.sum)
#>                4      6      8
#> Intercept  0.333  0.333  0.333   # overall mean
#>            0.667 -0.333 -0.333   # deviation of 1st from overall mean
#>           -0.333  0.667 -0.333   # deviation of 2nd from overall mean


contrasts(mtcars$cyl) <- contr.deviation
c.deviation <- cbind(Intercept = 1, contrasts(mtcars$cyl))
solve(c.deviation)
#>                4     6     8
#> Intercept  0.333 0.333 0.333   # overall mean
#> 6         -1.000 1.000 0.000   # 2nd level - 1st level
#> 8         -1.000 0.000 1.000   # 3rd level - 1st level

## With Interactions -----------------------------------------
mtcars <- data_modify(mtcars, am = C(am, contr = contr.deviation))
mtcars <- data_arrange(mtcars, select = c("cyl", "am"))

mm <- unique(model.matrix(~ cyl * am, data = mtcars))
rownames(mm) <- c(
  "cyl4.am0", "cyl4.am1", "cyl6.am0",
  "cyl6.am1", "cyl8.am0", "cyl8.am1"
)

solve(mm)
#>             cyl4.am0 cyl4.am1 cyl6.am0 cyl6.am1 cyl8.am0 cyl8.am1
#> (Intercept)    0.167    0.167    0.167    0.167    0.167    0.167  # overall mean
#> cyl6          -0.500   -0.500    0.500    0.500    0.000    0.000  # cyl MAIN eff: 2nd - 1st
#> cyl8          -0.500   -0.500    0.000    0.000    0.500    0.500  # cyl MAIN eff: 2nd - 1st
#> am1           -0.333    0.333   -0.333    0.333   -0.333    0.333  # am MAIN eff
#> cyl6:am1       1.000   -1.000   -1.000    1.000    0.000    0.000
#> cyl8:am1       1.000   -1.000    0.000    0.000   -1.000    1.000
}
\dontshow{\}) # examplesIf}
}
\seealso{
\code{\link[stats:contrast]{stats::contr.sum()}}
}


================================================
FILE: man/convert_na_to.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/convert_na_to.R
\name{convert_na_to}
\alias{convert_na_to}
\alias{convert_na_to.numeric}
\alias{convert_na_to.character}
\alias{convert_na_to.data.frame}
\title{Replace missing values in a variable or a data frame.}
\usage{
convert_na_to(x, ...)

\method{convert_na_to}{numeric}(x, replacement = NULL, verbose = TRUE, ...)

\method{convert_na_to}{character}(x, replacement = NULL, verbose = TRUE, ...)

\method{convert_na_to}{data.frame}(
  x,
  select = NULL,
  exclude = NULL,
  replacement = NULL,
  replace_num = replacement,
  replace_char = replacement,
  replace_fac = replacement,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
)
}
\arguments{
\item{x}{A numeric, factor, or character vector, or a data frame.}

\item{...}{Not used.}

\item{replacement}{Numeric or character value that will be used to
replace \code{NA}.}

\item{verbose}{Toggle warnings.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}

\item{replace_num}{Value to replace \code{NA} when variable is of type numeric.}

\item{replace_char}{Value to replace \code{NA} when variable is of type character.}

\item{replace_fac}{Value to replace \code{NA} when variable is of type factor.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}
}
\value{
\code{x}, where \code{NA} values are replaced by \code{replacement}.
}
\description{
Replace missing values in a variable or a data frame.
}
\section{Selection of variables - the \code{select} argument}{

For most functions that have a \code{select} argument (including this function),
the complete input data frame is returned, even when \code{select} only selects
a range of variables. That is, the function is only applied to those variables
that have a match in \code{select}, while all other variables remain unchanged.
In other words: for this function, \code{select} will not omit any non-included
variables, so that the returned data frame will include all variables
from the input data frame.
}

\examples{
# Convert NA to 0 in a numeric vector
convert_na_to(
  c(9, 3, NA, 2, 3, 1, NA, 8),
  replacement = 0
)

# Convert NA to "missing" in a character vector
convert_na_to(
  c("a", NA, "d", "z", NA, "t"),
  replacement = "missing"
)

### For data frames

test_df <- data.frame(
  x = c(1, 2, NA),
  x2 = c(4, 5, NA),
  y = c("a", "b", NA)
)

# Convert all NA to 0 in numeric variables, and all NA to "missing" in
# character variables
convert_na_to(
  test_df,
  replace_num = 0,
  replace_char = "missing"
)

# Convert a specific variable in the data frame
convert_na_to(
  test_df,
  replace_num = 0,
  replace_char = "missing",
  select = "x"
)

# Convert all variables starting with "x"
convert_na_to(
  test_df,
  replace_num = 0,
  replace_char = "missing",
  select = starts_with("x")
)

# Convert NA to 1 in variable 'x2' and to 0 in all other numeric
# variables
convert_na_to(
  test_df,
  replace_num = 0,
  select = list(x2 = 1)
)

}


================================================
FILE: man/convert_to_na.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/convert_to_na.R
\name{convert_to_na}
\alias{convert_to_na}
\alias{convert_to_na.numeric}
\alias{convert_to_na.factor}
\alias{convert_to_na.data.frame}
\title{Convert non-missing values in a variable into missing values.}
\usage{
convert_to_na(x, ...)

\method{convert_to_na}{numeric}(x, na = NULL, verbose = TRUE, ...)

\method{convert_to_na}{factor}(x, na = NULL, drop_levels = FALSE, verbose = TRUE, ...)

\method{convert_to_na}{data.frame}(
  x,
  select = NULL,
  exclude = NULL,
  na = NULL,
  drop_levels = FALSE,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
)
}
\arguments{
\item{x}{A vector, factor or a data frame.}

\item{...}{Not used.}

\item{na}{Numeric, character vector or logical (or a list of numeric, character
vectors or logicals) with values that should be converted to \code{NA}. Numeric
values applied to numeric vectors, character values are used for factors,
character vectors or date variables, and logical values for logical vectors.}

\item{verbose}{Toggle warnings.}

\item{drop_levels}{Logical, for factors, when specific levels are replaced
by \code{NA}, should unused levels be dropped?}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}
}
\value{
\code{x}, where all values in \code{na} are converted to \code{NA}.
}
\description{
Convert non-missing values in a variable into missing values.
}
\examples{
x <- sample(1:6, size = 30, replace = TRUE)
x
# values 4 and 5 to NA
convert_to_na(x, na = 4:5)

# data frames
set.seed(123)
x <- data.frame(
  a = sample(1:6, size = 20, replace = TRUE),
  b = sample(letters[1:6], size = 20, replace = TRUE),
  c = sample(c(30:33, 99), size = 20, replace = TRUE)
)
# for all numerics, convert 5 to NA. Character/factor will be ignored.
convert_to_na(x, na = 5)

# for numerics, 5 to NA, for character/factor, "f" to NA
convert_to_na(x, na = list(6, "f"))

# select specific variables
convert_to_na(x, select = c("a", "b"), na = list(6, "f"))
}


================================================
FILE: man/data_arrange.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_arrange.R
\name{data_arrange}
\alias{data_arrange}
\title{Arrange rows by column values}
\usage{
data_arrange(data, select = NULL, safe = TRUE)
}
\arguments{
\item{data}{A data frame, or an object that can be coerced to a data frame.}

\item{select}{Character vector of column names. Use a dash just before column
name to arrange in decreasing order, for example \code{"-x1"}.}

\item{safe}{Do not throw an error if one of the variables specified doesn't
exist.}
}
\value{
A data frame.
}
\description{
\code{data_arrange()} orders the rows of a data frame by the values of selected
columns.
}
\examples{

# Arrange using several variables
data_arrange(head(mtcars), c("gear", "carb"))

# Arrange in decreasing order
data_arrange(head(mtcars), "-carb")

# Throw an error if one of the variables specified doesn't exist
try(data_arrange(head(mtcars), c("gear", "foo"), safe = FALSE))
}


================================================
FILE: man/data_codebook.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_codebook.R
\name{data_codebook}
\alias{data_codebook}
\alias{print_html.data_codebook}
\alias{display.data_codebook}
\title{Generate a codebook of a data frame.}
\usage{
data_codebook(
  data,
  select = NULL,
  exclude = NULL,
  variable_label_width = NULL,
  value_label_width = NULL,
  max_values = 10,
  range_at = 6,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
)

\method{print_html}{data_codebook}(
  x,
  font_size = "100\%",
  line_padding = 3,
  row_color = "#eeeeee",
  ...
)

\method{display}{data_codebook}(
  object,
  format = "markdown",
  font_size = "100\%",
  line_padding = 3,
  row_color = "#eeeeee",
  ...
)
}
\arguments{
\item{data}{A data frame, or an object that can be coerced to a data frame.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}

\item{variable_label_width}{Length of variable labels. Longer labels will be
wrapped at \code{variable_label_width} chars. If \code{NULL}, longer labels will not
be split into multiple lines. Only applies to \emph{labelled data}.}

\item{value_label_width}{Length of value labels. Longer labels will be
shortened, where the remaining part is truncated. Only applies to
\emph{labelled data} or factor levels.}

\item{max_values}{Number of maximum values that should be displayed. Can be
used to avoid too many rows when variables have lots of unique values.}

\item{range_at}{Indicates how many unique values in a numeric vector are
needed in order to print a range for that variable instead of a frequency
table for all numeric values. Can be useful if the data contains numeric
variables with only a few unique values and where full frequency tables
instead of value ranges should be displayed.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}

\item{verbose}{Toggle warnings and messages on or off.}

\item{...}{Arguments passed to or from other methods.}

\item{x}{A (grouped) data frame, a vector or a statistical model (for
\code{unstandardize()} cannot be a model).}

\item{font_size}{For HTML tables, the font size.}

\item{line_padding}{For HTML tables, the distance (in pixel) between lines.}

\item{row_color}{For HTML tables, the fill color for odd rows.}

\item{object}{An object returned by \code{data_tabulate()}.}

\item{format}{String, indicating the output format. Can be \code{"markdown"}
\code{"html"}, or \code{"tt"}. \code{format = "html"} create an HTML table using the \emph{gt}
package. \code{format = "tt"} creates a \code{tinytable} object, which is either
printed as markdown or HTML table, depending on the environment. See
\code{\link[insight:export_table]{insight::export_table()}} for details.}
}
\value{
A formatted data frame, summarizing the content of the data frame.
Returned columns include the column index of the variables in the original
data frame (\code{ID}), column name, variable label (if data is labelled), type
of variable, number of missing values, unique values (or value range),
value labels (for labelled data), and a frequency table (N for each value).
Most columns are formatted as character vectors.
}
\description{
\code{data_codebook()} generates codebooks from data frames, i.e. overviews
of all variables and some more information about each variable (like
labels, values or value range, frequencies, amount of missing values).
}
\note{
There are methods to \code{print()} the data frame in a nicer output, as
well methods for printing in markdown or HTML format (\code{print_md()} and
\code{print_html()}). The \code{print()} method for text outputs passes arguments in
\code{...} to \code{\link[insight:export_table]{insight::export_table()}}.
}
\examples{
data(iris)
data_codebook(iris, select = starts_with("Sepal"))

data(efc)
data_codebook(efc)

# shorten labels
data_codebook(efc, variable_label_width = 20, value_label_width = 15)

# automatic range for numerics at more than 5 unique values
data(mtcars)
data_codebook(mtcars, select = starts_with("c"))

# force all values to be displayed
data_codebook(mtcars, select = starts_with("c"), range_at = 100)
}


================================================
FILE: man/data_duplicated.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_duplicated.R
\name{data_duplicated}
\alias{data_duplicated}
\title{Extract all duplicates}
\usage{
data_duplicated(
  data,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE
)
}
\arguments{
\item{data}{A data frame.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}

\item{verbose}{Toggle warnings.}
}
\value{
A dataframe, containing all duplicates.
}
\description{
Extract all duplicates, for visual inspection.
Note that it also contains the first occurrence of future
duplicates, unlike \code{\link[=duplicated]{duplicated()}} or \code{\link[dplyr:distinct]{dplyr::distinct()}}). Also
contains an additional column reporting the number of missing
values for that row, to help in the decision-making when
selecting which duplicates to keep.
}
\examples{
df1 <- data.frame(
  id = c(1, 2, 3, 1, 3),
  year = c(2022, 2022, 2022, 2022, 2000),
  item1 = c(NA, 1, 1, 2, 3),
  item2 = c(NA, 1, 1, 2, 3),
  item3 = c(NA, 1, 1, 2, 3)
)

data_duplicated(df1, select = "id")

data_duplicated(df1, select = c("id", "year"))

# Filter to exclude duplicates
df2 <- df1[-c(1, 5), ]
df2

}
\seealso{
\code{\link[=data_unique]{data_unique()}}
}
\keyword{duplicates}


================================================
FILE: man/data_extract.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_extract.R
\name{data_extract}
\alias{data_extract}
\alias{data_extract.data.frame}
\title{Extract one or more columns or elements from an object}
\usage{
data_extract(data, select, ...)

\method{data_extract}{data.frame}(
  data,
  select,
  name = NULL,
  extract = "all",
  as_data_frame = FALSE,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
)
}
\arguments{
\item{data}{The object to subset. Methods are currently available for data frames
and data frame extensions (e.g., tibbles).}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{...}{For use by future methods.}

\item{name}{An optional argument that specifies the column to be used as
names for the vector elements after extraction. Must be specified either
as literal variable name (e.g., \code{column_name}) or as string
(\code{"column_name"}). \code{name} will be ignored when a data frame is returned.}

\item{extract}{String, indicating which element will be extracted when \code{select}
matches multiple variables. Can be \code{"all"} (the default) to return all
matched variables, \code{"first"} or \code{"last"} to return the first or last match,
or \code{"odd"} and \code{"even"} to return all odd-numbered or even-numbered
matches. Note that \code{"first"} or \code{"last"} return a vector (unless
\code{as_data_frame = TRUE}), while \code{"all"} can return a vector (if only one
match was found) \emph{or} a data frame (for more than one match). Type safe
return values are only possible when \code{extract} is \code{"first"} or \code{"last"} (will
always return a vector) or when \code{as_data_frame = TRUE} (always returns a
data frame).}

\item{as_data_frame}{Logical, if \code{TRUE}, will always return a data frame,
even if only one variable was matched. If \code{FALSE}, either returns a vector
or a data frame. See \code{extract} for details.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}

\item{verbose}{Toggle warnings.}
}
\value{
A vector (or a data frame) containing the extracted element, or
\code{NULL} if no matching variable was found.
}
\description{
\code{data_extract()} (or its alias \code{extract()}) is similar to \code{$}. It extracts
either a single column or element from an object (e.g., a data frame, list),
or multiple columns resp. elements.
}
\details{
\code{data_extract()} can be used to select multiple variables or pull a
single variable from a data frame. Thus, the return value is by default not
type safe - \code{data_extract()} either returns a vector or a data frame.
\subsection{Extracting single variables (vectors)}{
When \code{select} is the name of a single column, or when select only matches
one column, a vector is returned. A single variable is also returned when
\code{extract} is either \verb{"first} or \code{"last"}. Setting \code{as_data_frame} to \code{TRUE}
overrides this behaviour and \emph{always} returns a data frame.
}
\subsection{Extracting a data frame of variables}{
When \code{select} is a character vector containing more than one column name (or
a numeric vector with more than one valid column indices), or when \code{select}
uses one of the supported select-helpers that match multiple columns, a
data frame is returned. Setting \code{as_data_frame} to \code{TRUE} \emph{always} returns
a data frame.
}
}
\examples{
# single variable
data_extract(mtcars, cyl, name = gear)
data_extract(mtcars, "cyl", name = gear)
data_extract(mtcars, -1, name = gear)
data_extract(mtcars, cyl, name = 0)
data_extract(mtcars, cyl, name = "row.names")

# selecting multiple variables
head(data_extract(iris, starts_with("Sepal")))
head(data_extract(iris, ends_with("Width")))
head(data_extract(iris, 2:4))

# select first of multiple variables
data_extract(iris, starts_with("Sepal"), extract = "first")

# select first of multiple variables, return as data frame
head(data_extract(iris, starts_with("Sepal"), extract = "first", as_data_frame = TRUE))
}


================================================
FILE: man/data_group.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_group.R
\name{data_group}
\alias{data_group}
\alias{data_ungroup}
\title{Create a grouped data frame}
\usage{
data_group(
  data,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
)

data_ungroup(data, verbose = TRUE, ...)
}
\arguments{
\item{data}{A data frame}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}

\item{verbose}{Toggle warnings.}

\item{...}{Arguments passed down to other functions. Mostly not used yet.}
}
\value{
A grouped data frame, i.e. a data frame with additional information
about the grouping structure saved as attributes.
}
\description{
This function is comparable to \code{dplyr::group_by()}, but just
following the \strong{datawizard} function design. \code{data_ungroup()} removes the
grouping information from a grouped data frame.
}
\examples{
\dontshow{if (requireNamespace("poorman")) withAutoprint(\{ # examplesIf}
data(efc)
suppressPackageStartupMessages(library(poorman, quietly = TRUE))

# total mean
efc \%>\%
  summarize(mean_hours = mean(c12hour, na.rm = TRUE))

# mean by educational level
efc \%>\%
  data_group(c172code) \%>\%
  summarize(mean_hours = mean(c12hour, na.rm = TRUE))
\dontshow{\}) # examplesIf}
}


================================================
FILE: man/data_match.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_match.R
\name{data_match}
\alias{data_match}
\alias{data_filter}
\title{Return filtered or sliced data frame, or row indices}
\usage{
data_match(x, to, match = "and", return_indices = FALSE, remove_na = TRUE, ...)

data_filter(x, ...)
}
\arguments{
\item{x}{A data frame.}

\item{to}{A data frame matching the specified conditions. Note that if
\code{match} is a value other than \code{"and"}, the original row order might be
changed. See 'Details'.}

\item{match}{String, indicating with which logical operation matching
conditions should be combined. Can be \code{"and"} (or \code{"&"}), \code{"or"} (or \code{"|"})
or \code{"not"} (or \code{"!"}).}

\item{return_indices}{Logical, if \code{TRUE}, return the vector of rows that
can be used to filter the original data frame. If \code{FALSE} (default),
returns directly the filtered data frame instead of the row indices.}

\item{remove_na}{Logical, if \code{TRUE}, missing values (\code{NA}s) are removed before
filtering the data. This is the default behaviour, however, sometimes when
row indices are requested (i.e. \code{return_indices=TRUE}), it might be useful
to preserve \code{NA} values, so returned row indices match the row indices of
the original data frame.}

\item{...}{A sequence of logical expressions indicating which rows to keep,
or a numeric vector indicating the row indices of rows to keep. Can also be
a string representation of a logical expression (e.g. \code{"x > 4"}), a
character vector (e.g. \code{c("x > 4", "y == 2")}) or a variable that contains
the string representation of a logical expression. These might be useful
when used in packages to avoid defining undefined global variables.}
}
\value{
A filtered data frame, or the row indices that match the specified
configuration.
}
\description{
Return a filtered (or sliced) data frame or row indices of a data frame that
match a specific condition. \code{data_filter()} works like \code{data_match()}, but works
with logical expressions or row indices of a data frame to specify matching
conditions.
}
\details{
For \code{data_match()}, if \code{match} is either \code{"or"} or \code{"not"}, the
original row order from \code{x} might be changed. If preserving row order is
required, use \code{data_filter()} instead.

\if{html}{\out{<div class="sourceCode">}}\preformatted{# mimics subset() behaviour, preserving original row order
head(data_filter(mtcars[c("mpg", "vs", "am")], vs == 0 | am == 1))
#>                    mpg vs am
#> Mazda RX4         21.0  0  1
#> Mazda RX4 Wag     21.0  0  1
#> Datsun 710        22.8  1  1
#> Hornet Sportabout 18.7  0  0
#> Duster 360        14.3  0  0
#> Merc 450SE        16.4  0  0

# re-sorting rows
head(data_match(mtcars[c("mpg", "vs", "am")],
                data.frame(vs = 0, am = 1),
                match = "or"))
#>                    mpg vs am
#> Mazda RX4         21.0  0  1
#> Mazda RX4 Wag     21.0  0  1
#> Hornet Sportabout 18.7  0  0
#> Duster 360        14.3  0  0
#> Merc 450SE        16.4  0  0
#> Merc 450SL        17.3  0  0
}\if{html}{\out{</div>}}

While \code{data_match()} works with data frames to match conditions against,
\code{data_filter()} is basically a wrapper around \verb{subset(subset = <filter>)}.
However, unlike \code{subset()}, it preserves label attributes and is useful when
working with labelled data.
}
\examples{
data_match(mtcars, data.frame(vs = 0, am = 1))
data_match(mtcars, data.frame(vs = 0, am = c(0, 1)))

# observations where "vs" is NOT 0 AND "am" is NOT 1
data_match(mtcars, data.frame(vs = 0, am = 1), match = "not")
# equivalent to
data_filter(mtcars, vs != 0 & am != 1)

# observations where EITHER "vs" is 0 OR "am" is 1
data_match(mtcars, data.frame(vs = 0, am = 1), match = "or")
# equivalent to
data_filter(mtcars, vs == 0 | am == 1)

# slice data frame by row indices
data_filter(mtcars, 5:10)

# Define a custom function containing data_filter()
my_filter <- function(data, variable) {
  data_filter(data, variable)
}
my_filter(mtcars, "cyl == 6")

# Pass complete filter-condition as string.
my_filter <- function(data, condition) {
  data_filter(data, condition)
}
my_filter(mtcars, "am != 0")

# string can also be used directly as argument
data_filter(mtcars, "am != 0")

# or as variable
fl <- "am != 0"
data_filter(mtcars, fl)
}
\seealso{
\itemize{
\item Add a prefix or suffix to column names: \code{\link[=data_addprefix]{data_addprefix()}}, \code{\link[=data_addsuffix]{data_addsuffix()}}
\item Functions to reorder or remove columns: \code{\link[=data_reorder]{data_reorder()}}, \code{\link[=data_relocate]{data_relocate()}},
\code{\link[=data_remove]{data_remove()}}
\item Functions to reshape, pivot or rotate data frames: \code{\link[=data_to_long]{data_to_long()}},
\code{\link[=data_to_wide]{data_to_wide()}}, \code{\link[=data_rotate]{data_rotate()}}
\item Functions to recode data: \code{\link[=rescale]{rescale()}}, \code{\link[=reverse]{reverse()}}, \code{\link[=categorize]{categorize()}},
\code{\link[=recode_values]{recode_values()}}, \code{\link[=slide]{slide()}}
\item Functions to standardize, normalize, rank-transform: \code{\link[=center]{center()}}, \code{\link[=standardize]{standardize()}},
\code{\link[=normalize]{normalize()}}, \code{\link[=ranktransform]{ranktransform()}}, \code{\link[=winsorize]{winsorize()}}
\item Split and merge data frames: \code{\link[=data_partition]{data_partition()}}, \code{\link[=data_merge]{data_merge()}}
\item Functions to find or select columns: \code{\link[=data_select]{data_select()}}, \code{\link[=extract_column_names]{extract_column_names()}}
\item Functions to filter rows: \code{\link[=data_match]{data_match()}}, \code{\link[=data_filter]{data_filter()}}
}
}


================================================
FILE: man/data_merge.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_merge.R
\name{data_merge}
\alias{data_merge}
\alias{data_join}
\alias{data_merge.data.frame}
\alias{data_merge.list}
\title{Merge (join) two data frames, or a list of data frames}
\usage{
data_merge(x, ...)

data_join(x, ...)

\method{data_merge}{data.frame}(x, y, join = "left", by = NULL, id = NULL, verbose = TRUE, ...)

\method{data_merge}{list}(x, join = "left", by = NULL, id = NULL, verbose = TRUE, ...)
}
\arguments{
\item{x, y}{A data frame to merge. \code{x} may also be a list of data frames
that will be merged. Note that the list-method has no \code{y} argument.}

\item{...}{Not used.}

\item{join}{Character vector, indicating the method of joining the data frames.
Can be \code{"full"}, \code{"left"} (default), \code{"right"}, \code{"inner"}, \code{"anti"}, \code{"semi"}
or \code{"bind"}. See details below.}

\item{by}{Specifications of the columns used for merging.}

\item{id}{Optional name for ID column that will be created to indicate the
source data frames for appended rows. Only applies if \code{join = "bind"}.}

\item{verbose}{Toggle warnings.}
}
\value{
A merged data frame.
}
\description{
Merge (join) two data frames, or a list of data frames. However, unlike
base R's \code{merge()}, \code{data_merge()} offers a few more methods to join data
frames, and it does not drop data frame nor column attributes.
}
\section{Merging data frames}{


Merging data frames is performed by adding rows (cases), columns
(variables) or both from the source data frame (\code{y}) to the target
data frame (\code{x}). This usually requires one or more variables which
are included in both data frames and that are used for merging, typically
indicated with the \code{by} argument. When \code{by} contains a variable present
in both data frames, cases are matched and filtered by identical values
of \code{by} in \code{x} and \code{y}.
}

\section{Left- and right-joins}{


Left- and right joins usually don't add new rows (cases), but only new
columns (variables) for existing cases in \code{x}. For \code{join = "left"} or
\code{join = "right"} to work, \code{by} \emph{must} indicate one or more columns that
are included in both data frames. For \code{join = "left"}, if \code{by} is an
identifier variable, which is included in both \code{x} and \code{y}, all variables
from \code{y} are copied to \code{x}, but only those cases from \code{y} that have
matching values in their identifier variable in \code{x} (i.e. all cases
in \code{x} that are also found in \code{y} get the related values from the new
columns in \code{y}). If there is no match between identifiers in \code{x} and \code{y},
the copied variable from \code{y} will get a \code{NA} value for this particular
case. Other variables that occur both in \code{x} and \code{y}, but are not used
as identifiers (with \code{by}), will be renamed to avoid multiple identical
variable names. Cases in \code{y} where values from the identifier have no
match in \code{x}'s identifier are removed. \code{join = "right"} works in
a similar way as \code{join = "left"}, just that only cases from \code{x} that
have matching values in their identifier variable in \code{y} are chosen.

In base R, these are equivalent to \code{merge(x, y, all.x = TRUE)} and
\code{merge(x, y, all.y = TRUE)}.
}

\section{Full joins}{


Full joins copy all cases from \code{y} to \code{x}. For matching cases in both
data frames, values for new variables are copied from \code{y} to \code{x}. For
cases in \code{y} not present in \code{x}, these will be added as new rows to \code{x}.
Thus, full joins not only add new columns (variables), but also might
add new rows (cases).

In base R, this is equivalent to \code{merge(x, y, all = TRUE)}.
}

\section{Inner joins}{


Inner joins merge two data frames, however, only those rows (cases) are
kept that are present in both data frames. Thus, inner joins usually
add new columns (variables), but also remove rows (cases) that only
occur in one data frame.

In base R, this is equivalent to \code{merge(x, y)}.
}

\section{Binds}{


\code{join = "bind"} row-binds the complete second data frame \code{y} to \code{x}.
Unlike simple \code{rbind()}, which requires the same columns for both data
frames, \code{join = "bind"} will bind shared columns from \code{y} to \code{x}, and
add new columns from \code{y} to \code{x}.
}

\examples{

x <- data.frame(a = 1:3, b = c("a", "b", "c"), c = 5:7, id = 1:3)
y <- data.frame(c = 6:8, d = c("f", "g", "h"), e = 100:102, id = 2:4)

x
y

# "by" will default to all shared columns, i.e. "c" and "id". new columns
# "d" and "e" will be copied from "y" to "x", but there are only two cases
# in "x" that have the same values for "c" and "id" in "y". only those cases
# have values in the copied columns, the other case gets "NA".
data_merge(x, y, join = "left")

# we change the id-value here
x <- data.frame(a = 1:3, b = c("a", "b", "c"), c = 5:7, id = 1:3)
y <- data.frame(c = 6:8, d = c("f", "g", "h"), e = 100:102, id = 3:5)

x
y

# no cases in "y" have the same matching "c" and "id" as in "x", thus
# copied variables from "y" to "x" copy no values, all get NA.
data_merge(x, y, join = "left")

# one case in "y" has a match in "id" with "x", thus values for this
# case from the remaining variables in "y" are copied to "x", all other
# values (cases) in those remaining variables get NA
data_merge(x, y, join = "left", by = "id")

data(mtcars)
x <- mtcars[1:5, 1:3]
y <- mtcars[28:32, 4:6]

# add ID common column
x$id <- 1:5
y$id <- 3:7

# left-join, add new variables and copy values from y to x,
# where "id" values match
data_merge(x, y)

# right-join, add new variables and copy values from x to y,
# where "id" values match
data_merge(x, y, join = "right")

# full-join
data_merge(x, y, join = "full")


data(mtcars)
x <- mtcars[1:5, 1:3]
y <- mtcars[28:32, c(1, 4:5)]

# add ID common column
x$id <- 1:5
y$id <- 3:7

# left-join, no matching rows (because columns "id" and "disp" are used)
# new variables get all NA values
data_merge(x, y)

# one common value in "mpg", so one row from y is copied to x
data_merge(x, y, by = "mpg")

# only keep rows with matching values in by-column
data_merge(x, y, join = "semi", by = "mpg")

# only keep rows with non-matching values in by-column
data_merge(x, y, join = "anti", by = "mpg")

# merge list of data frames. can be of different rows
x <- mtcars[1:5, 1:3]
y <- mtcars[28:31, 3:5]
z <- mtcars[11:18, c(1, 3:4, 6:8)]
x$id <- 1:5
y$id <- 4:7
z$id <- 3:10
data_merge(list(x, y, z), join = "bind", by = "id", id = "source")
}
\seealso{
\itemize{
\item Add a prefix or suffix to column names: \code{\link[=data_addprefix]{data_addprefix()}}, \code{\link[=data_addsuffix]{data_addsuffix()}}
\item Functions to reorder or remove columns: \code{\link[=data_reorder]{data_reorder()}}, \code{\link[=data_relocate]{data_relocate()}},
\code{\link[=data_remove]{data_remove()}}
\item Functions to reshape, pivot or rotate data frames: \code{\link[=data_to_long]{data_to_long()}},
\code{\link[=data_to_wide]{data_to_wide()}}, \code{\link[=data_rotate]{data_rotate()}}
\item Functions to recode data: \code{\link[=rescale]{rescale()}}, \code{\link[=reverse]{reverse()}}, \code{\link[=categorize]{categorize()}},
\code{\link[=recode_values]{recode_values()}}, \code{\link[=slide]{slide()}}
\item Functions to standardize, normalize, rank-transform: \code{\link[=center]{center()}}, \code{\link[=standardize]{standardize()}},
\code{\link[=normalize]{normalize()}}, \code{\link[=ranktransform]{ranktransform()}}, \code{\link[=winsorize]{winsorize()}}
\item Split and merge data frames: \code{\link[=data_partition]{data_partition()}}, \code{\link[=data_merge]{data_merge()}}
\item Functions to find or select columns: \code{\link[=data_select]{data_select()}}, \code{\link[=extract_column_names]{extract_column_names()}}
\item Functions to filter rows: \code{\link[=data_match]{data_match()}}, \code{\link[=data_filter]{data_filter()}}
}
}


================================================
FILE: man/data_modify.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_modify.R
\name{data_modify}
\alias{data_modify}
\alias{data_modify.data.frame}
\title{Create new variables in a data frame}
\usage{
data_modify(data, ...)

\method{data_modify}{data.frame}(data, ..., .if = NULL, .at = NULL, .modify = NULL)
}
\arguments{
\item{data}{A data frame}

\item{...}{One or more expressions that define the new variable name and the
values or recoding of those new variables. These expressions can be one of:
\itemize{
\item A sequence of named, literal expressions, where the left-hand side refers
to the name of the new variable, while the right-hand side represent the
values of the new variable. Example: \code{Sepal.Width = center(Sepal.Width)}.
\item A vector of length 1 (which will be recycled to match the number of rows
in the data), or of same length as the data.
\item A variable that contains a value to be used. Example:

\if{html}{\out{<div class="sourceCode r">}}\preformatted{a <- "abc"
data_modify(iris, var_abc = a) # var_abc contains "abc"
}\if{html}{\out{</div>}}
\item An expression can also be provided as string and wrapped in
\code{as_expr()}. Example:

\if{html}{\out{<div class="sourceCode r">}}\preformatted{data_modify(iris, as_expr("Sepal.Width = center(Sepal.Width)"))
# or
a <- "center(Sepal.Width)"
data_modify(iris, Sepal.Width = as_expr(a))
# or
a <- "Sepal.Width = center(Sepal.Width)"
data_modify(iris, as_expr(a))
}\if{html}{\out{</div>}}

Note that \code{as_expr()} is no real function, which cannot be used outside
of \code{data_modify()}, and hence it is not exported nor documented. Rather,
it is only used for internally processing expressions.
\item Using \code{NULL} as right-hand side removes a variable from the data frame.
Example: \code{Petal.Width = NULL}.
\item For data frames (including grouped ones), the function \code{n()} can be used to
count the number of observations and thereby, for instance, create index
values by using \code{id = 1:n()} or \code{id = 3:(n()+2)} and similar. Note that,
like \code{as_expr()}, \code{n()} is also no true function and cannot be used outside
of \code{data_modify()}.
}

Note that newly created variables can be used in subsequent expressions,
including \code{.at} or \code{.if}. See also 'Examples'.}

\item{.if}{A function that returns \code{TRUE} for columns in the data frame where
\code{.if} applies. This argument is used in combination with the \code{.modify} argument.
Note that only one of \code{.at} or \code{.if} can be provided, but not both at the same
time. Newly created variables in \code{...} can also be selected, see 'Examples'.}

\item{.at}{A character vector of variable names that should be modified. This
argument is used in combination with the \code{.modify} argument. Note that only one
of \code{.at} or \code{.if} can be provided, but not both at the same time. Newly created
variables in \code{...} can also be selected, see 'Examples'.}

\item{.modify}{A function that modifies the variables defined in \code{.at} or \code{.if}.
This argument is used in combination with either the \code{.at} or the \code{.if} argument.
Note that the modified variable (i.e. the result from \code{.modify}) must be either
of length 1 or of same length as the input variable.}
}
\description{
Create new variables or modify existing variables in a data frame. Unlike \code{base::transform()}, \code{data_modify()}
can be used on grouped data frames, and newly created variables can be directly
used.
}
\note{
\code{data_modify()} can also be used inside functions. However, it is
recommended to pass the recode-expression as character vector or list of
characters.
}
\examples{
data(efc)
new_efc <- data_modify(
  efc,
  c12hour_c = center(c12hour),
  c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE),
  c12hour_z2 = standardize(c12hour)
)
head(new_efc)

# using strings instead of literal expressions
new_efc <- data_modify(
  efc,
  as_expr("c12hour_c = center(c12hour)"),
  as_expr("c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE)"),
  as_expr("c12hour_z2 = standardize(c12hour)")
)
head(new_efc)

# using a character vector, provided a variable
xpr <- c(
  "c12hour_c = center(c12hour)",
  "c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE)",
  "c12hour_z2 = standardize(c12hour)"
)
new_efc <- data_modify(efc, as_expr(xpr))
head(new_efc)

# using character strings, provided as variable
stand <- "c12hour_c / sd(c12hour, na.rm = TRUE)"
new_efc <- data_modify(
  efc,
  c12hour_c = center(c12hour),
  c12hour_z = as_expr(stand)
)
head(new_efc)

# attributes - in this case, value and variable labels - are preserved
str(new_efc)

# using `paste()` to build a string-expression
to_standardize <- c("Petal.Length", "Sepal.Length")
out <- data_modify(
  iris,
  as_expr(
    paste0(to_standardize, "_stand = standardize(", to_standardize, ")")
  )
)
head(out)

# overwrite existing variable, remove old variable
out <- data_modify(iris, Petal.Length = 1 / Sepal.Length, Sepal.Length = NULL)
head(out)

# works on grouped data
grouped_efc <- data_group(efc, "c172code")
new_efc <- data_modify(
  grouped_efc,
  c12hour_c = center(c12hour),
  c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE),
  c12hour_z2 = standardize(c12hour),
  id = 1:n()
)
head(new_efc)

# works from inside functions
foo1 <- function(data, ...) {
  head(data_modify(data, ...))
}
foo1(iris, SW_fraction = Sepal.Width / 10)
# or
foo1(iris, as_expr("SW_fraction = Sepal.Width / 10"))

# also with string arguments, using `as_expr()`
foo2 <- function(data, modification) {
  head(data_modify(data, as_expr(modification)))
}
foo2(iris, "SW_fraction = Sepal.Width / 10")

# modify at specific positions or if condition is met
d <- iris[1:5, ]
data_modify(d, .at = "Species", .modify = as.numeric)
data_modify(d, .if = is.factor, .modify = as.numeric)

# can be combined with dots
data_modify(d, new_length = Petal.Length * 2, .at = "Species", .modify = as.numeric)

# new variables used in `.at` or `.if`
data_modify(
  d,
  new_length = Petal.Length * 2,
  .at = c("Petal.Length", "new_length"),
  .modify = round
)

# combine "extract_column_names()" and ".at" argument
out <- data_modify(
  d,
  .at = extract_column_names(d, select = starts_with("Sepal")),
  .modify = as.factor
)
# "Sepal.Length" and "Sepal.Width" are now factors
str(out)

}


================================================
FILE: man/data_partition.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_partition.R
\name{data_partition}
\alias{data_partition}
\title{Partition data}
\usage{
data_partition(
  data,
  proportion = 0.7,
  by = NULL,
  seed = NULL,
  row_id = ".row_id",
  verbose = TRUE,
  ...
)
}
\arguments{
\item{data}{A data frame.}

\item{proportion}{Scalar (between 0 and 1) or numeric vector, indicating the
proportion(s) of the training set(s). The sum of \code{proportion} must not be
greater than 1. The remaining part will be used for the test set.}

\item{by}{A character vector indicating the name(s) of the column(s) used
for stratified partitioning.}

\item{seed}{A random number generator seed. Enter an integer (e.g. 123) so
that the random sampling will be the same each time you run the function.}

\item{row_id}{Character string, indicating the name of the column that
contains the row-id's.}

\item{verbose}{Toggle messages and warnings.}

\item{...}{Other arguments passed to or from other functions.}
}
\value{
A list of data frames. The list includes one training set per given
proportion and the remaining data as test set. List elements of training
sets are named after the given proportions (e.g., \verb{$p_0.7}), the test set
is named \verb{$test}.
}
\description{
Creates data partitions (for instance, a training and a test set) based on a
data frame that can also be stratified (i.e., evenly spread a given factor)
using the \code{by} argument.
}
\examples{
data(iris)
out <- data_partition(iris, proportion = 0.9)
out$test
nrow(out$p_0.9)

# Stratify by group (equal proportions of each species)
out <- data_partition(iris, proportion = 0.9, by = "Species")
out$test

# Create multiple partitions
out <- data_partition(iris, proportion = c(0.3, 0.3))
lapply(out, head)

# Create multiple partitions, stratified by group - 30\% equally sampled
# from species in first training set, 50\% in second training set and
# remaining 20\% equally sampled from each species in test set.
out <- data_partition(iris, proportion = c(0.3, 0.5), by = "Species")
lapply(out, function(i) table(i$Species))

}
\seealso{
\itemize{
\item Add a prefix or suffix to column names: \code{\link[=data_addprefix]{data_addprefix()}}, \code{\link[=data_addsuffix]{data_addsuffix()}}
\item Functions to reorder or remove columns: \code{\link[=data_reorder]{data_reorder()}}, \code{\link[=data_relocate]{data_relocate()}},
\code{\link[=data_remove]{data_remove()}}
\item Functions to reshape, pivot or rotate data frames: \code{\link[=data_to_long]{data_to_long()}},
\code{\link[=data_to_wide]{data_to_wide()}}, \code{\link[=data_rotate]{data_rotate()}}
\item Functions to recode data: \code{\link[=rescale]{rescale()}}, \code{\link[=reverse]{reverse()}}, \code{\link[=categorize]{categorize()}},
\code{\link[=recode_values]{recode_values()}}, \code{\link[=slide]{slide()}}
\item Functions to standardize, normalize, rank-transform: \code{\link[=center]{center()}}, \code{\link[=standardize]{standardize()}},
\code{\link[=normalize]{normalize()}}, \code{\link[=ranktransform]{ranktransform()}}, \code{\link[=winsorize]{winsorize()}}
\item Split and merge data frames: \code{\link[=data_partition]{data_partition()}}, \code{\link[=data_merge]{data_merge()}}
\item Functions to find or select columns: \code{\link[=data_select]{data_select()}}, \code{\link[=extract_column_names]{extract_column_names()}}
\item Functions to filter rows: \code{\link[=data_match]{data_match()}}, \code{\link[=data_filter]{data_filter()}}
}
}


================================================
FILE: man/data_peek.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_peek.R
\name{data_peek}
\alias{data_peek}
\alias{data_peek.data.frame}
\title{Peek at values and type of variables in a data frame}
\usage{
data_peek(x, ...)

\method{data_peek}{data.frame}(
  x,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  width = NULL,
  verbose = TRUE,
  ...
)
}
\arguments{
\item{x}{A data frame.}

\item{...}{not used.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}

\item{width}{Maximum width of line length to display. If \code{NULL}, width will
be determined using \code{options()$width}.}

\item{verbose}{Toggle warnings.}
}
\value{
A data frame with three columns, containing information about
the name, type and first values of the input data frame.
}
\description{
This function creates a table a data frame, showing all
column names, variable types and the first values (as many as fit into
the screen).
}
\note{
To show only specific or a limited number of variables, use the
\code{select} argument, e.g. \code{select = 1:5} to show only the first five variables.
}
\examples{
data(efc)
data_peek(efc)
# show variables two to four
data_peek(efc, select = 2:4)
}


================================================
FILE: man/data_prefix_suffix.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_addprefix.R
\name{data_addprefix}
\alias{data_addprefix}
\alias{data_addsuffix}
\title{Add a prefix or suffix to column names}
\usage{
data_addprefix(
  data,
  pattern,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
)

data_addsuffix(
  data,
  pattern,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
)
}
\arguments{
\item{data}{A data frame.}

\item{pattern}{A character string, which will be added as prefix or suffix
to the column names.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}

\item{verbose}{Toggle warnings.}

\item{...}{Other arguments passed to or from other functions.}
}
\description{
Add a prefix or suffix to column names
}
\examples{
# Add prefix / suffix to all columns
head(data_addprefix(iris, "NEW_"))
head(data_addsuffix(iris, "_OLD"))

}
\seealso{
\code{\link[=data_rename]{data_rename()}} for more fine-grained column renaming.
}


================================================
FILE: man/data_read.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_read.R, R/data_write.R
\name{data_read}
\alias{data_read}
\alias{data_write}
\title{Read (import) data files from various sources}
\usage{
data_read(
  path,
  path_catalog = NULL,
  encoding = NULL,
  convert_factors = TRUE,
  password = NULL,
  verbose = TRUE,
  ...
)

data_write(
  data,
  path,
  delimiter = ",",
  convert_factors = FALSE,
  save_labels = FALSE,
  password = NULL,
  verbose = TRUE,
  ...
)
}
\arguments{
\item{path}{Character string, the file path to the data file.}

\item{path_catalog}{Character string, path to the catalog file. Only relevant
for SAS data files.}

\item{encoding}{The character encoding used for the file. Usually not needed.}

\item{convert_factors}{If \code{TRUE} (default), numeric variables, where all
values have a value label, are assumed to be categorical and converted into
factors. If \code{FALSE}, no variable types are guessed and no conversion of
numeric variables into factors will be performed. For \code{data_read()}, this
argument only applies to file types with \emph{labelled data}, e.g. files from
SPSS, SAS or Stata. See also section 'Differences to other packages'. For
\code{data_write()}, this argument only applies to the text (e.g. \code{.txt} or
\code{.csv}) or spreadsheet file formats (like \code{.xlsx}). Converting to factors
might be useful for these formats because labelled numeric variables are then
converted into factors and exported as character columns - else, value labels
would be lost and only numeric values are written to the file.}

\item{password}{Password for data encryption. If not \code{NULL}, the data will be
encrypted (for \code{data_write()}) or decrypted (for \code{data_read()}) using the
provided password. Encryption is currently only supported for R file formats
(\code{.rds}, \code{.rda} and \code{.rdata}). See the section "Data encryption" below for more
information on the encryption method used.}

\item{verbose}{Toggle warnings and messages.}

\item{...}{Arguments passed to the related \verb{read_*()} or \verb{write_*()} functions.}

\item{data}{The data frame that should be written to a file.}

\item{delimiter}{For CSV-files, specifies the delimiter. Defaults to \code{","},
but in particular in European regions, \code{";"} might be a useful alternative,
especially when exported CSV-files should be opened in Excel.}

\item{save_labels}{Only applies to CSV files. If \code{TRUE}, value and variable
labels (if any) will be saved as additional CSV file. This file has the same
file name as the exported CSV file, but includes a \code{"_labels"} suffix (i.e.
when the file name is \code{"mydat.csv"}, the additional file with value and
variable labels is named \code{"mydat_labels.csv"}).}
}
\value{
A data frame.
}
\description{
This functions imports data from various file types. It is a small wrapper
around \code{haven::read_spss()}, \code{haven::read_stata()}, \code{haven::read_sas()},
\code{readxl::read_excel()} and \code{data.table::fread()} resp. \code{readr::read_delim()}
(the latter if package \strong{data.table} is not installed). Thus, supported file
types for importing data are data files from SPSS, SAS or Stata, Excel files
or text files (like '.csv' files). All other file types are passed to
\code{rio::import()}. \code{data_write()} works in a similar way.
}
\section{Supported file types}{

\itemize{
\item \code{data_read()} is a wrapper around the \strong{haven}, \strong{data.table}, \strong{readr}
\strong{readxl}, \strong{nanoparquet} and \strong{rio} packages. Currently supported file
types are \code{.txt}, \code{.csv}, \code{.xls}, \code{.xlsx}, \code{.sav}, \code{.por}, \code{.dta}, \code{.sas},
\code{.rda}, \code{.parquet}, \code{.rdata}, and \code{.rds} (and related files). All other file
types are passed to \code{rio::import()}.
\item \code{data_write()} is a wrapper around \strong{haven}, \strong{readr}, \strong{nanoparquet},
and \strong{rio} packages, and supports writing files into all formats supported
by these packages.
}
}

\section{Compressed files (zip) and URLs}{

\code{data_read()} can also read the above mentioned files from URLs or from
inside zip-compressed files. Thus, \code{path} can also be a URL to a file like
\code{"http://www.url.com/file.csv"}. When \code{path} points to a zip-compressed file,
and there are multiple files inside the zip-archive, then the first supported
file is extracted and loaded.
}

\section{General behaviour}{

\code{data_read()} detects the appropriate \verb{read_*()} function based on the
file-extension of the data file. Thus, in most cases it should be enough to
only specify the \code{path} argument. However, if more control is needed, all
arguments in \code{...} are passed down to the related \verb{read_*()} function. The
same applies to \code{data_write()}, i.e. based on the file extension provided in
\code{path}, the appropriate \verb{write_*()} function is used automatically.
}

\section{SPSS specific behaviour}{

\code{data_read()} does \emph{not} import user-defined ("tagged") \code{NA} values from
SPSS, i.e. argument \code{user_na} is always set to \code{FALSE} when importing SPSS
data with the \strong{haven} package. Use \code{convert_to_na()} to define missing
values in the imported data, if necessary. Furthermore, \code{data_write()}
compresses SPSS files by default. If this causes problems with (older) SPSS
versions, use \code{compress = "none"}, for example
\code{data_write(data, "myfile.sav", compress = "none")}.
}

\section{Differences to other packages that read foreign data formats}{

\code{data_read()} is most comparable to \code{rio::import()}. For data files from
SPSS, SAS or Stata, which support labelled data, variables are converted into
their most appropriate type. The major difference to \code{rio::import()} is for
data files from SPSS, SAS, or Stata, i.e. file types that support
\emph{labelled data}. \code{data_read()} automatically converts fully labelled numeric
variables into factors, where imported value labels will be set as factor
levels. If a numeric variable has \emph{no} value labels or less value labels than
values, it is not converted to factor. In this case, value labels are
preserved as \code{"labels"} attribute. Character vectors are preserved. Use
\code{convert_factors = FALSE} to remove the automatic conversion of numeric
variables to factors.
}

\section{Data encryption}{

\code{data_read()} and \code{data_write()} support data encryption for R file formats
(\code{.rds}, \code{.rda} and \code{.rdata}). To encrypt a file, provide a password to the
\code{password} argument in \code{data_write()}. To decrypt the file, provide the same
password to \code{data_read()}. The encryption is based on the \strong{openssl} package
and uses the AES-GCM algorithm (see \code{?openssl::aes_gcm_encrypt}) with a
256-bit key (see \code{?openssl::sha256}). Thus, data can also be decrypted without
relying on the \strong{datawizard} package, e.g. using following code:

\if{html}{\out{<div class="sourceCode">}}\preformatted{encrypted_data <- readRDS(datafile)
key <- openssl::sha256(charToRaw("<password>"))
out <- openssl::aes_gcm_decrypt(encrypted_data, key = key)
decrypted_data <- unserialize(out)
}\if{html}{\out{</div>}}

\strong{Warning:} Do not lose your \code{password}, else you will not be able to
decrypt the data again!
}


================================================
FILE: man/data_relocate.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_relocate.R, R/data_remove.R
\name{data_relocate}
\alias{data_relocate}
\alias{data_reorder}
\alias{data_remove}
\title{Relocate (reorder) columns of a data frame}
\usage{
data_relocate(
  data,
  select,
  before = NULL,
  after = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
)

data_reorder(
  data,
  select,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
)

data_remove(
  data,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = FALSE,
  ...
)
}
\arguments{
\item{data}{A data frame.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{before, after}{Destination of columns. Supplying neither will move
columns to the left-hand side; specifying both is an error. Can be a
character vector, indicating the name of the destination column, or a
numeric value, indicating the index number of the destination column.
If \code{-1}, will be added before or after the last column.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}

\item{verbose}{Toggle warnings.}

\item{...}{Arguments passed down to other functions. Mostly not used yet.}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}
}
\value{
A data frame with reordered columns.
}
\description{
\code{data_relocate()} will reorder columns to specific positions, indicated by
\code{before} or \code{after}. \code{data_reorder()} will instead move selected columns to
the beginning of a data frame. Finally, \code{data_remove()} removes columns
from a data frame. All functions support select-helpers that allow flexible
specification of a search pattern to find matching columns, which should
be reordered or removed.
}
\examples{
# Reorder columns
head(data_relocate(iris, select = "Species", before = "Sepal.Length"))
head(data_relocate(iris, select = "Species", before = "Sepal.Width"))
head(data_relocate(iris, select = "Sepal.Width", after = "Species"))
# which is same as
head(data_relocate(iris, select = "Sepal.Width", after = -1))

# Reorder multiple columns
head(data_relocate(iris, select = c("Species", "Petal.Length"), after = "Sepal.Width"))
# which is same as
head(data_relocate(iris, select = c("Species", "Petal.Length"), after = 2))

# Reorder columns
head(data_reorder(iris, c("Species", "Sepal.Length")))

# Remove columns
head(data_remove(iris, "Sepal.Length"))
head(data_remove(iris, starts_with("Sepal")))
}
\seealso{
\itemize{
\item Add a prefix or suffix to column names: \code{\link[=data_addprefix]{data_addprefix()}}, \code{\link[=data_addsuffix]{data_addsuffix()}}
\item Functions to reorder or remove columns: \code{\link[=data_reorder]{data_reorder()}}, \code{\link[=data_relocate]{data_relocate()}},
\code{\link[=data_remove]{data_remove()}}
\item Functions to reshape, pivot or rotate data frames: \code{\link[=data_to_long]{data_to_long()}},
\code{\link[=data_to_wide]{data_to_wide()}}, \code{\link[=data_rotate]{data_rotate()}}
\item Functions to recode data: \code{\link[=rescale]{rescale()}}, \code{\link[=reverse]{reverse()}}, \code{\link[=categorize]{categorize()}},
\code{\link[=recode_values]{recode_values()}}, \code{\link[=slide]{slide()}}
\item Functions to standardize, normalize, rank-transform: \code{\link[=center]{center()}}, \code{\link[=standardize]{standardize()}},
\code{\link[=normalize]{normalize()}}, \code{\link[=ranktransform]{ranktransform()}}, \code{\link[=winsorize]{winsorize()}}
\item Split and merge data frames: \code{\link[=data_partition]{data_partition()}}, \code{\link[=data_merge]{data_merge()}}
\item Functions to find or select columns: \code{\link[=data_select]{data_select()}}, \code{\link[=extract_column_names]{extract_column_names()}}
\item Functions to filter rows: \code{\link[=data_match]{data_match()}}, \code{\link[=data_filter]{data_filter()}}
}
}


================================================
FILE: man/data_rename.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_rename.R
\name{data_rename}
\alias{data_rename}
\alias{data_rename_rows}
\title{Rename columns and variable names}
\usage{
data_rename(data, select = NULL, replacement = NULL, ...)

data_rename_rows(data, rows = NULL)
}
\arguments{
\item{data}{A data frame.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{replacement}{Character vector. Can be one of the following:
\itemize{
\item A character vector that indicates the new names of the columns selected
in \code{select}. \code{select} and \code{replacement} must be of the same length.
\item A string (i.e. character vector of length 1) with a "glue" styled
pattern. Currently supported tokens are:
\itemize{
\item \code{{col}} which will be replaced by the column name, i.e. the
corresponding value in \code{select}.
\item \code{{n}} will be replaced by the number of the variable that is replaced.
\item \code{{letter}} will be replaced by alphabetical letters in sequential
order.
If more than 26 letters are required, letters are repeated, but have
sequential numeric indices (e.g., \code{a1} to \code{z1}, followed by \code{a2} to
\code{z2}).
\item Finally, the name of a user-defined object that is available in the
environment can be used. Note that the object's name is not allowed to
be one of the pre-defined tokens, \code{"col"}, \code{"n"} and \code{"letter"}.
}

An example for the use of tokens is...

\if{html}{\out{<div class="sourceCode r">}}\preformatted{data_rename(
  mtcars,
  select = c("am", "vs"),
  replacement = "new_name_from_\{col\}"
)
}\if{html}{\out{</div>}}

... which would return new column names \code{new_name_from_am} and
\code{new_name_from_vs}. See 'Examples'.
}

If \code{select} is a named vector, \code{replacement} is ignored.}

\item{...}{Other arguments passed to or from other functions.}

\item{rows}{Vector of row names.}
}
\value{
A modified data frame.
}
\description{
Safe and intuitive functions to rename variables or rows in
data frames. \code{data_rename()} will rename column names, i.e. it facilitates
renaming variables. \code{data_rename_rows()} is a convenient shortcut
to add or rename row names of a data frame, but unlike \code{row.names()}, its
input and output is a data frame, thus, integrating smoothly into a
possible pipe-workflow.
}
\details{
\code{select} can also be a named character vector. In this case, the names are
used to rename the columns in the output data frame. If you have a named
list, use \code{unlist()} to convert it to a named vector. See 'Examples'.
}
\examples{
# Rename columns
head(data_rename(iris, "Sepal.Length", "length"))

# Use named vector to rename
head(data_rename(iris, c(length = "Sepal.Length", width = "Sepal.Width")))

# Change all
head(data_rename(iris, replacement = paste0("Var", 1:5)))

# Use glue-styled patterns
head(data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "formerly_{col}"))
head(data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "{col}_is_column_{n}"))
head(data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "new_{letter}"))

# User-defined glue-styled patterns from objects in environment
x <- c("hi", "there", "!")
head(data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "col_{x}"))
}
\seealso{
\itemize{
\item Add a prefix or suffix to column names: \code{\link[=data_addprefix]{data_addprefix()}}, \code{\link[=data_addsuffix]{data_addsuffix()}}
\item Functions to reorder or remove columns: \code{\link[=data_reorder]{data_reorder()}}, \code{\link[=data_relocate]{data_relocate()}},
\code{\link[=data_remove]{data_remove()}}
\item Functions to reshape, pivot or rotate data frames: \code{\link[=data_to_long]{data_to_long()}},
\code{\link[=data_to_wide]{data_to_wide()}}, \code{\link[=data_rotate]{data_rotate()}}
\item Functions to recode data: \code{\link[=rescale]{rescale()}}, \code{\link[=reverse]{reverse()}}, \code{\link[=categorize]{categorize()}},
\code{\link[=recode_values]{recode_values()}}, \code{\link[=slide]{slide()}}
\item Functions to standardize, normalize, rank-transform: \code{\link[=center]{center()}}, \code{\link[=standardize]{standardize()}},
\code{\link[=normalize]{normalize()}}, \code{\link[=ranktransform]{ranktransform()}}, \code{\link[=winsorize]{winsorize()}}
\item Split and merge data frames: \code{\link[=data_partition]{data_partition()}}, \code{\link[=data_merge]{data_merge()}}
\item Functions to find or select columns: \code{\link[=data_select]{data_select()}}, \code{\link[=extract_column_names]{extract_column_names()}}
\item Functions to filter rows: \code{\link[=data_match]{data_match()}}, \code{\link[=data_filter]{data_filter()}}
}
}


================================================
FILE: man/data_replicate.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_replicate.R
\name{data_replicate}
\alias{data_replicate}
\title{Expand (i.e. replicate rows) a data frame}
\usage{
data_replicate(
  data,
  expand = NULL,
  select = NULL,
  exclude = NULL,
  remove_na = FALSE,
  ignore_case = FALSE,
  verbose = TRUE,
  regex = FALSE,
  ...
)
}
\arguments{
\item{data}{A data frame.}

\item{expand}{The name of the column that contains the counts of replications
for each row. Can also be a numeric value, indicating the position of that
column. Note that the variable indicated by \code{expand} must be an integer vector.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}

\item{remove_na}{Logical. If \code{TRUE}, missing values in the column
provided in \code{expand} are removed from the data frame. If \code{FALSE} and \code{expand}
contains missing values, the function will throw an error.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{verbose}{Toggle warnings.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}

\item{...}{Currently not used.}
}
\value{
A dataframe with each row replicated as many times as defined in \code{expand}.
}
\description{
Expand a data frame by replicating rows based on another variable that
contains the counts of replications per row.
}
\examples{
data(mtcars)
data_replicate(head(mtcars), "carb")
}


================================================
FILE: man/data_restoretype.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_restoretype.R
\name{data_restoretype}
\alias{data_restoretype}
\title{Restore the type of columns according to a reference data frame}
\usage{
data_restoretype(data, reference = NULL, ...)
}
\arguments{
\item{data}{A data frame for which to restore the column types.}

\item{reference}{A reference data frame from which to find the correct
column types. If \code{NULL}, each column is converted to numeric if it doesn't
generate \code{NA}s. For example, \code{c("1", "2")} can be converted to numeric but not
\code{c("Sepal.Length")}.}

\item{...}{Currently not used.}
}
\value{
A data frame with columns whose types have been restored based on the
reference data frame.
}
\description{
Restore the type of columns according to a reference data frame
}
\examples{
data <- data.frame(
  Sepal.Length = c("1", "3", "2"),
  Species = c("setosa", "versicolor", "setosa"),
  New = c("1", "3", "4")
)

fixed <- data_restoretype(data, reference = iris)
summary(fixed)
}


================================================
FILE: man/data_rotate.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_rotate.R
\name{data_rotate}
\alias{data_rotate}
\alias{data_transpose}
\title{Rotate a data frame}
\usage{
data_rotate(data, rownames = NULL, colnames = FALSE, verbose = TRUE)

data_transpose(data, rownames = NULL, colnames = FALSE, verbose = TRUE)
}
\arguments{
\item{data}{A data frame.}

\item{rownames}{Character vector (optional). If not \code{NULL}, the data frame's
rownames will be added as (first) column to the output, with \code{rownames}
being the name of this column.}

\item{colnames}{Logical or character vector (optional). If \code{TRUE}, the values
of the first column in \code{x} will be used as column names in the rotated data
frame. If a character vector, values from that column are used as column
names.}

\item{verbose}{Toggle warnings.}
}
\value{
A (rotated) data frame.
}
\description{
This function rotates a data frame, i.e. columns become rows and vice versa.
It's the equivalent of using \code{t()} but restores the \code{data.frame} class,
preserves attributes and prints a warning if the data type is
modified (see example).
}
\examples{
x <- mtcars[1:3, 1:4]

x

data_rotate(x)
data_rotate(x, rownames = "property")

# use values in 1. column as column name
data_rotate(x, colnames = TRUE)
data_rotate(x, rownames = "property", colnames = TRUE)

# use either first column or specific column for column names
x <- data.frame(a = 1:5, b = 11:15, c = 21:25)
data_rotate(x, colnames = TRUE)
data_rotate(x, colnames = "c")

}
\seealso{
\itemize{
\item Add a prefix or suffix to column names: \code{\link[=data_addprefix]{data_addprefix()}}, \code{\link[=data_addsuffix]{data_addsuffix()}}
\item Functions to reorder or remove columns: \code{\link[=data_reorder]{data_reorder()}}, \code{\link[=data_relocate]{data_relocate()}},
\code{\link[=data_remove]{data_remove()}}
\item Functions to reshape, pivot or rotate data frames: \code{\link[=data_to_long]{data_to_long()}},
\code{\link[=data_to_wide]{data_to_wide()}}, \code{\link[=data_rotate]{data_rotate()}}
\item Functions to recode data: \code{\link[=rescale]{rescale()}}, \code{\link[=reverse]{reverse()}}, \code{\link[=categorize]{categorize()}},
\code{\link[=recode_values]{recode_values()}}, \code{\link[=slide]{slide()}}
\item Functions to standardize, normalize, rank-transform: \code{\link[=center]{center()}}, \code{\link[=standardize]{standardize()}},
\code{\link[=normalize]{normalize()}}, \code{\link[=ranktransform]{ranktransform()}}, \code{\link[=winsorize]{winsorize()}}
\item Split and merge data frames: \code{\link[=data_partition]{data_partition()}}, \code{\link[=data_merge]{data_merge()}}
\item Functions to find or select columns: \code{\link[=data_select]{data_select()}}, \code{\link[=extract_column_names]{extract_column_names()}}
\item Functions to filter rows: \code{\link[=data_match]{data_match()}}, \code{\link[=data_filter]{data_filter()}}
}
}


================================================
FILE: man/data_seek.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_seek.R
\name{data_seek}
\alias{data_seek}
\title{Find variables by their names, variable or value labels}
\usage{
data_seek(data, pattern, seek = c("names", "labels"), fuzzy = FALSE)
}
\arguments{
\item{data}{A data frame.}

\item{pattern}{Character string (regular expression) to be matched in \code{data}.
May also be a character vector of length > 1. \code{pattern} is searched for in
column names, variable label and value labels attributes, or factor levels of
variables in \code{data}.}

\item{seek}{Character vector, indicating where \code{pattern} is sought. Use one
or more of the following options:
\itemize{
\item \code{"names"}: Searches in column names. \code{"column_names"} and \code{"columns"} are
aliases for \code{"names"}.
\item \code{"labels"}: Searches in variable labels. Only applies when a \code{label} attribute
is set for a variable.
\item \code{"values"}: Searches in value labels or factor levels. Only applies when a
\code{labels} attribute is set for a variable, or if a variable is a factor.
\code{"levels"} is an alias for \code{"values"}.
\item \code{"all"}: Searches in all of the above.
}}

\item{fuzzy}{Logical. If \code{TRUE}, "fuzzy matching" (partial and close distance
matching) will be used to find \code{pattern}.}
}
\value{
A data frame with three columns: the column index, the column name
and - if available - the variable label of all matched variables in \code{data}.
}
\description{
This functions seeks variables in a data frame, based on patterns
that either match the variable name (column name), variable labels, value labels
or factor levels. Matching variable and value labels only works for "labelled"
data, i.e. when the variables either have a \code{label} attribute or \code{labels}
attribute.

\code{data_seek()} is particular useful for larger data frames with labelled
data - finding the correct variable name can be a challenge. This function
helps to find the required variables, when only certain patterns of variable
names or labels are known.
}
\examples{
# seek variables with "Length" in variable name or labels
data_seek(iris, "Length")

# seek variables with "dependency" in names or labels
# column "e42dep" has a label-attribute "elder's dependency"
data(efc)
data_seek(efc, "dependency")

# "female" only appears as value label attribute - default search is in
# variable names and labels only, so no match
data_seek(efc, "female")
# when we seek in all sources, we find the variable "e16sex"
data_seek(efc, "female", seek = "all")

# typo, no match
data_seek(iris, "Lenght")
# typo, fuzzy match
data_seek(iris, "Lenght", fuzzy = TRUE)
}


================================================
FILE: man/data_separate.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_separate.R
\name{data_separate}
\alias{data_separate}
\title{Separate single variable into multiple variables}
\usage{
data_separate(
  data,
  select = NULL,
  new_columns = NULL,
  separator = "[^[:alnum:]]+",
  guess_columns = NULL,
  merge_multiple = FALSE,
  merge_separator = "",
  fill = "right",
  extra = "drop_right",
  convert_na = TRUE,
  exclude = NULL,
  append = FALSE,
  ignore_case = FALSE,
  verbose = TRUE,
  regex = FALSE,
  ...
)
}
\arguments{
\item{data}{A data frame.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{new_columns}{The names of the new columns, as character vector. If
more than one variable was selected (in \code{select}), the new names are prefixed
with the name of the original column. \code{new_columns} can also be a list of
(named) character vectors when multiple variables should be separated. See
'Examples'.}

\item{separator}{Separator between columns. Can be a character vector, which
is then treated as regular expression, or a numeric vector that indicates at
which positions the string values will be split.}

\item{guess_columns}{If \code{new_columns} is not given, the required number of
new columns is guessed based on the results of value splitting. For example,
if a variable is split into three new columns, this will be considered as
the required number of new columns, and columns are named \code{"split_1"},
\code{"split_2"} and \code{"split_3"}. When values from a variable are split into
different amount of new columns, the \code{guess_column} can be either \code{"mode"}
(number of new columns is based on the most common number of splits), \code{"min"}
or \code{"max"} to use the minimum resp. maximum number of possible splits as
required number of columns.}

\item{merge_multiple}{Logical, if \code{TRUE} and more than one variable is selected
for separating, new columns can be merged. Value pairs of all split variables
are merged.}

\item{merge_separator}{Separator string when \code{merge_multiple = TRUE}. Defines
the string that is used to merge values together.}

\item{fill}{How to deal with values that return fewer new columns after
splitting? Can be \code{"left"} (fill missing columns from the left with \code{NA}),
\code{"right"} (fill missing columns from the right with \code{NA}) or \code{"value_left"}
or \code{"value_right"} to fill missing columns from left or right with the
left-most or right-most values.}

\item{extra}{How to deal with values that return too many new columns after
splitting? Can be \code{"drop_left"} or \code{"drop_right"} to drop the left-most or
right-most values, or \code{"merge_left"} or \code{"merge_right"} to merge the left-
or right-most value together, and keeping all remaining values as is.}

\item{convert_na}{Logical, if \code{TRUE}, character \code{"NA"} values are converted
into real \code{NA} values.}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}

\item{append}{Logical, if \code{FALSE} (default), removes original columns that
were separated. If \code{TRUE}, all columns are preserved and the new columns are
appended to the data frame.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{verbose}{Toggle warnings.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}

\item{...}{Currently not used.}
}
\value{
A data frame with the newly created variable(s), or - when \code{append = TRUE} -
\code{data} including new variables.
}
\description{
Separates a single variable into multiple new variables.
}
\examples{
# simple case
d <- data.frame(
  x = c("1.a.6", "2.b.7", "3.c.8"),
  stringsAsFactors = FALSE
)
d
data_separate(d, new_columns = c("a", "b", "c"))

# guess number of columns
d <- data.frame(
  x = c("1.a.6", NA, "2.b.6.7", "3.c", "x.y.z"),
  stringsAsFactors = FALSE
)
d
data_separate(d, guess_columns = "mode")

data_separate(d, guess_columns = "max")

# drop left-most column
data_separate(d, guess_columns = "mode", extra = "drop_left")

# merge right-most column
data_separate(d, guess_columns = "mode", extra = "merge_right")

# fill columns with fewer values with left-most values
data_separate(d, guess_columns = "mode", fill = "value_left")

# fill and merge
data_separate(
  d,
  guess_columns = "mode",
  fill = "value_left",
  extra = "merge_right"
)

# multiple columns to split
d <- data.frame(
  x = c("1.a.6", "2.b.7", "3.c.8"),
  y = c("x.y.z", "10.11.12", "m.n.o"),
  stringsAsFactors = FALSE
)
d
# split two columns, default column names
data_separate(d, guess_columns = "mode")

# split into new named columns, repeating column names
data_separate(d, new_columns = c("a", "b", "c"))

# split selected variable new columns
data_separate(d, select = "y", new_columns = c("a", "b", "c"))

# merge multiple split columns
data_separate(
  d,
  new_columns = c("a", "b", "c"),
  merge_multiple = TRUE
)

# merge multiple split columns
data_separate(
  d,
  new_columns = c("a", "b", "c"),
  merge_multiple = TRUE,
  merge_separator = "-"
)

# separate multiple columns, give proper column names
d_sep <- data.frame(
  x = c("1.a.6", "2.b.7.d", "3.c.8", "5.j"),
  y = c("m.n.99.22", "77.f.g.34", "44.9", NA),
  stringsAsFactors = FALSE
)

data_separate(
  d_sep,
  select = c("x", "y"),
  new_columns = list(
    x = c("A", "B", "C"), # separate "x" into three columns
    y = c("EE", "FF", "GG", "HH") # separate "y" into four columns
  ),
  verbose = FALSE
)
}
\seealso{
\code{\link[=data_unite]{data_unite()}}
}


================================================
FILE: man/data_summary.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_summary.R
\name{data_summary}
\alias{data_summary}
\alias{data_summary.data.frame}
\title{Summarize data}
\usage{
data_summary(x, ...)

\method{data_summary}{data.frame}(x, ..., by = NULL, remove_na = FALSE, suffix = NULL)
}
\arguments{
\item{x}{A (grouped) data frame.}

\item{...}{One or more named expressions that define the new variable name
and the function to compute the summary statistic. Example:
\code{mean_sepal_width = mean(Sepal.Width)}. The expression can also be provided
as a character string, e.g. \code{"mean_sepal_width = mean(Sepal.Width)"}. The
summary function \code{n()} can be used to count the number of observations.}

\item{by}{Optional character string, indicating the names of one or more
variables in the data frame. If supplied, the data will be split by these
variables and summary statistics will be computed for each group.}

\item{remove_na}{Logical. If \code{TRUE}, missing values are omitted from the
grouping variable. If \code{FALSE} (default), missing values are included as a
level in the grouping variable.}

\item{suffix}{Optional, suffixes to be added to the new variable names,
especially useful when a function returns several values (e.g. \code{quantile()}).
Can be:
\itemize{
\item a character vector: all expressions in \code{...} must return the same number
of values as elements in \code{suffix}.
\item a list of named character vectors: the names of elements in \code{suffix} must
match the names of the expressions. It is also allowed to specify suffixes
for selected expressions only.
}

The new column names are a combination of the left-hand side (i.e.,
the name) of the expression and the related suffixes. If \code{suffix = NULL} (the
default), and a summary expression returns multiple values, either the names
of the returned values (if any) or automatically numbered suffixes such as
\verb{_1}, \verb{_2}, etc. are used. See 'Examples'.}
}
\value{
A data frame with the requested summary statistics.
}
\description{
This function can be used to compute summary statistics for a
data frame or a matrix.
}
\examples{
data(iris)
data_summary(iris, MW = mean(Sepal.Width), SD = sd(Sepal.Width))
data_summary(
  iris,
  MW = mean(Sepal.Width),
  SD = sd(Sepal.Width),
  by = "Species"
)

# same as
d <- data_group(iris, "Species")
data_summary(d, MW = mean(Sepal.Width), SD = sd(Sepal.Width))

# multiple groups
data(mtcars)
data_summary(mtcars, MW = mean(mpg), SD = sd(mpg), by = c("am", "gear"))

# expressions can also be supplied as character strings
data_summary(mtcars, "MW = mean(mpg)", "SD = sd(mpg)", by = c("am", "gear"))

# count observations within groups
data_summary(mtcars, observations = n(), by = c("am", "gear"))

# first and last observations of "mpg" within groups
data_summary(
  mtcars,
  first = mpg[1],
  last = mpg[length(mpg)],
  by = c("am", "gear")
)

# allow more than one-column-summaries for expressions
d <- data.frame(
  x = rnorm(100, 1, 1),
  y = rnorm(100, 2, 2),
  groups = rep(1:4, each = 25)
)

# since we have multiple columns for one expression, the names of the
# returned summary results are used as suffix by default
data_summary(
  d,
  quant_x = quantile(x, c(0.25, 0.75)),
  mean_x = mean(x),
  quant_y = quantile(y, c(0.25, 0.5, 0.75))
)

# if a summary function, like `fivenum()`, returns no named vector, suffixes
# are automatically numbered
data_summary(
  d,
  quant_x = quantile(x, c(0.25, 0.75)),
  mean_x = mean(x),
  fivenum_y = fivenum(y)
)

# specify column suffix for expressions, matching by names
data_summary(
  d,
  quant_x = quantile(x, c(0.25, 0.75)),
  mean_x = mean(x),
  quant_y = quantile(y, c(0.25, 0.5, 0.75)),
  suffix = list(quant_y = c("_Q1", "_Q2", "_Q3"))
)

# name multiple expression suffixes, grouped by variable
data_summary(
  d,
  quant_x = quantile(x, c(0.25, 0.75)),
  mean_x = mean(x),
  quant_y = quantile(y, c(0.25, 0.5, 0.75)),
  suffix = list(quant_x = c("Q1", "Q3"), quant_y = c("_Q1", "_Q2", "_Q3")),
  by = "groups"
)

}


================================================
FILE: man/data_tabulate.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_tabulate.R
\name{data_tabulate}
\alias{data_tabulate}
\alias{data_tabulate.default}
\alias{data_tabulate.data.frame}
\alias{print.datawizard_table}
\alias{display.datawizard_table}
\title{Create frequency and crosstables of variables}
\usage{
data_tabulate(x, ...)

\method{data_tabulate}{default}(
  x,
  by = NULL,
  drop_levels = FALSE,
  weights = NULL,
  remove_na = FALSE,
  proportions = NULL,
  name = NULL,
  verbose = TRUE,
  ...
)

\method{data_tabulate}{data.frame}(
  x,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  by = NULL,
  drop_levels = FALSE,
  weights = NULL,
  remove_na = FALSE,
  proportions = NULL,
  collapse = FALSE,
  verbose = TRUE,
  ...
)

\method{print}{datawizard_table}(x, big_mark = NULL, ...)

\method{display}{datawizard_table}(object, big_mark = NULL, format = "markdown", ...)
}
\arguments{
\item{x}{A (grouped) data frame, a vector or factor.}

\item{...}{not used.}

\item{by}{Optional vector or factor. If supplied, a crosstable is created.
If \code{x} is a data frame, \code{by} can also be a character string indicating the
name of a variable in \code{x}.}

\item{drop_levels}{Logical, if \code{FALSE}, factor levels that do not occur in
the data are included in the table (with frequency of zero), else unused
factor levels are dropped from the frequency table.}

\item{weights}{Optional numeric vector of weights. Must be of the same length
as \code{x}. If \code{weights} is supplied, weighted frequencies are calculated.}

\item{remove_na}{Logical, if \code{FALSE}, missing values are included in the
frequency or crosstable, else missing values are omitted. Note that the
default for the \code{as.table()} method is \code{remove_na = TRUE}, so that missing
values are not included in the returned table, which makes more sense for
post-processing of the table, e.g. using \code{chisq.test()}.}

\item{proportions}{Optional character string, indicating the type of
percentages to be calculated. Only applies to crosstables, i.e. when \code{by} is
not \code{NULL}. Can be \code{"row"} (row percentages), \code{"column"} (column percentages)
or \code{"full"} (to calculate relative frequencies for the full table).}

\item{name}{Optional character string, which includes the name that is used
for printing.}

\item{verbose}{Toggle warnings and messages.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}

\item{collapse}{Logical, if \code{TRUE} collapses multiple tables into one larger
table for printing. This affects only printing, not the returned object.}

\item{big_mark}{Optional character string, indicating the big mark that is
used for large numbers. If \code{NULL} (default), a big mark is added automatically for
large numbers (i.e. numbers with more than 5 digits). If you want to remove
the big mark, set \code{big_mark = ""}.}

\item{object}{An object returned by \code{data_tabulate()}.}

\item{format}{String, indicating the output format. Can be \code{"markdown"}
\code{"html"}, or \code{"tt"}. \code{format = "html"} create an HTML table using the \emph{gt}
package. \code{format = "tt"} creates a \code{tinytable} object, which is either
printed as markdown or HTML table, depending on the environment. See
\code{\link[insight:export_table]{insight::export_table()}} for details.}
}
\value{
A data frame, or a list of data frames, with one frequency table
as data frame per variable.
}
\description{
This function creates frequency or crosstables of variables,
including the number of levels/values as well as the distribution of raw,
valid and cumulative percentages. For crosstables, row, column and cell
percentages can be calculated.
}
\details{
There is an \code{as.data.frame()} method, to return the frequency tables as a
data frame. The structure of the returned object is a nested data frame,
where the first column contains name of the variable for which frequencies
were calculated, and the second column is a list column that contains the
frequency tables as data frame. See \link{as.table.datawizard_table}.

There is also an \code{as.table()} method, which returns a table object with the
frequencies of the variable. This is useful for further statistical analysis,
e.g. for using \code{chisq.test()} on the frequency table. See
\link{as.table.datawizard_table}.
}
\note{
There are \code{print_html()} and \code{print_md()} methods available for printing
frequency or crosstables in HTML and markdown format, e.g.
\code{print_html(data_tabulate(x))}. The \code{print()} method for text outputs passes
arguments in \code{...} to \code{\link[insight:export_table]{insight::export_table()}}.
}
\section{Crosstables}{

If \code{by} is supplied, a crosstable is created. The crosstable includes \verb{<NA>}
(missing) values by default. The first column indicates values of \code{x}, the
first row indicates values of \code{by} (including missing values). The last row
and column contain the total frequencies for each row and column, respectively.
Setting \code{remove_na = FALSE} will omit missing values from the crosstable.
Setting \code{proportions} to \code{"row"} or \code{"column"} will add row or column
percentages. Setting \code{proportions} to \code{"full"} will add relative frequencies
for the full table.
}

\examples{
\dontshow{if (requireNamespace("poorman")) withAutoprint(\{ # examplesIf}
# frequency tables -------
# ------------------------
data(efc)

# vector/factor
data_tabulate(efc$c172code)

# drop missing values
data_tabulate(efc$c172code, remove_na = TRUE)

# data frame
data_tabulate(efc, c("e42dep", "c172code"))

# grouped data frame
suppressPackageStartupMessages(library(poorman, quietly = TRUE))
efc \%>\%
  group_by(c172code) \%>\%
  data_tabulate("e16sex")

# collapse tables
efc \%>\%
  group_by(c172code) \%>\%
  data_tabulate("e16sex", collapse = TRUE)

# for larger N's (> 100000), a big mark is automatically added
set.seed(123)
x <- sample(1:3, 1e6, TRUE)
data_tabulate(x, name = "Large Number")

# to remove the big mark, use "print(..., big_mark = "")"
print(data_tabulate(x), big_mark = "")

# weighted frequencies
set.seed(123)
efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
data_tabulate(efc$e42dep, weights = efc$weights)

# crosstables ------
# ------------------

# add some missing values
set.seed(123)
efc$e16sex[sample.int(nrow(efc), 5)] <- NA

data_tabulate(efc, "c172code", by = "e16sex")

# add row and column percentages
data_tabulate(efc, "c172code", by = "e16sex", proportions = "row")
data_tabulate(efc, "c172code", by = "e16sex", proportions = "column")

# omit missing values
data_tabulate(
  efc$c172code,
  by = efc$e16sex,
  proportions = "column",
  remove_na = TRUE
)

# round percentages
out <- data_tabulate(efc, "c172code", by = "e16sex", proportions = "column")
print(out, digits = 0)
\dontshow{\}) # examplesIf}
}
\seealso{
\link{as.prop.table}
}


================================================
FILE: man/data_to_long.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_to_long.R
\name{data_to_long}
\alias{data_to_long}
\alias{reshape_longer}
\title{Reshape (pivot) data from wide to long}
\usage{
data_to_long(
  data,
  select = "all",
  names_to = "name",
  names_prefix = NULL,
  names_sep = NULL,
  names_pattern = NULL,
  values_to = "value",
  values_drop_na = FALSE,
  rows_to = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  ...,
  cols
)

reshape_longer(
  data,
  select = "all",
  names_to = "name",
  names_prefix = NULL,
  names_sep = NULL,
  names_pattern = NULL,
  values_to = "value",
  values_drop_na = FALSE,
  rows_to = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  ...,
  cols
)
}
\arguments{
\item{data}{A data frame to convert to long format, so that it has more
rows and fewer columns after the operation.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{names_to}{The name of the new column (variable) that will contain the
\emph{names} from columns in \code{select} as values, to identify the source of the
values. \code{names_to} can be a character vector with more than one column name,
in which case \code{names_sep} or \code{names_pattern} must be provided in order to
identify which parts of the column names go into newly created columns.
See also 'Examples'.}

\item{names_prefix}{A regular expression used to remove matching text from
the start of each variable name.}

\item{names_sep, names_pattern}{If \code{names_to} contains multiple values, this
argument controls how the column name is broken up. \code{names_pattern} takes a
regular expression containing matching groups, i.e. "()".}

\item{values_to}{The name of the new column that will contain the \emph{values} of
the columns in \code{select}.}

\item{values_drop_na}{If \code{TRUE}, will drop rows that contain only \code{NA} in the
\code{values_to} column. This effectively converts explicit missing values to
implicit missing values, and should generally be used only when missing values
in data were created by its structure.}

\item{rows_to}{The name of the column that will contain the row names or row
numbers from the original data. If \code{NULL}, will be removed.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}

\item{...}{Currently not used.}

\item{cols}{Identical to \code{select}. This argument is here to ensure compatibility
with \code{tidyr::pivot_longer()}. If both \code{select} and \code{cols} are provided, \code{cols}
is used.}
}
\value{
If a tibble was provided as input, \code{reshape_longer()} also returns a
tibble. Otherwise, it returns a data frame.
}
\description{
This function "lengthens" data, increasing the number of rows and decreasing
the number of columns. This is a dependency-free base-R equivalent of
\code{tidyr::pivot_longer()}.
}
\details{
Reshaping data into long format usually means that the input data frame is
in \emph{wide} format, where multiple measurements taken on the same subject are
stored in multiple columns (variables). The long format stores the same
information in a single column, with each measurement per subject stored in
a separate row. The values of all variables that are not in \code{select} will
be repeated.

The necessary information for \code{data_to_long()} is:
\itemize{
\item The columns that contain the repeated measurements (\code{select}).
\item The name of the newly created column that will contain the names of the
columns in \code{select} (\code{names_to}), to identify the source of the values.
\code{names_to} can also be a character vector with more than one column name,
in which case \code{names_sep} or \code{names_pattern} must be provided to specify
which parts of the column names go into the newly created columns.
\item The name of the newly created column that contains the values of the
columns in \code{select} (\code{values_to}).
}

In other words: repeated measurements that are spread across several columns
will be gathered into a single column (\code{values_to}), with the original column
names, that identify the source of the gathered values, stored in one or more
new columns (\code{names_to}).
}
\examples{
\dontshow{if (all(insight::check_if_installed(c("psych", "tidyr"), quietly = TRUE))) withAutoprint(\{ # examplesIf}
wide_data <- setNames(
  data.frame(replicate(2, rnorm(8))),
  c("Time1", "Time2")
)
wide_data$ID <- 1:8
wide_data

# Default behaviour (equivalent to tidyr::pivot_longer(wide_data, cols = 1:3))
# probably doesn't make much sense to mix "time" and "id"
data_to_long(wide_data)

# Customizing the names
data_to_long(
  wide_data,
  select = c("Time1", "Time2"),
  names_to = "Timepoint",
  values_to = "Score"
)

# Reshape multiple columns into long format.
mydat <- data.frame(
  age = c(20, 30, 40),
  sex = c("Female", "Male", "Male"),
  score_t1 = c(30, 35, 32),
  score_t2 = c(33, 34, 37),
  score_t3 = c(36, 35, 38),
  speed_t1 = c(2, 3, 1),
  speed_t2 = c(3, 4, 5),
  speed_t3 = c(1, 8, 6)
)
# The column names are split into two columns: "type" and "time". The
# pattern for splitting column names is provided in `names_pattern`. Values
# of all "score_*" and "speed_*" columns are gathered into a single column
# named "count".
data_to_long(
  mydat,
  select = 3:8,
  names_to = c("type", "time"),
  names_pattern = "(score|speed)_t(\\\\d+)",
  values_to = "count"
)

# Full example
# ------------------
data <- psych::bfi # Wide format with one row per participant's personality test

# Pivot long format
very_long_data <- data_to_long(data,
  select = regex("\\\\d"), # Select all columns that contain a digit
  names_to = "Item",
  values_to = "Score",
  rows_to = "Participant"
)
head(very_long_data)

even_longer_data <- data_to_long(
  tidyr::who,
  select = new_sp_m014:newrel_f65,
  names_to = c("diagnosis", "gender", "age"),
  names_pattern = "new_?(.*)_(.)(.*)",
  values_to = "count"
)
head(even_longer_data)
\dontshow{\}) # examplesIf}
}
\seealso{
\itemize{
\item Add a prefix or suffix to column names: \code{\link[=data_addprefix]{data_addprefix()}}, \code{\link[=data_addsuffix]{data_addsuffix()}}
\item Functions to reorder or remove columns: \code{\link[=data_reorder]{data_reorder()}}, \code{\link[=data_relocate]{data_relocate()}},
\code{\link[=data_remove]{data_remove()}}
\item Functions to reshape, pivot or rotate data frames: \code{\link[=data_to_long]{data_to_long()}},
\code{\link[=data_to_wide]{data_to_wide()}}, \code{\link[=data_rotate]{data_rotate()}}
\item Functions to recode data: \code{\link[=rescale]{rescale()}}, \code{\link[=reverse]{reverse()}}, \code{\link[=categorize]{categorize()}},
\code{\link[=recode_values]{recode_values()}}, \code{\link[=slide]{slide()}}
\item Functions to standardize, normalize, rank-transform: \code{\link[=center]{center()}}, \code{\link[=standardize]{standardize()}},
\code{\link[=normalize]{normalize()}}, \code{\link[=ranktransform]{ranktransform()}}, \code{\link[=winsorize]{winsorize()}}
\item Split and merge data frames: \code{\link[=data_partition]{data_partition()}}, \code{\link[=data_merge]{data_merge()}}
\item Functions to find or select columns: \code{\link[=data_select]{data_select()}}, \code{\link[=extract_column_names]{extract_column_names()}}
\item Functions to filter rows: \code{\link[=data_match]{data_match()}}, \code{\link[=data_filter]{data_filter()}}
}
}


================================================
FILE: man/data_to_wide.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_to_wide.R
\name{data_to_wide}
\alias{data_to_wide}
\alias{reshape_wider}
\title{Reshape (pivot) data from long to wide}
\usage{
data_to_wide(
  data,
  id_cols = NULL,
  values_from = "Value",
  names_from = "Name",
  names_sep = "_",
  names_prefix = "",
  names_glue = NULL,
  values_fill = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
)

reshape_wider(
  data,
  id_cols = NULL,
  values_from = "Value",
  names_from = "Name",
  names_sep = "_",
  names_prefix = "",
  names_glue = NULL,
  values_fill = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
)
}
\arguments{
\item{data}{A data frame to convert to wide format, so that it has more
columns and fewer rows post-widening than pre-widening.}

\item{id_cols}{The name of the column that identifies the rows in the data
by which observations are grouped and the gathered data is spread into new
columns. Usually, this is a variable containing an ID for observations that
have been repeatedly measured. If \code{NULL}, it will use all remaining columns
that are not in \code{names_from} or \code{values_from} as ID columns. \code{id_cols} can
also be a character vector with more than one name of identifier columns. See
also 'Details' and 'Examples'.}

\item{values_from}{The name of the columns in the original data that contains
the values used to fill the new columns created in the widened data. Can also
be one of the selection helpers (see argument \code{select} in \code{\link[=data_select]{data_select()}}).}

\item{names_from}{The name of the column in the original data whose values
will be used for naming the new columns created in the widened data. Each
unique value in this column will become the name of one of these new columns.
In case \code{names_prefix} is provided, column names will be concatenated with
the string given in \code{names_prefix}. If \code{values_from} specifies more than one
variable that should be widened, the new column names are a combination of
the old column names in \code{values_from} and the \emph{values} from \code{names_from}, to
avoid duplicate column names.}

\item{names_sep}{If \code{names_from} or \code{values_from} contains multiple variables,
this will be used to join their values together into a single string to use
as a column name.}

\item{names_prefix}{String added to the start of every variable name. This is
particularly useful if \code{names_from} is a numeric vector and you want to create
syntactic variable names.}

\item{names_glue}{Instead of \code{names_sep} and \code{names_prefix}, you can supply a
\href{https://glue.tidyverse.org/index.html}{glue specification} that uses the
\code{names_from} columns to create custom column names. Note that the only
delimiters supported by \code{names_glue} are curly brackets, \verb{\{} and \verb{\}}.}

\item{values_fill}{Defunct argument, which has no function anymore. Will be
removed in future versions.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}

\item{verbose}{Toggle warnings.}

\item{...}{Not used for now.}
}
\value{
If a tibble was provided as input, \code{data_to_wide()} also returns a
tibble. Otherwise, it returns a data frame.
}
\description{
This function "widens" data, increasing the number of columns and decreasing
the number of rows. This is a dependency-free base-R equivalent of
\code{tidyr::pivot_wider()}.
}
\details{
Reshaping data into wide format usually means that the input data frame is
in \emph{long} format, where multiple measurements taken on the same subject are
stored in multiple rows. The wide format stores the same information in a
single row, with each measurement stored in a separate column. Thus, the
necessary information for \code{data_to_wide()} is:
\itemize{
\item The name of the column(s) that identify the groups or repeated measurements
(\code{id_cols}).
\item The name of the column whose \emph{values} will become the new column names
(\code{names_from}). Since these values may not necessarily reflect appropriate
column names, you can use \code{names_prefix} to add a prefix to each newly
created column name.
\item The name of the column(s) that contain the values (\code{values_from}) for the
new columns that are created by \code{names_from}.
}

In other words: repeated measurements, as indicated by \code{id_cols}, that are
saved into the column \code{values_from} will be spread into new columns, which
will be named after the values in \code{names_from}. See also 'Examples'.
}
\examples{
\dontshow{if (requireNamespace("lme4", quietly = TRUE)) withAutoprint(\{ # examplesIf}
data_long <- read.table(header = TRUE, text = "
 subject sex condition measurement
       1   M   control         7.9
       1   M     cond1        12.3
       1   M     cond2        10.7
       2   F   control         6.3
       2   F     cond1        10.6
       2   F     cond2        11.1
       3   F   control         9.5
       3   F     cond1        13.1
       3   F     cond2        13.8
       4   M   control        11.5
       4   M     cond1        13.4
       4   M     cond2        12.9")

# converting long data into wide format
data_to_wide(
  data_long,
  id_cols = "subject",
  names_from = "condition",
  values_from = "measurement"
)

# converting long data into wide format with custom column names
data_to_wide(
  data_long,
  id_cols = "subject",
  names_from = "condition",
  values_from = "measurement",
  names_prefix = "Var.",
  names_sep = "."
)

# converting long data into wide format, combining multiple columns
production <- expand.grid(
  product = c("A", "B"),
  country = c("AI", "EI"),
  year = 2000:2014
)
production <- data_filter(production, (product == "A" & country == "AI") | product == "B")
production$production <- rnorm(nrow(production))

data_to_wide(
  production,
  names_from = c("product", "country"),
  values_from = "production",
  names_glue = "prod_{product}_{country}"
)

# reshaping multiple long columns into wide format. to avoid duplicate
# column names, new names are a combination of the old column names in
# `values_from` and the values from `names_from`
data_long <- read.table(header = TRUE, text = "
subject_id time score anxiety test
         1    1    10       5   NA
         1    2    NA       7   NA
         2    1    15       6   NA
         2    2    12      NA   NA
         3    1    18       8   NA
         5    2    11       4   NA
         4    1    NA       5   NA
         4    2    14      NA   NA")

data_to_wide(
  data_long,
  id_cols = "subject_id",
  names_from = "time",
  values_from = c("score", "anxiety", "test")
)

# using the "sleepstudy" dataset
data(sleepstudy, package = "lme4")

# the sleepstudy data contains repeated measurements of average reaction
# times for each subjects over multiple days, in a sleep deprivation study.
# It is in long-format, i.e. each row corresponds to a single measurement.
# The variable "Days" contains the timepoint of the measurement, and
# "Reaction" contains the measurement itself. Converting this data to wide
# format will create a new column for each day, with the reaction time as the
# value.
head(sleepstudy)

data_to_wide(
  sleepstudy,
  id_cols = "Subject",
  names_from = "Days",
  values_from = "Reaction"
)

# clearer column names
data_to_wide(
  sleepstudy,
  id_cols = "Subject",
  names_from = "Days",
  values_from = "Reaction",
  names_prefix = "Reaction_Day_"
)

# For unequal group sizes, missing information is filled with NA
d <- subset(sleepstudy, Days \%in\% c(0, 1, 2, 3, 4))[c(1:9, 11:13, 16:17, 21), ]

# long format, different number of "Subjects"
d

data_to_wide(
  d,
  id_cols = "Subject",
  names_from = "Days",
  values_from = "Reaction",
  names_prefix = "Reaction_Day_"
)
\dontshow{\}) # examplesIf}
}
\seealso{
\itemize{
\item Add a prefix or suffix to column names: \code{\link[=data_addprefix]{data_addprefix()}}, \code{\link[=data_addsuffix]{data_addsuffix()}}
\item Functions to reorder or remove columns: \code{\link[=data_reorder]{data_reorder()}}, \code{\link[=data_relocate]{data_relocate()}},
\code{\link[=data_remove]{data_remove()}}
\item Functions to reshape, pivot or rotate data frames: \code{\link[=data_to_long]{data_to_long()}},
\code{\link[=data_to_wide]{data_to_wide()}}, \code{\link[=data_rotate]{data_rotate()}}
\item Functions to recode data: \code{\link[=rescale]{rescale()}}, \code{\link[=reverse]{reverse()}}, \code{\link[=categorize]{categorize()}},
\code{\link[=recode_values]{recode_values()}}, \code{\link[=slide]{slide()}}
\item Functions to standardize, normalize, rank-transform: \code{\link[=center]{center()}}, \code{\link[=standardize]{standardize()}},
\code{\link[=normalize]{normalize()}}, \code{\link[=ranktransform]{ranktransform()}}, \code{\link[=winsorize]{winsorize()}}
\item Split and merge data frames: \code{\link[=data_partition]{data_partition()}}, \code{\link[=data_merge]{data_merge()}}
\item Functions to find or select columns: \code{\link[=data_select]{data_select()}}, \code{\link[=extract_column_names]{extract_column_names()}}
\item Functions to filter rows: \code{\link[=data_match]{data_match()}}, \code{\link[=data_filter]{data_filter()}}
}
}


================================================
FILE: man/data_unique.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_unique.R
\name{data_unique}
\alias{data_unique}
\title{Keep only one row from all with duplicated IDs}
\usage{
data_unique(
  data,
  select = NULL,
  keep = "best",
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE
)
}
\arguments{
\item{data}{A data frame.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{keep}{The method to be used for duplicate selection, either "best"
(the default), "first", or "last".}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}

\item{verbose}{Toggle warnings.}
}
\value{
A data frame, containing only the chosen duplicates.
}
\description{
From all rows with at least one duplicated ID,
keep only one. Methods for selecting the duplicated row are
either the first duplicate, the last duplicate, or the "best"
duplicate (default), based on the duplicate with the smallest
number of \code{NA}. In case of ties, it picks the first
duplicate, as it is the one most likely to be valid and
authentic, given practice effects.

Contrarily to \code{dplyr::distinct()}, \code{data_unique()} keeps all columns.
}
\examples{
df1 <- data.frame(
  id = c(1, 2, 3, 1, 3),
  item1 = c(NA, 1, 1, 2, 3),
  item2 = c(NA, 1, 1, 2, 3),
  item3 = c(NA, 1, 1, 2, 3)
)

data_unique(df1, select = "id")
}
\seealso{
\code{\link[=data_duplicated]{data_duplicated()}}
}


================================================
FILE: man/data_unite.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_unite.R
\name{data_unite}
\alias{data_unite}
\title{Unite ("merge") multiple variables}
\usage{
data_unite(
  data,
  new_column = NULL,
  select = NULL,
  exclude = NULL,
  separator = "_",
  append = FALSE,
  remove_na = FALSE,
  ignore_case = FALSE,
  verbose = TRUE,
  regex = FALSE,
  ...
)
}
\arguments{
\item{data}{A data frame.}

\item{new_column}{The name of the new column, as a string.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}

\item{separator}{A character to use between values.}

\item{append}{Logical, if \code{FALSE} (default), removes original columns that
were united. If \code{TRUE}, all columns are preserved and the new column is
appended to the data frame.}

\item{remove_na}{Logical, if \code{TRUE}, missing values (\code{NA}) are not included
in the united values. If \code{FALSE}, missing values are represented as \code{"NA"}
in the united values.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{verbose}{Toggle warnings.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}

\item{...}{Currently not used.}
}
\value{
\code{data}, with a newly created variable.
}
\description{
Merge values of multiple variables per observation into one new variable.
}
\examples{
d <- data.frame(
  x = 1:3,
  y = letters[1:3],
  z = 6:8
)
d
data_unite(d, new_column = "xyz")
data_unite(d, new_column = "xyz", remove = FALSE)
data_unite(d, new_column = "xyz", select = c("x", "z"))
data_unite(d, new_column = "xyz", select = c("x", "z"), append = TRUE)
}
\seealso{
\code{\link[=data_separate]{data_separate()}}
}


================================================
FILE: man/datawizard-package.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/datawizard-package.R
\docType{package}
\name{datawizard-package}
\alias{datawizard-package}
\alias{datawizard}
\title{datawizard: Easy Data Wrangling and Statistical Transformations}
\description{
A lightweight package to assist in key steps involved in any data analysis
workflow:
\itemize{
\item wrangling the raw data to get it in the needed form,
\item applying preprocessing steps and statistical transformations, and
\item compute statistical summaries of data properties and distributions.
}

It is also the data wrangling backend for packages in 'easystats' ecosystem.
Reference: Patil et al. (2022) \doi{10.21105/joss.04684}.
}
\details{
\code{datawizard}
}
\seealso{
Useful links:
\itemize{
  \item \url{https://easystats.github.io/datawizard/}
  \item Report bugs at \url{https://github.com/easystats/datawizard/issues}
}

}
\author{
\strong{Maintainer}: Etienne Bacher \email{etienne.bacher@protonmail.com} (\href{https://orcid.org/0000-0002-9271-5075}{ORCID})

Authors:
\itemize{
  \item Indrajeet Patil \email{patilindrajeet.science@gmail.com} (\href{https://orcid.org/0000-0003-1995-6531}{ORCID})
  \item Dominique Makowski \email{dom.makowski@gmail.com} (\href{https://orcid.org/0000-0001-5375-9967}{ORCID})
  \item Daniel Lüdecke \email{d.luedecke@uke.de} (\href{https://orcid.org/0000-0002-8895-3206}{ORCID})
  \item Mattan S. Ben-Shachar \email{matanshm@post.bgu.ac.il} (\href{https://orcid.org/0000-0002-4287-4801}{ORCID})
  \item Brenton M. Wiernik \email{brenton@wiernik.org} (\href{https://orcid.org/0000-0001-9560-6336}{ORCID})
}

Other contributors:
\itemize{
  \item Rémi Thériault \email{remi.theriault@mail.mcgill.ca} (\href{https://orcid.org/0000-0003-4315-6788}{ORCID}) [contributor]
  \item Thomas J. Faulkenberry \email{faulkenberry@tarleton.edu} [reviewer]
  \item Robert Garrett \email{rcg4@illinois.edu} [reviewer]
}

}
\keyword{internal}


================================================
FILE: man/demean.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/demean.R
\name{demean}
\alias{demean}
\alias{degroup}
\alias{detrend}
\title{Compute group-meaned and de-meaned variables}
\usage{
demean(
  x,
  select,
  by,
  nested = FALSE,
  suffix_demean = "_within",
  suffix_groupmean = "_between",
  append = TRUE,
  add_attributes = TRUE,
  verbose = TRUE
)

degroup(
  x,
  select,
  by,
  nested = FALSE,
  center = "mean",
  suffix_demean = "_within",
  suffix_groupmean = "_between",
  append = TRUE,
  add_attributes = TRUE,
  verbose = TRUE
)

detrend(
  x,
  select,
  by,
  nested = FALSE,
  center = "mean",
  suffix_demean = "_within",
  suffix_groupmean = "_between",
  append = TRUE,
  add_attributes = TRUE,
  verbose = TRUE
)
}
\arguments{
\item{x}{A data frame.}

\item{select}{Character vector (or formula) with names of variables to select
that should be group- and de-meaned.}

\item{by}{Character vector (or formula) with the name of the variable that
indicates the group- or cluster-ID. For cross-classified or nested designs,
\code{by} can also identify two or more variables as group- or cluster-IDs. If
the data is nested and should be treated as such, set \code{nested = TRUE}. Else,
if \code{by} defines two or more variables and \code{nested = FALSE}, a cross-classified
design is assumed. Note that \code{demean()} and \code{degroup()} can't handle a mix
of nested and cross-classified designs in one model.

For nested designs, \code{by} can be:
\itemize{
\item a character vector with the name of the variable that indicates the
levels, ordered from \emph{highest} level to \emph{lowest} (e.g.
\code{by = c("L4", "L3", "L2")}.
\item a character vector with variable names in the format \code{by = "L4/L3/L2"},
where the levels are separated by \code{/}.
}

See also section \emph{De-meaning for cross-classified designs} and
\emph{De-meaning for nested designs} below.}

\item{nested}{Logical, if \code{TRUE}, the data is treated as nested. If \code{FALSE},
the data is treated as cross-classified. Only applies if \code{by} contains more
than one variable.}

\item{suffix_demean, suffix_groupmean}{String value, will be appended to the
names of the group-meaned and de-meaned variables of \code{x}. By default,
de-meaned variables will be suffixed with \code{"_within"} and
grouped-meaned variables with \code{"_between"}.}

\item{append}{Logical, if \code{TRUE} (default), the group- and de-meaned
variables will be appended (column bind) to the original data \code{x},
thus returning both the original and the de-/group-meaned variables.}

\item{add_attributes}{Logical, if \code{TRUE}, the returned variables gain
attributes to indicate the within- and between-effects. This is only
relevant when printing \code{model_parameters()} - in such cases, the
within- and between-effects are printed in separated blocks.}

\item{verbose}{Toggle warnings and messages.}

\item{center}{Method for centering. \code{demean()} always performs
mean-centering, while \code{degroup()} can use \code{center = "median"} or
\code{center = "mode"} for median- or mode-centering, and also \code{"min"}
or \code{"max"}.}
}
\value{
A data frame with the group-/de-meaned variables, which get the suffix
\code{"_between"} (for the group-meaned variable) and \code{"_within"} (for the
de-meaned variable) by default. For cross-classified or nested designs,
the name pattern of the group-meaned variables is the name of the centered
variable followed by the name of the variable that indicates the related
grouping level, e.g. \code{predictor_L3_between} and \code{predictor_L2_between}.
}
\description{
\code{demean()} computes group- and de-meaned versions of a variable that can be
used in regression analysis to model the between- and within-subject effect
(person-mean centering or centering within clusters). \code{degroup()} is more
generic in terms of the centering-operation. While \code{demean()} always uses
mean-centering, \code{degroup()} can also use the mode or median for centering.
}
\section{Heterogeneity Bias}{


Mixed models include different levels of sources of variability, i.e.
error terms at each level. When macro-indicators (or level-2 predictors,
or higher-level units, or more general: \emph{group-level predictors that
\strong{vary} within and across groups}) are included as fixed effects (i.e.
treated as covariate at level-1), the variance that is left unaccounted for
this covariate will be absorbed into the error terms of level-1 and level-2
(\emph{Bafumi and Gelman 2006; Gelman and Hill 2007, Chapter 12.6.}):
"Such covariates contain two parts: one that is specific to the higher-level
entity that does not vary between occasions, and one that represents the
difference between occasions, within higher-level entities" (\emph{Bell et al. 2015}).
Hence, the error terms will be correlated with the covariate, which violates
one of the assumptions of mixed models (iid, independent and identically
distributed error terms). This bias is also called the \emph{heterogeneity bias}
(\emph{Bell et al. 2015}). To resolve this problem, level-2 predictors used as
(level-1) covariates should be separated into their "within" and "between"
effects by "de-meaning" and "group-meaning": After demeaning time-varying
predictors, "at the higher level, the mean term is no longer constrained by
Level 1 effects, so it is free to account for all the higher-level variance
associated with that variable" (\emph{Bell et al. 2015}).
}

\section{Panel data and correlating fixed and group effects}{


\code{demean()} is intended to create group- and de-meaned variables for panel
regression models (fixed effects models), or for complex
random-effect-within-between models (see \emph{Bell et al. 2015, 2018}), where
group-effects (random effects) and fixed effects correlate (see
\emph{Bafumi and Gelman 2006}). This can happen, for instance, when analyzing
panel data, which can lead to \emph{Heterogeneity Bias}. To control for correlating
predictors and group effects, it is recommended to include the group-meaned
and de-meaned version of \emph{time-varying covariates} (and group-meaned version
of \emph{time-invariant covariates} that are on a higher level, e.g. level-2
predictors) in the model. By this, one can fit complex multilevel models for
panel data, including time-varying predictors, time-invariant predictors and
random effects.
}

\section{Why mixed models are preferred over fixed effects models}{


A mixed models approach can model the causes of endogeneity explicitly
by including the (separated) within- and between-effects of time-varying
fixed effects and including time-constant fixed effects. Furthermore,
mixed models also include random effects, thus a mixed models approach
is superior to classic fixed-effects models, which lack information of
variation in the group-effects or between-subject effects. Furthermore,
fixed effects regression cannot include random slopes, which means that
fixed effects regressions are neglecting "cross-cluster differences in the
effects of lower-level controls (which) reduces the precision of estimated
context effects, resulting in unnecessarily wide confidence intervals and
low statistical power" (\emph{Heisig et al. 2017}).
}

\section{Terminology}{


The group-meaned variable is simply the mean of an independent variable
within each group (or id-level or cluster) represented by \code{by}. It represents
the cluster-mean of an independent variable. The regression coefficient of a
group-meaned variable is the \emph{between-subject-effect}. The de-meaned variable
is then the centered version of the group-meaned variable. De-meaning is
sometimes also called person-mean centering or centering within clusters.
The regression coefficient of a de-meaned variable represents the
\emph{within-subject-effect}.
}

\section{De-meaning with continuous predictors}{


For continuous time-varying predictors, the recommendation is to include
both their de-meaned and group-meaned versions as fixed effects, but not
the raw (untransformed) time-varying predictors themselves. The de-meaned
predictor should also be included as random effect (random slope). In
regression models, the coefficient of the de-meaned predictors indicates
the within-subject effect, while the coefficient of the group-meaned
predictor indicates the between-subject effect.
}

\section{De-meaning with binary predictors}{


For binary time-varying predictors, there are two recommendations. First
is to include the raw (untransformed) binary predictor as fixed effect
only and the \emph{de-meaned} variable as random effect (random slope).
The alternative would be to add the de-meaned version(s) of binary
time-varying covariates as additional fixed effect as well (instead of
adding it as random slope). Centering time-varying binary variables to
obtain within-effects (level 1) isn't necessary. They have a sensible
interpretation when left in the typical 0/1 format (\emph{Hoffmann 2015,
chapter 8-2.I}). \code{demean()} will thus coerce categorical time-varying
predictors to numeric to compute the de- and group-meaned versions for
these variables, where the raw (untransformed) binary predictor and the
de-meaned version should be added to the model.
}

\section{De-meaning of factors with more than 2 levels}{


Factors with more than two levels are demeaned in two ways: first, these
are also converted to numeric and de-meaned; second, dummy variables
are created (binary, with 0/1 coding for each level) and these binary
dummy-variables are de-meaned in the same way (as described above).
Packages like \strong{panelr} internally convert factors to dummies before
demeaning, so this behaviour can be mimicked here.
}

\section{De-meaning interaction terms}{


There are multiple ways to deal with interaction terms of within- and
between-effects.
\itemize{
\item A classical approach is to simply use the product term of the de-meaned
variables (i.e. introducing the de-meaned variables as interaction term
in the model formula, e.g. \code{y ~ x_within * time_within}). This approach,
however, might be subject to bias (see \emph{Giesselmann & Schmidt-Catran 2020}).
\item Another option is to first calculate the product term and then apply the
de-meaning to it. This approach produces an estimator "that reflects
unit-level differences of interacted variables whose moderators vary
within units", which is desirable if \emph{no} within interaction of
two time-dependent variables is required. This is what \code{demean()} does
internally when \code{select} contains interaction terms.
\item A third option, when the interaction should result in a genuine within
estimator, is to "double de-mean" the interaction terms
(\emph{Giesselmann & Schmidt-Catran 2018}), however, this is currently
not supported by \code{demean()}. If this is required, the \code{wmb()}
function from the \strong{panelr} package should be used.
}

To de-mean interaction terms for within-between models, simply specify
the term as interaction for the \code{select}-argument, e.g. \code{select = "a*b"}
(see 'Examples').
}

\section{De-meaning for cross-classified designs}{


\code{demean()} can handle cross-classified designs, where the data has two or
more groups at the higher (i.e. second) level. In such cases, the
\code{by}-argument can identify two or more variables that represent the
cross-classified group- or cluster-IDs. The de-meaned variables for
cross-classified designs are simply subtracting all group means from each
individual value, i.e. \emph{fully cluster-mean-centering} (see \emph{Guo et al. 2024}
for details). Note that de-meaning for cross-classified designs is \emph{not}
equivalent to de-meaning of nested data structures from models with three or
more levels. Set \code{nested = TRUE} to explicitly assume a nested design. For
cross-classified designs, de-meaning is supposed to work for models like
\code{y ~ x + (1|level3) + (1|level2)}, but \emph{not} for models like
\code{y ~ x + (1|level3/level2)}. Note that \code{demean()} and \code{degroup()} can't
handle a mix of nested and cross-classified designs in one model.
}

\section{De-meaning for nested designs}{


\emph{Brincks et al. (2017)} have suggested an algorithm to center variables for
nested designs, which is implemented in \code{demean()}. For nested designs, set
\code{nested = TRUE} \emph{and} specify the variables that indicate the different
levels in descending order in the \code{by} argument. E.g.,
\verb{by = c("level4", "level3, "level2")} assumes a model like
\code{y ~ x + (1|level4/level3/level2)}. An alternative notation for the
\code{by}-argument would be \code{by = "level4/level3/level2"}, similar to the
formula notation.
}

\section{Analysing panel data with mixed models using lme4}{


A description of how to translate the formulas described in \emph{Bell et al. 2018}
into R using \code{lmer()} from \strong{lme4} can be found in
\href{https://easystats.github.io/parameters/articles/demean.html}{this vignette}.
}

\examples{

data(iris)
iris$ID <- sample(1:4, nrow(iris), replace = TRUE) # fake-ID
iris$binary <- as.factor(rbinom(150, 1, 0.35)) # binary variable

x <- demean(iris, select = c("Sepal.Length", "Petal.Length"), by = "ID")
head(x)

x <- demean(iris, select = c("Sepal.Length", "binary", "Species"), by = "ID")
head(x)


# demean interaction term x*y
dat <- data.frame(
  a = c(1, 2, 3, 4, 1, 2, 3, 4),
  x = c(4, 3, 3, 4, 1, 2, 1, 2),
  y = c(1, 2, 1, 2, 4, 3, 2, 1),
  ID = c(1, 2, 3, 1, 2, 3, 1, 2)
)
demean(dat, select = c("a", "x*y"), by = "ID")

# or in formula-notation
demean(dat, select = ~ a + x * y, by = ~ID)

}
\references{
\itemize{
\item Bafumi J, Gelman A. 2006. Fitting Multilevel Models When Predictors
and Group Effects Correlate. In. Philadelphia, PA: Annual meeting of the
American Political Science Association.
\item Bell A, Fairbrother M, Jones K. 2019. Fixed and Random Effects
Models: Making an Informed Choice. Quality & Quantity (53); 1051-1074
\item Bell A, Jones K. 2015. Explaining Fixed Effects: Random Effects
Modeling of Time-Series Cross-Sectional and Panel Data. Political Science
Research and Methods, 3(1), 133–153.
\item Brincks, A. M., Enders, C. K., Llabre, M. M., Bulotsky-Shearer, R. J.,
Prado, G., and Feaster, D. J. (2017). Centering Predictor Variables in
Three-Level Contextual Models. Multivariate Behavioral Research, 52(2),
149–163. https://doi.org/10.1080/00273171.2016.1256753
\item Gelman A, Hill J. 2007. Data Analysis Using Regression and
Multilevel/Hierarchical Models. Analytical Methods for Social Research.
Cambridge, New York: Cambridge University Press
\item Giesselmann M, Schmidt-Catran, AW. 2020. Interactions in fixed
effects regression models. Sociological Methods & Research, 1–28.
https://doi.org/10.1177/0049124120914934
\item Guo Y, Dhaliwal J, Rights JD. 2024. Disaggregating level-specific effects
in cross-classified multilevel models. Behavior Research Methods, 56(4),
3023–3057.
\item Heisig JP, Schaeffer M, Giesecke J. 2017. The Costs of Simplicity:
Why Multilevel Models May Benefit from Accounting for Cross-Cluster
Differences in the Effects of Controls. American Sociological Review 82
(4): 796–827.
\item Hoffman L. 2015. Longitudinal analysis: modeling within-person
fluctuation and change. New York: Routledge
}
}
\seealso{
If grand-mean centering (instead of centering within-clusters)
is required, see \code{\link[=center]{center()}}. See \code{\link[performance:check_group_variation]{performance::check_group_variation()}}
to check for heterogeneity bias.
}


================================================
FILE: man/describe_distribution.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/describe_distribution.R
\name{describe_distribution}
\alias{describe_distribution}
\alias{describe_distribution.numeric}
\alias{describe_distribution.factor}
\alias{describe_distribution.data.frame}
\title{Describe a distribution}
\usage{
describe_distribution(x, ...)

\method{describe_distribution}{numeric}(
  x,
  centrality = "mean",
  dispersion = TRUE,
  iqr = TRUE,
  range = TRUE,
  quartiles = FALSE,
  ci = NULL,
  iterations = 100,
  threshold = 0.1,
  verbose = TRUE,
  ...
)

\method{describe_distribution}{factor}(x, dispersion = TRUE, range = TRUE, verbose = TRUE, ...)

\method{describe_distribution}{data.frame}(
  x,
  select = NULL,
  exclude = NULL,
  centrality = "mean",
  dispersion = TRUE,
  iqr = TRUE,
  range = TRUE,
  quartiles = FALSE,
  include_factors = FALSE,
  ci = NULL,
  iterations = 100,
  threshold = 0.1,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  by = NULL,
  ...
)
}
\arguments{
\item{x}{A numeric vector, a character vector, a data frame, or a list. See
\code{Details}.}

\item{...}{Additional arguments to be passed to or from methods.}

\item{centrality}{The point-estimates (centrality indices) to compute. Character
(vector) or list with one or more of these options: \code{"median"}, \code{"mean"}, \code{"MAP"}
(see \code{\link[bayestestR:map_estimate]{map_estimate()}}), \code{"trimmed"} (which is just \code{mean(x, trim = threshold)}),
\code{"mode"} or \code{"all"}.}

\item{dispersion}{Logical, if \code{TRUE}, computes indices of dispersion related
to the estimate(s) (\code{SD} and \code{MAD} for \code{mean} and \code{median}, respectively).
Dispersion is not available for \code{"MAP"} or \code{"mode"} centrality indices.}

\item{iqr}{Logical, if \code{TRUE}, the interquartile range is calculated (based
on \code{\link[stats:IQR]{stats::IQR()}}, using \code{type = 6}).}

\item{range}{Return the range (min and max).}

\item{quartiles}{Return the first and third quartiles (25th and 75th
percentiles).}

\item{ci}{Confidence Interval (CI) level. Default is \code{NULL}, i.e. no
confidence intervals are computed. If not \code{NULL}, confidence intervals are
based on bootstrap replicates (see \code{iterations}).}

\item{iterations}{The number of bootstrap replicates for computing confidence
intervals. Only applies when \code{ci} is not \code{NULL}. Defaults to \code{100}. For
more stable results, increase the number of \code{iterations}, but note that this
can also increase the computation time significantly.}

\item{threshold}{For \code{centrality = "trimmed"} (i.e. trimmed mean), indicates
the fraction (0 to 0.5) of observations to be trimmed from each end of the
vector before the mean is computed.}

\item{verbose}{Show or silence warnings and messages.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}

\item{include_factors}{Logical, if \code{TRUE}, factors are included in the
output, however, only columns for range (first and last factor levels) as
well as n and missing will contain information.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}

\item{by}{Column names indicating how to split the data in various groups
before describing the distribution. \code{by} groups will be added to potentially
existing groups created by \code{data_group()}.}
}
\value{
A data frame with columns that describe the properties of the variables.
}
\description{
This function describes a distribution by a set of indices (e.g., measures of
centrality, dispersion, range, skewness, (excess) kurtosis).
}
\details{
If \code{x} is a data frame, only numeric variables are kept and will be
displayed in the summary by default.

If \code{x} is a list, the behavior is different whether \code{x} is a stored list. If
\code{x} is stored (for example, \code{describe_distribution(mylist)} where \code{mylist}
was created before), artificial variable names are used in the summary
(\code{Var_1}, \code{Var_2}, etc.). If \code{x} is an unstored list (for example,
\code{describe_distribution(list(mtcars$mpg))}), then \code{"mtcars$mpg"} is used as
variable name.
}
\note{
There is also a
\href{https://easystats.github.io/see/articles/parameters.html}{\code{plot()}-method}
implemented in the \href{https://easystats.github.io/see/}{\strong{see}-package}.
}
\examples{
\dontshow{if (require("bayestestR", quietly = TRUE)) withAutoprint(\{ # examplesIf}
describe_distribution(rnorm(100))

data(iris)
describe_distribution(iris)
describe_distribution(iris, include_factors = TRUE, quartiles = TRUE)
describe_distribution(list(mtcars$mpg, mtcars$cyl))
\dontshow{\}) # examplesIf}
}
\seealso{
\code{\link[=kurtosis]{kurtosis()}} to compute kurtosis (recognized as excess kurtosis).
}


================================================
FILE: man/distribution_mode.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/descriptives.R
\name{distribution_mode}
\alias{distribution_mode}
\title{Compute mode for a statistical distribution}
\usage{
distribution_mode(x)
}
\arguments{
\item{x}{An atomic vector, a list, or a data frame.}
}
\value{
The value that appears most frequently in the provided data.
The returned data structure will be the same as the entered one.
}
\description{
Compute mode for a statistical distribution
}
\examples{

distribution_mode(c(1, 2, 3, 3, 4, 5))
distribution_mode(c(1.5, 2.3, 3.7, 3.7, 4.0, 5))

}
\seealso{
For continuous variables, the
\strong{Highest Maximum a Posteriori probability estimate (MAP)} may be
a more useful way to estimate the most commonly-observed value
than the mode. See \code{\link[bayestestR:map_estimate]{bayestestR::map_estimate()}}.
}


================================================
FILE: man/efc.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data.R
\docType{data}
\name{efc}
\alias{efc}
\title{Sample dataset from the EFC Survey}
\description{
Selected variables from the EUROFAMCARE survey. Useful when
testing on "real-life" data sets, including random missing values. This
data set also has value and variable label attributes.
}
\keyword{data}


================================================
FILE: man/extract_column_names.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_select.R, R/extract_column_names.R
\name{data_select}
\alias{data_select}
\alias{extract_column_names}
\alias{find_columns}
\title{Find or get columns in a data frame based on search patterns}
\usage{
data_select(
  data,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
)

extract_column_names(
  data,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
)

find_columns(
  data,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
)
}
\arguments{
\item{data}{A data frame.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}

\item{verbose}{Toggle warnings.}

\item{...}{Arguments passed down to other functions. Mostly not used yet.}
}
\value{
\code{extract_column_names()} returns a character vector with column names that
matched the pattern in \code{select} and \code{exclude}, or \code{NULL} if no matching
column name was found. \code{data_select()} returns a data frame with matching
columns.
}
\description{
\code{extract_column_names()} returns column names from a data set that
match a certain search pattern, while \code{data_select()} returns the found data.
}
\details{
Specifically for \code{data_select()}, \code{select} can also be a named character
vector. In this case, the names are used to rename the columns in the
output data frame. See 'Examples'.

Note that it is possible to either pass an entire select helper or only the
pattern inside a select helper as a function argument:

\if{html}{\out{<div class="sourceCode r">}}\preformatted{foo <- function(data, pattern) \{
  extract_column_names(data, select = starts_with(pattern))
\}
foo(iris, pattern = "Sep")

foo2 <- function(data, pattern) \{
  extract_column_names(data, select = pattern)
\}
foo2(iris, pattern = starts_with("Sep"))
}\if{html}{\out{</div>}}

This means that it is also possible to use loop values as arguments or patterns:

\if{html}{\out{<div class="sourceCode r">}}\preformatted{for (i in c("Sepal", "Sp")) \{
  head(iris) |>
    extract_column_names(select = starts_with(i)) |>
    print()
\}
}\if{html}{\out{</div>}}

However, this behavior is limited to a "single-level function". It will not
work in nested functions, like below:

\if{html}{\out{<div class="sourceCode r">}}\preformatted{inner <- function(data, arg) \{
  extract_column_names(data, select = arg)
\}
outer <- function(data, arg) \{
  inner(data, starts_with(arg))
\}
outer(iris, "Sep")
}\if{html}{\out{</div>}}

In this case, it is better to pass the whole select helper as the argument of
\code{outer()}:

\if{html}{\out{<div class="sourceCode r">}}\preformatted{outer <- function(data, arg) \{
  inner(data, arg)
\}
outer(iris, starts_with("Sep"))
}\if{html}{\out{</div>}}
}
\examples{
# Find column names by pattern
extract_column_names(iris, starts_with("Sepal"))
extract_column_names(iris, ends_with("Width"))
extract_column_names(iris, regex("\\\\."))
extract_column_names(iris, c("Petal.Width", "Sepal.Length"))

# starts with "Sepal", but not allowed to end with "width"
extract_column_names(iris, starts_with("Sepal"), exclude = contains("Width"))

# find numeric with mean > 3.5
numeric_mean_35 <- function(x) is.numeric(x) && mean(x, na.rm = TRUE) > 3.5
extract_column_names(iris, numeric_mean_35)

# find column names, using range
extract_column_names(mtcars, c(cyl:hp, wt))

# find range of column names by range, using character vector
extract_column_names(mtcars, c("cyl:hp", "wt"))

# rename returned columns for "data_select()"
head(data_select(mtcars, c(`Miles per Gallon` = "mpg", Cylinders = "cyl")))
}
\seealso{
\itemize{
\item Add a prefix or suffix to column names: \code{\link[=data_addprefix]{data_addprefix()}}, \code{\link[=data_addsuffix]{data_addsuffix()}}
\item Functions to reorder or remove columns: \code{\link[=data_reorder]{data_reorder()}}, \code{\link[=data_relocate]{data_relocate()}},
\code{\link[=data_remove]{data_remove()}}
\item Functions to reshape, pivot or rotate data frames: \code{\link[=data_to_long]{data_to_long()}},
\code{\link[=data_to_wide]{data_to_wide()}}, \code{\link[=data_rotate]{data_rotate()}}
\item Functions to recode data: \code{\link[=rescale]{rescale()}}, \code{\link[=reverse]{reverse()}}, \code{\link[=categorize]{categorize()}},
\code{\link[=recode_values]{recode_values()}}, \code{\link[=slide]{slide()}}
\item Functions to standardize, normalize, rank-transform: \code{\link[=center]{center()}}, \code{\link[=standardize]{standardize()}},
\code{\link[=normalize]{normalize()}}, \code{\link[=ranktransform]{ranktransform()}}, \code{\link[=winsorize]{winsorize()}}
\item Split and merge data frames: \code{\link[=data_partition]{data_partition()}}, \code{\link[=data_merge]{data_merge()}}
\item Functions to find or select columns: \code{\link[=data_select]{data_select()}}, \code{\link[=extract_column_names]{extract_column_names()}}
\item Functions to filter rows: \code{\link[=data_match]{data_match()}}, \code{\link[=data_filter]{data_filter()}}
}
}


================================================
FILE: man/labels_to_levels.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/labels_to_levels.R
\name{labels_to_levels}
\alias{labels_to_levels}
\alias{labels_to_levels.factor}
\alias{labels_to_levels.data.frame}
\title{Convert value labels into factor levels}
\usage{
labels_to_levels(x, ...)

\method{labels_to_levels}{factor}(x, verbose = TRUE, ...)

\method{labels_to_levels}{data.frame}(
  x,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  append = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
)
}
\arguments{
\item{x}{A data frame or factor. Other variable types (e.g. numerics) are not
allowed.}

\item{...}{Currently not used.}

\item{verbose}{Toggle warnings.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{append}{Logical or string. If \code{TRUE}, recoded or converted variables
get new column names and are appended (column bind) to \code{x}, thus returning
both the original and the recoded variables. The new columns get a suffix,
based on the calling function: \code{"_r"} for recode functions, \code{"_n"} for
\code{to_numeric()}, \code{"_f"} for \code{to_factor()}, or \code{"_s"} for
\code{slide()}. If \code{append=FALSE}, original variables in \code{x} will be
overwritten by their recoded versions. If a character value, recoded
variables are appended with new column names (using the defined suffix) to
the original data frame.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}
}
\value{
\code{x}, where for all factors former levels are replaced by their value
labels.
}
\description{
Convert value labels into factor levels
}
\details{
\code{labels_to_levels()} allows to use value labels of factors as their levels.
}
\examples{
data(efc)
# create factor
x <- as.factor(efc$c172code)
# add value labels - these are not factor levels yet
x <- assign_labels(x, values = c(`1` = "low", `2` = "mid", `3` = "high"))
levels(x)
data_tabulate(x)

x <- labels_to_levels(x)
levels(x)
data_tabulate(x)
}


================================================
FILE: man/makepredictcall.dw_transformer.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/makepredictcall.R
\name{makepredictcall.dw_transformer}
\alias{makepredictcall.dw_transformer}
\title{Utility Function for Safe Prediction with \code{datawizard} transformers}
\usage{
\method{makepredictcall}{dw_transformer}(var, call)
}
\arguments{
\item{var}{A variable.}

\item{call}{The term in the formula, as a call.}
}
\value{
A replacement for \code{call} for the \code{predvars} attribute of
  the terms.
}
\description{
This function allows for the use of (some of) \code{datawizard}'s transformers
inside a model formula. See examples below.
\cr\cr
Currently, \code{\link[=center]{center()}}, \code{\link[=standardize]{standardize()}}, \code{\link[=normalize]{normalize()}}, & \code{\link[=rescale]{rescale()}} are
supported.
}
\examples{

data("mtcars")
train <- mtcars[1:30, ]
test <- mtcars[31:32, ]

m1 <- lm(mpg ~ center(hp), data = train)
predict(m1, newdata = test) # Data is "centered" before the prediction is made,
# according to the center of the old data

m2 <- lm(mpg ~ standardize(hp), data = train)
m3 <- lm(mpg ~ scale(hp), data = train) # same as above
predict(m2, newdata = test) # Data is "standardized" before the prediction is made.
predict(m3, newdata = test) # Data is "standardized" before the prediction is made.


m4 <- lm(mpg ~ normalize(hp), data = mtcars)
m5 <- lm(mpg ~ rescale(hp, to = c(-3, 3)), data = mtcars)

(newdata <- data.frame(hp = c(range(mtcars$hp), 400))) # 400 is outside original range!

model.frame(delete.response(terms(m4)), data = newdata)
model.frame(delete.response(terms(m5)), data = newdata)

}
\seealso{
\code{\link[stats:makepredictcall]{stats::makepredictcall()}}
}
\concept{datawizard-transformers}


================================================
FILE: man/mean_sd.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/mean_sd.R
\name{mean_sd}
\alias{mean_sd}
\alias{median_mad}
\title{Summary Helpers}
\usage{
mean_sd(x, times = 1L, remove_na = TRUE, named = TRUE, ...)

median_mad(
  x,
  times = 1L,
  remove_na = TRUE,
  constant = 1.4826,
  named = TRUE,
  ...
)
}
\arguments{
\item{x}{A numeric vector (or one that can be coerced to one via
\code{as.numeric()}) to be summarized.}

\item{times}{How many SDs above and below the Mean (or MADs around the Median)}

\item{remove_na}{Logical. Should \code{NA} values be removed before computing (\code{TRUE})
or not (\code{FALSE}, default)?}

\item{named}{Should the vector be named?
(E.g., \code{c("-SD" = -1, Mean = 1, "+SD" = 2)}.)}

\item{...}{Not used.}

\item{constant}{scale factor.}
}
\value{
A (possibly named) numeric vector of length \code{2*times + 1} of SDs
below the mean, the mean, and SDs above the mean (or median and MAD).
}
\description{
Summary Helpers
}
\examples{
mean_sd(mtcars$mpg)

mean_sd(mtcars$mpg, times = 2L)

median_mad(mtcars$mpg)

}


================================================
FILE: man/means_by_group.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/means_by_group.R
\name{means_by_group}
\alias{means_by_group}
\alias{means_by_group.numeric}
\alias{means_by_group.data.frame}
\title{Summary of mean values by group}
\usage{
means_by_group(x, ...)

\method{means_by_group}{numeric}(x, by = NULL, ci = 0.95, weights = NULL, digits = NULL, ...)

\method{means_by_group}{data.frame}(
  x,
  select = NULL,
  by = NULL,
  ci = 0.95,
  weights = NULL,
  digits = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
)
}
\arguments{
\item{x}{A vector or a data frame.}

\item{...}{Currently not used}

\item{by}{If \code{x} is a numeric vector, \code{by} should be a factor that
indicates the group-classifying categories. If \code{x} is a data frame, \code{by}
should be a character string, naming the variable in \code{x} that is used for
grouping. Numeric vectors are coerced to factors. Not that \code{by} should
only refer to a single variable.}

\item{ci}{Level of confidence interval for mean estimates. Default is \code{0.95}.
Use \code{ci = NA} to suppress confidence intervals.}

\item{weights}{If \code{x} is a numeric vector, \code{weights} should be a vector of
weights that will be applied to weight all observations. If \code{x} is a data
frame, \code{weights} can also be a character string indicating the name of the
variable in \code{x} that should be used for weighting. Default is \code{NULL}, so no
weights are used.}

\item{digits}{Optional scalar, indicating the amount of digits after decimal
point when rounding estimates and values.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}

\item{verbose}{Toggle warnings.}
}
\value{
A data frame with information on mean and further summary statistics
for each sub-group.
}
\description{
Computes summary table of means by groups.
}
\details{
This function is comparable to \code{aggregate(x, by, mean)}, but provides
some further information, including summary statistics from a One-Way-ANOVA
using \code{x} as dependent and \code{by} as independent variable. \code{\link[emmeans:contrast]{emmeans::contrast()}}
is used to get p-values for each sub-group. P-values indicate whether each
group-mean is significantly different from the total mean.
}
\examples{
data(efc)
means_by_group(efc, "c12hour", "e42dep")

data(iris)
means_by_group(iris, "Sepal.Width", "Species")

# weighting
efc$weight <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
means_by_group(efc, "c12hour", "e42dep", weights = "weight")
}


================================================
FILE: man/nhanes_sample.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data.R
\docType{data}
\name{nhanes_sample}
\alias{nhanes_sample}
\title{Sample dataset from the National Health and Nutrition Examination Survey}
\description{
Selected variables from the National Health and Nutrition Examination
Survey that are used in the example from Lumley (2010), Appendix E.
}
\references{
Lumley T (2010). Complex Surveys: a guide to analysis using R. Wiley
}
\keyword{data}


================================================
FILE: man/normalize.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/normalize.R, R/unnormalize.R
\name{normalize}
\alias{normalize}
\alias{normalize.numeric}
\alias{normalize.data.frame}
\alias{unnormalize}
\alias{unnormalize.numeric}
\alias{unnormalize.data.frame}
\alias{unnormalize.grouped_df}
\title{Normalize numeric variable to 0-1 range}
\usage{
normalize(x, ...)

\method{normalize}{numeric}(x, include_bounds = TRUE, verbose = TRUE, ...)

\method{normalize}{data.frame}(
  x,
  select = NULL,
  exclude = NULL,
  include_bounds = TRUE,
  append = FALSE,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
)

unnormalize(x, ...)

\method{unnormalize}{numeric}(x, verbose = TRUE, ...)

\method{unnormalize}{data.frame}(
  x,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
)

\method{unnormalize}{grouped_df}(
  x,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
)
}
\arguments{
\item{x}{A numeric vector, (grouped) data frame, or matrix. See 'Details'.}

\item{...}{Arguments passed to or from other methods.}

\item{include_bounds}{Numeric or logical. Using this can be useful in case of
beta-regression, where the response variable is not allowed to include
zeros and ones. If \code{TRUE}, the input is normalized to a range that includes
zero and one. If \code{FALSE}, the return value is compressed, using
Smithson and Verkuilen's (2006) formula \code{(x * (n - 1) + 0.5) / n}, to avoid
zeros and ones in the normalized variables. Else, if numeric (e.g., \code{0.001}),
\code{include_bounds} defines the "distance" to the lower and upper bound, i.e.
the normalized vectors are rescaled to a range from \code{0 + include_bounds} to
\code{1 - include_bounds}.}

\item{verbose}{Toggle warnings and messages on or off.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}

\item{append}{Logical or string. If \code{TRUE}, standardized variables get new
column names (with the suffix \code{"_z"}) and are appended (column bind) to \code{x},
thus returning both the original and the standardized variables. If \code{FALSE},
original variables in \code{x} will be overwritten by their standardized versions.
If a character value, standardized variables are appended with new column
names (using the defined suffix) to the original data frame.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}
}
\value{
A normalized object.
}
\description{
Performs a normalization of data, i.e., it scales variables in the range
0 - 1. This is a special case of \code{\link[=rescale]{rescale()}}. \code{unnormalize()} is the
counterpart, but only works for variables that have been normalized with
\code{normalize()}.
}
\details{
\itemize{
\item If \code{x} is a matrix, normalization is performed across all values (not
column- or row-wise). For column-wise normalization, convert the matrix to a
data.frame.
\item If \code{x} is a grouped data frame (\code{grouped_df}), normalization is performed
separately for each group.
}
}
\section{Selection of variables - the \code{select} argument}{

For most functions that have a \code{select} argument (including this function),
the complete input data frame is returned, even when \code{select} only selects
a range of variables. That is, the function is only applied to those variables
that have a match in \code{select}, while all other variables remain unchanged.
In other words: for this function, \code{select} will not omit any non-included
variables, so that the returned data frame will include all variables
from the input data frame.
}

\examples{

normalize(c(0, 1, 5, -5, -2))
normalize(c(0, 1, 5, -5, -2), include_bounds = FALSE)
# use a value defining the bounds
normalize(c(0, 1, 5, -5, -2), include_bounds = 0.001)

head(normalize(trees))

}
\references{
Smithson M, Verkuilen J (2006). A Better Lemon Squeezer? Maximum-Likelihood
Regression with Beta-Distributed Dependent Variables. Psychological Methods,
11(1), 54–71.
}
\seealso{
See \code{\link[=makepredictcall.dw_transformer]{makepredictcall.dw_transformer()}} for use in model formulas.

Other transform utilities: 
\code{\link{ranktransform}()},
\code{\link{rescale}()},
\code{\link{reverse}()},
\code{\link{standardize}()}
}
\concept{transform utilities}


================================================
FILE: man/ranktransform.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/ranktransform.R
\name{ranktransform}
\alias{ranktransform}
\alias{ranktransform.numeric}
\alias{ranktransform.data.frame}
\title{(Signed) rank transformation}
\usage{
ranktransform(x, ...)

\method{ranktransform}{numeric}(
  x,
  sign = FALSE,
  method = "average",
  zeros = "na",
  verbose = TRUE,
  ...
)

\method{ranktransform}{data.frame}(
  x,
  select = NULL,
  exclude = NULL,
  sign = FALSE,
  method = "average",
  ignore_case = FALSE,
  regex = FALSE,
  zeros = "na",
  verbose = TRUE,
  ...
)
}
\arguments{
\item{x}{Object.}

\item{...}{Arguments passed to or from other methods.}

\item{sign}{Logical, if \code{TRUE}, return signed ranks.}

\item{method}{Treatment of ties. Can be one of \code{"average"} (default),
\code{"first"}, \code{"last"}, \code{"random"}, \code{"max"} or \code{"min"}. See \code{\link[=rank]{rank()}} for
details.}

\item{zeros}{How to handle zeros. If \code{"na"} (default), they are marked as
\code{NA}. If \code{"signrank"}, they are kept during the ranking and marked as zeros.
This is only used when \code{sign = TRUE}.}

\item{verbose}{Toggle warnings.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}
}
\value{
A rank-transformed object.
}
\description{
Transform numeric values with the integers of their rank (i.e., 1st smallest,
2nd smallest, 3rd smallest, etc.). Setting the \code{sign} argument to \code{TRUE} will
give you signed ranks, where the ranking is done according to absolute size
but where the sign is preserved (i.e., 2, 1, -3, 4).
}
\section{Selection of variables - the \code{select} argument}{

For most functions that have a \code{select} argument (including this function),
the complete input data frame is returned, even when \code{select} only selects
a range of variables. That is, the function is only applied to those variables
that have a match in \code{select}, while all other variables remain unchanged.
In other words: for this function, \code{select} will not omit any non-included
variables, so that the returned data frame will include all variables
from the input data frame.
}

\examples{
ranktransform(c(0, 1, 5, -5, -2))

# By default, zeros are converted to NA
suppressWarnings(
  ranktransform(c(0, 1, 5, -5, -2), sign = TRUE)
)
ranktransform(c(0, 1, 5, -5, -2), sign = TRUE, zeros = "signrank")

head(ranktransform(trees))
}
\seealso{
Other transform utilities: 
\code{\link{normalize}()},
\code{\link{rescale}()},
\code{\link{reverse}()},
\code{\link{standardize}()}
}
\concept{transform utilities}


================================================
FILE: man/recode_into.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/recode_into.R
\name{recode_into}
\alias{recode_into}
\title{Recode values from one or more variables into a new variable}
\usage{
recode_into(
  ...,
  data = NULL,
  default = NA,
  overwrite = TRUE,
  preserve_na = FALSE,
  verbose = TRUE
)
}
\arguments{
\item{...}{A sequence of two-sided formulas, where the left hand side (LHS)
is a logical matching condition that determines which values match this case.
The LHS of this formula is also called "recode pattern" (e.g., in messages).
The right hand side (RHS) indicates the replacement value.}

\item{data}{Optional, name of a data frame. This can be used to avoid writing
the data name multiple times in \code{...}. See 'Examples'.}

\item{default}{Indicates the default value that is chosen when no match in
the formulas in \code{...} is found. If not provided, \code{NA} is used as default
value.}

\item{overwrite}{Logical, if \code{TRUE} (default) and more than one recode pattern
apply to the same case, already recoded values will be overwritten by subsequent
recode patterns. If \code{FALSE}, former recoded cases will not be altered by later
recode patterns that would apply to those cases again. A warning message is
printed to alert such situations and to avoid unintentional recodings.}

\item{preserve_na}{Logical, if \code{TRUE} and \code{default} is not \code{NA}, missing
values in the original variable will be set back to \code{NA} in the recoded
variable (unless overwritten by other recode patterns). If \code{FALSE}, missing
values in the original variable will be recoded to \code{default}. Setting
\code{preserve_na = TRUE} prevents unintentional overwriting of missing values
with \code{default}, which means that you won't find valid values where the
original data only had missing values. See 'Examples'.}

\item{verbose}{Toggle warnings.}
}
\value{
A vector with recoded values.
}
\description{
This functions recodes values from one or more variables into a new variable.
It is a convenient function to avoid nested \code{\link[=ifelse]{ifelse()}} statements, which
is similar to \code{dplyr::case_when()}.
}
\examples{
x <- 1:30
recode_into(
  x > 15 ~ "a",
  x > 10 & x <= 15 ~ "b",
  default = "c"
)

x <- 1:10
# default behaviour: second recode pattern "x > 5" overwrites
# some of the formerly recoded cases from pattern "x >= 3 & x <= 7"
recode_into(
  x >= 3 & x <= 7 ~ 1,
  x > 5 ~ 2,
  default = 0,
  verbose = FALSE
)

# setting "overwrite = FALSE" will not alter formerly recoded cases
recode_into(
  x >= 3 & x <= 7 ~ 1,
  x > 5 ~ 2,
  default = 0,
  overwrite = FALSE,
  verbose = FALSE
)

set.seed(123)
d <- data.frame(
  x = sample(1:5, 30, TRUE),
  y = sample(letters[1:5], 30, TRUE),
  stringsAsFactors = FALSE
)

# from different variables into new vector
recode_into(
  d$x \%in\% 1:3 & d$y \%in\% c("a", "b") ~ 1,
  d$x > 3 ~ 2,
  default = 0
)

# no need to write name of data frame each time
recode_into(
  x \%in\% 1:3 & y \%in\% c("a", "b") ~ 1,
  x > 3 ~ 2,
  data = d,
  default = 0
)

# handling of missing values
d <- data.frame(
  x = c(1, NA, 2, NA, 3, 4),
  y = c(1, 11, 3, NA, 5, 6)
)
# first NA in x is overwritten by valid value from y
# we have no known value for second NA in x and y,
# thus we get one NA in the result
recode_into(
  x <= 3 ~ 1,
  y > 5 ~ 2,
  data = d,
  default = 0,
  preserve_na = TRUE
)
# first NA in x is overwritten by valid value from y
# default value is used for second NA
recode_into(
  x <= 3 ~ 1,
  y > 5 ~ 2,
  data = d,
  default = 0,
  preserve_na = FALSE
)
}


================================================
FILE: man/recode_values.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/recode_values.R
\name{recode_values}
\alias{recode_values}
\alias{recode_values.numeric}
\alias{recode_values.data.frame}
\title{Recode old values of variables into new values}
\usage{
recode_values(x, ...)

\method{recode_values}{numeric}(
  x,
  recode = NULL,
  default = NULL,
  preserve_na = TRUE,
  verbose = TRUE,
  ...
)

\method{recode_values}{data.frame}(
  x,
  select = NULL,
  exclude = NULL,
  recode = NULL,
  default = NULL,
  preserve_na = TRUE,
  append = FALSE,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
)
}
\arguments{
\item{x}{A data frame, numeric or character vector, or factor.}

\item{...}{not used.}

\item{recode}{A list of named vectors, which indicate the recode pairs.
The \emph{names} of the list-elements (i.e. the left-hand side) represent the
\emph{new} values, while the values of the list-elements indicate the original
(old) values that should be replaced. When recoding numeric vectors,
element names have to be surrounded in backticks. For example,
\code{recode=list(`0`=1)} would recode all \code{1} into \code{0} in a numeric
vector. See also 'Examples' and 'Details'.}

\item{default}{Defines the default value for all values that have no match in
the recode-pairs. If \code{NULL}, original values will be preserved when there
is no match. Note that, if \code{preserve_na=FALSE}, missing values (\code{NA}) are
also captured by the \code{default} argument, and thus will also be recoded into
the specified value. See 'Examples' and 'Details'.}

\item{preserve_na}{Logical, if \code{TRUE}, \code{NA} (missing values) are preserved.
This overrides any other arguments, including \code{default}. Hence, if
\code{preserve_na=TRUE}, \code{default} will no longer convert \code{NA} into the specified
default value.}

\item{verbose}{Toggle warnings.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}

\item{append}{Logical or string. If \code{TRUE}, recoded or converted variables
get new column names and are appended (column bind) to \code{x}, thus returning
both the original and the recoded variables. The new columns get a suffix,
based on the calling function: \code{"_r"} for recode functions, \code{"_n"} for
\code{to_numeric()}, \code{"_f"} for \code{to_factor()}, or \code{"_s"} for
\code{slide()}. If \code{append=FALSE}, original variables in \code{x} will be
overwritten by their recoded versions. If a character value, recoded
variables are appended with new column names (using the defined suffix) to
the original data frame.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}
}
\value{
\code{x}, where old values are replaced by new values.
}
\description{
This functions recodes old values into new values and can be used to to
recode numeric or character vectors, or factors.
}
\details{
This section describes the pattern of the \code{recode} arguments, which also
provides some shortcuts, in particular when recoding numeric values.
\itemize{
\item Single values

Single values either need to be wrapped in backticks (in case of numeric
values) or "as is" (for character or factor levels). Example:
\code{recode=list(`0`=1,`1`=2)} would recode 1 into 0, and 2 into 1.
For factors or character vectors, an example is:
\code{recode=list(x="a",y="b")} (recode "a" into "x" and "b" into "y").
\item Multiple values

Multiple values that should be recoded into a new value can be separated
with comma. Example: \code{recode=list(`1`=c(1,4),`2`=c(2,3))} would recode the
values 1 and 4 into 1, and 2 and 3 into 2. It is also possible to define  the
old values as a character string, like:  \code{recode=list(`1`="1,4",`2`="2,3")}
For factors or character vectors, an example is:
\code{recode=list(x=c("a","b"),y=c("c","d"))}.
\item Value range

Numeric value ranges can be defined using the \code{:}. Example:
\code{recode=list(`1`=1:3,`2`=4:6)} would recode all values from 1 to 3 into
1, and 4 to 6 into 2.
\item \code{min} and \code{max}

placeholder to use the minimum or maximum value of the
(numeric) variable. Useful, e.g., when recoding ranges of values.
Example: \code{recode=list(`1`="min:10",`2`="11:max")}.
\item \code{default} values

The \code{default} argument defines the default value for all values that have
no match in the recode-pairs. For example,
\verb{recode=list(`1`=c(1,2),`2`=c(3,4)), default=9} would
recode values 1 and 2 into 1, 3 and 4 into 2, and all other values into 9.
If \code{preserve_na} is set to \code{FALSE}, \code{NA} (missing values) will also be
recoded into the specified default value.
\item Reversing and rescaling

See \code{\link[=reverse]{reverse()}} and \code{\link[=rescale]{rescale()}}.
}
}
\note{
You can use \code{options(data_recode_pattern = "old=new")} to switch the
behaviour of the \code{recode}-argument, i.e. recode-pairs are now following the
pattern \verb{old values = new values}, e.g. if \code{getOption("data_recode_pattern")}
is set to \code{"old=new"}, then \code{recode(`1`=0)} would recode all 1 into 0.
The default for \code{recode(`1`=0)} is to recode all 0 into 1.
}
\section{Selection of variables - the \code{select} argument}{

For most functions that have a \code{select} argument (including this function),
the complete input data frame is returned, even when \code{select} only selects
a range of variables. That is, the function is only applied to those variables
that have a match in \code{select}, while all other variables remain unchanged.
In other words: for this function, \code{select} will not omit any non-included
variables, so that the returned data frame will include all variables
from the input data frame.
}

\examples{
# numeric ----------
set.seed(123)
x <- sample(c(1:4, NA), 15, TRUE)
table(x, useNA = "always")

out <- recode_values(x, list(`0` = 1, `1` = 2:3, `2` = 4))
out
table(out, useNA = "always")

# to recode NA values, set preserve_na to FALSE
out <- recode_values(
  x,
  list(`0` = 1, `1` = 2:3, `2` = 4, `9` = NA),
  preserve_na = FALSE
)
out
table(out, useNA = "always")

# preserve na ----------
out <- recode_values(x, list(`0` = 1, `1` = 2:3), default = 77)
out
table(out, useNA = "always")

# recode na into default ----------
out <- recode_values(
  x,
  list(`0` = 1, `1` = 2:3),
  default = 77,
  preserve_na = FALSE
)
out
table(out, useNA = "always")


# factors (character vectors are similar) ----------
set.seed(123)
x <- as.factor(sample(c("a", "b", "c"), 15, TRUE))
table(x)

out <- recode_values(x, list(x = "a", y = c("b", "c")))
out
table(out)

out <- recode_values(x, list(x = "a", y = "b", z = "c"))
out
table(out)

out <- recode_values(x, list(y = "b,c"), default = 77)
# same as
# recode_values(x, list(y = c("b", "c")), default = 77)
out
table(out)


# data frames ----------
set.seed(123)
d <- data.frame(
  x = sample(c(1:4, NA), 12, TRUE),
  y = as.factor(sample(c("a", "b", "c"), 12, TRUE)),
  stringsAsFactors = FALSE
)

recode_values(
  d,
  recode = list(`0` = 1, `1` = 2:3, `2` = 4, x = "a", y = c("b", "c")),
  append = TRUE
)


# switch recode pattern to "old=new" ----------
options(data_recode_pattern = "old=new")

# numeric
set.seed(123)
x <- sample(c(1:4, NA), 15, TRUE)
table(x, useNA = "always")

out <- recode_values(x, list(`1` = 0, `2:3` = 1, `4` = 2))
table(out, useNA = "always")

# factors (character vectors are similar)
set.seed(123)
x <- as.factor(sample(c("a", "b", "c"), 15, TRUE))
table(x)

out <- recode_values(x, list(a = "x", `b, c` = "y"))
table(out)

# reset options
options(data_recode_pattern = NULL)
}
\seealso{
\itemize{
\item Add a prefix or suffix to column names: \code{\link[=data_addprefix]{data_addprefix()}}, \code{\link[=data_addsuffix]{data_addsuffix()}}
\item Functions to reorder or remove columns: \code{\link[=data_reorder]{data_reorder()}}, \code{\link[=data_relocate]{data_relocate()}},
\code{\link[=data_remove]{data_remove()}}
\item Functions to reshape, pivot or rotate data frames: \code{\link[=data_to_long]{data_to_long()}},
\code{\link[=data_to_wide]{data_to_wide()}}, \code{\link[=data_rotate]{data_rotate()}}
\item Functions to recode data: \code{\link[=rescale]{rescale()}}, \code{\link[=reverse]{reverse()}}, \code{\link[=categorize]{categorize()}},
\code{\link[=recode_values]{recode_values()}}, \code{\link[=slide]{slide()}}
\item Functions to standardize, normalize, rank-transform: \code{\link[=center]{center()}}, \code{\link[=standardize]{standardize()}},
\code{\link[=normalize]{normalize()}}, \code{\link[=ranktransform]{ranktransform()}}, \code{\link[=winsorize]{winsorize()}}
\item Split and merge data frames: \code{\link[=data_partition]{data_partition()}}, \code{\link[=data_merge]{data_merge()}}
\item Functions to find or select columns: \code{\link[=data_select]{data_select()}}, \code{\link[=extract_column_names]{extract_column_names()}}
\item Functions to filter rows: \code{\link[=data_match]{data_match()}}, \code{\link[=data_filter]{data_filter()}}
}
}


================================================
FILE: man/reexports.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_tabulate.R
\docType{import}
\name{reexports}
\alias{reexports}
\alias{print_html}
\alias{print_md}
\alias{display}
\title{Objects exported from other packages}
\keyword{internal}
\description{
These objects are imported from other packages. Follow the links
below to see their documentation.

\describe{
  \item{insight}{\code{\link[insight]{display}}, \code{\link[insight:display]{print_html}}, \code{\link[insight:display]{print_md}}}
}}


================================================
FILE: man/remove_empty.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/remove_empty.R
\name{remove_empty}
\alias{remove_empty}
\alias{empty_columns}
\alias{empty_rows}
\alias{remove_empty_columns}
\alias{remove_empty_rows}
\title{Return or remove variables or observations that are completely missing}
\usage{
empty_columns(x)

empty_rows(x)

remove_empty_columns(x)

remove_empty_rows(x)

remove_empty(x)
}
\arguments{
\item{x}{A data frame.}
}
\value{
\itemize{
\item For \code{empty_columns()} and \code{empty_rows()}, a numeric (named) vector with row
or column indices of those variables that completely have missing values.
\item For \code{remove_empty_columns()} and \code{remove_empty_rows()}, a data frame with
"empty" columns or rows removed, respectively.
\item For \code{remove_empty()}, \strong{both} empty rows and columns will be removed.
}
}
\description{
These functions check which rows or columns of a data frame completely
contain missing values, i.e. which observations or variables completely have
missing values, and either (1) returns their indices; or (2) removes them
from the data frame.
}
\details{
For character vectors, empty string values (i.e. \code{""}) are also
considered as missing value. Thus, if a character vector only contains \code{NA}
and \code{""}, it is considered as empty variable and will be removed. Same
applies to observations (rows) that only contain \code{NA} or \code{""}.
}
\examples{
tmp <- data.frame(
  a = c(1, 2, 3, NA, 5),
  b = c(1, NA, 3, NA, 5),
  c = c(NA, NA, NA, NA, NA),
  d = c(1, NA, 3, NA, 5)
)

tmp

# indices of empty columns or rows
empty_columns(tmp)
empty_rows(tmp)

# remove empty columns or rows
remove_empty_columns(tmp)
remove_empty_rows(tmp)

# remove empty columns and rows
remove_empty(tmp)

# also remove "empty" character vectors
tmp <- data.frame(
  a = c(1, 2, 3, NA, 5),
  b = c(1, NA, 3, NA, 5),
  c = c("", "", "", "", ""),
  stringsAsFactors = FALSE
)
empty_columns(tmp)

}


================================================
FILE: man/replace_nan_inf.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/replace_nan_inf.R
\name{replace_nan_inf}
\alias{replace_nan_inf}
\title{Convert infinite or \code{NaN} values into \code{NA}}
\usage{
replace_nan_inf(x, ...)
}
\arguments{
\item{x}{A vector or a dataframe}

\item{...}{Currently not used.}
}
\value{
Data with \code{Inf}, \code{-Inf}, and \code{NaN} converted to \code{NA}.
}
\description{
Replaces all infinite (\code{Inf} and \code{-Inf}) or \code{NaN} values with \code{NA}.
}
\examples{
# a vector
x <- c(1, 2, NA, 3, NaN, 4, NA, 5, Inf, -Inf, 6, 7)
replace_nan_inf(x)

# a data frame
df <- data.frame(
  x = c(1, NA, 5, Inf, 2, NA),
  y = c(3, NaN, 4, -Inf, 6, 7),
  stringsAsFactors = FALSE
)
replace_nan_inf(df)
}


================================================
FILE: man/rescale.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_rescale.R
\name{rescale}
\alias{rescale}
\alias{change_scale}
\alias{rescale.numeric}
\alias{rescale.data.frame}
\title{Rescale Variables to a New Range}
\usage{
rescale(x, ...)

change_scale(x, ...)

\method{rescale}{numeric}(
  x,
  to = c(0, 100),
  multiply = NULL,
  add = NULL,
  range = NULL,
  verbose = TRUE,
  ...
)

\method{rescale}{data.frame}(
  x,
  select = NULL,
  exclude = NULL,
  to = c(0, 100),
  multiply = NULL,
  add = NULL,
  range = NULL,
  append = FALSE,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = FALSE,
  ...
)
}
\arguments{
\item{x}{A (grouped) data frame, numeric vector or factor.}

\item{...}{Arguments passed to or from other methods.}

\item{to}{Numeric vector of length 2 giving the new range that the variable
will have after rescaling. To reverse-score a variable, the range should
be given with the maximum value first. See examples.}

\item{multiply}{If not \code{NULL}, \code{to} is ignored and \code{multiply} will be used,
giving the factor by which the actual range of \code{x} should be expanded.
For example, if a vector ranges from 5 to 15 and \code{multiply = 1.1}, the current
range of 10 will be expanded by the factor of 1.1, giving a new range of
11. Thus, the rescaled vector would range from 4.5 to 15.5.}

\item{add}{A vector of length 1 or 2. If not \code{NULL}, \code{to} is ignored and \code{add}
will be used, giving the amount by which the minimum and maximum of the
actual range of \code{x} should be expanded. For example, if a vector ranges from
5 to 15 and \code{add = 1}, the range will be expanded from 4 to 16. If \code{add} is
of length 2, then the first value is used for the lower bound and the second
value for the upper bound.}

\item{range}{Initial (old) range of values. If \code{NULL}, will take the range of
the input vector (\code{range(x)}).}

\item{verbose}{Toggle warnings.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}

\item{append}{Logical or string. If \code{TRUE}, recoded or converted variables
get new column names and are appended (column bind) to \code{x}, thus returning
both the original and the recoded variables. The new columns get a suffix,
based on the calling function: \code{"_r"} for recode functions, \code{"_n"} for
\code{to_numeric()}, \code{"_f"} for \code{to_factor()}, or \code{"_s"} for
\code{slide()}. If \code{append=FALSE}, original variables in \code{x} will be
overwritten by their recoded versions. If a character value, recoded
variables are appended with new column names (using the defined suffix) to
the original data frame.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}
}
\value{
A rescaled object.
}
\description{
Rescale variables to a new range. Can also be used to reverse-score variables
(change the keying/scoring direction), or to expand a range.
}
\section{Selection of variables - the \code{select} argument}{

For most functions that have a \code{select} argument (including this function),
the complete input data frame is returned, even when \code{select} only selects
a range of variables. That is, the function is only applied to those variables
that have a match in \code{select}, while all other variables remain unchanged.
In other words: for this function, \code{select} will not omit any non-included
variables, so that the returned data frame will include all variables
from the input data frame.
}

\examples{
rescale(c(0, 1, 5, -5, -2))
rescale(c(0, 1, 5, -5, -2), to = c(-5, 5))
rescale(c(1, 2, 3, 4, 5), to = c(-2, 2))

# Specify the "theoretical" range of the input vector
rescale(c(1, 3, 4), to = c(0, 40), range = c(0, 4))

# Reverse-score a variable
rescale(c(1, 2, 3, 4, 5), to = c(5, 1))
rescale(c(1, 2, 3, 4, 5), to = c(2, -2))

# Data frames
head(rescale(iris, to = c(0, 1)))
head(rescale(iris, to = c(0, 1), select = "Sepal.Length"))

# One can specify a list of ranges
head(rescale(iris, to = list(
  "Sepal.Length" = c(0, 1),
  "Petal.Length" = c(-1, 0)
)))

# "expand" ranges by a factor or a given value
x <- 5:15
x
# both will expand the range by 10\%
rescale(x, multiply = 1.1)
rescale(x, add = 0.5)

# expand range by different values
rescale(x, add = c(1, 3))

# Specify list of multipliers
d <- data.frame(x = 5:15, y = 5:15)
rescale(d, multiply = list(x = 1.1, y = 0.5))
}
\seealso{
See \code{\link[=makepredictcall.dw_transformer]{makepredictcall.dw_transformer()}} for use in model formulas.

Other transform utilities: 
\code{\link{normalize}()},
\code{\link{ranktransform}()},
\code{\link{reverse}()},
\code{\link{standardize}()}
}
\concept{transform utilities}


================================================
FILE: man/rescale_weights.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/rescale_weights.R
\name{rescale_weights}
\alias{rescale_weights}
\title{Rescale design weights for multilevel analysis}
\usage{
rescale_weights(
  data,
  probability_weights = NULL,
  by = NULL,
  nest = FALSE,
  method = "carle"
)
}
\arguments{
\item{data}{A data frame.}

\item{probability_weights}{Variable indicating the probability (design or
sampling) weights of the survey data (level-1-weight), provided as character
string or formula.}

\item{by}{Variable names (as character vector, or as formula), indicating
the grouping structure (strata) of the survey data (level-2-cluster
variable). It is also possible to create weights for multiple group
variables; in such cases, each created weighting variable will be suffixed
by the name of the group variable. This argument is required for
\code{method = "carle"}, but optional for \code{method = "kish"}.}

\item{nest}{Logical, if \code{TRUE} and \code{by} indicates at least two group
variables, then groups are "nested", i.e. groups are now a combination from
each group level of the variables in \code{by}. This argument is not used when
\code{method = "kish"}.}

\item{method}{String, indicating which rescale-method is used for rescaling
weights. Can be either \code{"carle"} (default) or \code{"kish"}. See 'Details'. If
\code{method = "carle"}, the \code{by} argument is required.}
}
\value{
\code{data}, including the new weighting variable(s). For \code{method = "carle"}, new
columns \code{rescaled_weights_a} and \code{rescaled_weights_b} are returned, and for
\code{method = "kish"}, the returned data contains a column \code{rescaled_weights}.
These represent the rescaled design weights to use in multilevel models (use
these variables for the \code{weights} argument).
}
\description{
Most functions to fit multilevel and mixed effects models only
allow the user to specify frequency weights, but not design (i.e., sampling
or probability) weights, which should be used when analyzing complex samples
(e.g., probability samples). \code{rescale_weights()} implements two algorithms,
one proposed by \cite{Asparouhov (2006)} and \cite{Carle (2009)}, to rescale
design weights in survey data to account for the grouping structure of
multilevel models, and one based on the design effect proposed by
\cite{Kish (1965)}, to rescale weights by the design effect to account for
additional sampling error introduced by weighting.
}
\details{
\itemize{
\item \code{method = "carle"}

Rescaling is based on two methods: For \code{rescaled_weights_a}, the sample
weights \code{probability_weights} are adjusted by a factor that represents the
proportion of group size divided by the sum of sampling weights within each
group. The adjustment factor for \code{rescaled_weights_b} is the sum of sample
weights within each group divided by the sum of squared sample weights
within each group (see Carle (2009), Appendix B). In other words,
\code{rescaled_weights_a} "scales the weights so that the new weights sum to the
cluster sample size" while \code{rescaled_weights_b} "scales the weights so that
the new weights sum to the effective cluster size".

Regarding the choice between scaling methods A and B, Carle suggests that
"analysts who wish to discuss point estimates should report results based
on weighting method A. For analysts more interested in residual
between-group variance, method B may generally provide the least biased
estimates". In general, it is recommended to fit a non-weighted model and
weighted models with both scaling methods and when comparing the models,
see whether the "inferential decisions converge", to gain confidence in the
results.

Though the bias of scaled weights decreases with increasing group size,
method A is preferred when insufficient or low group size is a concern.

The group ID and probably PSU may be used as random effects (e.g. nested
design, or group and PSU as varying intercepts), depending on the survey
design that should be mimicked.
\item \code{method = "kish"}

Rescaling is based on scaling the sample weights so the mean value is 1,
which means the sum of all weights equals the sample size. Next, the design
effect (\emph{Kish 1965}) is calculated, which is the mean of the squared
weights divided by the squared mean of the weights. The scaled sample
weights are then divided by the design effect. This method is most
appropriate when weights are based on additional variables beyond the
grouping variables in the model (e.g., other demographic characteristics),
but may also be useful in other contexts.

Some tests on real-world survey-data suggest that, in comparison to the
Carle-method, the Kish-method comes closer to estimates from a regular
survey-design using the \strong{survey} package. Note that these tests are not
representative and it is recommended to check your results against a
standard survey-design.
}
}
\examples{
\dontshow{if (all(insight::check_if_installed(c("lme4", "parameters"), quietly = TRUE))) withAutoprint(\{ # examplesIf}
data(nhanes_sample)
head(rescale_weights(nhanes_sample, "WTINT2YR", "SDMVSTRA"))

# also works with multiple group-variables
head(rescale_weights(nhanes_sample, "WTINT2YR", c("SDMVSTRA", "SDMVPSU")))

# or nested structures.
x <- rescale_weights(
  data = nhanes_sample,
  probability_weights = "WTINT2YR",
  by = c("SDMVSTRA", "SDMVPSU"),
  nest = TRUE
)
head(x)

\donttest{
# compare different methods, using multilevel-Poisson regression

d <- rescale_weights(nhanes_sample, "WTINT2YR", "SDMVSTRA")
result1 <- lme4::glmer(
  total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU),
  family = poisson(),
  data = d,
  weights = rescaled_weights_a
)
result2 <- lme4::glmer(
  total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU),
  family = poisson(),
  data = d,
  weights = rescaled_weights_b
)

d <- rescale_weights(
  nhanes_sample,
  "WTINT2YR",
  method = "kish"
)
result3 <- lme4::glmer(
  total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU),
  family = poisson(),
  data = d,
  weights = rescaled_weights
)
d <- rescale_weights(
  nhanes_sample,
  "WTINT2YR",
  "SDMVSTRA",
  method = "kish"
)
result4 <- lme4::glmer(
  total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU),
  family = poisson(),
  data = d,
  weights = rescaled_weights
)
parameters::compare_parameters(
  list(result1, result2, result3, result4),
  exponentiate = TRUE,
  column_names = c("Carle (A)", "Carle (B)", "Kish", "Kish (grouped)")
)
}
\dontshow{\}) # examplesIf}
}
\references{
\itemize{
\item Asparouhov T. (2006). General Multi-Level Modeling with Sampling
Weights. Communications in Statistics - Theory and Methods 35: 439-460
\item Carle A.C. (2009). Fitting multilevel models in complex survey data
with design weights: Recommendations. BMC Medical Research Methodology
9(49): 1-13
\item Kish, L. (1965) Survey Sampling. London: Wiley.
}
}


================================================
FILE: man/reshape_ci.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/reshape_ci.R
\name{reshape_ci}
\alias{reshape_ci}
\title{Reshape CI between wide/long formats}
\usage{
reshape_ci(x, ci_type = "CI")
}
\arguments{
\item{x}{A data frame containing columns named \code{CI_low} and \code{CI_high} (or
similar, see \code{ci_type}).}

\item{ci_type}{String indicating the "type" (i.e. prefix) of the interval
columns. Per \emph{easystats} convention, confidence or credible intervals are
named \code{CI_low} and \code{CI_high}, and the related \code{ci_type} would be \code{"CI"}.
If column names for other intervals differ, \code{ci_type} can be used to
indicate the name, e.g. \code{ci_type = "SI"} can be used for support intervals,
where the column names in the data frame would be \code{SI_low} and \code{SI_high}.}
}
\value{
A data frame with columns corresponding to confidence intervals reshaped
either to wide or long format.
}
\description{
Reshape CI between wide/long formats.
}
\examples{
x <- data.frame(
  Parameter = c("Term 1", "Term 2", "Term 1", "Term 2"),
  CI = c(0.8, 0.8, 0.9, 0.9),
  CI_low = c(0.2, 0.3, 0.1, 0.15),
  CI_high = c(0.5, 0.6, 0.8, 0.85),
  stringsAsFactors = FALSE
)

reshape_ci(x)
reshape_ci(reshape_ci(x))
}


================================================
FILE: man/reverse.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_reverse.R
\name{reverse}
\alias{reverse}
\alias{reverse_scale}
\alias{reverse.numeric}
\alias{reverse.data.frame}
\title{Reverse-Score Variables}
\usage{
reverse(x, ...)

reverse_scale(x, ...)

\method{reverse}{numeric}(x, range = NULL, verbose = TRUE, ...)

\method{reverse}{data.frame}(
  x,
  select = NULL,
  exclude = NULL,
  range = NULL,
  append = FALSE,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = FALSE,
  ...
)
}
\arguments{
\item{x}{A (grouped) data frame, numeric vector or factor.}

\item{...}{Arguments passed to or from other methods.}

\item{range}{Range of values that is used as reference for reversing the
scale. For numeric variables, can be \code{NULL} or a numeric vector of length
two, indicating the lowest and highest value of the reference range. If
\code{NULL}, will take the range of the input vector (\code{range(x)}). For factors,
\code{range} can be \code{NULL}, a numeric vector of length two, or a (numeric)
vector of at least the same length as factor levels (i.e. must be equal
to or larger than \code{nlevels(x)}). Note that providing a \code{range} for factors
usually only makes sense when factor levels are numeric, not characters.}

\item{verbose}{Toggle warnings.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}

\item{append}{Logical or string. If \code{TRUE}, recoded or converted variables
get new column names and are appended (column bind) to \code{x}, thus returning
both the original and the recoded variables. The new columns get a suffix,
based on the calling function: \code{"_r"} for recode functions, \code{"_n"} for
\code{to_numeric()}, \code{"_f"} for \code{to_factor()}, or \code{"_s"} for
\code{slide()}. If \code{append=FALSE}, original variables in \code{x} will be
overwritten by their recoded versions. If a character value, recoded
variables are appended with new column names (using the defined suffix) to
the original data frame.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}
}
\value{
A reverse-scored object.
}
\description{
Reverse-score variables (change the keying/scoring direction).
}
\section{Selection of variables - the \code{select} argument}{

For most functions that have a \code{select} argument (including this function),
the complete input data frame is returned, even when \code{select} only selects
a range of variables. That is, the function is only applied to those variables
that have a match in \code{select}, while all other variables remain unchanged.
In other words: for this function, \code{select} will not omit any non-included
variables, so that the returned data frame will include all variables
from the input data frame.
}

\examples{
reverse(c(1, 2, 3, 4, 5))
reverse(c(-2, -1, 0, 2, 1))

# Specify the "theoretical" range of the input vector
reverse(c(1, 3, 4), range = c(0, 4))

# Factor variables
reverse(factor(c(1, 2, 3, 4, 5)))
reverse(factor(c(1, 2, 3, 4, 5)), range = 0:10)

# Data frames
head(reverse(iris))
head(reverse(iris, select = "Sepal.Length"))

}
\seealso{
Other transform utilities: 
\code{\link{normalize}()},
\code{\link{ranktransform}()},
\code{\link{rescale}()},
\code{\link{standardize}()}
}
\concept{transform utilities}


================================================
FILE: man/row_count.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/row_count.R
\name{row_count}
\alias{row_count}
\title{Count specific values row-wise}
\usage{
row_count(
  data,
  select = NULL,
  exclude = NULL,
  count = NULL,
  allow_coercion = TRUE,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE
)
}
\arguments{
\item{data}{A data frame with at least two columns, where number of specific
values are counted row-wise.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}

\item{count}{The value for which the row sum should be computed. May be a
numeric value, a character string (for factors or character vectors), \code{NA} or
\code{Inf}.}

\item{allow_coercion}{Logical. If \code{FALSE}, \code{count} matches only values of same
class (i.e. when \code{count = 2}, the value \code{"2"} is not counted and vice versa).
By default, when \code{allow_coercion = TRUE}, \code{count = 2} also matches \code{"2"}. In
order to count factor levels in the data, use \code{count = factor("level")}. See
'Examples'.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}

\item{verbose}{Toggle warnings.}
}
\value{
A vector with row-wise counts of values specified in \code{count}.
}
\description{
\code{row_count()} mimics base R's \code{rowSums()}, with sums for a
specific value indicated by \code{count}. Hence, it is similar to
\code{rowSums(x == count, na.rm = TRUE)}, but offers some more options, including
strict comparisons. Comparisons using \code{==} coerce values to atomic vectors,
thus both \code{2 == 2} and \code{"2" == 2} are \code{TRUE}. In \code{row_count()}, it is also
possible to make "type safe" comparisons using the \code{allow_coercion} argument,
where \code{"2" == 2} is not true.
}
\examples{
dat <- data.frame(
  c1 = c(1, 2, NA, 4),
  c2 = c(NA, 2, NA, 5),
  c3 = c(NA, 4, NA, NA),
  c4 = c(2, 3, 7, 8)
)

# count all 4s per row
row_count(dat, count = 4)
# count all missing values per row
row_count(dat, count = NA)

dat <- data.frame(
  c1 = c("1", "2", NA, "3"),
  c2 = c(NA, "2", NA, "3"),
  c3 = c(NA, 4, NA, NA),
  c4 = c(2, 3, 7, Inf)
)
# count all 2s and "2"s per row
row_count(dat, count = 2)
# only count 2s, but not "2"s
row_count(dat, count = 2, allow_coercion = FALSE)

dat <- data.frame(
  c1 = factor(c("1", "2", NA, "3")),
  c2 = c("2", "1", NA, "3"),
  c3 = c(NA, 4, NA, NA),
  c4 = c(2, 3, 7, Inf)
)
# find only character "2"s
row_count(dat, count = "2", allow_coercion = FALSE)
# find only factor level "2"s
row_count(dat, count = factor("2"), allow_coercion = FALSE)

}


================================================
FILE: man/row_means.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/row_means.R
\name{row_means}
\alias{row_means}
\alias{row_sums}
\title{Row means or sums (optionally with minimum amount of valid values)}
\usage{
row_means(
  data,
  select = NULL,
  exclude = NULL,
  min_valid = NULL,
  digits = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  remove_na = FALSE,
  verbose = TRUE
)

row_sums(
  data,
  select = NULL,
  exclude = NULL,
  min_valid = NULL,
  digits = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  remove_na = FALSE,
  verbose = TRUE
)
}
\arguments{
\item{data}{A data frame with at least two columns, where row means or row
sums are applied.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}

\item{min_valid}{Optional, a numeric value of length 1. May either be
\itemize{
\item a numeric value that indicates the amount of valid values per row to
calculate the row mean or row sum;
\item or a value between \code{0} and \code{1}, indicating a proportion of valid values per
row to calculate the row mean or row sum (see 'Details').
\item \code{NULL} (default), in which all cases are considered.
}

If a row's sum of valid values is less than \code{min_valid}, \code{NA} will be returned.}

\item{digits}{Numeric value indicating the number of decimal places to be
used for rounding mean values. Negative values are allowed (see 'Details').
By default, \code{digits = NULL} and no rounding is used.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}

\item{remove_na}{Logical, if \code{TRUE} (default), removes missing (\code{NA}) values
before calculating row means or row sums. Only applies if \code{min_valid} is not
specified.}

\item{verbose}{Toggle warnings.}
}
\value{
A vector with row means (for \code{row_means()}) or row sums (for
\code{row_sums()}) for those rows with at least \code{n} valid values.
}
\description{
This function is similar to the SPSS \code{MEAN.n} or \code{SUM.n}
function and computes row means or row sums from a data frame or matrix if at
least \code{min_valid} values of a row are valid (and not \code{NA}).
}
\details{
Rounding to a negative number of \code{digits} means rounding to a power
of ten, for example \code{row_means(df, 3, digits = -2)} rounds to the nearest
hundred. For \code{min_valid}, if not \code{NULL}, \code{min_valid} must be a numeric value
from \code{0} to \code{ncol(data)}. If a row in the data frame has at least \code{min_valid}
non-missing values, the row mean or row sum is returned. If \code{min_valid} is a
non-integer value from 0 to 1, \code{min_valid} is considered to indicate the
proportion of required non-missing values per row. E.g., if
\code{min_valid = 0.75}, a row must have at least \code{ncol(data) * min_valid}
non-missing values for the row mean or row sum to be calculated. See
'Examples'.
}
\examples{
dat <- data.frame(
  c1 = c(1, 2, NA, 4),
  c2 = c(NA, 2, NA, 5),
  c3 = c(NA, 4, NA, NA),
  c4 = c(2, 3, 7, 8)
)

# default, all means are shown, if no NA values are present
row_means(dat)

# remove all NA before computing row means
row_means(dat, remove_na = TRUE)

# needs at least 4 non-missing values per row
row_means(dat, min_valid = 4) # 1 valid return value
row_sums(dat, min_valid = 4) # 1 valid return value

# needs at least 3 non-missing values per row
row_means(dat, min_valid = 3) # 2 valid return values

# needs at least 2 non-missing values per row
row_means(dat, min_valid = 2)

# needs at least 1 non-missing value per row, for two selected variables
row_means(dat, select = c("c1", "c3"), min_valid = 1)

# needs at least 50\% of non-missing values per row
row_means(dat, min_valid = 0.5) # 3 valid return values
row_sums(dat, min_valid = 0.5)

# needs at least 75\% of non-missing values per row
row_means(dat, min_valid = 0.75) # 2 valid return values

}


================================================
FILE: man/rownames.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/utils-rows.R
\name{rownames_as_column}
\alias{rownames_as_column}
\alias{column_as_rownames}
\alias{rowid_as_column}
\title{Tools for working with row names or row ids}
\usage{
rownames_as_column(x, var = "rowname")

column_as_rownames(x, var = "rowname")

rowid_as_column(x, var = "rowid")
}
\arguments{
\item{x}{A data frame.}

\item{var}{Name of column to use for row names/ids. For \code{column_as_rownames()},
this argument can be the variable name or the column number. For
\code{rownames_as_column()} and \code{rowid_as_column()}, the column name must not
already exist in the data.}
}
\value{
A data frame.
}
\description{
Tools for working with row names or row ids
}
\details{
These are similar to \code{tibble}'s functions \code{column_to_rownames()},
\code{rownames_to_column()} and \code{rowid_to_column()}. Note that the behavior of
\code{rowid_as_column()} is different for grouped dataframe: instead of making
the rowid unique across the full dataframe, it creates rowid per group.
Therefore, there can be several rows with the same rowid if they belong to
different groups.

If you are familiar with \code{dplyr}, this is similar to doing the following:

\if{html}{\out{<div class="sourceCode r">}}\preformatted{data |>
  group_by(grp) |>
  mutate(id = row_number()) |>
  ungroup()
}\if{html}{\out{</div>}}
}
\examples{
# Convert between row names and column --------------------------------
test <- rownames_as_column(mtcars, var = "car")
test
head(column_as_rownames(test, var = "car"))

test_data <- head(iris)

rowid_as_column(test_data)
rowid_as_column(test_data, var = "my_id")
}


================================================
FILE: man/skewness.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/skewness_kurtosis.R
\name{skewness}
\alias{skewness}
\alias{skewness.numeric}
\alias{kurtosis}
\alias{kurtosis.numeric}
\alias{print.parameters_kurtosis}
\alias{print.parameters_skewness}
\alias{summary.parameters_skewness}
\alias{summary.parameters_kurtosis}
\title{Compute Skewness and (Excess) Kurtosis}
\usage{
skewness(x, ...)

\method{skewness}{numeric}(
  x,
  remove_na = TRUE,
  type = "2",
  iterations = NULL,
  verbose = TRUE,
  ...
)

kurtosis(x, ...)

\method{kurtosis}{numeric}(
  x,
  remove_na = TRUE,
  type = "2",
  iterations = NULL,
  verbose = TRUE,
  ...
)

\method{print}{parameters_kurtosis}(x, digits = 3, test = FALSE, ...)

\method{print}{parameters_skewness}(x, digits = 3, test = FALSE, ...)

\method{summary}{parameters_skewness}(object, test = FALSE, ...)

\method{summary}{parameters_kurtosis}(object, test = FALSE, ...)
}
\arguments{
\item{x}{A numeric vector or data.frame.}

\item{...}{Arguments passed to or from other methods.}

\item{remove_na}{Logical. Should \code{NA} values be removed before computing (\code{TRUE})
or not (\code{FALSE}, default)?}

\item{type}{Type of algorithm for computing skewness. May be one of \code{1}
(or \code{"1"}, \code{"I"} or \code{"classic"}), \code{2} (or \code{"2"},
\code{"II"} or \code{"SPSS"} or \code{"SAS"}) or \code{3} (or  \code{"3"},
\code{"III"} or \code{"Minitab"}). See 'Details'.}

\item{iterations}{The number of bootstrap replicates for computing standard
errors. If \code{NULL} (default), parametric standard errors are computed.}

\item{verbose}{Toggle warnings and messages.}

\item{digits}{Number of decimal places.}

\item{test}{Logical, if \code{TRUE}, tests if skewness or kurtosis is
significantly different from zero.}

\item{object}{An object returned by \code{skewness()} or \code{kurtosis()}.}
}
\value{
Values of skewness or kurtosis.
}
\description{
Compute Skewness and (Excess) Kurtosis
}
\details{
\subsection{Skewness}{
Symmetric distributions have a \code{skewness} around zero, while
a negative skewness values indicates a "left-skewed" distribution, and a
positive skewness values indicates a "right-skewed" distribution. Examples
for the relationship of skewness and distributions are:
\itemize{
\item Normal distribution (and other symmetric distribution) has a skewness
of 0
\item Half-normal distribution has a skewness just below 1
\item Exponential distribution has a skewness of 2
\item Lognormal distribution can have a skewness of any positive value,
depending on its parameters
}

(\cite{https://en.wikipedia.org/wiki/Skewness})
}

\subsection{Types of Skewness}{
\code{skewness()} supports three different methods for estimating skewness,
as discussed in \cite{Joanes and Gill (1988)}:
\itemize{
\item Type "1" is the "classical" method, which is \code{g1 = (sum((x - mean(x))^3) / n) / (sum((x - mean(x))^2) / n)^1.5}
\item Type "2" first calculates the type-1 skewness, then adjusts the result:
\code{G1 = g1 * sqrt(n * (n - 1)) / (n - 2)}. This is what SAS and SPSS
usually return.
\item Type "3" first calculates the type-1 skewness, then adjusts the result:
\code{b1 = g1 * ((1 - 1 / n))^1.5}. This is what Minitab usually returns.
}
}

\subsection{Kurtosis}{
The \code{kurtosis} is a measure of "tailedness" of a distribution. A
distribution with a kurtosis values of about zero is called "mesokurtic". A
kurtosis value larger than zero indicates a "leptokurtic" distribution with
\emph{fatter} tails. A kurtosis value below zero indicates a "platykurtic"
distribution with \emph{thinner} tails
(\cite{https://en.wikipedia.org/wiki/Kurtosis}).
}

\subsection{Types of Kurtosis}{
\code{kurtosis()} supports three different methods for estimating kurtosis,
as discussed in \cite{Joanes and Gill (1988)}:
\itemize{
\item Type "1" is the "classical" method, which is \code{g2 = n * sum((x - mean(x))^4) / (sum((x - mean(x))^2)^2) - 3}.
\item Type "2" first calculates the type-1 kurtosis, then adjusts the result:
\code{G2 = ((n + 1) * g2 + 6) * (n - 1)/((n - 2) * (n - 3))}. This is what
SAS and SPSS usually return
\item Type "3" first calculates the type-1 kurtosis, then adjusts the result:
\code{b2 = (g2 + 3) * (1 - 1 / n)^2 - 3}. This is what Minitab usually
returns.
}

}

\subsection{Standard Errors}{
It is recommended to compute empirical (bootstrapped) standard errors (via
the \code{iterations} argument) than relying on analytic standard errors
(\cite{Wright & Herrington, 2011}).
}
}
\examples{
skewness(rnorm(1000))
kurtosis(rnorm(1000))
}
\references{
\itemize{
\item D. N. Joanes and C. A. Gill (1998). Comparing measures of sample
skewness and kurtosis. The Statistician, 47, 183–189.
\item Wright, D. B., & Herrington, J. A. (2011). Problematic standard
errors and confidence intervals for skewness and kurtosis. Behavior
research methods, 43(1), 8-17.
}
}


================================================
FILE: man/slide.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/slide.R
\name{slide}
\alias{slide}
\alias{slide.numeric}
\alias{slide.data.frame}
\title{Shift numeric value range}
\usage{
slide(x, ...)

\method{slide}{numeric}(x, lowest = 0, ...)

\method{slide}{data.frame}(
  x,
  select = NULL,
  exclude = NULL,
  lowest = 0,
  append = FALSE,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
)
}
\arguments{
\item{x}{A data frame or numeric vector.}

\item{...}{not used.}

\item{lowest}{Numeric, indicating the lowest (minimum) value when converting
factors or character vectors to numeric values.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}

\item{append}{Logical or string. If \code{TRUE}, recoded or converted variables
get new column names and are appended (column bind) to \code{x}, thus returning
both the original and the recoded variables. The new columns get a suffix,
based on the calling function: \code{"_r"} for recode functions, \code{"_n"} for
\code{to_numeric()}, \code{"_f"} for \code{to_factor()}, or \code{"_s"} for
\code{slide()}. If \code{append=FALSE}, original variables in \code{x} will be
overwritten by their recoded versions. If a character value, recoded
variables are appended with new column names (using the defined suffix) to
the original data frame.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}

\item{verbose}{Toggle warnings.}
}
\value{
\code{x}, where the range of numeric variables starts at a new value.
}
\description{
This functions shifts the value range of a numeric variable, so that the
new range starts at a given value.
}
\section{Selection of variables - the \code{select} argument}{

For most functions that have a \code{select} argument (including this function),
the complete input data frame is returned, even when \code{select} only selects
a range of variables. That is, the function is only applied to those variables
that have a match in \code{select}, while all other variables remain unchanged.
In other words: for this function, \code{select} will not omit any non-included
variables, so that the returned data frame will include all variables
from the input data frame.
}

\examples{
# numeric
head(mtcars$gear)
head(slide(mtcars$gear))
head(slide(mtcars$gear, lowest = 10))

# data frame
sapply(slide(mtcars, lowest = 1), min)
sapply(mtcars, min)
}
\seealso{
\itemize{
\item Add a prefix or suffix to column names: \code{\link[=data_addprefix]{data_addprefix()}}, \code{\link[=data_addsuffix]{data_addsuffix()}}
\item Functions to reorder or remove columns: \code{\link[=data_reorder]{data_reorder()}}, \code{\link[=data_relocate]{data_relocate()}},
\code{\link[=data_remove]{data_remove()}}
\item Functions to reshape, pivot or rotate data frames: \code{\link[=data_to_long]{data_to_long()}},
\code{\link[=data_to_wide]{data_to_wide()}}, \code{\link[=data_rotate]{data_rotate()}}
\item Functions to recode data: \code{\link[=rescale]{rescale()}}, \code{\link[=reverse]{reverse()}}, \code{\link[=categorize]{categorize()}},
\code{\link[=recode_values]{recode_values()}}, \code{\link[=slide]{slide()}}
\item Functions to standardize, normalize, rank-transform: \code{\link[=center]{center()}}, \code{\link[=standardize]{standardize()}},
\code{\link[=normalize]{normalize()}}, \code{\link[=ranktransform]{ranktransform()}}, \code{\link[=winsorize]{winsorize()}}
\item Split and merge data frames: \code{\link[=data_partition]{data_partition()}}, \code{\link[=data_merge]{data_merge()}}
\item Functions to find or select columns: \code{\link[=data_select]{data_select()}}, \code{\link[=extract_column_names]{extract_column_names()}}
\item Functions to filter rows: \code{\link[=data_match]{data_match()}}, \code{\link[=data_filter]{data_filter()}}
}
}


================================================
FILE: man/smoothness.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/smoothness.R
\name{smoothness}
\alias{smoothness}
\title{Quantify the smoothness of a vector}
\usage{
smoothness(x, method = "cor", lag = 1, iterations = NULL, ...)
}
\arguments{
\item{x}{Numeric vector (similar to a time series).}

\item{method}{Can be \code{"diff"} (the standard deviation of the standardized
differences) or \code{"cor"} (default, lag-one autocorrelation).}

\item{lag}{An integer indicating which lag to use. If less than \code{1}, will be
interpreted as expressed in percentage of the length of the vector.}

\item{iterations}{The number of bootstrap replicates for computing standard
errors. If \code{NULL} (default), parametric standard errors are computed.}

\item{...}{Arguments passed to or from other methods.}
}
\value{
Value of smoothness.
}
\description{
Quantify the smoothness of a vector
}
\examples{
x <- (-10:10)^3 + rnorm(21, 0, 100)
plot(x)
smoothness(x, method = "cor")
smoothness(x, method = "diff")
}
\references{
https://stats.stackexchange.com/questions/24607/how-to-measure-smoothness-of-a-time-series-in-r
}


================================================
FILE: man/standardize.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/standardize.R, R/unstandardize.R
\name{standardize}
\alias{standardize}
\alias{standardise}
\alias{standardize.numeric}
\alias{standardize.factor}
\alias{standardize.data.frame}
\alias{unstandardize}
\alias{unstandardise}
\alias{unstandardize.numeric}
\alias{unstandardize.data.frame}
\title{Standardization (Z-scoring)}
\usage{
standardize(x, ...)

standardise(x, ...)

\method{standardize}{numeric}(
  x,
  robust = FALSE,
  two_sd = FALSE,
  weights = NULL,
  reference = NULL,
  center = NULL,
  scale = NULL,
  verbose = TRUE,
  ...
)

\method{standardize}{factor}(
  x,
  robust = FALSE,
  two_sd = FALSE,
  weights = NULL,
  force = FALSE,
  verbose = TRUE,
  ...
)

\method{standardize}{data.frame}(
  x,
  select = NULL,
  exclude = NULL,
  robust = FALSE,
  two_sd = FALSE,
  weights = NULL,
  reference = NULL,
  center = NULL,
  scale = NULL,
  remove_na = c("none", "selected", "all"),
  force = FALSE,
  append = FALSE,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
)

unstandardize(x, ...)

unstandardise(x, ...)

\method{unstandardize}{numeric}(
  x,
  center = NULL,
  scale = NULL,
  reference = NULL,
  robust = FALSE,
  two_sd = FALSE,
  ...
)

\method{unstandardize}{data.frame}(
  x,
  center = NULL,
  scale = NULL,
  reference = NULL,
  robust = FALSE,
  two_sd = FALSE,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
)
}
\arguments{
\item{x}{A (grouped) data frame, a vector or a statistical model (for
\code{unstandardize()} cannot be a model).}

\item{...}{Arguments passed to or from other methods.}

\item{robust}{Logical, if \code{TRUE}, centering is done by subtracting the
median from the variables and dividing it by the median absolute deviation
(MAD). If \code{FALSE}, variables are standardized by subtracting the
mean and dividing it by the standard deviation (SD).}

\item{two_sd}{If \code{TRUE}, the variables are scaled by two times the deviation
(SD or MAD depending on \code{robust}). This method can be useful to obtain
model coefficients of continuous parameters comparable to coefficients
related to binary predictors, when applied to \strong{the predictors} (not the
outcome) (Gelman, 2008).}

\item{weights}{Can be \code{NULL} (for no weighting), or:
\itemize{
\item For model: if \code{TRUE} (default), a weighted-standardization is carried out.
\item For \code{data.frame}s: a numeric vector of weights, or a character of the
name of a column in the \code{data.frame} that contains the weights.
\item For numeric vectors: a numeric vector of weights.
}}

\item{reference}{A data frame or variable from which the centrality and
deviation will be computed instead of from the input variable. Useful for
standardizing a subset or new data according to another data frame.}

\item{center, scale}{\itemize{
\item For \code{standardize()}: \cr
Numeric values, which can be used as alternative to \code{reference} to define
a reference centrality and deviation. If \code{scale} and \code{center} are of
length 1, they will be recycled to match the length of selected variables
for standardization. Else, \code{center} and \code{scale} must be of same length as
the number of selected variables. Values in \code{center} and \code{scale} will be
matched to selected variables in the provided order, unless a named vector
is given. In this case, names are matched against the names of the selected
variables.
\item For \code{unstandardize()}: \cr
\code{center} and \code{scale} correspond to the center (the mean / median) and the scale (SD / MAD) of
the original non-standardized data (for data frames, should be named, or
have column order correspond to the numeric column). However, one can also
directly provide the original data through \code{reference}, from which the
center and the scale will be computed (according to \code{robust} and \code{two_sd}).
Alternatively, if the input contains the attributes \code{center} and \code{scale}
(as does the output of \code{standardize()}), it will take it from there if the
rest of the arguments are absent.
}}

\item{verbose}{Toggle warnings and messages on or off.}

\item{force}{Logical, if \code{TRUE}, forces recoding of factors and character
vectors as well.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}

\item{remove_na}{How should missing values (\code{NA}) be treated: if \code{"none"}
(default): each column's standardization is done separately, ignoring
\code{NA}s. Else, rows with \code{NA} in the columns selected with \code{select} /
\code{exclude} (\code{"selected"}) or in all columns (\code{"all"}) are dropped before
standardization, and the resulting data frame does not include these cases.}

\item{append}{Logical or string. If \code{TRUE}, standardized variables get new
column names (with the suffix \code{"_z"}) and are appended (column bind) to \code{x},
thus returning both the original and the standardized variables. If \code{FALSE},
original variables in \code{x} will be overwritten by their standardized versions.
If a character value, standardized variables are appended with new column
names (using the defined suffix) to the original data frame.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}
}
\value{
The standardized object (either a standardize data frame or a
statistical model fitted on standardized data).
}
\description{
Performs a standardization of data (z-scoring), i.e., centering and scaling,
so that the data is expressed in terms of standard deviation (i.e., mean = 0,
SD = 1) or Median Absolute Deviance (median = 0, MAD = 1). When applied to a
statistical model, this function extracts the dataset, standardizes it, and
refits the model with this standardized version of the dataset. The
\code{\link[=normalize]{normalize()}} function can also be used to scale all numeric variables within
the 0 - 1 range.
\cr\cr
For model standardization, see \code{\link[=standardize.default]{standardize.default()}}.
}
\note{
When \code{x} is a vector or a data frame with \verb{remove_na = "none")},
missing values are preserved, so the return value has the same length /
number of rows as the original input.
}
\section{Selection of variables - the \code{select} argument}{

For most functions that have a \code{select} argument (including this function),
the complete input data frame is returned, even when \code{select} only selects
a range of variables. That is, the function is only applied to those variables
that have a match in \code{select}, while all other variables remain unchanged.
In other words: for this function, \code{select} will not omit any non-included
variables, so that the returned data frame will include all variables
from the input data frame.
}

\examples{
d <- iris[1:4, ]

# vectors
standardise(d$Petal.Length)

# Data frames
# overwrite
standardise(d, select = c("Sepal.Length", "Sepal.Width"))

# append
standardise(d, select = c("Sepal.Length", "Sepal.Width"), append = TRUE)

# append, suffix
standardise(d, select = c("Sepal.Length", "Sepal.Width"), append = "_std")

# standardizing with reference center and scale
d <- data.frame(
  a = c(-2, -1, 0, 1, 2),
  b = c(3, 4, 5, 6, 7)
)

# default standardization, based on mean and sd of each variable
standardize(d) # means are 0 and 5, sd ~ 1.581139

# standardization, based on mean and sd set to the same values
standardize(d, center = c(0, 5), scale = c(1.581, 1.581))

# standardization, mean and sd for each variable newly defined
standardize(d, center = c(3, 4), scale = c(2, 4))

# standardization, taking same mean and sd for each variable
standardize(d, center = 1, scale = 3)
}
\seealso{
See \code{\link[=center]{center()}} for grand-mean centering of variables, and
\code{\link[=makepredictcall.dw_transformer]{makepredictcall.dw_transformer()}} for use in model formulas.

Other transform utilities: 
\code{\link{normalize}()},
\code{\link{ranktransform}()},
\code{\link{rescale}()},
\code{\link{reverse}()}

Other standardize: 
\code{\link{standardize.default}()}
}
\concept{standardize}
\concept{transform utilities}


================================================
FILE: man/standardize.default.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/standardize.models.R
\name{standardize.default}
\alias{standardize.default}
\alias{standardize_models}
\title{Re-fit a model with standardized data}
\usage{
\method{standardize}{default}(
  x,
  robust = FALSE,
  two_sd = FALSE,
  weights = TRUE,
  verbose = TRUE,
  include_response = TRUE,
  ...
)
}
\arguments{
\item{x}{A statistical model.}

\item{robust}{Logical, if \code{TRUE}, centering is done by subtracting the
median from the variables and dividing it by the median absolute deviation
(MAD). If \code{FALSE}, variables are standardized by subtracting the
mean and dividing it by the standard deviation (SD).}

\item{two_sd}{If \code{TRUE}, the variables are scaled by two times the deviation
(SD or MAD depending on \code{robust}). This method can be useful to obtain
model coefficients of continuous parameters comparable to coefficients
related to binary predictors, when applied to \strong{the predictors} (not the
outcome) (Gelman, 2008).}

\item{weights}{If \code{TRUE} (default), a weighted-standardization is carried out.}

\item{verbose}{Toggle warnings and messages on or off.}

\item{include_response}{If \code{TRUE} (default), the response value will also be
standardized. If \code{FALSE}, only the predictors will be standardized.
\itemize{
\item Note that for GLMs and models with non-linear link functions, the
response value will not be standardized, to make re-fitting the model work.
\item If the model contains an \code{\link[stats:offset]{stats::offset()}}, the offset variable(s) will
be standardized only if the response is standardized. If \code{two_sd = TRUE},
offsets are standardized by one-sd (similar to the response).
\item (For \code{mediate} models, the \code{include_response} refers to the outcome in
the y model; m model's response will always be standardized when possible).
}}

\item{...}{Arguments passed to or from other methods.}
}
\value{
A statistical model fitted on standardized data
}
\description{
Performs a standardization of data (z-scoring) using
\code{\link[=standardize]{standardize()}} and then re-fits the model to the standardized data.
\cr\cr
Standardization is done by completely refitting the model on the standardized
data. Hence, this approach is equal to standardizing the variables \emph{before}
fitting the model and will return a new model object. This method is
particularly recommended for complex models that include interactions or
transformations (e.g., polynomial or spline terms). The \code{robust} (default to
\code{FALSE}) argument enables a robust standardization of data, based on the
\code{median} and the \code{MAD} instead of the \code{mean} and the \code{SD}.
}
\section{Generalized Linear Models}{
Standardization for generalized linear models (GLM, GLMM, etc) is done only
with respect to the predictors (while the outcome remains as-is,
unstandardized) - maintaining the interpretability of the coefficients (e.g.,
in a binomial model: the exponent of the standardized parameter is the OR of
a change of 1 SD in the predictor, etc.)
}

\section{Dealing with Factors}{
\code{standardize(model)} or \code{standardize_parameters(model, method = "refit")} do
\emph{not} standardize categorical predictors (i.e. factors) / their
dummy-variables, which may be a different behaviour compared to other R
packages (such as \strong{lm.beta}) or other software packages (like SPSS). To
mimic such behaviours, either use \code{standardize_parameters(model, method = "basic")} to obtain post-hoc standardized parameters, or standardize the data
with \code{standardize(data, force = TRUE)} \emph{before} fitting the
model.
}

\section{Transformed Variables}{
When the model's formula contains transformations (e.g. \code{y ~ exp(X)}) the
transformation effectively takes place after standardization (e.g.,
\code{exp(scale(X))}). Since some transformations are undefined for none positive
values, such as \code{log()} and \code{sqrt()}, the relevel variables are shifted (post
standardization) by \code{Z - min(Z) + 1} or \code{Z - min(Z)} (respectively).
}

\examples{
model <- lm(Infant.Mortality ~ Education * Fertility, data = swiss)
coef(standardize(model))

}
\seealso{
Other standardize: 
\code{\link{standardize}()}
}
\concept{standardize}


================================================
FILE: man/text_format.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/text_format.R
\name{text_format}
\alias{text_format}
\alias{text_fullstop}
\alias{text_lastchar}
\alias{text_concatenate}
\alias{text_paste}
\alias{text_remove}
\alias{text_wrap}
\title{Convenient text formatting functionalities}
\usage{
text_format(
  text,
  sep = ", ",
  last = " and ",
  width = NULL,
  enclose = NULL,
  ...
)

text_fullstop(text)

text_lastchar(text, n = 1)

text_concatenate(text, sep = ", ", last = " and ", enclose = NULL)

text_paste(text, text2 = NULL, sep = ", ", enclose = NULL, ...)

text_remove(text, pattern = "", ...)

text_wrap(text, width = NULL, ...)
}
\arguments{
\item{text, text2}{A character string.}

\item{sep}{Separator.}

\item{last}{Last separator.}

\item{width}{Positive integer giving the target column width for wrapping
lines in the output. Can be "auto", in which case it will select 90\\% of the
default width.}

\item{enclose}{Character that will be used to wrap elements of \code{text}, so
these can be, e.g., enclosed with quotes or backticks. If \code{NULL} (default),
text elements will not be enclosed.}

\item{...}{Other arguments to be passed to or from other functions.}

\item{n}{The number of characters to find.}

\item{pattern}{Regex pattern to remove from \code{text}.}
}
\value{
A character string.
}
\description{
Convenience functions to manipulate and format text.
}
\examples{
# Add full stop if missing
text_fullstop(c("something", "something else."))

# Find last characters
text_lastchar(c("ABC", "DEF"), n = 2)

# Smart concatenation
text_concatenate(c("First", "Second", "Last"))
text_concatenate(c("First", "Second", "Last"), last = " or ", enclose = "`")

# Remove parts of string
text_remove(c("one!", "two", "three!"), "!")

# Wrap text
long_text <- paste(rep("abc ", 100), collapse = "")
cat(text_wrap(long_text, width = 50))

# Paste with optional separator
text_paste(c("A", "", "B"), c("42", "42", "42"))
}


================================================
FILE: man/to_factor.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/to_factor.R
\name{to_factor}
\alias{to_factor}
\alias{to_factor.numeric}
\alias{to_factor.data.frame}
\title{Convert data to factors}
\usage{
to_factor(x, ...)

\method{to_factor}{numeric}(x, labels_to_levels = TRUE, verbose = TRUE, ...)

\method{to_factor}{data.frame}(
  x,
  select = NULL,
  exclude = NULL,
  ignore_case = FALSE,
  append = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
)
}
\arguments{
\item{x}{A data frame or vector.}

\item{...}{Arguments passed to or from other methods.}

\item{labels_to_levels}{Logical, if \code{TRUE}, value labels are used as factor
levels after \code{x} was converted to factor. Else, factor levels are based on
the values of \code{x} (i.e. as if using \code{as.factor()}).}

\item{verbose}{Toggle warnings.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{append}{Logical or string. If \code{TRUE}, recoded or converted variables
get new column names and are appended (column bind) to \code{x}, thus returning
both the original and the recoded variables. The new columns get a suffix,
based on the calling function: \code{"_r"} for recode functions, \code{"_n"} for
\code{to_numeric()}, \code{"_f"} for \code{to_factor()}, or \code{"_s"} for
\code{slide()}. If \code{append=FALSE}, original variables in \code{x} will be
overwritten by their recoded versions. If a character value, recoded
variables are appended with new column names (using the defined suffix) to
the original data frame.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}
}
\value{
A factor, or a data frame of factors.
}
\description{
Convert data to factors
}
\details{
Convert variables or data into factors. If the data is labelled, value labels
will be used as factor levels. The counterpart to convert variables into
numeric is \code{to_numeric()}.
}
\note{
Factors are ignored and returned as is. If you want to use value labels
as levels for factors, use \code{\link[=labels_to_levels]{labels_to_levels()}} instead.
}
\section{Selection of variables - the \code{select} argument}{

For most functions that have a \code{select} argument (including this function),
the complete input data frame is returned, even when \code{select} only selects
a range of variables. That is, the function is only applied to those variables
that have a match in \code{select}, while all other variables remain unchanged.
In other words: for this function, \code{select} will not omit any non-included
variables, so that the returned data frame will include all variables
from the input data frame.
}

\examples{
str(to_factor(iris))

# use labels as levels
data(efc)
str(efc$c172code)
head(to_factor(efc$c172code))
}


================================================
FILE: man/to_numeric.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/to_numeric.R
\name{to_numeric}
\alias{to_numeric}
\alias{to_numeric.data.frame}
\title{Convert data to numeric}
\usage{
to_numeric(x, ...)

\method{to_numeric}{data.frame}(
  x,
  select = NULL,
  exclude = NULL,
  dummy_factors = FALSE,
  preserve_levels = FALSE,
  lowest = NULL,
  append = FALSE,
  ignore_case = FALSE,
  regex = FALSE,
  verbose = TRUE,
  ...
)
}
\arguments{
\item{x}{A data frame, factor or vector.}

\item{...}{Arguments passed to or from other methods.}

\item{select}{Variables that will be included when performing the required
tasks. Can be either
\itemize{
\item a variable specified as a literal variable name (e.g., \code{column_name}),
\item a string with the variable name (e.g., \code{"column_name"}), a character
vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a
character vector of variable names including ranges specified via \code{:}
(e.g., \code{c("col1:col3", "col5")}),
\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can
be a named character vector. In this case, the names are used to rename
the columns in the output data frame. See 'Details' in the related
functions to see where this option applies.
\item a formula with variable names (e.g., \code{~column_1 + column_2}),
\item a vector of positive integers, giving the positions counting from the left
(e.g. \code{1} or \code{c(1, 3, 5)}),
\item a vector of negative integers, giving the positions counting from the
right (e.g., \code{-1} or \code{-1:-3}),
\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()},
\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular
expression patterns.
\item a function testing for logical conditions, e.g. \code{is.numeric()} (or
\code{is.numeric}), or any user-defined function that selects the variables
for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
\item ranges specified via literal variable names, select-helpers (except
\code{regex()}) and (user-defined) functions can be negated, i.e. return
non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()},
\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
used alternatively. For instance, \code{select=-ends_with("Length")} (with
\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
negation should not work as expected, use the \code{exclude} argument instead.
}

If \code{NULL}, selects all columns. Patterns that found no matches are silently
ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))}
will just return \code{"Species"}.}

\item{exclude}{See \code{select}, however, column names matched by the pattern
from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
excludes no columns.}

\item{dummy_factors}{Transform factors to dummy factors (all factor levels as
different columns filled with a binary 0-1 value).}

\item{preserve_levels}{Logical, only applies if \code{x} is a factor. If \code{TRUE},
and \code{x} has numeric factor levels, these will be converted into the related
numeric values. If this is not possible, the converted numeric values will
start from 1 to number of levels.}

\item{lowest}{Numeric, indicating the lowest (minimum) value when converting
factors or character vectors to numeric values.}

\item{append}{Logical or string. If \code{TRUE}, recoded or converted variables
get new column names and are appended (column bind) to \code{x}, thus returning
both the original and the recoded variables. The new columns get a suffix,
based on the calling function: \code{"_r"} for recode functions, \code{"_n"} for
\code{to_numeric()}, \code{"_f"} for \code{to_factor()}, or \code{"_s"} for
\code{slide()}. If \code{append=FALSE}, original variables in \code{x} will be
overwritten by their recoded versions. If a character value, recoded
variables are appended with new column names (using the defined suffix) to
the original data frame.}

\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
a regular expression is used in \code{select}, ignores lower/upper case in the
search pattern when matching against variable names.}

\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
character string (or a variable containing a character string) and is not
allowed to be one of the supported select-helpers or a character vector
of length > 1. \code{regex = TRUE} is comparable to using one of the two
select-helpers, \code{select = contains()} or \code{select = regex()}, however,
since the select-helpers may not work when called from inside other
functions (see 'Details'), this argument may be used as workaround.}

\item{verbose}{Toggle warnings.}
}
\value{
A data frame of numeric variables.
}
\description{
Convert data to numeric by converting characters to factors and factors to
either numeric levels or dummy variables. The "counterpart" to convert
variables into factors is \code{to_factor()}.
}
\note{
When factors should be converted into multiple "binary" dummies, i.e.
each factor level is converted into a separate column filled with a binary
0-1 value, set \code{dummy_factors = TRUE}. If you want to preserve the original
factor levels (in case these represent numeric values), use
\code{preserve_levels = TRUE}.
}
\section{Selection of variables - \code{select} argument}{

For most functions that have a \code{select} argument the complete input data
frame is returned, even when \code{select} only selects a range of variables.
However, for \code{to_numeric()}, factors might be converted into dummies,
thus, the number of variables of the returned data frame no longer match
the input data frame. Hence, when \code{select} is used, \emph{only} those variables
(or their dummies) specified in \code{select} will be returned. Use \code{append=TRUE}
to also include the original variables in the returned data frame.
}

\examples{
to_numeric(head(ToothGrowth))
to_numeric(head(ToothGrowth), dummy_factors = TRUE)

# factors
x <- as.factor(mtcars$gear)
to_numeric(x)
to_numeric(x, preserve_levels = TRUE)
# same as:
coerce_to_numeric(x)

}


================================================
FILE: man/visualisation_recipe.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/visualisation_recipe.R
\name{visualisation_recipe}
\alias{visualisation_recipe}
\title{Prepare objects for visualisation}
\usage{
visualisation_recipe(x, ...)
}
\arguments{
\item{x}{An \code{easystats} object.}

\item{...}{Other arguments passed to other functions.}
}
\description{
This function prepares objects for visualisation by returning a list of
layers with data and geoms that can be easily plotted using for instance
\code{ggplot2}.

If the \code{see} package is installed, the call to \code{visualization_recipe()} can be
replaced by \code{plot()}, which will internally call the former and then plot it
using \code{ggplot}. The resulting plot can be customized ad-hoc (by adding
ggplot's geoms, theme or specifications), or via some of the arguments
of \code{visualisation_recipe()} that control the aesthetic parameters.

See the specific documentation page for your object's class:
\itemize{
\item {modelbased}: \url{https://easystats.github.io/modelbased/reference/visualisation_recipe.estimate_predicted.html}
\item {correlation}: \url{https://easystats.github.io/correlation/reference/visualisation_recipe.easycormatrix.html}
}
}


================================================
FILE: man/weighted_mean.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/weighted_mean_median_sd_mad.R
\name{weighted_mean}
\alias{weighted_mean}
\alias{weighted_median}
\alias{weighted_sd}
\alias{weighted_mad}
\title{Weighted Mean, Median, SD, and MAD}
\usage{
weighted_mean(x, weights = NULL, remove_na = TRUE, verbose = TRUE, ...)

weighted_median(x, weights = NULL, remove_na = TRUE, verbose = TRUE, ...)

weighted_sd(x, weights = NULL, remove_na = TRUE, verbose = TRUE, ...)

weighted_mad(
  x,
  weights = NULL,
  constant = 1.4826,
  remove_na = TRUE,
  verbose = TRUE,
  ...
)
}
\arguments{
\item{x}{an object containing the values whose weighted mean is to be
    computed.}

\item{weights}{A numerical vector of weights the same length as \code{x} giving
the weights to use for elements of \code{x}. If \code{weights = NULL}, \code{x} is passed
to the non-weighted function.}

\item{remove_na}{Logical, if \code{TRUE} (default), removes missing (\code{NA}) and infinite
values from \code{x} and \code{weights}.}

\item{verbose}{Show warning when \code{weights} are negative?}

\item{...}{arguments to be passed to or from methods.}

\item{constant}{scale factor.}
}
\description{
Weighted Mean, Median, SD, and MAD
}
\examples{
## GPA from Siegel 1994
x <- c(3.7, 3.3, 3.5, 2.8)
wt <- c(5, 5, 4, 1) / 15

weighted_mean(x, wt)
weighted_median(x, wt)

weighted_sd(x, wt)
weighted_mad(x, wt)

}


================================================
FILE: man/winsorize.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/winsorize.R
\name{winsorize}
\alias{winsorize}
\alias{winsorize.numeric}
\title{Winsorize data}
\usage{
winsorize(data, ...)

\method{winsorize}{numeric}(
  data,
  threshold = 0.2,
  method = "percentile",
  robust = FALSE,
  verbose = TRUE,
  ...
)
}
\arguments{
\item{data}{data frame or vector.}

\item{...}{Currently not used.}

\item{threshold}{The amount of winsorization, depends on the value of \code{method}:
\itemize{
\item For \code{method = "percentile"}: the amount to winsorize from \emph{each} tail.
The value of \code{threshold} must be between 0 and 0.5 and of length 1.
\item For \code{method = "zscore"}: the number of \emph{SD}/\emph{MAD}-deviations from the
\emph{mean}/\emph{median} (see \code{robust}). The value of \code{threshold} must be greater
than 0 and of length 1.
\item For \code{method = "raw"}: a vector of length 2 with the lower and upper bound
for winsorization.
}}

\item{method}{One of "percentile" (default), "zscore", or "raw".}

\item{robust}{Logical, if TRUE, winsorizing through the "zscore" method is
done via the median and the median absolute deviation (MAD); if FALSE, via
the mean and the standard deviation.}

\item{verbose}{Not used anymore since \code{datawizard} 0.6.6.}
}
\value{
A data frame with winsorized columns or a winsorized vector.
}
\description{
Winsorize data
}
\details{
Winsorizing or winsorization is the transformation of statistics by limiting
extreme values in the statistical data to reduce the effect of possibly
spurious outliers. The distribution of many statistics can be heavily
influenced by outliers. A typical strategy is to set all outliers (values
beyond a certain threshold) to a specified percentile of the data; for
example, a \verb{90\%} winsorization would see all data below the 5th percentile set
to the 5th percentile, and data above the 95th percentile set to the 95th
percentile. Winsorized estimators are usually more robust to outliers than
their more standard forms.
}
\examples{
hist(iris$Sepal.Length, main = "Original data")

hist(winsorize(iris$Sepal.Length, threshold = 0.2),
  xlim = c(4, 8), main = "Percentile Winsorization"
)

hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore"),
  xlim = c(4, 8), main = "Mean (+/- SD) Winsorization"
)

hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore", robust = TRUE),
  xlim = c(4, 8), main = "Median (+/- MAD) Winsorization"
)

hist(winsorize(iris$Sepal.Length, threshold = c(5, 7.5), method = "raw"),
  xlim = c(4, 8), main = "Raw Thresholds"
)

# Also works on a data frame:
winsorize(iris, threshold = 0.2)

}
\seealso{
\itemize{
\item Add a prefix or suffix to column names: \code{\link[=data_addprefix]{data_addprefix()}}, \code{\link[=data_addsuffix]{data_addsuffix()}}
\item Functions to reorder or remove columns: \code{\link[=data_reorder]{data_reorder()}}, \code{\link[=data_relocate]{data_relocate()}},
\code{\link[=data_remove]{data_remove()}}
\item Functions to reshape, pivot or rotate data frames: \code{\link[=data_to_long]{data_to_long()}},
\code{\link[=data_to_wide]{data_to_wide()}}, \code{\link[=data_rotate]{data_rotate()}}
\item Functions to recode data: \code{\link[=rescale]{rescale()}}, \code{\link[=reverse]{reverse()}}, \code{\link[=categorize]{categorize()}},
\code{\link[=recode_values]{recode_values()}}, \code{\link[=slide]{slide()}}
\item Functions to standardize, normalize, rank-transform: \code{\link[=center]{center()}}, \code{\link[=standardize]{standardize()}},
\code{\link[=normalize]{normalize()}}, \code{\link[=ranktransform]{ranktransform()}}, \code{\link[=winsorize]{winsorize()}}
\item Split and merge data frames: \code{\link[=data_partition]{data_partition()}}, \code{\link[=data_merge]{data_merge()}}
\item Functions to find or select columns: \code{\link[=data_select]{data_select()}}, \code{\link[=extract_column_names]{extract_column_names()}}
\item Functions to filter rows: \code{\link[=data_match]{data_match()}}, \code{\link[=data_filter]{data_filter()}}
}
}


================================================
FILE: paper/JOSS_files/apa.csl
================================================
<?xml version="1.0" encoding="utf-8"?>
<style xmlns="http://purl.org/net/xbiblio/csl" class="in-text" version="1.0" demote-non-dropping-particle="never" page-range-format="expanded">
  <info>
    <title>American Psychological Association 7th edition</title>
    <title-short>APA</title-short>
    <id>http://www.zotero.org/styles/apa</id>
    <link href="http://www.zotero.org/styles/apa" rel="self"/>
    <link href="http://www.zotero.org/styles/apa-6th-edition" rel="template"/>
    <link href="https://apastyle.apa.org/style-grammar-guidelines/references/examples" rel="documentation"/>
    <author>
      <name>Brenton M. Wiernik</name>
      <email>zotero@wiernik.org</email>
    </author>
    <category citation-format="author-date"/>
    <category field="psychology"/>
    <category field="generic-base"/>
    <updated>2020-11-04T04:07:19+00:00</updated>
    <rights license="http://creativecommons.org/licenses/by-sa/3.0/">This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 License</rights>
  </info>
  <locale xml:lang="en">
    <terms>
      <term name="editortranslator" form="short">
        <single>ed. &amp; trans.</single>
        <multiple>eds. &amp; trans.</multiple>
      </term>
      <term name="translator" form="short">trans.</term>
      <term name="interviewer" form="short">
        <single>interviewer</single>
        <multiple>interviewers</multiple>
      </term>
      <term name="collection-editor" form="short">
        <single>ed.</single>
        <multiple>eds.</multiple>
      </term>
      <term name="circa" form="short">ca.</term>
      <term name="bc"> B.C.E.</term>
      <term name="ad"> C.E.</term>
      <term name="letter">personal communication</term>
      <term name="letter" form="short">letter</term>
      <term name="issue" form="long">
        <single>issue</single>
        <multiple>issues</multiple>
      </term>
    </terms>
  </locale>
  <locale xml:lang="af">
    <terms>
      <term name="letter">persoonlike kommunikasie</term>
      <term name="letter" form="short">brief</term>
    </terms>
  </locale>
  <locale xml:lang="ar">
    <terms>
      <term name="letter">اتصال شخصي</term>
      <term name="letter" form="short">خطاب</term>
    </terms>
  </locale>
  <locale xml:lang="bg">
    <terms>
      <term name="letter">лична комуникация</term>
      <term name="letter" form="short">писмо</term>
    </terms>
  </locale>
  <locale xml:lang="ca">
    <terms>
      <term name="letter">comunicació personal</term>
      <term name="letter" form="short">carta</term>
    </terms>
  </locale>
  <locale xml:lang="cs">
    <terms>
      <term name="letter">osobní komunikace</term>
      <term name="letter" form="short">dopis</term>
    </terms>
  </locale>
  <locale xml:lang="cy">
    <terms>
      <term name="letter">cyfathrebu personol</term>
      <term name="letter" form="short">llythyr</term>
    </terms>
  </locale>
  <locale xml:lang="da">
    <terms>
      <term name="et-al">et al.</term>
      <term name="letter">personlig kommunikation</term>
      <term name="letter" form="short">brev</term>
    </terms>
  </locale>
  <locale xml:lang="de">
    <terms>
      <term name="et-al">et al.</term>
      <term name="letter">persönliche Kommunikation</term>
      <term name="letter" form="short">Brief</term>
    </terms>
  </locale>
  <locale xml:lang="el">
    <terms>
      <term name="letter">προσωπική επικοινωνία</term>
      <term name="letter" form="short">επιστολή</term>
    </terms>
  </locale>
  <locale xml:lang="es">
    <terms>
      <term name="from">de</term>
      <term name="letter">comunicación personal</term>
      <term name="letter" form="short">carta</term>
    </terms>
  </locale>
  <locale xml:lang="et">
    <terms>
      <term name="letter">isiklik suhtlus</term>
      <term name="letter" form="short">kiri</term>
    </terms>
  </locale>
  <locale xml:lang="eu">
    <terms>
      <term name="letter">komunikazio pertsonala</term>
      <term name="letter" form="short">gutuna</term>
    </terms>
  </locale>
  <locale xml:lang="fa">
    <terms>
      <term name="letter">ارتباط شخصی</term>
      <term name="letter" form="short">نامه</term>
    </terms>
  </locale>
  <locale xml:lang="fi">
    <terms>
      <term name="letter">henkilökohtainen viestintä</term>
      <term name="letter" form="short">kirje</term>
    </terms>
  </locale>
  <locale xml:lang="fr">
    <terms>
      <term name="letter">communication personnelle</term>
      <term name="letter" form="short">lettre</term>
      <term name="editor" form="short">
        <single>éd.</single>
        <multiple>éds.</multiple>
      </term>
    </terms>
  </locale>
  <locale xml:lang="he">
    <terms>
      <term name="letter">תקשורת אישית</term>
      <term name="letter" form="short">מכתב</term>
    </terms>
  </locale>
  <locale xml:lang="hr">
    <terms>
      <term name="letter">osobna komunikacija</term>
      <term name="letter" form="short">pismo</term>
    </terms>
  </locale>
  <locale xml:lang="hu">
    <terms>
      <term name="letter">személyes kommunikáció</term>
      <term name="letter" form="short">levél</term>
    </terms>
  </locale>
  <locale xml:lang="id">
    <terms>
      <term name="letter">komunikasi pribadi</term>
      <term name="letter" form="short">surat</term>
    </terms>
  </locale>
  <locale xml:lang="is">
    <terms>
      <term name="letter">persónuleg samskipti</term>
      <term name="letter" form="short">bréf</term>
    </terms>
  </locale>
  <locale xml:lang="it">
    <terms>
      <term name="letter">comunicazione personale</term>
      <term name="letter" form="short">lettera</term>
    </terms>
  </locale>
  <locale xml:lang="ja">
    <terms>
      <term name="letter">個人的なやり取り</term>
      <term name="letter" form="short">手紙</term>
    </terms>
  </locale>
  <locale xml:lang="ko">
    <terms>
      <term name="letter">개인 서신</term>
      <term name="letter" form="short">편지</term>
    </terms>
  </locale>
  <locale xml:lang="la">
    <terms>
      <term name="letter"/>
      <term name="letter" form="short">epistula</term>
    </terms>
  </locale>
  <locale xml:lang="lt">
    <terms>
      <term name="letter">communicationis personalis</term>
      <term name="letter" form="short"/>
    </terms>
  </locale>
  <locale xml:lang="lv">
    <terms>
      <term name="letter">personīga komunikācija</term>
      <term name="letter" form="short">vēstule</term>
    </terms>
  </locale>
  <locale xml:lang="mn">
    <terms>
      <term name="letter">хувийн харилцаа холбоо</term>
      <term name="letter" form="short">захиа</term>
    </terms>
  </locale>
  <locale xml:lang="nb">
    <terms>
      <term name="et-al">et al.</term>
      <term name="letter">personlig kommunikasjon</term>
      <term name="letter" form="short">brev</term>
    </terms>
  </locale>
  <locale xml:lang="nl">
    <terms>
      <term name="et-al">et al.</term>
      <term name="letter">persoonlijke communicatie</term>
      <term name="letter" form="short">brief</term>
    </terms>
  </locale>
  <locale xml:lang="nn">
    <terms>
      <term name="et-al">et al.</term>
      <term name="letter">personlig kommunikasjon</term>
      <term name="letter" form="short">brev</term>
    </terms>
  </locale>
  <locale xml:lang="pl">
    <terms>
      <term name="letter">osobista komunikacja</term>
      <term name="letter" form="short">list</term>
    </terms>
  </locale>
  <locale xml:lang="pt">
    <terms>
      <term name="letter">comunicação pessoal</term>
      <term name="letter" form="short">carta</term>
    </terms>
  </locale>
  <locale xml:lang="ro">
    <terms>
      <term name="letter">comunicare personală</term>
      <term name="letter" form="short">scrisoare</term>
    </terms>
  </locale>
  <locale xml:lang="ru">
    <terms>
      <term name="letter">личная переписка</term>
      <term name="letter" form="short">письмо</term>
    </terms>
  </locale>
  <locale xml:lang="sk">
    <terms>
      <term name="letter">osobná komunikácia</term>
      <term name="letter" form="short">list</term>
    </terms>
  </locale>
  <locale xml:lang="sl">
    <terms>
      <term name="letter">osebna komunikacija</term>
      <term name="letter" form="short">pismo</term>
    </terms>
  </locale>
  <locale xml:lang="sr">
    <terms>
      <term name="letter">лична комуникација</term>
      <term name="letter" form="short">писмо</term>
    </terms>
  </locale>
  <locale xml:lang="sv">
    <terms>
      <term name="letter">personlig kommunikation</term>
      <term name="letter" form="short">brev</term>
    </terms>
  </locale>
  <locale xml:lang="th">
    <terms>
      <term name="letter">การสื่อสารส่วนบุคคล</term>
      <term name="letter" form="short">จดหมาย</term>
    </terms>
  </locale>
  <locale xml:lang="tr">
    <terms>
      <term name="letter">kişisel iletişim</term>
      <term name="letter" form="short">mektup</term>
    </terms>
  </locale>
  <locale xml:lang="uk">
    <terms>
      <term name="letter">особисте спілкування</term>
      <term name="letter" form="short">лист</term>
    </terms>
  </locale>
  <locale xml:lang="vi">
    <terms>
      <term name="letter">giao tiếp cá nhân</term>
      <term name="letter" form="short">thư</term>
    </terms>
  </locale>
  <locale xml:lang="zh-CN">
    <terms>
      <term name="letter">的私人交流</term>
      <term name="letter" form="short">信函</term>
    </terms>
  </locale>
  <locale xml:lang="zh-TW">
    <terms>
      <term name="letter">私人通訊</term>
      <term name="letter" form="short">信函</term>
    </terms>
  </locale>
  <!-- General categories of item types:
       Periodical: article-journal article-magazine article-newspaper post-weblog review review-book
       Periodical or Booklike: paper-conference
       Booklike: article book broadcast chapter dataset entry entry-dictionary entry-encyclopedia figure 
                 graphic interview manuscript map motion_picture musical_score pamphlet patent 
                 personal_communication report song speech thesis post webpage
       Legal: bill legal_case legislation treaty
  -->
  <!-- APA references contain four parts: author, date, title, source -->
  <macro name="author-bib">
    <names variable="composer" delimiter=", ">
      <name name-as-sort-order="all" and="symbol" sort-separator=", " initialize-with=". " delimiter=", " delimiter-precedes-last="always"/>
      <substitute>
        <names variable="author"/>
        <names variable="illustrator"/>
        <names variable="director">
          <name name-as-sort-order="all" and="symbol" sort-separator=", " initialize-with=". " delimiter=", " delimiter-precedes-last="always"/>
          <label form="long" prefix=" (" suffix=")" text-case="title"/>
        </names>
        <choose>
          <if variable="container-title">
            <choose>
              <if type="book entry entry-dictionary entry-encyclopedia" match="any">
                <choose>
                  <if variable="title">
                    <group delimiter=" ">
                      <text macro="title"/>
                      <text macro="parenthetical"/>
                    </group>
                  </if>
                  <else>
                    <text macro="title-and-descriptions"/>
                  </else>
                </choose>
              </if>
            </choose>
          </if>
        </choose>
        <!-- Test for editortranslator and put that first as that becomes available -->
        <names variable="editor" delimiter=", ">
          <name name-as-sort-order="all" and="symbol" sort-separator=", " initialize-with=". " delimiter=", " delimiter-precedes-last="always"/>
          <label form="short" prefix=" (" suffix=")" text-case="title"/>
        </names>
        <names variable="editorial-director">
          <name name-as-sort-order="all" and="symbol" sort-separator=", " initialize-with=". " delimiter=", " delimiter-precedes-last="always"/>
          <label form="short" prefix=" (" suffix=")" text-case="title"/>
        </names>
        <names variable="collection-editor">
          <name name-as-sort-order="all" and="symbol" sort-separator=", " initialize-with=". " delimiter=", " delimiter-precedes-last="always"/>
          <label form="short" prefix=" (" suffix=")" text-case="title"/>
        </names>
        <choose>
          <if variable="title">
            <group delimiter=" ">
              <text macro="title"/>
              <text macro="parenthetical"/>
            </group>
          </if>
          <else>
            <text macro="title-and-descriptions"/>
          </else>
        </choose>
      </substitute>
    </names>
  </macro>
  <macro name="author-intext">
    <choose>
      <if type="bill legal_case legislation treaty" match="any">
        <text macro="title-intext"/>
      </if>
      <else-if type="interview personal_communication" match="any">
        <choose>
          <!-- These variables indicate that the letter is retrievable by the reader. 
                If not, then use the APA in-text-only personal communication format -->
          <if variable="archive container-title DOI publisher URL" match="none">
            <group delimiter=", ">
              <names variable="author">
                <name and="symbol" delimiter=", " initialize-with=". "/>
                <substitute>
                  <text macro="title-intext"/>
                </substitute>
              </names>
              <!-- Replace with term="personal-communication" if that becomes available -->
              <text term="letter"/>
            </group>
          </if>
          <else>
            <names variable="author" delimiter=", ">
              <name form="short" and="symbol" delimiter=", " initialize-with=". "/>
              <substitute>
                <text macro="title-intext"/>
              </substitute>
            </names>
          </else>
        </choose>
      </else-if>
      <else>
        <names variable="composer" delimiter=", ">
          <name form="short" and="symbol" delimiter=", " initialize-with=". "/>
          <substitute>
            <names variable="author"/>
            <names variable="illustrator"/>
            <names variable="director"/>
            <choose>
              <if variable="container-title">
                <choose>
                  <if type="book entry entry-dictionary entry-encyclopedia" match="any">
                    <text macro="title-intext"/>
                  </if>
                </choose>
              </if>
            </choose>
            <names variable="editor"/>
            <names variable="editorial-director"/>
            <text macro="title-intext"/>
          </substitute>
        </names>
      </else>
    </choose>
  </macro>
  <macro name="date-bib">
    <group delimiter=" " prefix="(" suffix=")">
      <choose>
        <if is-uncertain-date="issued">
          <text term="circa" form="short"/>
        </if>
      </choose>
      <group>
        <choose>
          <if variable="issued">
            <date variable="issued">
              <date-part name="year"/>
            </date>
            <text variable="year-suffix"/>
            <choose>
              <if type="article-magazine article-newspaper broadcast interview motion_picture pamphlet personal_communication post post-weblog song speech webpage" match="any">
                <!-- Many video and audio examples in manual give full dates. Err on the side of too much information. -->
                <date variable="issued">
                  <date-part prefix=", " name="month"/>
                  <date-part prefix=" " name="day"/>
                </date>
              </if>
              <else-if type="paper-conference">
                <!-- Capture 'speech' stored as 'paper-conference' -->
                <choose>
                  <if variable="collection-editor editor editorial-director issue page volume" match="none">
                    <date variable="issued">
                      <date-part prefix=", " name="month"/>
                      <date-part prefix=" " name="day"/>
                    </date>
                  </if>
                </choose>
              </else-if>
              <!-- Only year: article article-journal book chapter entry entry-dictionary entry-encyclopedia dataset figure graphic 
                   manuscript map musical_score paper-conference[published] patent report review review-book thesis -->
            </choose>
          </if>
          <else-if variable="status">
            <group>
              <text variable="status" text-case="lowercase"/>
              <text variable="year-suffix" prefix="-"/>
            </group>
          </else-if>
          <else>
            <group>
              <text term="no date" form="short"/>
              <text variable="year-suffix" prefix="-"/>
            </group>
          </else>
        </choose>
      </group>
    </group>
  </macro>
  <macro name="date-sort-group">
    <choose>
      <if variable="issued">
        <text value="1"/>
      </if>
      <else-if variable="status">
        <text value="2"/>
      </else-if>
      <else>
        <text value="0"/>
      </else>
    </choose>
  </macro>
  <macro name="date-sort-date">
    <choose>
      <if type="article-magazine article-newspaper broadcast interview pamphlet personal_communication post post-weblog speech treaty webpage" match="any">
        <date variable="issued" form="numeric"/>
      </if>
      <else-if type="paper-conference">
        <!-- Capture 'speech' stored as 'paper-conference' -->
        <choose>
          <if variable="collection-editor editor editorial-director issue page volume" match="none">
            <date variable="issued" form="numeric"/>
          </if>
        </choose>
      </else-if>
      <else>
        <date variable="issued" form="numeric"/>
      </else>
    </choose>
  </macro>
  <macro name="date-intext">
    <choose>
      <if variable="issued">
        <group delimiter="/">
          <group delimiter=" ">
            <choose>
              <if is-uncertain-date="original-date">
                <text term="circa" form="short"/>
              </if>
            </choose>
            <date variable="original-date">
              <date-part name="year"/>
            </date>
          </group>
          <group delimiter=" ">
            <choose>
              <if is-uncertain-date="issued">
                <text term="circa" form="short"/>
              </if>
            </choose>
            <group>
              <choose>
                <if type="interview personal_communication" match="any">
                  <choose>
                    <if variable="archive container-title DOI publisher URL" match="none">
                      <!-- These variables indicate that the communication is retrievable by the reader. 
                           If not, then use the in-text-only personal communication format -->
                      <date variable="issued" form="text"/>
                    </if>
                    <else>
                      <date variable="issued">
                        <date-part name="year"/>
                      </date>
                    </else>
                  </choose>
                </if>
                <else>
                  <date variable="issued">
                    <date-part name="year"/>
                  </date>
                </else>
              </choose>
              <text variable="year-suffix"/>
            </group>
          </group>
        </group>
      </if>
      <else-if variable="status">
        <text variable="status" text-case="lowercase"/>
        <text variable="year-suffix" prefix="-"/>
      </else-if>
      <else>
        <text term="no date" form="short"/>
        <text variable="year-suffix" prefix="-"/>
      </else>
    </choose>
  </macro>
  <!-- APA has two description elements following the title:
       title (parenthetical) [bracketed]  -->
  <macro name="title-and-descriptions">
    <choose>
      <if variable="title">
        <group delimiter=" ">
          <text macro="title"/>
          <text macro="parenthetical"/>
          <text macro="bracketed"/>
        </group>
      </if>
      <else>
        <group delimiter=" ">
          <text macro="bracketed"/>
          <text macro="parenthetical"/>
        </group>
      </else>
    </choose>
  </macro>
  <macro name="title">
    <choose>
      <if type="post webpage" match="any">
        <!-- Webpages are always italicized -->
        <text variable="title" font-style="italic"/>
      </if>
      <else-if variable="container-title" match="any">
        <!-- Other types are italicized based on presence of container-title.
             Assume that review and review-book are published in periodicals/blogs,
             not just on a web page (ex. 69) -->
        <text variable="title"/>
      </else-if>
      <else>
        <choose>
          <if type="article-journal article-magazine article-newspaper post-weblog review review-book" match="any">
            <text variable="title" font-style="italic"/>
          </if>
          <else-if type="paper-conference">
            <choose>
              <if variable="collection-editor editor editorial-director" match="any">
                <group delimiter=": " font-style="italic">
                  <text variable="title"/>
                  <!-- Replace with volume-title as that becomes available -->
                  <choose>
                    <if is-numeric="volume" match="none">
                      <group delimiter=" ">
                        <label variable="volume" form="short" text-case="capitalize-first"/>
                        <text variable="volume"/>
                      </group>
                    </if>
                  </choose>
                </group>
              </if>
              <else>
                <text variable="title" font-style="italic"/>
              </else>
            </choose>
          </else-if>
          <else>
            <group delimiter=": " font-style="italic">
              <text variable="title"/>
              <!-- Replace with volume-title as that becomes available -->
              <choose>
                <if is-numeric="volume" match="none">
                  <group delimiter=" ">
                    <label variable="volume" form="short" text-case="capitalize-first"/>
                    <text variable="volume"/>
                  </group>
                </if>
              </choose>
            </group>
          </else>
        </choose>
      </else>
    </choose>
  </macro>
  <macro name="title-intext">
    <choose>
      <if variable="title" match="none">
        <text macro="bracketed-intext" prefix="[" suffix="]"/>
      </if>
      <else-if type="bill">
        <!-- If a bill has no number or container-title, assume it is a hearing; italic -->
        <choose>
          <if variable="number container-title" match="none">
            <text variable="title" form="short" font-style="italic" text-case="title"/>
          </if>
          <else-if variable="title">
            <text variable="title" form="short" text-case="title"/>
          </else-if>
          <else>
            <group delimiter=" ">
              <text variable="genre"/>
              <group delimiter=" ">
                <choose>
                  <if variable="chapter-number container-title" match="none">
                    <!-- Replace with label variable="number" as that becomes available -->
                    <text term="issue" form="short"/>
                  </if>
                </choose>
                <text variable="number"/>
              </group>
            </group>
          </else>
        </choose>
      </else-if>
      <else-if type="legal_case" match="any">
        <!-- Cases are italicized -->
        <text variable="title" font-style="italic"/>
      </else-if>
      <else-if type="legislation treaty" match="any">
        <!-- Legislation and treaties not italicized or quoted -->
        <text variable="title" form="short" text-case="title"/>
      </else-if>
      <else-if type="post webpage" match="any">
        <!-- Webpages are always italicized -->
        <text variable="title" form="short" font-style="italic" text-case="title"/>
      </else-if>
      <else-if variable="container-title" match="any">
        <!-- Other types are italicized or quoted based on presence of container-title. As in title macro. -->
        <text variable="title" form="short" quotes="true" text-case="title"/>
      </else-if>
      <else>
        <text variable="title" form="short" font-style="italic" text-case="title"/>
      </else>
    </choose>
  </macro>
  <macro name="parenthetical">
    <!-- (Secondary contributors; Database location; Genre no. 123; Report Series 123, Version, Edition, Volume, Page) -->
    <group prefix="(" suffix=")">
      <choose>
        <if type="patent">
          <!-- authority: U.S. ; genre: patent ; number: 123,445 -->
          <group delimiter=" ">
            <text variable="authority" form="short"/>
            <choose>
              <if variable="genre">
                <text variable="genre" text-case="capitalize-first"/>
              </if>
              <else>
                <!-- This should be localized -->
                <text value="patent" text-case="capitalize-first"/>
              </else>
            </choose>
            <group delimiter=" ">
              <!-- Replace with label variable="number" if that becomes available -->
              <text term="issue" form="short" text-case="capitalize-first"/>
              <text variable="number"/>
            </group>
          </group>
        </if>
        <else-if type="post webpage" match="any">
          <!-- For post webpage, container-title is treated as publisher -->
          <group delimiter="; ">
            <text macro="secondary-contributors"/>
            <text macro="database-location"/>
            <text macro="number"/>
            <text macro="locators-booklike"/>
          </group>
        </else-if>
        <else-if variable="container-title">
          <group delimiter="; ">
            <text macro="secondary-contributors"/>
            <choose>
              <if type="broadcast graphic map motion_picture song" match="any">
                <!-- For audiovisual media, number information comes after title, not container-title -->
                <text macro="number"/>
              </if>
            </choose>
          </group>
        </else-if>
        <else>
          <group delimiter="; ">
            <text macro="secondary-contributors"/>
            <text macro="database-location"/>
            <text macro="number"/>
            <text macro="locators-booklike"/>
          </group>
        </else>
      </choose>
    </group>
  </macro>
  <macro name="parenthetical-container">
    <choose>
      <if variable="container-title" match="any">
        <group prefix="(" suffix=")">
          <group delimiter="; ">
            <text macro="database-location"/>
            <choose>
              <if type="broadcast graphic map motion_picture song" match="none">
                <!-- For audiovisual media, number information comes after title, not container-title -->
                <text macro="number"/>
              </if>
            </choose>
            <text macro="locators-booklike"/>
          </group>
        </group>
      </if>
    </choose>
  </macro>
  <macro name="bracketed">
    <!-- [Descriptive information] -->
    <!-- If there is a number, genre is already printed in macro="number" -->
    <group prefix="[" suffix="]">
      <choose>
        <if variable="reviewed-author reviewed-title" type="review review-book" match="any">
          <!-- Reviewed item -->
          <group delimiter="; ">
            <group delimiter=", ">
              <group delimiter=" ">
                <!-- Assume that genre is entered as 'Review of the book' or similar -->
                <choose>
                  <if variable="number" match="none">
                    <choose>
                      <if variable="genre">
                        <text variable="genre" text-case="capitalize-first"/>
                      </if>
                      <else-if variable="medium">
                        <text variable="medium" text-case="capitalize-first"/>
                      </else-if>
                      <else>
                        <!-- Replace with term="review" as that becomes available -->
                        <text value="Review of"/>
                      </else>
                    </choose>
                  </if>
                  <else>
                    <choose>
                      <if variable="medium">
                        <text variable="medium" text-case="capitalize-first"/>
                      </if>
                      <else>
                        <!-- Replace with term="review" as that becomes available -->
                        <text value="Review of"/>
                      </else>
                    </choose>
                  </else>
                </choose>
                <text macro="reviewed-title"/>
              </group>
              <names variable="reviewed-author">
                <label form="verb-short" suffix=" "/>
                <name and="symbol" initialize-with=". " delimiter=", "/>
              </names>
            </group>
            <choose>
              <if variable="genre" match="any">
                <choose>
                  <if variable="number" match="none">
                    <text variable="medium" text-case="capitalize-first"/>
                  </if>
                </choose>
              </if>
            </choose>
          </group>
        </if>
        <else-if type="thesis">
          <!-- Thesis type and institution -->
          <group delimiter="; ">
            <choose>
              <if variable="number" match="none">
                <group delimiter=", ">
                  <text variable="genre" text-case="capitalize-first"/>
                  <choose>
                    <if variable="archive DOI URL" match="any">
                      <!-- Include the university in brackets if thesis is published -->
                      <text variable="publisher"/>
                    </if>
                  </choose>
                </group>
              </if>
            </choose>
            <text variable="medium" text-case="capitalize-first"/>
          </group>
        </else-if>
        <else-if variable="interviewer" type="interview" match="any">
          <!-- Interview information -->
          <choose>
            <if variable="title">
              <text macro="format"/>
            </if>
            <else-if variable="genre">
              <group delimiter="; ">
                <group delimiter=" ">
                  <text variable="genre" text-case="capitalize-first"/>
                  <group delimiter=" ">
                    <text term="author" form="verb"/>
                    <names variable="interviewer">
                      <name and="symbol" initialize-with=". " delimiter=", "/>
                    </names>
                  </group>
                </group>
              </group>
            </else-if>
            <else-if variable="interviewer">
              <group delimiter="; ">
                <names variable="interviewer">
                  <label form="verb" suffix=" " text-case="capitalize-first"/>
                  <name and="symbol" initialize-with=". " delimiter=", "/>
                </names>
                <text variable="medium" text-case="capitalize-first"/>
              </group>
            </else-if>
            <else>
              <text macro="format"/>
            </else>
          </choose>
        </else-if>
        <else-if type="personal_communication">
          <!-- Letter information -->
          <choose>
            <if variable="recipient">
              <group delimiter="; ">
                <group delimiter=" ">
                  <choose>
                    <if variable="number" match="none">
                      <choose>
                        <if variable="genre">
                          <text variable="genre" text-case="capitalize-first"/>
                        </if>
                        <else-if variable="medium">
                          <text variable="medium" text-case="capitalize-first"/>
                        </else-if>
                        <else>
                          <text term="letter" form="short" text-case="capitalize-first"/>
                        </else>
                      </choose>
                    </if>
                    <else>
                      <choose>
                        <if variable="medium">
                          <text variable="medium" text-case="capitalize-first"/>
                        </if>
                        <else>
                          <text term="letter" form="short" text-case="capitalize-first"/>
                        </else>
                      </choose>
                    </else>
                  </choose>
                  <names variable="recipient" delimiter=", ">
                    <label form="verb" suffix=" "/>
                    <name and="symbol" delimiter=", "/>
                  </names>
                </group>
                <choose>
                  <if variable="genre" match="any">
                    <choose>
                      <if variable="number" match="none">
                        <text variable="medium" text-case="capitalize-first"/>
                      </if>
                    </choose>
                  </if>
                </choose>
              </group>
            </if>
            <else>
              <text macro="format"/>
            </else>
          </choose>
        </else-if>
        <else-if variable="composer" type="song" match="all">
          <!-- Performer of classical music works -->
          <group delimiter="; ">
            <choose>
              <if variable="number" match="none">
                <group delimiter=" ">
                  <choose>
                    <if variable="genre">
                      <text variable="genre" text-case="capitalize-first"/>
                      <!-- Replace prefix with performer label as that becomes available -->
                      <names variable="author" prefix="recorded by ">
                        <name and="symbol" initialize-with=". " delimiter=", "/>
                      </names>
                    </if>
                    <else-if variable="medium">
                      <text variable="medium" text-case="capitalize-first"/>
                      <!-- Replace prefix with performer label as that becomes available -->
                      <names variable="author" prefix="recorded by ">
                        <name and="symbol" initialize-with=". " delimiter=", "/>
                      </names>
                    </else-if>
                    <else>
                      <!-- Replace prefix with performer label as that becomes available -->
                      <names variable="author" prefix="Recorded by ">
                        <name and="symbol" initialize-with=". " delimiter=", "/>
                      </names>
                    </else>
                  </choose>
                </group>
              </if>
              <else>
                <group delimiter=" ">
                  <choose>
                    <if variable="medium">
                      <text variable="medium" text-case="capitalize-first"/>
                      <!-- Replace prefix with performer label as that becomes available -->
                      <names variable="author" prefix="recorded by ">
                        <name and="symbol" initialize-with=". " delimiter=", "/>
                      </names>
                    </if>
                    <else>
                      <!-- Replace prefix with performer label as that becomes available -->
                      <names variable="author" prefix="Recorded by ">
                        <name and="symbol" initialize-with=". " delimiter=", "/>
                      </names>
                    </else>
                  </choose>
                </group>
              </else>
            </choose>
            <choose>
              <if variable="genre" match="any">
                <choose>
                  <if variable="number" match="none">
                    <text variable="medium" text-case="capitalize-first"/>
                  </if>
                </choose>
              </if>
            </choose>
          </group>
        </else-if>
        <else-if variable="container-title" match="none">
          <!-- Other description -->
          <text macro="format"/>
        </else-if>
        <else>
          <!-- For conference presentations, chapters in reports, software, place bracketed after the container title -->
          <choose>
            <if type="paper-conference speech" match="any">
              <choose>
                <if variable="collection-editor editor editorial-director issue page volume" match="any">
                  <text macro="format"/>
                </if>
              </choose>
            </if>
            <else-if type="book">
              <choose>
                <if variable="version" match="none">
                  <text macro="format"/>
                </if>
              </choose>
            </else-if>
            <else-if type="report" match="none">
              <text macro="format"/>
            </else-if>
          </choose>
        </else>
      </choose>
    </group>
  </macro>
  <macro name="bracketed-intext">
    <group prefix="[" suffix="]">
      <choose>
        <if variable="reviewed-author reviewed-title" type="review review-book" match="any">
          <!-- This should be localized -->
          <text macro="reviewed-title-intext" prefix="Review of "/>
        </if>
        <else-if variable="interviewer" type="interview" match="any">
          <names variable="interviewer">
            <label form="verb" suffix=" " text-case="capitalize-first"/>
            <name and="symbol" initialize-with=". " delimiter=", "/>
            <substitute>
              <text macro="format-intext"/>
            </substitute>
          </names>
        </else-if>
        <else-if type="personal_communication">
          <!-- Letter information -->
          <choose>
            <if variable="recipient">
              <group delimiter=" ">
                <choose>
                  <if variable="number" match="none">
                    <text variable="genre" text-case="capitalize-first"/>
                  </if>
                  <else>
                    <text term="letter" form="short" text-case="capitalize-first"/>
                  </else>
                </choose>
                <names variable="recipient" delimiter=", ">
                  <label form="verb" suffix=" "/>
                  <name and="symbol" delimiter=", "/>
                </names>
              </group>
            </if>
            <else>
              <text macro="format-intext"/>
            </else>
          </choose>
        </else-if>
        <else>
          <text macro="format-intext"/>
        </else>
      </choose>
    </group>
  </macro>
  <macro name="bracketed-container">
    <group prefix="[" suffix="]">
      <choose>
        <if type="paper-conference speech" match="any">
          <!-- Conference presentations should describe the session [container] in bracketed unless published in a proceedings -->
          <choose>
            <if variable="collection-editor editor editorial-director issue page volume" match="none">
              <text macro="format"/>
            </if>
          </choose>
        </if>
        <else-if type="book" variable="version" match="all">
          <!-- For entries in mobile app reference works, place bracketed after the container-title -->
          <text macro="format"/>
        </else-if>
        <else-if type="report">
          <!-- For chapters in reports, place bracketed after the container title -->
          <text macro="format"/>
        </else-if>
      </choose>
    </group>
  </macro>
  <macro name="secondary-contributors">
    <choose>
      <if type="article-journal article-magazine article-newspaper post-weblog review review-book" match="any">
        <text macro="secondary-contributors-periodical"/>
      </if>
      <else-if type="paper-conference">
        <choose>
          <if variable="collection-editor editor editorial-director" match="any">
            <text macro="secondary-contributors-booklike"/>
          </if>
          <else>
            <text macro="secondary-contributors-periodical"/>
          </else>
        </choose>
      </else-if>
      <else>
        <text macro="secondary-contributors-booklike"/>
      </else>
    </choose>
  </macro>
  <macro name="secondary-contributors-periodical">
    <group delimiter="; ">
      <choose>
        <if variable="title">
          <names variable="interviewer" delimiter="; ">
            <name and="symbol" initialize-with=". " delimiter=", "/>
            <label form="short" prefix=", " text-case="title"/>
          </names>
        </if>
      </choose>
      <names variable="translator" delimiter="; ">
        <name and="symbol" initialize-with=". " delimiter=", "/>
        <label form="short" prefix=", " text-case="title"/>
      </names>
    </group>
  </macro>
  <macro name="secondary-contributors-booklike">
    <group delimiter="; ">
      <choose>
        <if variable="title">
          <names variable="interviewer">
            <name and="symbol" initialize-with=". " delimiter=", "/>
            <label form="short" prefix=", " text-case="title"/>
          </names>
        </if>
      </choose>
      <!-- When editortranslator becomes available, add a test: variable="editortranslator" match="none"; then print translator -->
      <choose>
        <if type="post webpage" match="none">
          <!-- Webpages treat container-title like publisher -->
          <choose>
            <if variable="container-title" match="none">
              <group delimiter="; ">
                <names variable="container-author">
                  <label form="verb-short" suffix=" " text-case="title"/>
                  <name and="symbol" initialize-with=". " delimiter=", "/>
                </names>
                <names variable="editor translator" delimiter="; ">
                  <name and="symbol" initialize-with=". " delimiter=", "/>
                  <label form="short" prefix=", " text-case="title"/>
                </names>
              </group>
            </if>
          </choose>
        </if>
        <else>
          <group delimiter="; ">
            <names variable="container-author">
              <label form="verb-short" suffix=" " text-case="title"/>
              <name and="symbol" initialize-with=". " delimiter=", "/>
            </names>
            <names variable="editor translator" delimiter="; ">
              <name and="symbol" initialize-with=". " delimiter=", "/>
              <label form="short" prefix=", " text-case="title"/>
            </names>
          </group>
        </else>
      </choose>
    </group>
  </macro>
  <macro name="database-location">
    <choose>
      <if variable="archive-place" match="none">
        <!-- With `archive-place`: physical archives. Without: online archives. -->
        <!-- Add archive_collection as that becomes available -->
        <text variable="archive_location"/>
      </if>
    </choose>
  </macro>
  <macro name="number">
    <choose>
      <if variable="number">
        <group delimiter=", ">
          <group delimiter=" ">
            <text variable="genre" text-case="title"/>
            <choose>
              <if is-numeric="number">
                <!-- Replace with label variable="number" if that becomes available -->
                <text term="issue" form="short" text-case="capitalize-first"/>
                <text variable="number"/>
              </if>
              <else>
                <text variable="number"/>
              </else>
            </choose>
          </group>
          <choose>
            <if type="thesis">
              <choose>
                <!-- Include the university in brackets if thesis is published -->
                <if variable="archive DOI URL" match="any">
                  <text variable="publisher"/>
                </if>
              </choose>
            </if>
          </choose>
        </group>
      </if>
    </choose>
  </macro>
  <macro name="locators-booklike">
    <choose>
      <if type="article-journal article-magazine article-newspaper broadcast interview patent post post-weblog review review-book speech webpage" match="any"/>
      <else-if type="paper-conference">
        <choose>
          <if variable="collection-editor editor editorial-director" match="any">
            <group delimiter=", ">
              <text macro="version"/>
              <text macro="edition"/>
              <text macro="volume-booklike"/>
            </group>
          </if>
        </choose>
      </else-if>
      <else>
        <group delimiter=", ">
          <text macro="version"/>
          <text macro="edition"/>
          <text macro="volume-booklike"/>
        </group>
      </else>
    </choose>
  </macro>
  <macro name="version">
    <choose>
      <if is-numeric="version">
        <group delimiter=" ">
          <!-- replace with label variable="version" if that becomes available -->
          <text term="version" text-case="capitalize-first"/>
          <text variable="version"/>
        </group>
      </if>
      <else>
        <text variable="version"/>
      </else>
    </choose>
  </macro>
  <macro name="edition">
    <choose>
      <if is-numeric="edition">
        <group delimiter=" ">
          <number variable="edition" form="ordinal"/>
          <label variable="edition" form="short"/>
        </group>
      </if>
      <else>
        <text variable="edition"/>
      </else>
    </choose>
  </macro>
  <macro name="volume-booklike">
    <group delimiter=", ">
      <!-- Report series [ex. 52] -->
      <choose>
        <if type="report">
          <group delimiter=" ">
            <text variable="collection-title" text-case="title"/>
            <text variable="collection-number"/>
          </group>
        </if>
      </choose>
      <choose>
        <if variable="volume" match="any">
          <choose>
            <!-- Non-numeric volumes are already printed as part of the book title -->
            <if is-numeric="volume" match="none"/>
            <else>
              <group delimiter=" ">
                <label variable="volume" form="short" text-case="capitalize-first"/>
                <number variable="volume" form="numeric"/>
              </group>
            </else>
          </choose>
        </if>
        <else>
          <group>
            <!-- Replace with label variable="number-of-volumes" if that becomes available -->
            <text term="volume" form="short" text-case="capitalize-first" suffix=" "/>
            <text term="page-range-delimiter" prefix="1"/>
            <number variable="number-of-volumes" form="numeric"/>
          </group>
        </else>
      </choose>
      <group delimiter=" ">
        <label variable="issue" text-case="capitalize-first"/>
        <text variable="issue"/>
      </group>
      <group delimiter=" ">
        <label variable="page" form="short" suffix=" "/>
        <text variable="page"/>
      </group>
    </group>
  </macro>
  <macro name="reviewed-title">
    <choose>
      <if variable="reviewed-title">
        <!-- Not possible to distinguish TV series episode from other reviewed 
              works [Ex. 69] -->
        <text variable="reviewed-title" font-style="italic"/>
      </if>
      <else>
        <!-- Assume title is title of reviewed work -->
        <text variable="title" font-style="italic"/>
      </else>
    </choose>
  </macro>
  <macro name="reviewed-title-intext">
    <choose>
      <if variable="reviewed-title">
        <!-- Not possible to distinguish TV series episode from other reviewed works [Ex. 69] -->
        <text variable="reviewed-title" form="short" font-style="italic" text-case="title"/>
      </if>
      <else>
        <!-- Assume title is title of reviewed work -->
        <text variable="title" form="short" font-style="italic" text-case="title"/>
      </else>
    </choose>
  </macro>
  <macro name="format">
    <choose>
      <if variable="genre medium" match="any">
        <group delimiter="; ">
          <choose>
            <if variable="number" match="none">
              <text variable="genre" text-case="capitalize-first"/>
            </if>
          </choose>
          <text variable="medium" text-case="capitalize-first"/>
        </group>
      </if>
      <!-- Generic labels for specific types -->
      <!-- These should be localized when possible -->
      <else-if type="dataset">
        <text value="Data set"/>
      </else-if>
      <else-if type="book" variable="version" match="all">
        <!-- Replace with type="software" and term="software" as that becomes available -->
        <text value="Computer software"/>
      </else-if>
      <else-if type="interview personal_communication" match="any">
        <choose>
          <if variable="archive container-title DOI publisher URL" match="none">
            <text term="letter" text-case="capitalize-first"/>
          </if>
          <else-if type="interview">
            <text term="interview" text-case="capitalize-first"/>
          </else-if>
        </choose>
      </else-if>
      <else-if type="map">
        <text value="Map"/>
      </else-if>
    </choose>
  </macro>
  <macro name="format-intext">
    <choose>
      <if variable="genre" match="any">
        <text variable="genre" text-case="capitalize-first"/>
      </if>
      <else-if variable="medium">
        <text variable="medium" text-case="capitalize-first"/>
      </else-if>
      <!-- Generic labels for specific types -->
      <!-- These should be localized when possible -->
      <else-if type="dataset">
        <text value="Data set"/>
      </else-if>
      <else-if type="book" variable="version" match="all">
        <!-- Replace with type="software" and term="software" as that becomes available -->
        <text value="Computer software"/>
      </else-if>
      <else-if type="interview personal_communication" match="any">
        <choose>
          <if variable="archive container-title DOI publisher URL" match="none">
            <text term="letter" text-case="capitalize-first"/>
          </if>
          <else-if type="interview">
            <text term="interview" text-case="capitalize-first"/>
          </else-if>
        </choose>
      </else-if>
      <else-if type="map">
        <text value="Map"/>
      </else-if>
    </choose>
  </macro>
  <!-- APA 'source' element contains four parts:
       container, event, publisher, access -->
  <macro name="container">
    <choose>
      <if type="article-journal article-magazine article-newspaper post-weblog review review-book" match="any">
        <!-- Periodical items -->
        <text macro="container-periodical"/>
      </if>
      <else-if type="paper-conference">
        <!-- Determine if paper-conference is a periodical or booklike -->
        <choose>
          <if variable="editor editorial-director collection-editor container-author" match="any">
            <text macro="container-booklike"/>
          </if>
          <else>
            <text macro="container-periodical"/>
          </else>
        </choose>
      </else-if>
      <else-if type="post webpage" match="none">
        <!-- post and webpage treat container-title like publisher -->
        <text macro="container-booklike"/>
      </else-if>
    </choose>
  </macro>
  <macro name="container-periodical">
    <group delimiter=". ">
      <group delimiter=", ">
        <text variable="container-title" font-style="italic" text-case="title"/>
        <choose>
          <if variable="volume">
            <group>
              <text variable="volume" font-style="italic"/>
              <text variable="issue" prefix="(" suffix=")"/>
            </group>
          </if>
          <else>
            <text variable="issue" font-style="italic"/>
          </else>
        </choose>
        <choose>
          <if variable="page">
            <text variable="page"/>
          </if>
          <else>
            <!-- Ex. 6: Journal article with article number or eLocator -->
            <!-- This should be localized -->
            <text variable="number" prefix="Article "/>
          </else>
        </choose>
      </group>
      <choose>
        <if variable="issued">
          <choose>
            <if variable="issue page volume" match="none">
              <text variable="status" text-case="capitalize-first"/>
            </if>
          </choose>
        </if>
      </choose>
    </group>
  </macro>
  <macro name="container-booklike">
    <choose>
      <if variable="container-title" match="any">
        <group delimiter=" ">
          <text term="in" text-case="capitalize-first"/>
          <group delimiter=", ">
            <names variable="editor translator" delimiter=", &amp; ">
              <!-- Change to editortranslator and move editor to substitute as that becomes available -->
              <name and="symbol" initialize-with=". " delimiter=", "/>
              <label form="short" text-case="title" prefix=" (" suffix=")"/>
              <substitute>
                <names variable="editorial-director"/>
                <names variable="collection-editor"/>
                <names variable="container-author"/>
              </substitute>
            </names>
            <group delimiter=": " font-style="italic">
              <text variable="container-title"/>
              <!-- Replace with volume-title as that becomes available -->
              <choose>
                <if is-numeric="volume" match="none">
                  <group delimiter=" ">
                    <label variable="volume" form="short" text-case="capitalize-first"/>
                    <text variable="volume"/>
                  </group>
                </if>
              </choose>
            </group>
          </group>
          <text macro="parenthetical-container"/>
          <text macro="bracketed-container"/>
        </group>
      </if>
    </choose>
  </macro>
  <macro name="publisher">
    <group delimiter="; ">
      <choose>
        <if type="thesis">
          <choose>
            <if variable="archive DOI URL" match="none">
              <text variable="publisher"/>
            </if>
          </choose>
        </if>
        <else-if type="post webpage" match="any">
          <!-- For websites, treat container title like publisher -->
          <group delimiter="; ">
            <text variable="container-title" text-case="title"/>
            <text variable="publisher"/>
          </group>
        </else-if>
        <else-if type="paper-conference">
          <!-- For paper-conference, don't print publisher if in a journal-like proceedings -->
          <choose>
            <if variable="collection-editor editor editorial-director" match="any">
              <text variable="publisher"/>
            </if>
          </choose>
        </else-if>
        <else-if type="article-journal article-magazine article-newspaper post-weblog" match="none">
          <text variable="publisher"/>
        </else-if>
      </choose>
      <group delimiter=", ">
        <choose>
          <if variable="archive-place">
            <!-- With `archive-place`: physical archives. Without: online archives. -->
            <!-- For physical archives, print the location before the archive name.
                For electronic archives, these are printed in macro="description". -->
            <!-- Split "archive_location" into "archive_collection" and "archive_location" as that becomes available -->
            <!-- Must test for archive_collection:
                With collection: archive_collection (archive_location), archive, archive-place
                No collection: archive (archive_location), archive-place
            -->
            <text variable="archive_location"/>
          </if>
        </choose>
        <text variable="archive"/>
        <text variable="archive-place"/>
      </group>
    </group>
  </macro>
  <macro name="access">
    <choose>
      <if variable="DOI" match="any">
        <text variable="DOI" prefix="https://doi.org/"/>
      </if>
      <else-if variable="URL">
        <group delimiter=" ">
          <choose>
            <if variable="issued status" match="none">
              <group delimiter=" ">
                <text term="retrieved" text-case="capitalize-first"/>
                <date variable="accessed" form="text" suffix=","/>
                <text term="from"/>
              </group>
            </if>
          </choose>
          <text variable="URL"/>
        </group>
      </else-if>
    </choose>
  </macro>
  <macro name="event">
    <choose>
      <if variable="event">
        <!-- To prevent Zotero from printing event-place due to its double-mapping of all 'place' to
             both publisher-place and event-place. Remove this 'choose' when that is changed. -->
        <choose>
          <if variable="collection-editor editor editorial-director issue page volume" match="none">
            <!-- Don't print event info if published in a proceedings -->
            <group delimiter=", ">
              <text variable="event"/>
              <text variable="event-place"/>
            </group>
          </if>
        </choose>
      </if>
    </choose>
  </macro>
  <!-- After 'source', APA also prints publication history (original publication, reprint info, retraction info) -->
  <macro name="publication-history">
    <choose>
      <if type="patent" match="none">
        <group prefix="(" suffix=")">
          <choose>
            <if variable="references">
              <!-- This provides the option for more elaborate description 
                   of publication history, such as full "reprinted" references
                   (examples 11, 43, 44) or retracted references -->
              <text variable="references"/>
            </if>
            <else>
              <group delimiter=" ">
                <text value="Original work published"/>
                <choose>
                  <if is-uncertain-date="original-date">
                    <text term="circa" form="short"/>
                  </if>
                </choose>
                <date variable="original-date">
                  <date-part name="year"/>
                </date>
              </group>
            </else>
          </choose>
        </group>
      </if>
      <else>
        <text variable="references" prefix="(" suffix=")"/>
      </else>
    </choose>
  </macro>
  <!-- Legal citations have their own rules -->
  <macro name="legal-cites">
    <choose>
      <if type="legal_case">
        <group delimiter=". ">
          <group delimiter=", ">
            <text variable="title"/>
            <group delimiter=" ">
              <text macro="container-legal"/>
              <text macro="date-legal"/>
            </group>
            <text variable="references"/>
          </group>
          <text macro="access"/>
        </group>
      </if>
      <else-if type="bill">
        <!-- Currently designed to handle bills, resolutions, hearings, rederal reports. -->
        <group delimiter=". ">
          <group delimiter=", ">
            <choose>
              <if variable="number container-title" match="none">
                <!-- If no number or container-title, then assume it is a hearing -->
                <text variable="title" font-style="italic"/>
              </if>
              <else>
                <text variable="title"/>
              </else>
            </choose>
            <group delimiter=" ">
              <text macro="container-legal"/>
              <text macro="date-legal"/>
              <choose>
                <if variable="number container-title" match="none">
                  <!-- If no number or container-title, then assume it is a hearing -->
                  <names variable="author" prefix="(testimony of " suffix=")">
                    <name and="symbol" delimiter=", "/>
                  </names>
                </if>
                <else>
                  <text variable="status" prefix="(" suffix=")"/>
                </else>
              </choose>
            </group>
            <text variable="references"/>
          </group>
          <text macro="access"/>
        </group>
      </else-if>
      <else-if type="legislation">
        <!-- Currently designed to handle statutes, codified regulations, executive orders.
             For uncodified regulations, assume future code section is in status. -->
        <group delimiter=". ">
          <group delimiter=", ">
            <text variable="title"/>
            <group delimiter=" ">
              <text macro="container-legal"/>
              <text macro="date-legal"/>
              <text variable="status" prefix="(" suffix=")"/>
            </group>
            <text variable="references"/>
          </group>
          <text macro="access"/>
        </group>
      </else-if>
      <else-if type="treaty">
        <!-- APA generally defers to Bluebook for legal citations, but diverges without
             explanation for treaty items. The Bluebook format that was used in APA 6th
             ed. is used here. -->
        <group delimiter=", ">
          <text variable="title" text-case="title"/>
          <names variable="author">
            <name initialize-with="." form="short" delimiter="-"/>
          </names>
          <text macro="date-legal"/>
          <text macro="container-legal"/>
          <text macro="access"/>
        </group>
      </else-if>
    </choose>
  </macro>
  <macro name="date-legal">
    <choose>
      <if type="legal_case">
        <group prefix="(" suffix=")" delimiter=" ">
          <text variable="authority"/>
          <choose>
            <if variable="container-title" match="any">
              <!-- Print only year for cases published in reporters-->
              <date variable="issued" form="numeric" date-parts="year"/>
            </if>
            <else>
              <date variable="issued" form="text"/>
            </else>
          </choose>
        </group>
      </if>
      <else-if type="bill legislation" match="any">
        <group prefix="(" suffix=")" delimiter=" ">
          <group delimiter=" ">
            <date variable="original-date">
              <date-part name="year"/>
            </date>
            <text term="and" form="symbol"/>
          </group>
          <date variable="issued">
            <date-part name="year"/>
          </date>
        </group>
      </else-if>
      <else-if type="treaty">
        <date variable="issued" form="text"/>
      </else-if>
    </choose>
  </macro>
  <macro name="container-legal">
    <!-- Expect legal item container-titles to be stored in short form -->
    <choose>
      <if type="legal_case">
        <group delimiter=" ">
          <choose>
            <if variable="container-title">
              <group delimiter=" ">
                <text variable="volume"/>
                <text variable="container-title"/>
                <group delimiter=" ">
                  <!-- Change to label variable="section" as that becomes available -->
                  <text term="section" form="symbol"/>
                  <text variable="section"/>
                </group>
                <choose>
                  <if variable="page page-first" match="any">
                    <text variable="page-first"/>
                  </if>
                  <else>
                    <text value="___"/>
                  </else>
                </choose>
              </group>
            </if>
            <else>
              <group delimiter=" ">
                <choose>
                  <if is-numeric="number">
                    <!-- Replace with label variable="number" if that becomes available -->
                    <text term="issue" form="short" text-case="capitalize-first"/>
                  </if>
                </choose>
                <text variable="number"/>
              </group>
            </else>
          </choose>
        </group>
      </if>
      <else-if type="bill">
        <group delimiter=", ">
          <group delimiter=" ">
            <text variable="genre"/>
            <group delimiter=" ">
              <choose>
                <if variable="chapter-number container-title" match="none">
                  <!-- Replace with label variable="number" as that becomes available -->
                  <text term="issue" form="short"/>
                </if>
              </choose>
              <text variable="number"/>
            </group>
          </group>
          <text variable="authority"/>
          <text variable="chapter-number"/>
          <group delimiter=" ">
            <text variable="volume"/>
            <text variable="container-title"/>
            <text variable="page-first"/>
          </group>
        </group>
      </else-if>
      <else-if type="legislation">
        <choose>
          <if variable="number">
            <!--There's a public law number-->
            <group delimiter=", ">
              <text variable="number" prefix="Pub. L. No. "/>
              <group delimiter=" ">
                <text variable="volume"/>
                <text variable="container-title"/>
                <text variable="page-first"/>
              </group>
            </group>
          </if>
          <else>
            <group delimiter=" ">
              <text variable="volume"/>
              <text variable="container-title"/>
              <choose>
                <if variable="section">
                  <group delimiter=" ">
                    <!-- Change to label variable="section" as that becomes available -->
                    <text term="section" form="symbol"/>
                    <text variable="section"/>
                  </group>
                </if>
                <else>
                  <text variable="page-first"/>
                </else>
              </choose>
            </group>
          </else>
        </choose>
      </else-if>
      <else-if type="treaty">
        <group delimiter=" ">
          <number variable="volume"/>
          <text variable="container-title"/>
          <choose>
            <if variable="page page-first" match="any">
              <text variable="page-first"/>
            </if>
            <else>
              <group delimiter=" ">
                <!-- Replace with label variable="number" if that becomes available -->
                <text term="issue" form="short" text-case="capitalize-first"/>
                <text variable="number"/>
              </group>
            </else>
          </choose>
        </group>
      </else-if>
    </choose>
  </macro>
  <macro name="citation-locator">
    <group delimiter=" ">
      <choose>
        <if locator="chapter">
          <label variable="locator" text-case="capitalize-first"/>
        </if>
        <else>
          <label variable="locator" form="short"/>
        </else>
      </choose>
      <text variable="locator"/>
    </group>
  </macro>
  <citation et-al-min="3" et-al-use-first="1" disambiguate-add-year-suffix="true" disambiguate-add-names="true" disambiguate-add-givenname="true" collapse="year" givenname-disambiguation-rule="primary-name">
    <sort>
      <key macro="author-bib" names-min="3" names-use-first="1"/>
      <key macro="date-sort-group"/>
      <key macro="date-sort-date" sort="ascending"/>
      <key variable="status"/>
    </sort>
    <layout prefix="(" suffix=")" delimiter="; ">
      <group delimiter=", ">
        <text macro="author-intext"/>
        <text macro="date-intext"/>
        <text macro="citation-locator"/>
      </group>
    </layout>
  </citation>
  <bibliography hanging-indent="true" et-al-min="21" et-al-use-first="19" et-al-use-last="true" entry-spacing="0" line-spacing="2">
    <sort>
      <key macro="author-bib"/>
      <key macro="date-sort-group"/>
      <key macro="date-sort-date" sort="ascending"/>
      <key variable="status"/>
      <key macro="title"/>
    </sort>
    <layout>
      <choose>
        <if type="bill legal_case legislation treaty" match="any">
          <!-- Legal items have different orders and delimiters -->
          <choose>
            <if variable="DOI URL" match="any">
              <text macro="legal-cites"/>
            </if>
            <else>
              <text macro="legal-cites" suffix="."/>
            </else>
          </choose>
        </if>
        <else>
          <group delimiter=" ">
            <group delimiter=". " suffix=".">
              <text macro="author-bib"/>
              <text macro="date-bib"/>
              <text macro="title-and-descriptions"/>
              <text macro="container"/>
              <text macro="event"/>
              <text macro="publisher"/>
            </group>
            <text macro="access"/>
            <text macro="publication-history"/>
          </group>
        </else>
      </choose>
    </layout>
  </bibliography>
</style>


================================================
FILE: paper/JOSS_files/paper.Rmd
================================================
---
title: "datawizard: An R Package for Easy Data Preparation and Statistical Transformations"
tags:
  - R
  - easystats
authors:
- affiliation: 1
  name: Indrajeet Patil
  orcid: 0000-0003-1995-6531
- affiliation: 2
  name: Dominique Makowski
  orcid: 0000-0001-5375-9967
- affiliation: 3
  name: Mattan S. Ben-Shachar
  orcid: 0000-0002-4287-4801
- affiliation: 4
  name: Brenton M. Wiernik^[Brenton Wiernik is currently an independent researcher and Research Scientist at Meta, Demography and Survey Science. The current work was done in an independent capacity.]
  orcid: 0000-0001-9560-6336
- affiliation: 5
  name: Etienne Bacher
  orcid: 0000-0002-9271-5075 
- affiliation: 6
  name: Daniel Lüdecke
  orcid: 0000-0002-8895-3206
  
affiliations:
- index: 1
  name: cynkra Analytics GmbH, Germany
- index: 2
  name: Nanyang Technological University, Singapore
- index: 3
  name: Ben-Gurion University of the Negev, Israel
- index: 4
  name: Independent Researcher
- index: 5
  name: Luxembourg Institute of Socio-Economic Research (LISER), Luxembourg
- index: 6
  name: University Medical Center Hamburg-Eppendorf, Germany
    
date: "`r Sys.Date()`"
bibliography: paper.bib
output: rticles::joss_article
csl: apa.csl
journal: JOSS
link-citations: yes
---

```{r, warning=FALSE, message=FALSE, echo=FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  out.width = "100%",
  dpi = 300,
  comment = "#>",
  message = FALSE,
  warning = FALSE
)

library(datawizard)
set.seed(2016)
```

# Summary

The `{datawizard}` package for the R programming language [@base2021] provides a lightweight toolbox to assist in key steps involved in any data analysis workflow: (1) wrangling the raw data to get it in the needed form, (2) applying preprocessing steps and statistical transformations, and (3) compute statistical summaries of data properties and distributions. Therefore, it can be a valuable tool for R users and developers looking for a lightweight option for data preparation.

# Statement of Need

The `{datawizard}` package is part of `{easystats}`, a collection of R packages designed to make statistical analysis easier (@Ben-Shachar2020, @Lüdecke2020parameters, @Lüdecke2020performance, @Lüdecke2021see, @Lüdecke2019, @Makowski2019, @Makowski2020). As this ecosystem follows a "0-external-hard-dependency" policy, a data manipulation package that relies only on base R needed to be created. In effect, `{datawizard}` provides a data processing backend for this entire ecosystem. 
In addition to its usefulness to the `{easystats}` ecosystem, it also provides *an* option for R users and package developers if they wish to keep their (recursive) dependency weight to a minimum (for other options, see @Dowle2021, @Eastwood2021).

Because `{datawizard}` is also meant to be used and adopted easily by a wide range of users, its workflow and syntax are designed to be similar to `{tidyverse}` [@Wickham2019], a widely used ecosystem of R packages. Thus, users familiar with the `{tidyverse}` can easily translate their knowledge and make full use of `{datawizard}`.

In addition to being a lightweight solution to clean messy data, `{datawizard}` also provides helpers for the other important step of data analysis: applying statistical transformations to the cleaned data while setting up statistical models. This includes various types of data standardization, normalization, rank-transformation, and adjustment. These transformations, although widely used, are not currently collectively implemented in a package in the R ecosystem, so `{datawizard}` can help new R users in finding the transformation they need.

Lastly, `{datawizard}` also provides a toolbox to create detailed summaries of data properties and distributions (e.g., tables of descriptive statistics for each variable). This is a common step in data analysis, but it is not available in base R or many modeling packages, so its inclusion makes `{datawizard}` a one-stop-shop for data preparation tasks.

# Features

## Data Preparation

The raw data is rarely in a state that it can be directly fed into a statistical model. It often needs to be modified in various ways. For example, columns need to be renamed or reshaped, certain portions of the data need to be filtered out, data scattered across multiple tables needs to be joined, etc. 

`{datawizard}` provides various functions for cleaning and preparing data (see Table 1).

| Function         | Operation                             |
| :--------------- | :------------------------------------ |
| `data_filter()`  | to select only certain *observations* |
| `data_select()`  | to select only certain *variables*    |
| `data_extract()` | to extract a single *variable*        |
| `data_rename()`  | to rename variables                   |
| `data_to_long()` | to convert data from wide to long     |
| `data_to_wide()` | to convert data from long to wide     |
| `data_join()`    | to join two data frames               |
| ...              | ...                                   |

Table: The table below lists a few key functions offered by `{datawizard}` for data wrangling. To see the full list, see the package website: <https://easystats.github.io/datawizard/>

We will look at one example function that converts data in wide format to tidy/long format:

```{r}
stocks <- data.frame(
  time = as.Date("2009-01-01") + 0:4,
  X = rnorm(5, 0, 1),
  Y = rnorm(5, 0, 2)
)

stocks

data_to_long(
  stocks,
  select = -c("time"),
  names_to = "stock",
  values_to = "price"
)
```

## Statistical Transformations

Even after getting the raw data in the needed format, we may need to transform certain variables further to meet requirements imposed by a statistical test.

`{datawizard}` provides a rich collection of such functions for transforming variables (see Table 2).

| Function          | Operation                                    |
| :---------------- | :------------------------------------------- |
| `standardize()`   | to center and scale data                     |
| `normalize()`     | to scale variables to 0-1 range              |
| `adjust()`        | to adjust data for effect of other variables |
| `slide()`         | to shift numeric value range                 |
| `ranktransform()` | to convert numeric values to integer ranks   |
| ...               | ...                                          |

Table: The table below lists a few key functions offered by `{datawizard}` for data transformations. To see the full list, see the package website: <https://easystats.github.io/datawizard/>

We will look at one example function that standardizes (i.e. centers and scales) data so that it can be expressed in terms of standard deviation:

```{r}
d <- data.frame(
  a = c(-2, -1, 0, 1, 2),
  b = c(3, 4, 5, 6, 7)
)

standardize(d, center = c(3, 4), scale = c(2, 4))
```

## Summaries of Data Properties and Distributions

The workhorse function to get a comprehensive summary of data properties is `describe_distribution()`, which combines a set of indices (e.g., measures of centrality, dispersion, range, skewness, kurtosis, etc.) computed by other functions in `{datawizard}`.

```{r eval=FALSE}
describe_distribution(mtcars)
```

```{r echo=FALSE, eval=TRUE, results="asis"}
library(kableExtra)
options(digits = 2)
kbl(describe_distribution(mtcars), format = "latex", booktabs = TRUE, linesep = "") 
```

# Licensing and Availability

`{datawizard}` is licensed under the GNU General Public License (v3.0), with all source code openly developed and stored on GitHub (<https://github.com/easystats/datawizard>), along with a corresponding issue tracker for bug reporting and feature enhancements. In the spirit of honest and open science, we encourage requests, tips for fixes, feature updates, as well as general questions and concerns via direct interaction with contributors and developers.

# Acknowledgments

`{datawizard}` is part of the collaborative [*easystats*](https://easystats.github.io/easystats/) ecosystem. Thus, we thank the [members of easystats](https://github.com/orgs/easystats/people) as well as the users.

# References


================================================
FILE: paper/JOSS_files/paper.bib
================================================
@Article{Ben-Shachar2020,
    title = {{e}ffectsize: Estimation of Effect Size Indices and Standardized Parameters},
    author = {Mattan S. Ben-Shachar and Daniel Lüdecke and Dominique Makowski},
    year = {2020},
    journal = {Journal of Open Source Software},
    volume = {5},
    number = {56},
    pages = {2815},
    publisher = {The Open Journal},
    doi = {10.21105/joss.02815},
    url = {https://doi.org/10.21105/joss.02815},
  }


@Article{Lüdecke2020parameters,
    title = {Extracting, Computing and Exploring the Parameters of Statistical Models using {R}.},
    volume = {5},
    doi = {10.21105/joss.02445},
    number = {53},
    journal = {Journal of Open Source Software},
    author = {Daniel Lüdecke and Mattan S. Ben-Shachar and Indrajeet Patil and Dominique Makowski},
    year = {2020},
    pages = {2445},
  }

 @Article{Lüdecke2021see,
    title = {{see}: An {R} Package for Visualizing Statistical Models},
    author = {Daniel Lüdecke and Indrajeet Patil and Mattan S. Ben-Shachar and Brenton M. Wiernik and Philip Waggoner and Dominique Makowski},
    journal = {Journal of Open Source Software},
    year = {2021},
    volume = {6},
    number = {64},
    pages = {3393},
    doi = {10.21105/joss.03393},
  }

@Article{Lüdecke2020performance,
    title = {{performance}: An {R} Package for Assessment, Comparison and Testing of Statistical Models},
    author = {Daniel Lüdecke and Mattan S. Ben-Shachar and Indrajeet Patil and Philip Waggoner and Dominique Makowski},
    year = {2021},
    journal = {Journal of Open Source Software},
    volume = {6},
    number = {60},
    pages = {3139},
    doi = {10.21105/joss.03139},
  }


@Article{Lüdecke2019,
    title = {{insight}: A Unified Interface to Access Information from Model Objects in {R}.},
    volume = {4},
    doi = {10.21105/joss.01412},
    number = {38},
    journal = {Journal of Open Source Software},
    author = {Daniel Lüdecke and Philip Waggoner and Dominique Makowski},
    year = {2019},
    pages = {1412},
  }


@Article{Makowski2020,
    title = {Methods and Algorithms for Correlation Analysis in {R}.},
    author = {Dominique Makowski and Mattan S. Ben-Shachar and Indrajeet Patil and Daniel Lüdecke},
    doi = {10.21105/joss.02306},
    year = {2020},
    journal = {Journal of Open Source Software},
    number = {51},
    volume = {5},
    pages = {2306},
    url = {https://joss.theoj.org/papers/10.21105/joss.02306},
  }

@Article{Patil2021,
    doi = {10.21105/joss.03167},
    url = {https://doi.org/10.21105/joss.03167},
    year = {2021},
    publisher = {{The Open Journal}},
    volume = {6},
    number = {61},
    pages = {3167},
    author = {Indrajeet Patil},
    title = {{Visualizations with statistical details: The {'ggstatsplot'} approach}},
    journal = {{Journal of Open Source Software}},
  }

@Article{Makowski2019,
    title = {{bayestestR}: Describing Effects and their Uncertainty, Existence and Significance within the {B}ayesian Framework.},
    author = {Dominique Makowski and Mattan S. Ben-Shachar and Daniel Lüdecke},
    journal = {Journal of Open Source Software},
    doi = {10.21105/joss.01541},
    year = {2019},
    number = {40},
    volume = {4},
    pages = {1541},
    url = {https://joss.theoj.org/papers/10.21105/joss.01541},
  }

@Article{Wickham2019,
    title = {Welcome to the {tidyverse}},
    author = {Hadley Wickham and Mara Averick and Jennifer Bryan and Winston Chang and Lucy D'Agostino McGowan and Romain François and Garrett Grolemund and Alex Hayes and Lionel Henry and Jim Hester and Max Kuhn and Thomas Lin Pedersen and Evan Miller and Stephan Milton Bache and Kirill Müller and Jeroen Ooms and David Robinson and Dana Paige Seidel and Vitalie Spinu and Kohske Takahashi and Davis Vaughan and Claus Wilke and Kara Woo and Hiroaki Yutani},
    year = {2019},
    journal = {Journal of Open Source Software},
    volume = {4},
    number = {43},
    pages = {1686},
    doi = {10.21105/joss.01686},
  }

@Article{Makowski2020modelbased,
    title = {Estimation of Model-Based Predictions, Contrasts and Means.},
    author = {Dominique Makowski and Mattan S. Ben-Shachar and Indrajeet Patil and Daniel Lüdecke},
    journal = {CRAN},
    year = {2020},
    url = {https://github.com/easystats/modelbased},
  }

@Manual{base2021,
    title = {{R}: A Language and Environment for Statistical Computing},
    author = {{R Core Team}},
    organization = {R Foundation for Statistical Computing},
    address = {Vienna, Austria},
    year = {2021},
    url = {https://www.R-project.org/},
  }

  @Manual{Eastwood2021,
    title = {poorman: A Poor Man's Dependency Free Recreation of 'dplyr'},
    author = {Nathan Eastwood},
    year = {2021},
    note = {R package version 0.2.5},
    url = {https://CRAN.R-project.org/package=poorman},
  }

@Manual{Dowle2021,
    title = {data.table: Extension of `data.frame`},
    author = {Matt Dowle and Arun Srinivasan},
    year = {2021},
    note = {R package version 1.14.2},
    url = {https://CRAN.R-project.org/package=data.table},
  }


================================================
FILE: paper/JOSS_files/paper.log
================================================
This is XeTeX, Version 3.141592653-2.6-0.999994 (TeX Live 2022) (preloaded format=xelatex 2022.9.27)  4 OCT 2022 17:54
entering extended mode
 restricted \write18 enabled.
 %&-line parsing enabled.
**paper.tex
(./paper.tex
LaTeX2e <2022-06-01> patch level 5
L3 programming layer <2022-08-30> (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/base/article.cls
Document Class: article 2021/10/04 v1.4n Standard LaTeX document class
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/base/size10.clo
File: size10.clo 2021/10/04 v1.4n Standard LaTeX file (size option)
)
\c@part=\count181
\c@section=\count182
\c@subsection=\count183
\c@subsubsection=\count184
\c@paragraph=\count185
\c@subparagraph=\count186
\c@figure=\count187
\c@table=\count188
\abovecaptionskip=\skip47
\belowcaptionskip=\skip48
\bibindent=\dimen138
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/marginnote/marginnote.sty
Package: marginnote 2018/08/09 v1.4b non floating margin notes for LaTeX
\c@mn@abspage=\count189
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/graphics/graphicx.sty
Package: graphicx 2021/09/16 v1.2d Enhanced LaTeX Graphics (DPC,SPQR)
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/graphics/keyval.sty
Package: keyval 2022/05/29 v1.15 key=value parser (DPC)
\KV@toks@=\toks16
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/graphics/graphics.sty
Package: graphics 2022/03/10 v1.4e Standard LaTeX Graphics (DPC,SPQR)
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/graphics/trig.sty
Package: trig 2021/08/11 v1.11 sin cos tan (DPC)
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/graphics-cfg/graphics.cfg
File: graphics.cfg 2016/06/04 v1.11 sample graphics configuration
)
Package graphics Info: Driver file: xetex.def on input line 107.
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/graphics-def/xetex.def
File: xetex.def 2022/09/22 v5.0n Graphics/color driver for xetex
\stockwidth=\dimen139
\stockheight=\dimen140
))
\Gin@req@height=\dimen141
\Gin@req@width=\dimen142
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/xcolor/xcolor.sty
Package: xcolor 2022/06/12 v2.14 LaTeX color extensions (UK)
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/graphics-cfg/color.cfg
File: color.cfg 2016/01/02 v1.6 sample color configuration
)
Package xcolor Info: Driver file: xetex.def on input line 227.
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/graphics/mathcolor.ltx)
Package xcolor Info: Model `cmy' substituted by `cmy0' on input line 1353.
Package xcolor Info: Model `RGB' extended on input line 1369.
Package xcolor Info: Model `HTML' substituted by `rgb' on input line 1371.
Package xcolor Info: Model `Hsb' substituted by `hsb' on input line 1372.
Package xcolor Info: Model `tHsb' substituted by `hsb' on input line 1373.
Package xcolor Info: Model `HSB' substituted by `hsb' on input line 1374.
Package xcolor Info: Model `Gray' substituted by `gray' on input line 1375.
Package xcolor Info: Model `wave' substituted by `hsb' on input line 1376.
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/preprint/authblk.sty
Package: authblk 2001/02/27 1.3 (PWD)
\affilsep=\skip49
\@affilsep=\skip50
\c@Maxaffil=\count190
\c@authors=\count191
\c@affil=\count192
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/etoolbox/etoolbox.sty
Package: etoolbox 2020/10/05 v2.5k e-TeX tools for LaTeX (JAW)
\etb@tempcnta=\count193
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/titlesec/titlesec.sty
Package: titlesec 2021/07/05 v2.14 Sectioning titles
\ttl@box=\box51
\beforetitleunit=\skip51
\aftertitleunit=\skip52
\ttl@plus=\dimen143
\ttl@minus=\dimen144
\ttl@toksa=\toks17
\titlewidth=\dimen145
\titlewidthlast=\dimen146
\titlewidthfirst=\dimen147
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/tools/calc.sty
Package: calc 2017/05/25 v4.3 Infix arithmetic (KKT,FJ)
\calc@Acount=\count194
\calc@Bcount=\count195
\calc@Adimen=\dimen148
\calc@Bdimen=\dimen149
\calc@Askip=\skip53
\calc@Bskip=\skip54
LaTeX Info: Redefining \setlength on input line 80.
LaTeX Info: Redefining \addtolength on input line 81.
\calc@Ccount=\count196
\calc@Cskip=\skip55
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/pgf/frontendlayer/tikz.sty (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/pgf/basiclayer/pgf.sty (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/pgf/utilities/pgfrcs.sty (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/utilities/pgfutil-common.tex
\pgfutil@everybye=\toks18
\pgfutil@tempdima=\dimen150
\pgfutil@tempdimb=\dimen151
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/utilities/pgfutil-common-lists.tex)) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/utilities/pgfutil-latex.def
\pgfutil@abb=\box52
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/utilities/pgfrcs.code.tex (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/pgf.revision.tex)
Package: pgfrcs 2021/05/15 v3.1.9a (3.1.9a)
))
Package: pgf 2021/05/15 v3.1.9a (3.1.9a)
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/pgf/basiclayer/pgfcore.sty (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/pgf/systemlayer/pgfsys.sty (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/systemlayer/pgfsys.code.tex
Package: pgfsys 2021/05/15 v3.1.9a (3.1.9a)
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/utilities/pgfkeys.code.tex
\pgfkeys@pathtoks=\toks19
\pgfkeys@temptoks=\toks20
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/utilities/pgfkeysfiltered.code.tex
\pgfkeys@tmptoks=\toks21
))
\pgf@x=\dimen152
\pgf@y=\dimen153
\pgf@xa=\dimen154
\pgf@ya=\dimen155
\pgf@xb=\dimen156
\pgf@yb=\dimen157
\pgf@xc=\dimen158
\pgf@yc=\dimen159
\pgf@xd=\dimen160
\pgf@yd=\dimen161
\w@pgf@writea=\write3
\r@pgf@reada=\read2
\c@pgf@counta=\count197
\c@pgf@countb=\count198
\c@pgf@countc=\count199
\c@pgf@countd=\count266
\t@pgf@toka=\toks22
\t@pgf@tokb=\toks23
\t@pgf@tokc=\toks24
\pgf@sys@id@count=\count267
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/systemlayer/pgf.cfg
File: pgf.cfg 2021/05/15 v3.1.9a (3.1.9a)
)
Driver file for pgf: pgfsys-xetex.def
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-xetex.def
File: pgfsys-xetex.def 2021/05/15 v3.1.9a (3.1.9a)
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-dvipdfmx.def
File: pgfsys-dvipdfmx.def 2021/05/15 v3.1.9a (3.1.9a)
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-common-pdf.def
File: pgfsys-common-pdf.def 2021/05/15 v3.1.9a (3.1.9a)
)
\pgfsys@objnum=\count268
))) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/systemlayer/pgfsyssoftpath.code.tex
File: pgfsyssoftpath.code.tex 2021/05/15 v3.1.9a (3.1.9a)
\pgfsyssoftpath@smallbuffer@items=\count269
\pgfsyssoftpath@bigbuffer@items=\count270
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/systemlayer/pgfsysprotocol.code.tex
File: pgfsysprotocol.code.tex 2021/05/15 v3.1.9a (3.1.9a)
)) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcore.code.tex
Package: pgfcore 2021/05/15 v3.1.9a (3.1.9a)
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmath.code.tex (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathcalc.code.tex (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathutil.code.tex) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathparser.code.tex
\pgfmath@dimen=\dimen162
\pgfmath@count=\count271
\pgfmath@box=\box53
\pgfmath@toks=\toks25
\pgfmath@stack@operand=\toks26
\pgfmath@stack@operation=\toks27
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.code.tex (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.basic.code.tex) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.trigonometric.code.tex) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.random.code.tex) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.comparison.code.tex) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.base.code.tex) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.round.code.tex) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.misc.code.tex) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.integerarithmetics.code.tex))) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathfloat.code.tex
\c@pgfmathroundto@lastzeros=\count272
)) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfint.code.tex) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepoints.code.tex
File: pgfcorepoints.code.tex 2021/05/15 v3.1.9a (3.1.9a)
\pgf@picminx=\dimen163
\pgf@picmaxx=\dimen164
\pgf@picminy=\dimen165
\pgf@picmaxy=\dimen166
\pgf@pathminx=\dimen167
\pgf@pathmaxx=\dimen168
\pgf@pathminy=\dimen169
\pgf@pathmaxy=\dimen170
\pgf@xx=\dimen171
\pgf@xy=\dimen172
\pgf@yx=\dimen173
\pgf@yy=\dimen174
\pgf@zx=\dimen175
\pgf@zy=\dimen176
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathconstruct.code.tex
File: pgfcorepathconstruct.code.tex 2021/05/15 v3.1.9a (3.1.9a)
\pgf@path@lastx=\dimen177
\pgf@path@lasty=\dimen178
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathusage.code.tex
File: pgfcorepathusage.code.tex 2021/05/15 v3.1.9a (3.1.9a)
\pgf@shorten@end@additional=\dimen179
\pgf@shorten@start@additional=\dimen180
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcorescopes.code.tex
File: pgfcorescopes.code.tex 2021/05/15 v3.1.9a (3.1.9a)
\pgfpic=\box54
\pgf@hbox=\box55
\pgf@layerbox@main=\box56
\pgf@picture@serial@count=\count273
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcoregraphicstate.code.tex
File: pgfcoregraphicstate.code.tex 2021/05/15 v3.1.9a (3.1.9a)
\pgflinewidth=\dimen181
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcoretransformations.code.tex
File: pgfcoretransformations.code.tex 2021/05/15 v3.1.9a (3.1.9a)
\pgf@pt@x=\dimen182
\pgf@pt@y=\dimen183
\pgf@pt@temp=\dimen184
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcorequick.code.tex
File: pgfcorequick.code.tex 2021/05/15 v3.1.9a (3.1.9a)
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreobjects.code.tex
File: pgfcoreobjects.code.tex 2021/05/15 v3.1.9a (3.1.9a)
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathprocessing.code.tex
File: pgfcorepathprocessing.code.tex 2021/05/15 v3.1.9a (3.1.9a)
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcorearrows.code.tex
File: pgfcorearrows.code.tex 2021/05/15 v3.1.9a (3.1.9a)
\pgfarrowsep=\dimen185
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreshade.code.tex
File: pgfcoreshade.code.tex 2021/05/15 v3.1.9a (3.1.9a)
\pgf@max=\dimen186
\pgf@sys@shading@range@num=\count274
\pgf@shadingcount=\count275
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreimage.code.tex
File: pgfcoreimage.code.tex 2021/05/15 v3.1.9a (3.1.9a)
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreexternal.code.tex
File: pgfcoreexternal.code.tex 2021/05/15 v3.1.9a (3.1.9a)
\pgfexternal@startupbox=\box57
)) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcorelayers.code.tex
File: pgfcorelayers.code.tex 2021/05/15 v3.1.9a (3.1.9a)
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcoretransparency.code.tex
File: pgfcoretransparency.code.tex 2021/05/15 v3.1.9a (3.1.9a)
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepatterns.code.tex
File: pgfcorepatterns.code.tex 2021/05/15 v3.1.9a (3.1.9a)
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcorerdf.code.tex
File: pgfcorerdf.code.tex 2021/05/15 v3.1.9a (3.1.9a)
))) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/modules/pgfmoduleshapes.code.tex
File: pgfmoduleshapes.code.tex 2021/05/15 v3.1.9a (3.1.9a)
\pgfnodeparttextbox=\box58
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/modules/pgfmoduleplot.code.tex
File: pgfmoduleplot.code.tex 2021/05/15 v3.1.9a (3.1.9a)
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version-0-65.sty
Package: pgfcomp-version-0-65 2021/05/15 v3.1.9a (3.1.9a)
\pgf@nodesepstart=\dimen187
\pgf@nodesepend=\dimen188
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version-1-18.sty
Package: pgfcomp-version-1-18 2021/05/15 v3.1.9a (3.1.9a)
)) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/pgf/utilities/pgffor.sty (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/pgf/utilities/pgfkeys.sty (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/utilities/pgfkeys.code.tex)) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/pgf/math/pgfmath.sty (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmath.code.tex)) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/utilities/pgffor.code.tex
Package: pgffor 2021/05/15 v3.1.9a (3.1.9a)
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmath.code.tex)
\pgffor@iter=\dimen189
\pgffor@skip=\dimen190
\pgffor@stack=\toks28
\pgffor@toks=\toks29
)) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/frontendlayer/tikz/tikz.code.tex
Package: tikz 2021/05/15 v3.1.9a (3.1.9a)
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/libraries/pgflibraryplothandlers.code.tex
File: pgflibraryplothandlers.code.tex 2021/05/15 v3.1.9a (3.1.9a)
\pgf@plot@mark@count=\count276
\pgfplotmarksize=\dimen191
)
\tikz@lastx=\dimen192
\tikz@lasty=\dimen193
\tikz@lastxsaved=\dimen194
\tikz@lastysaved=\dimen195
\tikz@lastmovetox=\dimen196
\tikz@lastmovetoy=\dimen197
\tikzleveldistance=\dimen198
\tikzsiblingdistance=\dimen199
\tikz@figbox=\box59
\tikz@figbox@bg=\box60
\tikz@tempbox=\box61
\tikz@tempbox@bg=\box62
\tikztreelevel=\count277
\tikznumberofchildren=\count278
\tikznumberofcurrentchild=\count279
\tikz@fig@count=\count280
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/modules/pgfmodulematrix.code.tex
File: pgfmodulematrix.code.tex 2021/05/15 v3.1.9a (3.1.9a)
\pgfmatrixcurrentrow=\count281
\pgfmatrixcurrentcolumn=\count282
\pgf@matrix@numberofcolumns=\count283
)
\tikz@expandcount=\count284
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tikzlibrarytopaths.code.tex
File: tikzlibrarytopaths.code.tex 2021/05/15 v3.1.9a (3.1.9a)
))) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/hyperref/hyperref.sty
Package: hyperref 2022-09-22 v7.00t Hypertext links for LaTeX
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/ltxcmds/ltxcmds.sty
Package: ltxcmds 2020-05-10 v1.25 LaTeX kernel commands for general use (HO)
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/iftex/iftex.sty
Package: iftex 2022/02/03 v1.0f TeX engine tests
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pdftexcmds/pdftexcmds.sty
Package: pdftexcmds 2020-06-27 v0.33 Utility functions of pdfTeX for LuaTeX (HO)
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/infwarerr/infwarerr.sty
Package: infwarerr 2019/12/03 v1.5 Providing info/warning/error messages (HO)
)
Package pdftexcmds Info: \pdf@primitive is available.
Package pdftexcmds Info: \pdf@ifprimitive is available.
Package pdftexcmds Info: \pdfdraftmode not found.
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/kvsetkeys/kvsetkeys.sty
Package: kvsetkeys 2019/12/15 v1.18 Key value parser (HO)
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/kvdefinekeys/kvdefinekeys.sty
Package: kvdefinekeys 2019-12-19 v1.6 Define keys (HO)
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/pdfescape/pdfescape.sty
Package: pdfescape 2019/12/09 v1.15 Implements pdfTeX's escape features (HO)
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/hycolor/hycolor.sty
Package: hycolor 2020-01-27 v1.10 Color options for hyperref/bookmark (HO)
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/letltxmacro/letltxmacro.sty
Package: letltxmacro 2019/12/03 v1.6 Let assignment for LaTeX macros (HO)
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/auxhook/auxhook.sty
Package: auxhook 2019-12-17 v1.6 Hooks for auxiliary files (HO)
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/hyperref/nameref.sty
Package: nameref 2022-05-17 v2.50 Cross-referencing by name of section
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/refcount/refcount.sty
Package: refcount 2019/12/15 v3.6 Data extraction from label references (HO)
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/gettitlestring/gettitlestring.sty
Package: gettitlestring 2019/12/15 v1.6 Cleanup title references (HO)
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/kvoptions/kvoptions.sty
Package: kvoptions 2022-06-15 v3.15 Key value format for package options (HO)
))
\c@section@level=\count285
)
\@linkdim=\dimen256
\Hy@linkcounter=\count286
\Hy@pagecounter=\count287
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/hyperref/pd1enc.def
File: pd1enc.def 2022-09-22 v7.00t Hyperref: PDFDocEncoding definition (HO)
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/intcalc/intcalc.sty
Package: intcalc 2019/12/15 v1.3 Expandable calculations with integers (HO)
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/etexcmds/etexcmds.sty
Package: etexcmds 2019/12/15 v1.7 Avoid name clashes with e-TeX commands (HO)
)
\Hy@SavedSpaceFactor=\count288
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/hyperref/puenc.def
File: puenc.def 2022-09-22 v7.00t Hyperref: PDF Unicode definition (HO)
)
Package hyperref Info: Hyper figures OFF on input line 4162.
Package hyperref Info: Link nesting OFF on input line 4167.
Package hyperref Info: Hyper index ON on input line 4170.
Package hyperref Info: Plain pages OFF on input line 4177.
Package hyperref Info: Backreferencing OFF on input line 4182.
Package hyperref Info: Implicit mode ON; LaTeX internals redefined.
Package hyperref Info: Bookmarks ON on input line 4410.
\c@Hy@tempcnt=\count289
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/url/url.sty
\Urlmuskip=\muskip16
Package: url 2013/09/16  ver 3.4  Verb mode for urls, etc.
)
LaTeX Info: Redefining \url on input line 4748.
\XeTeXLinkMargin=\dimen257
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/bitset/bitset.sty
Package: bitset 2019/12/09 v1.3 Handle bit-vector datatype (HO)
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/bigintcalc/bigintcalc.sty
Package: bigintcalc 2019/12/15 v1.5 Expandable calculations on big integers (HO)
))
\Fld@menulength=\count290
\Field@Width=\dimen258
\Fld@charsize=\dimen259
Package hyperref Info: Hyper figures OFF on input line 6027.
Package hyperref Info: Link nesting OFF on input line 6032.
Package hyperref Info: Hyper index ON on input line 6035.
Package hyperref Info: backreferencing OFF on input line 6042.
Package hyperref Info: Link coloring OFF on input line 6047.
Package hyperref Info: Link coloring with OCG OFF on input line 6052.
Package hyperref Info: PDF/A mode OFF on input line 6057.
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/base/atbegshi-ltx.sty
Package: atbegshi-ltx 2021/01/10 v1.0c Emulation of the original atbegshi
package with kernel methods
)
\Hy@abspage=\count291
\c@Item=\count292
\c@Hfootnote=\count293
)
Package hyperref Info: Driver (autodetected): hxetex.
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/hyperref/hxetex.def
File: hxetex.def 2022-09-22 v7.00t Hyperref driver for XeTeX
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/stringenc/stringenc.sty
Package: stringenc 2019/11/29 v1.12 Convert strings between diff. encodings (HO)
)
\pdfm@box=\box63
\c@Hy@AnnotLevel=\count294
\HyField@AnnotCount=\count295
\Fld@listcount=\count296
\c@bookmark@seq@number=\count297
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/rerunfilecheck/rerunfilecheck.sty
Package: rerunfilecheck 2022-07-10 v1.10 Rerun checks for auxiliary files (HO)
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/base/atveryend-ltx.sty
Package: atveryend-ltx 2020/08/19 v1.0a Emulation of the original atveryend package
with kernel methods
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/uniquecounter/uniquecounter.sty
Package: uniquecounter 2019/12/15 v1.4 Provide unlimited unique counter (HO)
)
Package uniquecounter Info: New unique counter `rerunfilecheck' on input line 285.
)
\Hy@SectionHShift=\skip56
)
Package hyperref Info: Option `colorlinks' set `true' on input line 12.
Package hyperref Info: Option `breaklinks' set `true' on input line 12.
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/caption/caption.sty
Package: caption 2022/03/01 v3.6b Customizing captions (AR)
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/caption/caption3.sty
Package: caption3 2022/03/17 v2.3b caption3 kernel (AR)
\caption@tempdima=\dimen260
\captionmargin=\dimen261
\caption@leftmargin=\dimen262
\caption@rightmargin=\dimen263
\caption@width=\dimen264
\caption@indent=\dimen265
\caption@parindent=\dimen266
\caption@hangindent=\dimen267
Package caption Info: Standard document class detected.
)
\c@caption@flags=\count298
\c@continuedfloat=\count299
Package caption Info: hyperref package is loaded.
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/tcolorbox/tcolorbox.sty
Package: tcolorbox 2022/06/24 version 5.1.1 text color boxes
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/tools/verbatim.sty
Package: verbatim 2020-07-07 v1.5u LaTeX2e package for verbatim enhancements
\every@verbatim=\toks30
\verbatim@line=\toks31
\verbatim@in@stream=\read3
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/environ/environ.sty
Package: environ 2014/05/04 v0.3 A new way to define environments
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/trimspaces/trimspaces.sty
Package: trimspaces 2009/09/17 v1.1 Trim spaces around a token list
)
\@envbody=\toks32
)
\tcb@titlebox=\box64
\tcb@upperbox=\box65
\tcb@lowerbox=\box66
\tcb@phantombox=\box67
\c@tcbbreakpart=\count300
\c@tcblayer=\count301
\c@tcolorbox@number=\count302
\tcb@temp=\box68
\tcb@temp=\box69
\tcb@temp=\box70
\tcb@temp=\box71
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/amsfonts/amssymb.sty
Package: amssymb 2013/01/14 v3.01 AMS font symbols
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/amsfonts/amsfonts.sty
Package: amsfonts 2013/01/14 v3.01 Basic AMSFonts support
\@emptytoks=\toks33
\symAMSa=\mathgroup4
\symAMSb=\mathgroup5
LaTeX Font Info:    Redeclaring math symbol \hbar on input line 98.
LaTeX Font Info:    Overwriting math alphabet `\mathfrak' in version `bold'
(Font)                  U/euf/m/n --> U/euf/b/n on input line 106.
)) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/amsmath/amsmath.sty
Package: amsmath 2022/04/08 v2.17n AMS math features
\@mathmargin=\skip57
For additional information on amsmath, use the `?' option.
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/amsmath/amstext.sty
Package: amstext 2021/08/26 v2.01 AMS text
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/amsmath/amsgen.sty
File: amsgen.sty 1999/11/30 v2.0 generic functions
\@emptytoks=\toks34
\ex@=\dimen268
)) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/amsmath/amsbsy.sty
Package: amsbsy 1999/11/29 v1.2d Bold Symbols
\pmbraise@=\dimen269
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/amsmath/amsopn.sty
Package: amsopn 2022/04/08 v2.04 operator names
)
\inf@bad=\count303
LaTeX Info: Redefining \frac on input line 234.
\uproot@=\count304
\leftroot@=\count305
LaTeX Info: Redefining \overline on input line 399.
LaTeX Info: Redefining \colon on input line 410.
\classnum@=\count306
\DOTSCASE@=\count307
LaTeX Info: Redefining \ldots on input line 496.
LaTeX Info: Redefining \dots on input line 499.
LaTeX Info: Redefining \cdots on input line 620.
\Mathstrutbox@=\box72
\strutbox@=\box73
LaTeX Info: Redefining \big on input line 722.
LaTeX Info: Redefining \Big on input line 723.
LaTeX Info: Redefining \bigg on input line 724.
LaTeX Info: Redefining \Bigg on input line 725.
\big@size=\dimen270
LaTeX Font Info:    Redeclaring font encoding OML on input line 743.
LaTeX Font Info:    Redeclaring font encoding OMS on input line 744.
\macc@depth=\count308
LaTeX Info: Redefining \bmod on input line 905.
LaTeX Info: Redefining \pmod on input line 910.
LaTeX Info: Redefining \smash on input line 940.
LaTeX Info: Redefining \relbar on input line 970.
LaTeX Info: Redefining \Relbar on input line 971.
\c@MaxMatrixCols=\count309
\dotsspace@=\muskip17
\c@parentequation=\count310
\dspbrk@lvl=\count311
\tag@help=\toks35
\row@=\count312
\column@=\count313
\maxfields@=\count314
\andhelp@=\toks36
\eqnshift@=\dimen271
\alignsep@=\dimen272
\tagshift@=\dimen273
\tagwidth@=\dimen274
\totwidth@=\dimen275
\lineht@=\dimen276
\@envbody=\toks37
\multlinegap=\skip58
\multlinetaggap=\skip59
\mathdisplay@stack=\toks38
LaTeX Info: Redefining \[ on input line 2953.
LaTeX Info: Redefining \] on input line 2954.
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/iftex/ifxetex.sty
Package: ifxetex 2019/10/25 v0.7 ifxetex legacy package. Use iftex instead.
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/iftex/ifluatex.sty
Package: ifluatex 2019/10/25 v1.5 ifluatex legacy package. Use iftex instead.
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/seqsplit/seqsplit.sty
Package: seqsplit 2006/08/07 v0.1 Splitting long sequences (DNA, RNA, proteins, etc.) 
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/base/fixltx2e.sty
Package: fixltx2e 2016/12/29 v2.1a fixes to LaTeX (obsolete)
Applying: [2015/01/01] Old fixltx2e package on input line 46.

Package fixltx2e Warning: fixltx2e is not required with releases after 2015
(fixltx2e)                All fixes are now in the LaTeX kernel.
(fixltx2e)                See the latexrelease package for details.

Already applied: [0000/00/00] Old fixltx2e package on input line 53.
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/biblatex/biblatex.sty
Package: biblatex 2022/07/12 v3.18b programmable bibliographies (PK/MW)
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/logreq/logreq.sty
Package: logreq 2010/08/04 v1.0 xml request logger
\lrq@indent=\count315
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/logreq/logreq.def
File: logreq.def 2010/08/04 v1.0 logreq spec v1.0
)) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/base/ifthen.sty
Package: ifthen 2022/04/13 v1.1d Standard LaTeX ifthen package (DPC)
)
\c@tabx@nest=\count316
\c@listtotal=\count317
\c@listcount=\count318
\c@liststart=\count319
\c@liststop=\count320
\c@citecount=\count321
\c@citetotal=\count322
\c@multicitecount=\count323
\c@multicitetotal=\count324
\c@instcount=\count325
\c@maxnames=\count326
\c@minnames=\count327
\c@maxitems=\count328
\c@minitems=\count329
\c@citecounter=\count330
\c@maxcitecounter=\count331
\c@savedcitecounter=\count332
\c@uniquelist=\count333
\c@uniquename=\count334
\c@refsection=\count335
\c@refsegment=\count336
\c@maxextratitle=\count337
\c@maxextratitleyear=\count338
\c@maxextraname=\count339
\c@maxextradate=\count340
\c@maxextraalpha=\count341
\c@abbrvpenalty=\count342
\c@highnamepenalty=\count343
\c@lownamepenalty=\count344
\c@maxparens=\count345
\c@parenlevel=\count346
\blx@tempcnta=\count347
\blx@tempcntb=\count348
\blx@tempcntc=\count349
\c@blx@maxsection=\count350
\blx@maxsegment@0=\count351
\blx@notetype=\count352
\blx@parenlevel@text=\count353
\blx@parenlevel@foot=\count354
\blx@sectionciteorder@0=\count355
\blx@sectionciteorderinternal@0=\count356
\blx@entrysetcounter=\count357
\blx@biblioinstance=\count358
\labelnumberwidth=\skip60
\labelalphawidth=\skip61
\biblabelsep=\skip62
\bibitemsep=\skip63
\bibnamesep=\skip64
\bibinitsep=\skip65
\bibparsep=\skip66
\bibhang=\skip67
\blx@bcfin=\read4
\blx@bcfout=\write4
\blx@langwohyphens=\language3
\c@mincomprange=\count359
\c@maxcomprange=\count360
\c@mincompwidth=\count361
Package biblatex Info: Trying to load biblatex default data model...
Package biblatex Info: ... file 'blx-dm.def' found.
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/biblatex/blx-dm.def
File: blx-dm.def 2022/07/12 v3.18b biblatex localization (PK/MW)
)
Package biblatex Info: Trying to load biblatex custom data model...
Package biblatex Info: ... file 'biblatex-dm.cfg' not found.
\c@afterword=\count362
\c@savedafterword=\count363
\c@annotator=\count364
\c@savedannotator=\count365
\c@author=\count366
\c@savedauthor=\count367
\c@bookauthor=\count368
\c@savedbookauthor=\count369
\c@commentator=\count370
\c@savedcommentator=\count371
\c@editor=\count372
\c@savededitor=\count373
\c@editora=\count374
\c@savededitora=\count375
\c@editorb=\count376
\c@savededitorb=\count377
\c@editorc=\count378
\c@savededitorc=\count379
\c@foreword=\count380
\c@savedforeword=\count381
\c@holder=\count382
\c@savedholder=\count383
\c@introduction=\count384
\c@savedintroduction=\count385
\c@namea=\count386
\c@savednamea=\count387
\c@nameb=\count388
\c@savednameb=\count389
\c@namec=\count390
\c@savednamec=\count391
\c@translator=\count392
\c@savedtranslator=\count393
\c@shortauthor=\count394
\c@savedshortauthor=\count395
\c@shorteditor=\count396
\c@savedshorteditor=\count397
\c@labelname=\count398
\c@savedlabelname=\count399
\c@institution=\count400
\c@savedinstitution=\count401
\c@lista=\count402
\c@savedlista=\count403
\c@listb=\count404
\c@savedlistb=\count405
\c@listc=\count406
\c@savedlistc=\count407
\c@listd=\count408
\c@savedlistd=\count409
\c@liste=\count410
\c@savedliste=\count411
\c@listf=\count412
\c@savedlistf=\count413
\c@location=\count414
\c@savedlocation=\count415
\c@organization=\count416
\c@savedorganization=\count417
\c@origlocation=\count418
\c@savedoriglocation=\count419
\c@origpublisher=\count420
\c@savedorigpublisher=\count421
\c@publisher=\count422
\c@savedpublisher=\count423
\c@language=\count424
\c@savedlanguage=\count425
\c@origlanguage=\count426
\c@savedoriglanguage=\count427
\c@pageref=\count428
\c@savedpageref=\count429
\shorthandwidth=\skip68
\shortjournalwidth=\skip69
\shortserieswidth=\skip70
\shorttitlewidth=\skip71
\shortauthorwidth=\skip72
\shorteditorwidth=\skip73
\locallabelnumberwidth=\skip74
\locallabelalphawidth=\skip75
\localshorthandwidth=\skip76
\localshortjournalwidth=\skip77
\localshortserieswidth=\skip78
\localshorttitlewidth=\skip79
\localshortauthorwidth=\skip80
\localshorteditorwidth=\skip81
Package biblatex Info: Trying to load enhanced support for Unicode engines...
Package biblatex Info: ... file 'blx-unicode.def' found.
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/biblatex/blx-unicode.def)
Package biblatex Info: Trying to load compatibility code...
Package biblatex Info: ... file 'blx-compat.def' found.
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/biblatex/blx-compat.def
File: blx-compat.def 2022/07/12 v3.18b biblatex compatibility (PK/MW)
)
Package biblatex Info: Trying to load generic definitions...
Package biblatex Info: ... file 'biblatex.def' found.
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/biblatex/biblatex.def
File: biblatex.def 2022/07/12 v3.18b biblatex compatibility (PK/MW)
\c@textcitecount=\count430
\c@textcitetotal=\count431
\c@textcitemaxnames=\count432
\c@biburlbigbreakpenalty=\count433
\c@biburlbreakpenalty=\count434
\c@biburlnumpenalty=\count435
\c@biburlucpenalty=\count436
\c@biburllcpenalty=\count437
\biburlbigskip=\muskip18
\biburlnumskip=\muskip19
\biburlucskip=\muskip20
\biburllcskip=\muskip21
\c@smartand=\count438
)
Package biblatex Info: Trying to load bibliography style 'numeric'...
Package biblatex Info: ... file 'numeric.bbx' found.
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/biblatex/bbx/numeric.bbx
File: numeric.bbx 2022/07/12 v3.18b biblatex bibliography style (PK/MW)
Package biblatex Info: Trying to load bibliography style 'standard'...
Package biblatex Info: ... file 'standard.bbx' found.
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/biblatex/bbx/standard.bbx
File: standard.bbx 2022/07/12 v3.18b biblatex bibliography style (PK/MW)
\c@bbx:relatedcount=\count439
\c@bbx:relatedtotal=\count440
))
Package biblatex Info: Trying to load citation style 'numeric'...
Package biblatex Info: ... file 'numeric.cbx' found.
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/biblatex/cbx/numeric.cbx
File: numeric.cbx 2022/07/12 v3.18b biblatex citation style (PK/MW)
Package biblatex Info: Redefining '\cite'.
Package biblatex Info: Redefining '\parencite'.
Package biblatex Info: Redefining '\footcite'.
Package biblatex Info: Redefining '\footcitetext'.
Package biblatex Info: Redefining '\smartcite'.
Package biblatex Info: Redefining '\supercite'.
Package biblatex Info: Redefining '\textcite'.
Package biblatex Info: Redefining '\textcites'.
Package biblatex Info: Redefining '\cites'.
Package biblatex Info: Redefining '\parencites'.
Package biblatex Info: Redefining '\smartcites'.
)
Package biblatex Info: Trying to load configuration file...
Package biblatex Info: ... file 'biblatex.cfg' found.
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/biblatex/biblatex.cfg
File: biblatex.cfg 
)
Package biblatex Info: XeTeX detected.
(biblatex)             Assuming input encoding 'utf8'.
Package biblatex Info: Document encoding is UTF8 ....
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/l3kernel/expl3.sty
Package: expl3 2022-08-30 L3 programming layer (loader) 
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/l3backend/l3backend-xetex.def
File: l3backend-xetex.def 2022-08-30 L3 backend support: XeTeX
\g__graphics_track_int=\count441
\l__pdf_internal_box=\box74
\g__pdf_backend_object_int=\count442
\g__pdf_backend_annotation_int=\count443
\g__pdf_backend_link_int=\count444
))
Package biblatex Info: ... and expl3
(biblatex)             2022-08-30 L3 programming layer (loader) 
(biblatex)             is new enough (at least 2020/04/06),
(biblatex)             setting 'casechanger=expl3'.
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/biblatex/blx-case-expl3.sty (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/l3packages/xparse/xparse.sty
Package: xparse 2022-06-22 L3 Experimental document command parser
)
Package: blx-case-expl3 2022/07/12 v3.18b expl3 case changing code for biblatex
)) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/geometry/geometry.sty
Package: geometry 2020/01/02 v5.9 Page Geometry
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/iftex/ifvtex.sty
Package: ifvtex 2019/10/25 v1.7 ifvtex legacy package. Use iftex instead.
)
\Gm@cnth=\count445
\Gm@cntv=\count446
\c@Gm@tempcnt=\count447
\Gm@bindingoffset=\dimen277
\Gm@wd@mp=\dimen278
\Gm@odd@mp=\dimen279
\Gm@even@mp=\dimen280
\Gm@layoutwidth=\dimen281
\Gm@layoutheight=\dimen282
\Gm@layouthoffset=\dimen283
\Gm@layoutvoffset=\dimen284
\Gm@dimlist=\toks39
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/fancyhdr/fancyhdr.sty
Package: fancyhdr 2022/05/18 v4.0.3 Extensive control of page headers and footers
\f@nch@headwidth=\skip82
\f@nch@O@elh=\skip83
\f@nch@O@erh=\skip84
\f@nch@O@olh=\skip85
\f@nch@O@orh=\skip86
\f@nch@O@elf=\skip87
\f@nch@O@erf=\skip88
\f@nch@O@olf=\skip89
\f@nch@O@orf=\skip90
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/xelatex/mathspec/mathspec.sty
Package: mathspec 2016/12/22 v0.2b LaTeX Package (Mathematics font selection for XeLaTeX)
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/fontspec/fontspec.sty
Package: fontspec 2022/01/15 v2.8a Font selection for XeLaTeX and LuaLaTeX
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/fontspec/fontspec-xetex.sty
Package: fontspec-xetex 2022/01/15 v2.8a Font selection for XeLaTeX and LuaLaTeX
\l__fontspec_script_int=\count448
\l__fontspec_language_int=\count449
\l__fontspec_strnum_int=\count450
\l__fontspec_tmp_int=\count451
\l__fontspec_tmpa_int=\count452
\l__fontspec_tmpb_int=\count453
\l__fontspec_tmpc_int=\count454
\l__fontspec_em_int=\count455
\l__fontspec_emdef_int=\count456
\l__fontspec_strong_int=\count457
\l__fontspec_strongdef_int=\count458
\l__fontspec_tmpa_dim=\dimen285
\l__fontspec_tmpb_dim=\dimen286
\l__fontspec_tmpc_dim=\dimen287
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/base/fontenc.sty
Package: fontenc 2021/04/29 v2.0v Standard LaTeX package
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/fontspec/fontspec.cfg))) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/xkeyval/xkeyval.sty
Package: xkeyval 2022/06/16 v2.9 package option processing (HA)
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/xkeyval/xkeyval.tex (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/xkeyval/xkvutils.tex
\XKV@toks=\toks40
\XKV@tempa@toks=\toks41
)
\XKV@depth=\count459
File: xkeyval.tex 2014/12/03 v2.7a key=value parser (HA)
))
\c@eu@=\count460
\c@eu@i=\count461
\c@mkern=\count462
)
Package hyperref Info: Option `unicode' set `true' on input line 151.
Package hyperref Info: Option `breaklinks' set `true' on input line 151.
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/grffile/grffile.sty
Package: grffile 2019/11/11 v2.1 Extended file name support for graphics (legacy)
Package grffile Info: This package is an empty stub for compatibility on input line 40.
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/fancyvrb/fancyvrb.sty
Package: fancyvrb 2022/06/06 4.5 verbatim text (tvz,hv)
\FV@CodeLineNo=\count463
\FV@InFile=\read5
\FV@TabBox=\box75
\c@FancyVerbLine=\count464
\FV@StepNumber=\count465
\FV@OutFile=\write5
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/framed/framed.sty
Package: framed 2011/10/22 v 0.96: framed or shaded text with page breaks
\OuterFrameSep=\skip91
\fb@frw=\dimen288
\fb@frh=\dimen289
\FrameRule=\dimen290
\FrameSep=\dimen291
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/tools/longtable.sty
Package: longtable 2021-09-01 v4.17 Multi-page Table package (DPC)
\LTleft=\skip92
\LTright=\skip93
\LTpre=\skip94
\LTpost=\skip95
\LTchunksize=\count466
\LTcapwidth=\dimen292
\LT@head=\box76
\LT@firsthead=\box77
\LT@foot=\box78
\LT@lastfoot=\box79
\LT@gbox=\box80
\LT@cols=\count467
\LT@rows=\count468
\c@LT@tables=\count469
\c@LT@chunks=\count470
\LT@p@ftn=\toks42
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/booktabs/booktabs.sty
Package: booktabs 2020/01/12 v1.61803398 Publication quality tables
\heavyrulewidth=\dimen293
\lightrulewidth=\dimen294
\cmidrulewidth=\dimen295
\belowrulesep=\dimen296
\belowbottomsep=\dimen297
\aboverulesep=\dimen298
\abovetopsep=\dimen299
\cmidrulesep=\dimen300
\cmidrulekern=\dimen301
\defaultaddspace=\dimen302
\@cmidla=\count471
\@cmidlb=\count472
\@aboverulesep=\dimen303
\@belowrulesep=\dimen304
\@thisruleclass=\count473
\@lastruleclass=\count474
\@thisrulewidth=\dimen305
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/tools/array.sty
Package: array 2022/03/10 v2.5f Tabular extension package (FMi)
\col@sep=\dimen306
\ar@mcellbox=\box81
\extrarowheight=\dimen307
\NC@list=\toks43
\extratabsurround=\skip96
\backup@length=\skip97
\ar@cellbox=\box82
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/mdwtools/footnote.sty
Package: footnote 1997/01/28 1.13 Save footnotes around boxes
\fn@notes=\box83
\fn@width=\dimen308
)
\cslhangindent=\skip98
\csllabelwidth=\skip99
\cslentryspacingunit=\skip100
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/multirow/multirow.sty
Package: multirow 2021/03/15 v2.8 Span multiple rows of a table
\multirow@colwidth=\skip101
\multirow@cntb=\count475
\multirow@dima=\skip102
\bigstrutjot=\dimen309
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/wrapfig/wrapfig.sty
\wrapoverhang=\dimen310
\WF@size=\dimen311
\c@WF@wrappedlines=\count476
\WF@box=\box84
\WF@everypar=\toks44
Package: wrapfig 2003/01/31  v 3.6
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/float/float.sty
Package: float 2001/11/08 v1.3d Float enhancements (AL)
\c@float@type=\count477
\float@exts=\toks45
\float@box=\box85
\@float@everytoks=\toks46
\@floatcapt=\box86
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/colortbl/colortbl.sty
Package: colortbl 2022/06/20 v1.0f Color table columns (DPC)
\everycr=\toks47
\minrowclearance=\skip103
\rownum=\count478
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/pdflscape/pdflscape.sty
Package: pdflscape 2019/12/05 v0.12 Display of landscape pages in PDF (HO)
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/graphics/lscape.sty
Package: lscape 2020/05/28 v3.02 Landscape Pages (DPC)
)
Package pdflscape Info: Auto-detected driver: dvipdfm (xetex) on input line 98.
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/tabu/tabu.sty
Package: tabu 2019/01/11 v2.9 - flexible LaTeX tabulars (FC+tabu-fixed)
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/varwidth/varwidth.sty
Package: varwidth 2009/03/30 ver 0.92;  Variable-width minipages
\@vwid@box=\box87
\sift@deathcycles=\count479
\@vwid@loff=\dimen312
\@vwid@roff=\dimen313
)
\c@taburow=\count480
\tabu@nbcols=\count481
\tabu@cnt=\count482
\tabu@Xcol=\count483
\tabu@alloc=\count484
\tabu@nested=\count485
\tabu@target=\dimen314
\tabu@spreadtarget=\dimen315
\tabu@naturalX=\dimen316
\tabucolX=\dimen317
\tabu@Xsum=\dimen318
\extrarowdepth=\dimen319
\abovetabulinesep=\dimen320
\belowtabulinesep=\dimen321
\tabustrutrule=\dimen322
\tabu@thebody=\toks48
\tabu@footnotes=\toks49
\tabu@box=\box88
\tabu@arstrutbox=\box89
\tabu@hleads=\box90
\tabu@vleads=\box91
\tabu@cellskip=\skip104
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/threeparttable/threeparttable.sty
Package: threeparttable 2003/06/13  v 3.0
\@tempboxb=\box92
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/threeparttablex/threeparttablex.sty
Package: threeparttablex 2013/07/23 v0.3 by daleif
\TPTL@width=\skip105
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/generic/ulem/ulem.sty
\UL@box=\box93
\UL@hyphenbox=\box94
\UL@skip=\skip106
\UL@hook=\toks50
\UL@height=\dimen323
\UL@pe=\count486
\UL@pixel=\dimen324
\ULC@box=\box95
Package: ulem 2019/11/18
\ULdepth=\dimen325
) (/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/makecell/makecell.sty
Package: makecell 2009/08/03 V0.1e Managing of Tab Column Heads and Cells
\rotheadsize=\dimen326
\c@nlinenum=\count487
\TeXr@lab=\toks51
)
\@quotelevel=\count488
\@quotereset=\count489
(./paper.aux)
\openout1 = `paper.aux'.

LaTeX Font Info:    Checking defaults for OML/cmm/m/it on input line 305.
LaTeX Font Info:    ... okay on input line 305.
LaTeX Font Info:    Checking defaults for OMS/cmsy/m/n on input line 305.
LaTeX Font Info:    ... okay on input line 305.
LaTeX Font Info:    Checking defaults for OT1/cmr/m/n on input line 305.
LaTeX Font Info:    ... okay on input line 305.
LaTeX Font Info:    Checking defaults for T1/cmr/m/n on input line 305.
LaTeX Font Info:    ... okay on input line 305.
LaTeX Font Info:    Checking defaults for TS1/cmr/m/n on input line 305.
LaTeX Font Info:    Trying to load font information for TS1+cmr on input line 305.
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/base/ts1cmr.fd
File: ts1cmr.fd 2019/12/16 v2.5j Standard LaTeX font definitions
)
LaTeX Font Info:    ... okay on input line 305.
LaTeX Font Info:    Checking defaults for TU/lmr/m/n on input line 305.
LaTeX Font Info:    ... okay on input line 305.
LaTeX Font Info:    Checking defaults for OMX/cmex/m/n on input line 305.
LaTeX Font Info:    ... okay on input line 305.
LaTeX Font Info:    Checking defaults for U/cmr/m/n on input line 305.
LaTeX Font Info:    ... okay on input line 305.
LaTeX Font Info:    Checking defaults for PD1/pdf/m/n on input line 305.
LaTeX Font Info:    ... okay on input line 305.
LaTeX Font Info:    Checking defaults for PU/pdf/m/n on input line 305.
LaTeX Font Info:    ... okay on input line 305.
Package hyperref Info: Link coloring ON on input line 305.
(./paper.out) (./paper.out)
\@outlinefile=\write6
\openout6 = `paper.out'.

Package caption Info: Begin \AtBeginDocument code.
Package caption Info: float package is loaded.
Package caption Info: longtable package is loaded.
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/caption/ltcaption.sty
Package: ltcaption 2021/01/08 v1.4c longtable captions (AR)
)
Package caption Info: threeparttable package is loaded.
Package caption Info: wrapfig package is loaded.
Package caption Info: End \AtBeginDocument code.
Package biblatex Info: Trying to load language 'english'...
Package biblatex Info: ... file 'english.lbx' found.
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/biblatex/lbx/english.lbx
File: english.lbx 2022/07/12 v3.18b biblatex localization (PK/MW)
)
Package biblatex Info: XeTeX detected.
(biblatex)             Assuming input encoding 'utf8'.
Package biblatex Info: Automatic encoding selection.
(biblatex)             Assuming data encoding 'utf8'.
\openout4 = `paper.bcf'.

Package biblatex Info: Trying to load bibliographic data...
Package biblatex Info: ... file 'paper.bbl' not found.
No file paper.bbl.
Package biblatex Info: Reference section=0 on input line 305.
Package biblatex Info: Reference segment=0 on input line 305.
*geometry* driver: auto-detecting
*geometry* detected driver: xetex
*geometry* verbose mode - [ preamble ] result:
* driver: xetex
* paper: a4paper
* layout: <same size as paper>
* layoutoffset:(h,v)=(0.0pt,0.0pt)
* modes: includemp 
* h-part:(L,W,R)=(28.45274pt, 526.376pt, 42.67912pt)
* v-part:(T,H,B)=(99.58464pt, 660.10394pt, 85.35826pt)
* \paperwidth=597.50787pt
* \paperheight=845.04684pt
* \textwidth=387.33861pt
* \textheight=660.10394pt
* \oddsidemargin=95.22015pt
* \evensidemargin=95.22015pt
* \topmargin=-60.28131pt
* \headheight=62.59596pt
* \headsep=25.0pt
* \topskip=10.0pt
* \footskip=30.0pt
* \marginparwidth=128.0374pt
* \marginparsep=11.0pt
* \columnsep=10.0pt
* \skip\footins=9.0pt plus 4.0pt minus 2.0pt
* \hoffset=0.0pt
* \voffset=0.0pt
* \mag=1000
* \@twocolumnfalse
* \@twosidefalse
* \@mparswitchfalse
* \@reversemargintrue
* (1in=72.27pt=25.4mm, 1cm=28.453pt)

LaTeX Font Info:    Trying to load font information for U+msa on input line 306.
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/amsfonts/umsa.fd
File: umsa.fd 2013/01/14 v3.01 AMS symbols A
)
LaTeX Font Info:    Trying to load font information for U+msb on input line 306.
(/Users/indrajeetpatil/Library/TinyTeX/texmf-dist/tex/latex/amsfonts/umsb.fd
File: umsb.fd 2013/01/14 v3.01 AMS symbols B
)

Package hyperref Warning: Suppressing link with empty target on input line 332.


Package hyperref Warning: Suppressing link with empty target on input line 332.


Package hyperref Warning: Suppressing link with empty target on input line 332.

File: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library/rticles/rmarkdown/templates/joss/resources/JOSS-logo.png Graphic file (type bmp)
</Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library/rticles/rmarkdown/templates/joss/resources/JOSS-logo.png>

Package fancyhdr Warning: \headheight is too small (62.59596pt): 
(fancyhdr)                Make it at least 63.55022pt, for example:
(fancyhdr)                \setlength{\headheight}{63.55022pt}.
(fancyhdr)                You might also make \topmargin smaller to compensate:
(fancyhdr)                \addtolength{\topmargin}{-0.95425pt}.

LaTeX Font Info:    Font shape `TU/lmss/m/it' in size <8> not available
(Font)              Font shape `TU/lmss/m/sl' tried instead on input line 393.
[1

]
File: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library/rticles/rmarkdown/templates/joss/resources/JOSS-logo.png Graphic file (type bmp)
</Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library/rticles/rmarkdown/templates/joss/resources/JOSS-logo.png>

Package fancyhdr Warning: \headheight is too small (62.59596pt): 
(fancyhdr)                Make it at least 63.55022pt, for example:
(fancyhdr)                \setlength{\headheight}{63.55022pt}.
(fancyhdr)                You might also make \topmargin smaller to compensate:
(fancyhdr)                \addtolength{\topmargin}{-0.95425pt}.

[2]
Underfull \hbox (badness 1448) in paragraph at lines 529--533
\TU/lmr/m/n/10 The workhorse function to get a comprehensive summary of data properties is
 []

File: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library/rticles/rmarkdown/templates/joss/resources/JOSS-logo.png Graphic file (type bmp)
</Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library/rticles/rmarkdown/templates/joss/resources/JOSS-logo.png>

Package fancyhdr Warning: \headheight is too small (62.59596pt): 
(fancyhdr)                Make it at least 63.55022pt, for example:
(fancyhdr)                \setlength{\headheight}{63.55022pt}.
(fancyhdr)                You might also make \topmargin smaller to compensate:
(fancyhdr)                \addtolength{\topmargin}{-0.95425pt}.

[3]
Overfull \hbox (25.8514pt too wide) in paragraph at lines 540--557
[][] 
 []

File: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library/rticles/rmarkdown/templates/joss/resources/JOSS-logo.png Graphic file (type bmp)
</Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library/rticles/rmarkdown/templates/joss/resources/JOSS-logo.png>

Package fancyhdr Warning: \headheight is too small (62.59596pt): 
(fancyhdr)                Make it at least 63.55022pt, for example:
(fancyhdr)                \setlength{\headheight}{63.55022pt}.
(fancyhdr)                You might also make \topmargin smaller to compensate:
(fancyhdr)                \addtolength{\topmargin}{-0.95425pt}.

[4]
File: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library/rticles/rmarkdown/templates/joss/resources/JOSS-logo.png Graphic file (type bmp)
</Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library/rticles/rmarkdown/templates/joss/resources/JOSS-logo.png>

Package fancyhdr Warning: \headheight is too small (62.59596pt): 
(fancyhdr)                Make it at least 63.55022pt, for example:
(fancyhdr)                \setlength{\headheight}{63.55022pt}.
(fancyhdr)                You might also make \topmargin smaller to compensate:
(fancyhdr)                \addtolength{\topmargin}{-0.95425pt}.

[5] (./paper.aux)
Package rerunfilecheck Info: File `paper.out' has not changed.
(rerunfilecheck)             Checksum: BFFBA38CB5FAA9461C3119C411944BB9;1328.
Package logreq Info: Writing requests to 'paper.run.xml'.
\openout1 = `paper.run.xml'.

 ) 
Here is how much of TeX's memory you used:
 36661 strings out of 477747
 751496 string characters out of 5842604
 1543570 words of memory out of 5000000
 57125 multiletter control sequences out of 15000+600000
 477790 words of font info for 84 fonts, out of 8000000 for 9000
 14 hyphenation exceptions out of 8191
 84i,13n,81p,678b,848s stack positions out of 10000i,1000n,20000p,200000b,200000s

Output written on paper.pdf (5 pages).


================================================
FILE: paper/JOSS_files/paper.md
================================================
---
title: "datawizard: An R Package for Easy Data Preparation and Statistical Transformations"
tags:
  - R
  - easystats
authors:
- affiliation: 1
  name: Indrajeet Patil
  orcid: 0000-0003-1995-6531
- affiliation: 2
  name: Dominique Makowski
  orcid: 0000-0001-5375-9967
- affiliation: 3
  name: Mattan S. Ben-Shachar
  orcid: 0000-0002-4287-4801
- affiliation: 4
  name: Brenton M. Wiernik^[Brenton Wiernik is currently an independent researcher and Research Scientist at Meta, Demography and Survey Science. The current work was done in an independent capacity.]
  orcid: 0000-0001-9560-6336
- affiliation: 5
  name: Etienne Bacher
  orcid: 0000-0002-9271-5075 
- affiliation: 6
  name: Daniel Lüdecke
  orcid: 0000-0002-8895-3206
  
affiliations:
- index: 1
  name: cynkra Analytics GmbH, Germany
- index: 2
  name: Nanyang Technological University, Singapore
- index: 3
  name: Ben-Gurion University of the Negev, Israel
- index: 4
  name: Independent Researcher
- index: 5
  name: Luxembourg Institute of Socio-Economic Research (LISER), Luxembourg
- index: 6
  name: University Medical Center Hamburg-Eppendorf, Germany
    
date: "2022-10-04"
bibliography: paper.bib
output: rticles::joss_article
csl: apa.csl
journal: JOSS
link-citations: yes
---


# Summary

The `{datawizard}` package for the R programming language [@base2021] provides a lightweight toolbox to assist in key steps involved in any data analysis workflow: (1) wrangling the raw data to get it in the needed form, (2) applying preprocessing steps and statistical transformations, and (3) compute statistical summaries of data properties and distributions. Therefore, it can be a valuable tool for R users and developers looking for a lightweight option for data preparation.

# Statement of Need

The `{datawizard}` package is part of `{easystats}`, a collection of R packages designed to make statistical analysis easier (@Ben-Shachar2020, @Lüdecke2020parameters, @Lüdecke2020performance, @Lüdecke2021see, @Lüdecke2019, @Makowski2019, @Makowski2020). As this ecosystem follows a "0-external-hard-dependency" policy, a data manipulation package that relies only on base R needed to be created. In effect, `{datawizard}` provides a data processing backend for this entire ecosystem. 
In addition to its usefulness to the `{easystats}` ecosystem, it also provides *an* option for R users and package developers if they wish to keep their (recursive) dependency weight to a minimum (for other options, see @Dowle2021, @Eastwood2021).

Because `{datawizard}` is also meant to be used and adopted easily by a wide range of users, its workflow and syntax are designed to be similar to `{tidyverse}` [@Wickham2019], a widely used ecosystem of R packages. Thus, users familiar with the `{tidyverse}` can easily translate their knowledge and make full use of `{datawizard}`.

In addition to being a lightweight solution to clean messy data, `{datawizard}` also provides helpers for the other important step of data analysis: applying statistical transformations to the cleaned data while setting up statistical models. This includes various types of data standardization, normalization, rank-transformation, and adjustment. These transformations, although widely used, are not currently collectively implemented in a package in the R ecosystem, so `{datawizard}` can help new R users in finding the transformation they need.

Lastly, `{datawizard}` also provides a toolbox to create detailed summaries of data properties and distributions (e.g., tables of descriptive statistics for each variable). This is a common step in data analysis, but it is not available in base R or many modeling packages, so its inclusion makes `{datawizard}` a one-stop-shop for data preparation tasks.

# Features

## Data Preparation

The raw data is rarely in a state that it can be directly fed into a statistical model. It often needs to be modified in various ways. For example, columns need to be renamed or reshaped, certain portions of the data need to be filtered out, data scattered across multiple tables needs to be joined, etc. 

`{datawizard}` provides various functions for cleaning and preparing data (see Table 1).

| Function         | Operation                             |
| :--------------- | :------------------------------------ |
| `data_filter()`  | to select only certain *observations* |
| `data_select()`  | to select only certain *variables*    |
| `data_extract()` | to extract a single *variable*        |
| `data_rename()`  | to rename variables                   |
| `data_to_long()` | to convert data from wide to long     |
| `data_to_wide()` | to convert data from long to wide     |
| `data_join()`    | to join two data frames               |
| ...              | ...                                   |

Table: The table below lists a few key functions offered by `{datawizard}` for data wrangling. To see the full list, see the package website: <https://easystats.github.io/datawizard/>

We will look at one example function that converts data in wide format to tidy/long format:


```r
stocks <- data.frame(
  time = as.Date("2009-01-01") + 0:4,
  X = rnorm(5, 0, 1),
  Y = rnorm(5, 0, 2)
)

stocks
#>         time           X          Y
#> 1 2009-01-01 -0.91474184 -0.5654808
#> 2 2009-01-02  1.00124785 -1.5270177
#> 3 2009-01-03 -0.05642291 -1.3700199
#> 4 2009-01-04  0.29664516  0.7341479
#> 5 2009-01-05 -2.79147086  0.3659937

data_to_long(
  stocks,
  select = -c("time"),
  names_to = "stock",
  values_to = "price"
)
#>          time stock       price
#> 1  2009-01-01     X -0.91474184
#> 2  2009-01-01     Y -0.56548082
#> 3  2009-01-02     X  1.00124785
#> 4  2009-01-02     Y -1.52701766
#> 5  2009-01-03     X -0.05642291
#> 6  2009-01-03     Y -1.37001987
#> 7  2009-01-04     X  0.29664516
#> 8  2009-01-04     Y  0.73414790
#> 9  2009-01-05     X -2.79147086
#> 10 2009-01-05     Y  0.36599370
```

## Statistical Transformations

Even after getting the raw data in the needed format, we may need to transform certain variables further to meet requirements imposed by a statistical test.

`{datawizard}` provides a rich collection of such functions for transforming variables (see Table 2).

| Function          | Operation                                    |
| :---------------- | :------------------------------------------- |
| `standardize()`   | to center and scale data                     |
| `normalize()`     | to scale variables to 0-1 range              |
| `adjust()`        | to adjust data for effect of other variables |
| `slide()`         | to shift numeric value range                 |
| `ranktransform()` | to convert numeric values to integer ranks   |
| ...               | ...                                          |

Table: The table below lists a few key functions offered by `{datawizard}` for data transformations. To see the full list, see the package website: <https://easystats.github.io/datawizard/>

We will look at one example function that standardizes (i.e. centers and scales) data so that it can be expressed in terms of standard deviation:


```r
d <- data.frame(
  a = c(-2, -1, 0, 1, 2),
  b = c(3, 4, 5, 6, 7)
)

standardize(d, center = c(3, 4), scale = c(2, 4))
#>      a     b
#> 1 -2.5 -0.25
#> 2 -2.0  0.00
#> 3 -1.5  0.25
#> 4 -1.0  0.50
#> 5 -0.5  0.75
```

## Summaries of Data Properties and Distributions

The workhorse function to get a comprehensive summary of data properties is `describe_distribution()`, which combines a set of indices (e.g., measures of centrality, dispersion, range, skewness, kurtosis, etc.) computed by other functions in `{datawizard}`.


```r
describe_distribution(mtcars)
```


\begin{tabular}[t]{lrrrrrrrrr}
\toprule
Variable & Mean & SD & IQR & Min & Max & Skewness & Kurtosis & n & n\_Missing\\
\midrule
mpg & 20.09 & 6.03 & 7.53 & 10.4 & 33.9 & 0.67 & -0.02 & 32 & 0\\
cyl & 6.19 & 1.79 & 4.00 & 4.0 & 8.0 & -0.19 & -1.76 & 32 & 0\\
disp & 230.72 & 123.94 & 221.52 & 71.1 & 472.0 & 0.42 & -1.07 & 32 & 0\\
hp & 146.69 & 68.56 & 84.50 & 52.0 & 335.0 & 0.80 & 0.28 & 32 & 0\\
drat & 3.60 & 0.53 & 0.84 & 2.8 & 4.9 & 0.29 & -0.45 & 32 & 0\\
wt & 3.22 & 0.98 & 1.19 & 1.5 & 5.4 & 0.47 & 0.42 & 32 & 0\\
qsec & 17.85 & 1.79 & 2.02 & 14.5 & 22.9 & 0.41 & 0.86 & 32 & 0\\
vs & 0.44 & 0.50 & 1.00 & 0.0 & 1.0 & 0.26 & -2.06 & 32 & 0\\
am & 0.41 & 0.50 & 1.00 & 0.0 & 1.0 & 0.40 & -1.97 & 32 & 0\\
gear & 3.69 & 0.74 & 1.00 & 3.0 & 5.0 & 0.58 & -0.90 & 32 & 0\\
carb & 2.81 & 1.62 & 2.00 & 1.0 & 8.0 & 1.16 & 2.02 & 32 & 0\\
\bottomrule
\end{tabular}

# Licensing and Availability

`{datawizard}` is licensed under the GNU General Public License (v3.0), with all source code openly developed and stored on GitHub (<https://github.com/easystats/datawizard>), along with a corresponding issue tracker for bug reporting and feature enhancements. In the spirit of honest and open science, we encourage requests, tips for fixes, feature updates, as well as general questions and concerns via direct interaction with contributors and developers.

# Acknowledgments

`{datawizard}` is part of the collaborative [*easystats*](https://easystats.github.io/easystats/) ecosystem. Thus, we thank the [members of easystats](https://github.com/orgs/easystats/people) as well as the users.

# References


================================================
FILE: pkgdown/_pkgdown.yaml
================================================
url: https://easystats.github.io/datawizard/

template:
  bootstrap: 5
  package: easystatstemplate

reference:
  - title: Data Preparation
    desc: |
      Main functions for cleaning and preparing data
    contents:
      - data_to_long
      - data_to_wide
      - data_extract
      - data_filter
      - data_select
      - data_reorder
      - data_arrange
      - data_merge
      - data_partition
      - data_rotate
      - data_group
      - data_replicate
      - data_duplicated
      - data_unique

  - title: Data and Variable Transformations
  - subtitle: Statistical Transformations
    desc: |
      Functions for transforming variables
    contents:
      - data_modify
      - data_separate
      - data_unite
      - categorize
      - recode_into
      - recode_values
      - adjust
      - demean
      - ranktransform
      - rescale_weights
      - winsorize
  - subtitle: "Linear Transformers"
    desc: |
      Convenient functions for common linear transformations
    contents:
      - center
      - slide
      - standardize
      - standardize.default
      - reverse
      - rescale
      - normalize
      - unstandardize
      - makepredictcall.dw_transformer
  - subtitle: "Others"
    contents:
    - contr.deviation

  - title: Data Properties
    desc: |
      Functions to compute statistical summaries of data properties and distributions
    contents:
      - as.prop.table
      - data_codebook
      - data_summary
      - data_tabulate
      - data_peek
      - data_seek
      - means_by_group
      - contains("distribution")
      - kurtosis
      - smoothness
      - skewness
      - row_count
      - row_means
      - weighted_mean
      - mean_sd

  - title: Convert and Replace Data
    desc: |
      Helpers for data replacements
    contents:
      - assign_labels
      - labels_to_levels
      - contains("to_numeric")
      - to_factor
      - starts_with("replace_")
      - starts_with("convert_")

  - title: Import data
    desc: |
      Helpers for importing data
    contents:
      - data_read

  - title: Helpers for Data Preparation
    desc: |
      Primarily useful in the context of other 'easystats' packages
    contents:
      - reshape_ci
      - data_rename
      - data_addprefix
      - remove_empty
      - contains("rownames")
      - rowid_as_column
      - contains("colnames")
      - extract_column_names
      - data_restoretype

  - title: Helpers for Text Formatting
    desc: |
      Primarily useful for 'report' package
    contents:
      - starts_with("text_")

  - title: Visualization helpers
    desc: |
      Primarily useful in the context of other 'easystats' packages
    contents:
      - visualisation_recipe

  - title: Data
    desc: |
      Datasets useful for examples and tests
    contents:
      - efc
      - nhanes_sample

articles:
  - title: Overview of vignettes
    navbar: ~
    contents:
      - overview_of_vignettes

  - title: Data Preparation
    desc: |
      Articles explaining utility of 'datawizard' for data wrangling
    navbar: ~
    contents:
      - tidyverse_translation
      - selection_syntax

  - title: Statistical Transformations
    desc: |
      Articles describing use of 'datawizard' functions for tranforming data
      to use in regression models
    navbar: ~
    contents:
      - standardize_data


================================================
FILE: tests/testthat/_snaps/categorize.md
================================================
# categorize labelling ranged

    Code
      categorize(mtcars$mpg, "equal_length", n_groups = 5)
    Output
       [1] 3 3 3 3 2 2 1 3 3 2 2 2 2 2 1 1 1 5 5 5 3 2 2 1 2 4 4 5 2 2 1 3

---

    Code
      categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range")
    Output
       [1] [19.8,24.5) [19.8,24.5) [19.8,24.5) [19.8,24.5) [15.1,19.8) [15.1,19.8)
       [7] [10.4,15.1) [19.8,24.5) [19.8,24.5) [15.1,19.8) [15.1,19.8) [15.1,19.8)
      [13] [15.1,19.8) [15.1,19.8) [10.4,15.1) [10.4,15.1) [10.4,15.1) [29.2,33.9]
      [19] [29.2,33.9] [29.2,33.9] [19.8,24.5) [15.1,19.8) [15.1,19.8) [10.4,15.1)
      [25] [15.1,19.8) [24.5,29.2) [24.5,29.2) [29.2,33.9] [15.1,19.8) [15.1,19.8)
      [31] [10.4,15.1) [19.8,24.5)
      Levels: [10.4,15.1) [15.1,19.8) [19.8,24.5) [24.5,29.2) [29.2,33.9]

---

    Code
      categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed")
    Output
       [1] (21-24.4)   (21-24.4)   (21-24.4)   (21-24.4)   (15.2-19.7) (15.2-19.7)
       [7] (10.4-15)   (21-24.4)   (21-24.4)   (15.2-19.7) (15.2-19.7) (15.2-19.7)
      [13] (15.2-19.7) (15.2-19.7) (10.4-15)   (10.4-15)   (10.4-15)   (30.4-33.9)
      [19] (30.4-33.9) (30.4-33.9) (21-24.4)   (15.2-19.7) (15.2-19.7) (10.4-15)  
      [25] (15.2-19.7) (26-27.3)   (26-27.3)   (30.4-33.9) (15.2-19.7) (15.2-19.7)
      [31] (10.4-15)   (21-24.4)  
      Levels: (10.4-15) (15.2-19.7) (21-24.4) (26-27.3) (30.4-33.9)

# categorize breaks

    Code
      categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range", breaks = "inclusive")
    Output
       [1] (19.8,24.5] (19.8,24.5] (19.8,24.5] (19.8,24.5] (15.1,19.8] (15.1,19.8]
       [7] [10.4,15.1] (19.8,24.5] (19.8,24.5] (15.1,19.8] (15.1,19.8] (15.1,19.8]
      [13] (15.1,19.8] (15.1,19.8] [10.4,15.1] [10.4,15.1] [10.4,15.1] (29.2,33.9]
      [19] (29.2,33.9] (29.2,33.9] (19.8,24.5] (15.1,19.8] (15.1,19.8] [10.4,15.1]
      [25] (15.1,19.8] (24.5,29.2] (24.5,29.2] (29.2,33.9] (15.1,19.8] (15.1,19.8]
      [31] [10.4,15.1] (19.8,24.5]
      Levels: [10.4,15.1] (15.1,19.8] (19.8,24.5] (24.5,29.2] (29.2,33.9]


================================================
FILE: tests/testthat/_snaps/contr.deviation.md
================================================
# contr.deviation | snapshot

    Code
      solve(c.deviation)
    Output
                         4         6         8
      Intercept  0.3333333 0.3333333 0.3333333
      6         -1.0000000 1.0000000 0.0000000
      8         -1.0000000 0.0000000 1.0000000

---

    Code
      solve(mm)
    Output
                    cyl4.am0   cyl4.am1   cyl6.am0  cyl6.am1   cyl8.am0  cyl8.am1
      (Intercept)  0.3333333  0.0000000  0.3333333 0.0000000  0.3333333 0.0000000
      cyl6        -1.0000000  0.0000000  1.0000000 0.0000000  0.0000000 0.0000000
      cyl8        -1.0000000  0.0000000  0.0000000 0.0000000  1.0000000 0.0000000
      am1         -0.3333333  0.3333333 -0.3333333 0.3333333 -0.3333333 0.3333333
      cyl6:am1     1.0000000 -1.0000000 -1.0000000 1.0000000  0.0000000 0.0000000
      cyl8:am1     1.0000000 -1.0000000  0.0000000 0.0000000 -1.0000000 1.0000000


================================================
FILE: tests/testthat/_snaps/data_codebook.md
================================================
# data_codebook iris

    Code
      data_codebook(iris)
    Output
      iris (150 rows and 5 variables, 5 shown)
      
      ID | Name         | Type        | Missings |     Values |          N
      ---+--------------+-------------+----------+------------+-----------
      1  | Sepal.Length | numeric     | 0 (0.0%) | [4.3, 7.9] |        150
      ---+--------------+-------------+----------+------------+-----------
      2  | Sepal.Width  | numeric     | 0 (0.0%) |   [2, 4.4] |        150
      ---+--------------+-------------+----------+------------+-----------
      3  | Petal.Length | numeric     | 0 (0.0%) |   [1, 6.9] |        150
      ---+--------------+-------------+----------+------------+-----------
      4  | Petal.Width  | numeric     | 0 (0.0%) | [0.1, 2.5] |        150
      ---+--------------+-------------+----------+------------+-----------
      5  | Species      | categorical | 0 (0.0%) |     setosa | 50 (33.3%)
         |              |             |          | versicolor | 50 (33.3%)
         |              |             |          |  virginica | 50 (33.3%)
      --------------------------------------------------------------------

# data_codebook iris, reordered

    Code
      data_codebook(iris[c(1, 2, 5, 3, 4)])
    Output
      iris[c(1, 2, 5, 3, 4)] (150 rows and 5 variables, 5 shown)
      
      ID | Name         | Type        | Missings |     Values |          N
      ---+--------------+-------------+----------+------------+-----------
      1  | Sepal.Length | numeric     | 0 (0.0%) | [4.3, 7.9] |        150
      ---+--------------+-------------+----------+------------+-----------
      2  | Sepal.Width  | numeric     | 0 (0.0%) |   [2, 4.4] |        150
      ---+--------------+-------------+----------+------------+-----------
      3  | Species      | categorical | 0 (0.0%) |     setosa | 50 (33.3%)
         |              |             |          | versicolor | 50 (33.3%)
         |              |             |          |  virginica | 50 (33.3%)
      ---+--------------+-------------+----------+------------+-----------
      4  | Petal.Length | numeric     | 0 (0.0%) |   [1, 6.9] |        150
      ---+--------------+-------------+----------+------------+-----------
      5  | Petal.Width  | numeric     | 0 (0.0%) | [0.1, 2.5] |        150
      --------------------------------------------------------------------

# data_codebook NaN and Inf

    Code
      data_codebook(d)
    Output
      d (9 rows and 1 variables, 1 shown)
      
      ID | Name | Type    |  Missings | Values |         N
      ---+------+---------+-----------+--------+----------
      1  | x    | numeric | 2 (22.2%) |      1 | 3 (42.9%)
         |      |         |           |      2 | 1 (14.3%)
         |      |         |           |      4 | 2 (28.6%)
         |      |         |           |    Inf | 1 (14.3%)
      ----------------------------------------------------

---

    Code
      data_codebook(d)
    Output
      d (102 rows and 1 variables, 1 shown)
      
      ID | Name | Type    | Missings |  Values |           N
      ---+------+---------+----------+---------+------------
      1  | x    | numeric | 0 (0.0%) | [1, 15] | 102 (98.1%)
         |      |         |          |     Inf |   2 ( 1.9%)
      ------------------------------------------------------

---

    Code
      data_codebook(d, range_at = 100)
    Output
      d (102 rows and 1 variables, 1 shown)
      
      ID | Name | Type    | Missings | Values |          N
      ---+------+---------+----------+--------+-----------
      1  | x    | numeric | 0 (0.0%) |      1 |  4 ( 4.0%)
         |      |         |          |      2 |  5 ( 5.0%)
         |      |         |          |      3 |  6 ( 6.0%)
         |      |         |          |      4 |  5 ( 5.0%)
         |      |         |          |      5 |  8 ( 8.0%)
         |      |         |          |      6 | 10 (10.0%)
         |      |         |          |      7 |  6 ( 6.0%)
         |      |         |          |      8 |  3 ( 3.0%)
         |      |         |          |      9 | 13 (13.0%)
         |      |         |          |     10 |  7 ( 7.0%)
         |      |         |          |  (...) |           
      ----------------------------------------------------

---

    Code
      data_codebook(d, range_at = 100, max_values = 4)
    Output
      d (102 rows and 1 variables, 1 shown)
      
      ID | Name | Type    | Missings | Values |        N
      ---+------+---------+----------+--------+---------
      1  | x    | numeric | 0 (0.0%) |      1 | 4 (4.0%)
         |      |         |          |      2 | 5 (5.0%)
         |      |         |          |      3 | 6 (6.0%)
         |      |         |          |      4 | 5 (5.0%)
         |      |         |          |  (...) |         
      --------------------------------------------------

# data_codebook, tinytable

    Code
      display(data_codebook(d), format = "tt")
    Output
      
      +----+------+---------+-----------+--------+-----------+
      | ID | Name | Type    | Missings  | Values | N         |
      +====+======+=========+===========+========+===========+
      | 1  | x    | numeric | 2 (22.2%) | 1      | 3 (42.9%) |
      +----+------+---------+-----------+--------+-----------+
      |    |      |         |           | 2      | 1 (14.3%) |
      +----+------+---------+-----------+--------+-----------+
      |    |      |         |           | 4      | 2 (28.6%) |
      +----+------+---------+-----------+--------+-----------+
      |    |      |         |           | Inf    | 1 (14.3%) |
      +----+------+---------+-----------+--------+-----------+
      
      Table: d (9 rows and 1 variables, 1 shown) 

---

    Code
      display(data_codebook(d), format = "tt")
    Output
      
      +----+------+---------+----------+---------+-------------+
      | ID | Name | Type    | Missings | Values  | N           |
      +====+======+=========+==========+=========+=============+
      | 1  | x    | numeric | 0 (0.0%) | [1, 15] | 102 (98.1%) |
      +----+------+---------+----------+---------+-------------+
      |    |      |         |          | Inf     | 2 (1.9%)    |
      +----+------+---------+----------+---------+-------------+
      
      Table: d (102 rows and 1 variables, 1 shown) 

---

    Code
      display(data_codebook(d, range_at = 100), format = "tt")
    Output
      
      +----+------+---------+----------+--------+------------+
      | ID | Name | Type    | Missings | Values | N          |
      +====+======+=========+==========+========+============+
      | 1  | x    | numeric | 0 (0.0%) | 1      | 4 (4.0%)   |
      +----+------+---------+----------+--------+------------+
      |    |      |         |          | 2      | 5 (5.0%)   |
      +----+------+---------+----------+--------+------------+
      |    |      |         |          | 3      | 6 (6.0%)   |
      +----+------+---------+----------+--------+------------+
      |    |      |         |          | 4      | 5 (5.0%)   |
      +----+------+---------+----------+--------+------------+
      |    |      |         |          | 5      | 8 (8.0%)   |
      +----+------+---------+----------+--------+------------+
      |    |      |         |          | 6      | 10 (10.0%) |
      +----+------+---------+----------+--------+------------+
      |    |      |         |          | 7      | 6 (6.0%)   |
      +----+------+---------+----------+--------+------------+
      |    |      |         |          | 8      | 3 (3.0%)   |
      +----+------+---------+----------+--------+------------+
      |    |      |         |          | 9      | 13 (13.0%) |
      +----+------+---------+----------+--------+------------+
      |    |      |         |          | 10     | 7 (7.0%)   |
      +----+------+---------+----------+--------+------------+
      |    |      |         |          | (...)  |            |
      +----+------+---------+----------+--------+------------+
      
      Table: d (102 rows and 1 variables, 1 shown) 

---

    Code
      display(data_codebook(d, range_at = 100, max_values = 4), format = "tt")
    Output
      
      +----+------+---------+----------+--------+----------+
      | ID | Name | Type    | Missings | Values | N        |
      +====+======+=========+==========+========+==========+
      | 1  | x    | numeric | 0 (0.0%) | 1      | 4 (4.0%) |
      +----+------+---------+----------+--------+----------+
      |    |      |         |          | 2      | 5 (5.0%) |
      +----+------+---------+----------+--------+----------+
      |    |      |         |          | 3      | 6 (6.0%) |
      +----+------+---------+----------+--------+----------+
      |    |      |         |          | 4      | 5 (5.0%) |
      +----+------+---------+----------+--------+----------+
      |    |      |         |          | (...)  |          |
      +----+------+---------+----------+--------+----------+
      
      Table: d (102 rows and 1 variables, 1 shown) 

---

    Code
      display(data_codebook(iris[c(1, 2, 5, 3, 4)]), format = "tt")
    Output
      
      +----+--------------+-------------+----------+------------+------------+
      | ID | Name         | Type        | Missings | Values     | N          |
      +====+==============+=============+==========+============+============+
      | 1  | Sepal.Length | numeric     | 0 (0.0%) | [4.3, 7.9] | 150        |
      +----+--------------+-------------+----------+------------+------------+
      | 2  | Sepal.Width  | numeric     | 0 (0.0%) | [2, 4.4]   | 150        |
      +----+--------------+-------------+----------+------------+------------+
      | 3  | Species      | categorical | 0 (0.0%) | setosa     | 50 (33.3%) |
      +----+--------------+-------------+----------+------------+------------+
      |    |              |             |          | versicolor | 50 (33.3%) |
      +----+--------------+-------------+----------+------------+------------+
      |    |              |             |          | virginica  | 50 (33.3%) |
      +----+--------------+-------------+----------+------------+------------+
      | 4  | Petal.Length | numeric     | 0 (0.0%) | [1, 6.9]   | 150        |
      +----+--------------+-------------+----------+------------+------------+
      | 5  | Petal.Width  | numeric     | 0 (0.0%) | [0.1, 2.5] | 150        |
      +----+--------------+-------------+----------+------------+------------+
      
      Table: iris[c(1, 2, 5, 3, 4)] (150 rows and 5 variables, 5 shown) 

# data_codebook iris, select

    Code
      data_codebook(iris, select = starts_with("Sepal"))
    Output
      iris (150 rows and 5 variables, 2 shown)
      
      ID | Name         | Type    | Missings |     Values |   N
      ---+--------------+---------+----------+------------+----
      1  | Sepal.Length | numeric | 0 (0.0%) | [4.3, 7.9] | 150
      ---+--------------+---------+----------+------------+----
      2  | Sepal.Width  | numeric | 0 (0.0%) |   [2, 4.4] | 150
      ---------------------------------------------------------

# data_codebook iris, select, ID

    Code
      data_codebook(iris, select = starts_with("Petal"))
    Output
      iris (150 rows and 5 variables, 2 shown)
      
      ID | Name         | Type    | Missings |     Values |   N
      ---+--------------+---------+----------+------------+----
      3  | Petal.Length | numeric | 0 (0.0%) |   [1, 6.9] | 150
      ---+--------------+---------+----------+------------+----
      4  | Petal.Width  | numeric | 0 (0.0%) | [0.1, 2.5] | 150
      ---------------------------------------------------------

# data_codebook efc

    Code
      print(data_codebook(efc), table_width = Inf)
    Output
      efc (100 rows and 5 variables, 5 shown)
      
      ID | Name     | Label                                    | Type        |   Missings |   Values | Value Labels                    |          N
      ---+----------+------------------------------------------+-------------+------------+----------+---------------------------------+-----------
      1  | c12hour  | average number of hours of care per week | numeric     |   2 (2.0%) | [5, 168] |                                 |         98
      ---+----------+------------------------------------------+-------------+------------+----------+---------------------------------+-----------
      2  | e16sex   | elder's gender                           | numeric     |   0 (0.0%) |        1 | male                            | 46 (46.0%)
         |          |                                          |             |            |        2 | female                          | 54 (54.0%)
      ---+----------+------------------------------------------+-------------+------------+----------+---------------------------------+-----------
      3  | e42dep   | elder's dependency                       | categorical |   3 (3.0%) |        1 | independent                     |  2 ( 2.1%)
         |          |                                          |             |            |        2 | slightly dependent              |  4 ( 4.1%)
         |          |                                          |             |            |        3 | moderately dependent            | 28 (28.9%)
         |          |                                          |             |            |        4 | severely dependent              | 63 (64.9%)
      ---+----------+------------------------------------------+-------------+------------+----------+---------------------------------+-----------
      4  | c172code | carer's level of education               | numeric     | 10 (10.0%) |        1 | low level of education          |  8 ( 8.9%)
         |          |                                          |             |            |        2 | intermediate level of education | 66 (73.3%)
         |          |                                          |             |            |        3 | high level of education         | 16 (17.8%)
      ---+----------+------------------------------------------+-------------+------------+----------+---------------------------------+-----------
      5  | neg_c_7  | Negative impact with 7 items             | numeric     |   3 (3.0%) |  [7, 28] |                                 |         97
      ---------------------------------------------------------------------------------------------------------------------------------------------

---

    Code
      print(data_codebook(efc), table_width = "auto", remove_duplicates = FALSE)
    Output
      efc (100 rows and 5 variables, 5 shown)
      
      ID | Name     | Label                                    | Type       
      ---+----------+------------------------------------------+------------
      1  | c12hour  | average number of hours of care per week | numeric    
      ---+----------+------------------------------------------+------------
      2  | e16sex   | elder's gender                           | numeric    
         |          |                                          |            
      ---+----------+------------------------------------------+------------
      3  | e42dep   | elder's dependency                       | categorical
         |          |                                          |            
         |          |                                          |            
         |          |                                          |            
      ---+----------+------------------------------------------+------------
      4  | c172code | carer's level of education               | numeric    
         |          |                                          |            
         |          |                                          |            
      ---+----------+------------------------------------------+------------
      5  | neg_c_7  | Negative impact with 7 items             | numeric    
      ----------------------------------------------------------------------
      
      ID |   Missings |   Values | Value Labels                    |          N
      ---+------------+----------+---------------------------------+-----------
      1  |   2 (2.0%) | [5, 168] |                                 |         98
      ---+------------+----------+---------------------------------+-----------
      2  |   0 (0.0%) |        1 | male                            | 46 (46.0%)
         |            |        2 | female                          | 54 (54.0%)
      ---+------------+----------+---------------------------------+-----------
      3  |   3 (3.0%) |        1 | independent                     |  2 ( 2.1%)
         |            |        2 | slightly dependent              |  4 ( 4.1%)
         |            |        3 | moderately dependent            | 28 (28.9%)
         |            |        4 | severely dependent              | 63 (64.9%)
      ---+------------+----------+---------------------------------+-----------
      4  | 10 (10.0%) |        1 | low level of education          |  8 ( 8.9%)
         |            |        2 | intermediate level of education | 66 (73.3%)
         |            |        3 | high level of education         | 16 (17.8%)
      ---+------------+----------+---------------------------------+-----------
      5  |   3 (3.0%) |  [7, 28] |                                 |         97
      -------------------------------------------------------------------------

---

    Code
      print(data_codebook(efc), table_width = "auto", remove_duplicates = TRUE)
    Output
      efc (100 rows and 5 variables, 5 shown)
      
      ID | Name     | Label                                    | Type       
      ---+----------+------------------------------------------+------------
      1  | c12hour  | average number of hours of care per week | numeric    
      ---+----------+------------------------------------------+------------
      2  | e16sex   | elder's gender                           | numeric    
      ---+----------+------------------------------------------+------------
      3  | e42dep   | elder's dependency                       | categorical
      ---+----------+------------------------------------------+------------
      4  | c172code | carer's level of education               | numeric    
      ---+----------+------------------------------------------+------------
      5  | neg_c_7  | Negative impact with 7 items             | numeric    
      ----------------------------------------------------------------------
      
      ID |   Missings |   Values | Value Labels                    |          N
      ---+------------+----------+---------------------------------+-----------
      1  |   2 (2.0%) | [5, 168] |                                 |         98
      ---+------------+----------+---------------------------------+-----------
      2  |   0 (0.0%) |        1 | male                            | 46 (46.0%)
         |            |        2 | female                          | 54 (54.0%)
      ---+------------+----------+---------------------------------+-----------
      3  |   3 (3.0%) |        1 | independent                     |  2 ( 2.1%)
         |            |        2 | slightly dependent              |  4 ( 4.1%)
         |            |        3 | moderately dependent            | 28 (28.9%)
         |            |        4 | severely dependent              | 63 (64.9%)
      ---+------------+----------+---------------------------------+-----------
      4  | 10 (10.0%) |        1 | low level of education          |  8 ( 8.9%)
         |            |        2 | intermediate level of education | 66 (73.3%)
         |            |        3 | high level of education         | 16 (17.8%)
      ---+------------+----------+---------------------------------+-----------
      5  |   3 (3.0%) |  [7, 28] |                                 |         97
      -------------------------------------------------------------------------

# data_codebook efc, variable_label_width

    Code
      print(out, table_width = Inf)
    Output
      efc (100 rows and 5 variables, 5 shown)
      
      ID | Name     | Label                        | Type        |   Missings |   Values | Value Labels                    |          N
      ---+----------+------------------------------+-------------+------------+----------+---------------------------------+-----------
      1  | c12hour  | average number of hours of   | numeric     |   2 (2.0%) | [5, 168] |                                 |         98
         |          | care per week                |             |            |          |                                 |           
      ---+----------+------------------------------+-------------+------------+----------+---------------------------------+-----------
      2  | e16sex   | elder's gender               | numeric     |   0 (0.0%) |        1 | male                            | 46 (46.0%)
         |          |                              |             |            |        2 | female                          | 54 (54.0%)
      ---+----------+------------------------------+-------------+------------+----------+---------------------------------+-----------
      3  | e42dep   | elder's dependency           | categorical |   3 (3.0%) |        1 | independent                     |  2 ( 2.1%)
         |          |                              |             |            |        2 | slightly dependent              |  4 ( 4.1%)
         |          |                              |             |            |        3 | moderately dependent            | 28 (28.9%)
         |          |                              |             |            |        4 | severely dependent              | 63 (64.9%)
      ---+----------+------------------------------+-------------+------------+----------+---------------------------------+-----------
      4  | c172code | carer's level of education   | numeric     | 10 (10.0%) |        1 | low level of education          |  8 ( 8.9%)
         |          |                              |             |            |        2 | intermediate level of education | 66 (73.3%)
         |          |                              |             |            |        3 | high level of education         | 16 (17.8%)
      ---+----------+------------------------------+-------------+------------+----------+---------------------------------+-----------
      5  | neg_c_7  | Negative impact with 7 items | numeric     |   3 (3.0%) |  [7, 28] |                                 |         97
      ---------------------------------------------------------------------------------------------------------------------------------

---

    Code
      print(out, table_width = "auto", remove_duplicates = FALSE)
    Output
      efc (100 rows and 5 variables, 5 shown)
      
      ID | Name     | Label                        | Type        |   Missings
      ---+----------+------------------------------+-------------+-----------
      1  | c12hour  | average number of hours of   | numeric     |   2 (2.0%)
         |          | care per week                |             |           
      ---+----------+------------------------------+-------------+-----------
      2  | e16sex   | elder's gender               | numeric     |   0 (0.0%)
         |          |                              |             |           
      ---+----------+------------------------------+-------------+-----------
      3  | e42dep   | elder's dependency           | categorical |   3 (3.0%)
         |          |                              |             |           
         |          |                              |             |           
         |          |                              |             |           
      ---+----------+------------------------------+-------------+-----------
      4  | c172code | carer's level of education   | numeric     | 10 (10.0%)
         |          |                              |             |           
         |          |                              |             |           
      ---+----------+------------------------------+-------------+-----------
      5  | neg_c_7  | Negative impact with 7 items | numeric     |   3 (3.0%)
      -----------------------------------------------------------------------
      
      ID |   Values | Value Labels                    |          N
      ---+----------+---------------------------------+-----------
      1  | [5, 168] |                                 |         98
         |          |                                 |           
      ---+----------+---------------------------------+-----------
      2  |        1 | male                            | 46 (46.0%)
         |        2 | female                          | 54 (54.0%)
      ---+----------+---------------------------------+-----------
      3  |        1 | independent                     |  2 ( 2.1%)
         |        2 | slightly dependent              |  4 ( 4.1%)
         |        3 | moderately dependent            | 28 (28.9%)
         |        4 | severely dependent              | 63 (64.9%)
      ---+----------+---------------------------------+-----------
      4  |        1 | low level of education          |  8 ( 8.9%)
         |        2 | intermediate level of education | 66 (73.3%)
         |        3 | high level of education         | 16 (17.8%)
      ---+----------+---------------------------------+-----------
      5  |  [7, 28] |                                 |         97
      ------------------------------------------------------------

---

    Code
      print(out, table_width = "auto", remove_duplicates = TRUE)
    Output
      efc (100 rows and 5 variables, 5 shown)
      
      ID | Name     | Label                        | Type        |   Missings
      ---+----------+------------------------------+-------------+-----------
      1  | c12hour  | average number of hours of   | numeric     |   2 (2.0%)
         |          | care per week                |             |           
      ---+----------+------------------------------+-------------+-----------
      2  | e16sex   | elder's gender               | numeric     |   0 (0.0%)
      ---+----------+------------------------------+-------------+-----------
      3  | e42dep   | elder's dependency           | categorical |   3 (3.0%)
      ---+----------+------------------------------+-------------+-----------
      4  | c172code | carer's level of education   | numeric     | 10 (10.0%)
      ---+----------+------------------------------+-------------+-----------
      5  | neg_c_7  | Negative impact with 7 items | numeric     |   3 (3.0%)
      -----------------------------------------------------------------------
      
      ID |   Values | Value Labels                    |          N
      ---+----------+---------------------------------+-----------
      1  | [5, 168] |                                 |         98
      ---+----------+---------------------------------+-----------
      2  |        1 | male                            | 46 (46.0%)
         |        2 | female                          | 54 (54.0%)
      ---+----------+---------------------------------+-----------
      3  |        1 | independent                     |  2 ( 2.1%)
         |        2 | slightly dependent              |  4 ( 4.1%)
         |        3 | moderately dependent            | 28 (28.9%)
         |        4 | severely dependent              | 63 (64.9%)
      ---+----------+---------------------------------+-----------
      4  |        1 | low level of education          |  8 ( 8.9%)
         |        2 | intermediate level of education | 66 (73.3%)
         |        3 | high level of education         | 16 (17.8%)
      ---+----------+---------------------------------+-----------
      5  |  [7, 28] |                                 |         97
      ------------------------------------------------------------

# data_codebook efc, value_label_width

    Code
      print(out, table_width = Inf)
    Output
      efc (100 rows and 5 variables, 5 shown)
      
      ID | Name     | Label                        | Type        |   Missings |   Values | Value Labels     |          N
      ---+----------+------------------------------+-------------+------------+----------+------------------+-----------
      1  | c12hour  | average number of hours of   | numeric     |   2 (2.0%) | [5, 168] |                  |         98
         |          | care per week                |             |            |          |                  |           
      ---+----------+------------------------------+-------------+------------+----------+------------------+-----------
      2  | e16sex   | elder's gender               | numeric     |   0 (0.0%) |        1 | male             | 46 (46.0%)
         |          |                              |             |            |        2 | female           | 54 (54.0%)
      ---+----------+------------------------------+-------------+------------+----------+------------------+-----------
      3  | e42dep   | elder's dependency           | categorical |   3 (3.0%) |        1 | independent      |  2 ( 2.1%)
         |          |                              |             |            |        2 | slightly...      |  4 ( 4.1%)
         |          |                              |             |            |        3 | moderately...    | 28 (28.9%)
         |          |                              |             |            |        4 | severely...      | 63 (64.9%)
      ---+----------+------------------------------+-------------+------------+----------+------------------+-----------
      4  | c172code | carer's level of education   | numeric     | 10 (10.0%) |        1 | low level of...  |  8 ( 8.9%)
         |          |                              |             |            |        2 | intermediate...  | 66 (73.3%)
         |          |                              |             |            |        3 | high level of... | 16 (17.8%)
      ---+----------+------------------------------+-------------+------------+----------+------------------+-----------
      5  | neg_c_7  | Negative impact with 7 items | numeric     |   3 (3.0%) |  [7, 28] |                  |         97
      ------------------------------------------------------------------------------------------------------------------

---

    Code
      print(out, table_width = "auto", remove_duplicates = FALSE)
    Output
      efc (100 rows and 5 variables, 5 shown)
      
      ID | Name     | Label                        | Type        |   Missings
      ---+----------+------------------------------+-------------+-----------
      1  | c12hour  | average number of hours of   | numeric     |   2 (2.0%)
         |          | care per week                |             |           
      ---+----------+------------------------------+-------------+-----------
      2  | e16sex   | elder's gender               | numeric     |   0 (0.0%)
         |          |                              |             |           
      ---+----------+------------------------------+-------------+-----------
      3  | e42dep   | elder's dependency           | categorical |   3 (3.0%)
         |          |                              |             |           
         |          |                              |             |           
         |          |                              |             |           
      ---+----------+------------------------------+-------------+-----------
      4  | c172code | carer's level of education   | numeric     | 10 (10.0%)
         |          |                              |             |           
         |          |                              |             |           
      ---+----------+------------------------------+-------------+-----------
      5  | neg_c_7  | Negative impact with 7 items | numeric     |   3 (3.0%)
      -----------------------------------------------------------------------
      
      ID |   Values | Value Labels     |          N
      ---+----------+------------------+-----------
      1  | [5, 168] |                  |         98
         |          |                  |           
      ---+----------+------------------+-----------
      2  |        1 | male             | 46 (46.0%)
         |        2 | female           | 54 (54.0%)
      ---+----------+------------------+-----------
      3  |        1 | independent      |  2 ( 2.1%)
         |        2 | slightly...      |  4 ( 4.1%)
         |        3 | moderately...    | 28 (28.9%)
         |        4 | severely...      | 63 (64.9%)
      ---+----------+------------------+-----------
      4  |        1 | low level of...  |  8 ( 8.9%)
         |        2 | intermediate...  | 66 (73.3%)
         |        3 | high level of... | 16 (17.8%)
      ---+----------+------------------+-----------
      5  |  [7, 28] |                  |         97
      ---------------------------------------------

---

    Code
      print(out, table_width = "auto", remove_duplicates = TRUE)
    Output
      efc (100 rows and 5 variables, 5 shown)
      
      ID | Name     | Label                        | Type        |   Missings
      ---+----------+------------------------------+-------------+-----------
      1  | c12hour  | average number of hours of   | numeric     |   2 (2.0%)
         |          | care per week                |             |           
      ---+----------+------------------------------+-------------+-----------
      2  | e16sex   | elder's gender               | numeric     |   0 (0.0%)
      ---+----------+------------------------------+-------------+-----------
      3  | e42dep   | elder's dependency           | categorical |   3 (3.0%)
      ---+----------+------------------------------+-------------+-----------
      4  | c172code | carer's level of education   | numeric     | 10 (10.0%)
      ---+----------+------------------------------+-------------+-----------
      5  | neg_c_7  | Negative impact with 7 items | numeric     |   3 (3.0%)
      -----------------------------------------------------------------------
      
      ID |   Values | Value Labels     |          N
      ---+----------+------------------+-----------
      1  | [5, 168] |                  |         98
      ---+----------+------------------+-----------
      2  |        1 | male             | 46 (46.0%)
         |        2 | female           | 54 (54.0%)
      ---+----------+------------------+-----------
      3  |        1 | independent      |  2 ( 2.1%)
         |        2 | slightly...      |  4 ( 4.1%)
         |        3 | moderately...    | 28 (28.9%)
         |        4 | severely...      | 63 (64.9%)
      ---+----------+------------------+-----------
      4  |        1 | low level of...  |  8 ( 8.9%)
         |        2 | intermediate...  | 66 (73.3%)
         |        3 | high level of... | 16 (17.8%)
      ---+----------+------------------+-----------
      5  |  [7, 28] |                  |         97
      ---------------------------------------------

# data_codebook truncated data

    Code
      data_codebook(d, max_values = 5)
    Output
      d (100 rows and 2 variables, 2 shown)
      
      ID | Name | Type      | Missings |  Values |        N
      ---+------+-----------+----------+---------+---------
      1  | a    | integer   | 0 (0.0%) | [1, 15] |      100
      ---+------+-----------+----------+---------+---------
      2  | b    | character | 0 (0.0%) |       a | 4 (4.0%)
         |      |           |          |       b | 3 (3.0%)
         |      |           |          |       c | 5 (5.0%)
         |      |           |          |       d | 4 (4.0%)
         |      |           |          |       e | 3 (3.0%)
         |      |           |          |   (...) |         
      -----------------------------------------------------

# data_codebook mixed numeric lengths

    Code
      data_codebook(d)
    Output
      d (100 rows and 2 variables, 2 shown)
      
      ID | Name | Type    | Missings |  Values |          N
      ---+------+---------+----------+---------+-----------
      1  | a    | integer | 0 (0.0%) |       1 | 28 (28.0%)
         |      |         |          |       2 | 26 (26.0%)
         |      |         |          |       3 | 29 (29.0%)
         |      |         |          |       4 | 17 (17.0%)
      ---+------+---------+----------+---------+-----------
      2  | b    | integer | 0 (0.0%) | [5, 15] |        100
      -----------------------------------------------------

# data_codebook mixed range_at

    Code
      data_codebook(d, range_at = 3)
    Output
      d (100 rows and 2 variables, 2 shown)
      
      ID | Name | Type    | Missings |  Values |   N
      ---+------+---------+----------+---------+----
      1  | a    | integer | 0 (0.0%) |  [1, 4] | 100
      ---+------+---------+----------+---------+----
      2  | b    | integer | 0 (0.0%) | [5, 15] | 100
      ----------------------------------------------

# data_codebook logicals

    Code
      data_codebook(d)
    Output
      d (100 rows and 3 variables, 3 shown)
      
      ID | Name | Type      | Missings |  Values |          N
      ---+------+-----------+----------+---------+-----------
      1  | a    | integer   | 0 (0.0%) | [1, 15] |        100
      ---+------+-----------+----------+---------+-----------
      2  | b    | character | 0 (0.0%) |       a | 26 (26.0%)
         |      |           |          |       b | 38 (38.0%)
         |      |           |          |       c | 36 (36.0%)
      ---+------+-----------+----------+---------+-----------
      3  | c    | logical   | 0 (0.0%) |   FALSE | 42 (42.0%)
         |      |           |          |    TRUE | 58 (58.0%)
      -------------------------------------------------------

# data_codebook labelled data exceptions

    Code
      data_codebook(d)
    Output
      d (100 rows and 3 variables, 3 shown)
      
      ID | Name | Type    |   Missings | Values | Value Labels |          N
      ---+------+---------+------------+--------+--------------+-----------
      1  | f1   | integer | 17 (17.0%) |      1 | One          | 21 (25.3%)
         |      |         |            |      2 | Two          | 20 (24.1%)
         |      |         |            |      3 | Three        | 23 (27.7%)
         |      |         |            |      5 | Five         | 19 (22.9%)
      ---+------+---------+------------+--------+--------------+-----------
      2  | f2   | integer |   0 (0.0%) |      1 | One          | 25 (25.0%)
         |      |         |            |      2 | Two          | 20 (20.0%)
         |      |         |            |      3 | Three        | 14 (14.0%)
         |      |         |            |      4 | 4            | 17 (17.0%)
         |      |         |            |      5 | Five         | 24 (24.0%)
      ---+------+---------+------------+--------+--------------+-----------
      3  | f3   | integer |   0 (0.0%) |      1 | One          | 21 (21.0%)
         |      |         |            |      2 | Two          | 24 (24.0%)
         |      |         |            |      3 | Three        | 16 (16.0%)
         |      |         |            |      4 | Four         | 14 (14.0%)
         |      |         |            |      5 | Five         | 25 (25.0%)
      ---------------------------------------------------------------------

# data_codebook labelled data factors

    Code
      data_codebook(d)
    Output
      d (100 rows and 3 variables, 3 shown)
      
      ID | Name | Type        | Missings | Values | Value Labels |          N
      ---+------+-------------+----------+--------+--------------+-----------
      1  | f1   | categorical | 0 (0.0%) |      a | A            | 35 (35.0%)
         |      |             |          |      b | Bee          | 32 (32.0%)
         |      |             |          |      c | Cee          | 33 (33.0%)
      ---+------+-------------+----------+--------+--------------+-----------
      2  | f2   | categorical | 0 (0.0%) |      a | A            | 30 (30.0%)
         |      |             |          |      b | Bee          | 38 (38.0%)
         |      |             |          |      c | Cee          | 32 (32.0%)
      ---+------+-------------+----------+--------+--------------+-----------
      3  | f3   | categorical | 0 (0.0%) |      a | A            | 23 (23.0%)
         |      |             |          |      b | Bee          | 28 (28.0%)
         |      |             |          |      c | Cee          | 49 (49.0%)
      -----------------------------------------------------------------------

# data_codebook works with numbers < 1

    Code
      data_codebook(d)
    Output
      d (6 rows and 2 variables, 2 shown)
      
      ID | Name | Type    | Missings | Values |         N
      ---+------+---------+----------+--------+----------
      1  | a    | numeric | 0 (0.0%) |      1 | 2 (33.3%)
         |      |         |          |      2 | 2 (33.3%)
         |      |         |          |      3 | 2 (33.3%)
      ---+------+---------+----------+--------+----------
      2  | b    | numeric | 0 (0.0%) |      0 | 3 (50.0%)
         |      |         |          |      1 | 2 (33.3%)
         |      |         |          |      2 | 1 (16.7%)
      ---------------------------------------------------

# data_codebook, big marks

    Code
      data_codebook(d)
    Output
      d (1,000,000 rows and 2 variables, 2 shown)
      
      ID | Name | Type        | Missings | Values |               N
      ---+------+-------------+----------+--------+----------------
      1  | f1   | categorical | 0 (0.0%) |      a | 333,238 (33.3%)
         |      |             |          |      b | 332,910 (33.3%)
         |      |             |          |      c | 333,852 (33.4%)
      ---+------+-------------+----------+--------+----------------
      2  | f2   | categorical | 0 (0.0%) |      1 | 333,285 (33.3%)
         |      |             |          |      2 | 333,358 (33.3%)
         |      |             |          |      3 | 333,357 (33.3%)
      -------------------------------------------------------------

# data_codebook, tagged NA

    Code
      data_codebook(data.frame(x))
    Output
      data.frame(x) (26 rows and 1 variables, 1 shown)
      
      ID | Name | Type    |   Missings | Values | Value Labels |         N
      ---+------+---------+------------+--------+--------------+----------
      1  | x    | numeric | 12 (46.2%) |      1 | Agreement    | 4 (15.4%)
         |      |         |            |      2 | 2            | 4 (15.4%)
         |      |         |            |      3 | 3            | 4 (15.4%)
         |      |         |            |      4 | Disagreement | 2 ( 7.7%)
         |      |         |            |  NA(a) | Refused      | 4 (15.4%)
         |      |         |            |  NA(c) | First        | 5 (19.2%)
         |      |         |            |  NA(z) | Not home     | 3 (11.5%)
      --------------------------------------------------------------------

---

    Code
      data_codebook(data.frame(x))
    Output
      data.frame(x) (23 rows and 1 variables, 1 shown)
      
      ID | Name | Type    |  Missings | Values | Value Labels |         N
      ---+------+---------+-----------+--------+--------------+----------
      1  | x    | numeric | 9 (39.1%) |      1 | Agreement    | 4 (17.4%)
         |      |         |           |      2 | 2            | 4 (17.4%)
         |      |         |           |      3 | 3            | 4 (17.4%)
         |      |         |           |      4 | Disagreement | 2 ( 8.7%)
         |      |         |           |  NA(a) | Refused      | 4 (17.4%)
         |      |         |           |  NA(c) | First        | 5 (21.7%)
      -------------------------------------------------------------------

# data_codebook, negative label values #334

    Code
      data_codebook(data.frame(x1, x2))
    Output
      data.frame(x1, x2) (4 rows and 2 variables, 2 shown)
      
      ID | Name | Type    | Missings | Values | Value Labels |         N
      ---+------+---------+----------+--------+--------------+----------
      1  | x1   | integer | 0 (0.0%) |      1 | Agreement    | 1 (25.0%)
         |      |         |          |      2 | 2            | 1 (25.0%)
         |      |         |          |      3 | 3            | 1 (25.0%)
         |      |         |          |      4 | Disagreement | 1 (25.0%)
      ---+------+---------+----------+--------+--------------+----------
      2  | x2   | numeric | 0 (0.0%) |     -9 | Missing      | 1 (25.0%)
         |      |         |          |      1 | Agreement    | 1 (25.0%)
         |      |         |          |      2 | 2            | 1 (25.0%)
         |      |         |          |      3 | 3            | 1 (25.0%)
      ------------------------------------------------------------------


================================================
FILE: tests/testthat/_snaps/data_modify.md
================================================
# data_modify message about recycling values

    Code
      head(data_modify(iris, Sepal.Width = 1))
    Output
        Sepal.Length Sepal.Width Petal.Length Petal.Width Species
      1          5.1           1          1.4         0.2  setosa
      2          4.9           1          1.4         0.2  setosa
      3          4.7           1          1.3         0.2  setosa
      4          4.6           1          1.5         0.2  setosa
      5          5.0           1          1.4         0.2  setosa
      6          5.4           1          1.7         0.4  setosa

---

    Code
      head(data_modify(iris, Sepal.Width = 1:2))
    Output
        Sepal.Length Sepal.Width Petal.Length Petal.Width Species
      1          5.1           1          1.4         0.2  setosa
      2          4.9           2          1.4         0.2  setosa
      3          4.7           1          1.3         0.2  setosa
      4          4.6           2          1.5         0.2  setosa
      5          5.0           1          1.4         0.2  setosa
      6          5.4           2          1.7         0.4  setosa

---

    Code
      head(data_modify(iris, Petal.Length = 1, Sepal.Width = 1))
    Output
        Sepal.Length Sepal.Width Petal.Length Petal.Width Species
      1          5.1           1            1         0.2  setosa
      2          4.9           1            1         0.2  setosa
      3          4.7           1            1         0.2  setosa
      4          4.6           1            1         0.2  setosa
      5          5.0           1            1         0.2  setosa
      6          5.4           1            1         0.4  setosa

---

    Code
      head(data_modify(iris, Petal.Length = 1, Sepal.Width = 1:2))
    Output
        Sepal.Length Sepal.Width Petal.Length Petal.Width Species
      1          5.1           1            1         0.2  setosa
      2          4.9           2            1         0.2  setosa
      3          4.7           1            1         0.2  setosa
      4          4.6           2            1         0.2  setosa
      5          5.0           1            1         0.2  setosa
      6          5.4           2            1         0.4  setosa

---

    Code
      head(data_modify(iris, Petal.Length = 2, Sepal.Width = 2))
    Output
        Sepal.Length Sepal.Width Petal.Length Petal.Width Species
      1          5.1           2            2         0.2  setosa
      2          4.9           2            2         0.2  setosa
      3          4.7           2            2         0.2  setosa
      4          4.6           2            2         0.2  setosa
      5          5.0           2            2         0.2  setosa
      6          5.4           2            2         0.4  setosa

# data_modify message about modified variables

    Code
      head(data_modify(iris, Sepal.Width = 2 * Sepal.Width))
    Output
        Sepal.Length Sepal.Width Petal.Length Petal.Width Species
      1          5.1         7.0          1.4         0.2  setosa
      2          4.9         6.0          1.4         0.2  setosa
      3          4.7         6.4          1.3         0.2  setosa
      4          4.6         6.2          1.5         0.2  setosa
      5          5.0         7.2          1.4         0.2  setosa
      6          5.4         7.8          1.7         0.4  setosa

---

    Code
      head(data_modify(iris, Petal.Length = Sepal.Length, Sepal.Width = Petal.Width))
    Output
        Sepal.Length Sepal.Width Petal.Length Petal.Width Species
      1          5.1         0.2          5.1         0.2  setosa
      2          4.9         0.2          4.9         0.2  setosa
      3          4.7         0.2          4.7         0.2  setosa
      4          4.6         0.2          4.6         0.2  setosa
      5          5.0         0.2          5.0         0.2  setosa
      6          5.4         0.4          5.4         0.4  setosa

# data_modify works with new expressions, different use cases same results

    Code
      print(head(out_complex))
    Output
        Species sepwid seplen half_petal new_var new_num new_var2 new_num2
      1  setosa    7.0   25.5       0.70  string       1       ho        4
      2  setosa    6.0   24.5       0.70  string       2       ho        5
      3  setosa    6.4   23.5       0.65  string       3       ho        6
      4  setosa    6.2   23.0       0.75  string       4       ho        4
      5  setosa    7.2   25.0       0.70  string       5       ho        5
      6  setosa    7.8   27.0       0.85  string       1       ho        6


================================================
FILE: tests/testthat/_snaps/data_partition.md
================================================
# data_partition works as expected

    Code
      data_partition(letters, seed = 123)
    Output
      $p_0.7
         data .row_id
      1     c       3
      2     e       5
      3     h       8
      4     i       9
      5     j      10
      6     k      11
      7     l      12
      8     m      13
      9     n      14
      10    o      15
      11    p      16
      12    r      18
      13    s      19
      14    t      20
      15    u      21
      16    w      23
      17    x      24
      18    y      25
      
      $test
        data .row_id
      1    a       1
      2    b       2
      3    d       4
      4    f       6
      5    g       7
      6    q      17
      7    v      22
      8    z      26
      

---

    Code
      str(data_partition(iris, proportion = 0.7, seed = 123))
    Output
      List of 2
       $ p_0.7:'data.frame':	105 obs. of  6 variables:
        ..$ Sepal.Length: num [1:105] 4.6 5.4 4.6 5 4.4 4.9 4.8 4.8 4.3 5.8 ...
        ..$ Sepal.Width : num [1:105] 3.1 3.9 3.4 3.4 2.9 3.1 3.4 3 3 4 ...
        ..$ Petal.Length: num [1:105] 1.5 1.7 1.4 1.5 1.4 1.5 1.6 1.4 1.1 1.2 ...
        ..$ Petal.Width : num [1:105] 0.2 0.4 0.3 0.2 0.2 0.1 0.2 0.1 0.1 0.2 ...
        ..$ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
        ..$ .row_id     : int [1:105] 4 6 7 8 9 10 12 13 14 15 ...
       $ test :'data.frame':	45 obs. of  6 variables:
        ..$ Sepal.Length: num [1:45] 5.1 4.9 4.7 5 5.4 5.1 5.7 5.2 5.2 5.2 ...
        ..$ Sepal.Width : num [1:45] 3.5 3 3.2 3.6 3.7 3.5 3.8 3.5 3.4 4.1 ...
        ..$ Petal.Length: num [1:45] 1.4 1.4 1.3 1.4 1.5 1.4 1.7 1.5 1.4 1.5 ...
        ..$ Petal.Width : num [1:45] 0.2 0.2 0.2 0.2 0.2 0.3 0.3 0.2 0.2 0.1 ...
        ..$ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
        ..$ .row_id     : int [1:45] 1 2 3 5 11 18 19 28 29 33 ...

---

    Code
      str(data_partition(iris, proportion = c(0.2, 0.5), seed = 123))
    Output
      List of 3
       $ p_0.2:'data.frame':	30 obs. of  6 variables:
        ..$ Sepal.Length: num [1:30] 4.6 4.4 4.3 4.6 5 5 5.4 5 4.4 5 ...
        ..$ Sepal.Width : num [1:30] 3.4 2.9 3 3.6 3 3.4 3.4 3.5 3.2 3.3 ...
        ..$ Petal.Length: num [1:30] 1.4 1.4 1.1 1 1.6 1.6 1.5 1.3 1.3 1.4 ...
        ..$ Petal.Width : num [1:30] 0.3 0.2 0.1 0.2 0.2 0.4 0.4 0.3 0.2 0.2 ...
        ..$ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
        ..$ .row_id     : int [1:30] 7 9 14 23 26 27 32 41 43 50 ...
       $ p_0.5:'data.frame':	75 obs. of  6 variables:
        ..$ Sepal.Length: num [1:75] 4.6 5.4 5 4.9 4.8 5.8 5.7 5.4 5.1 5.7 ...
        ..$ Sepal.Width : num [1:75] 3.1 3.9 3.4 3.1 3.4 4 4.4 3.9 3.5 3.8 ...
        ..$ Petal.Length: num [1:75] 1.5 1.7 1.5 1.5 1.6 1.2 1.5 1.3 1.4 1.7 ...
        ..$ Petal.Width : num [1:75] 0.2 0.4 0.2 0.1 0.2 0.2 0.4 0.4 0.3 0.3 ...
        ..$ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
        ..$ .row_id     : int [1:75] 4 6 8 10 12 15 16 17 18 19 ...
       $ test :'data.frame':	45 obs. of  6 variables:
        ..$ Sepal.Length: num [1:45] 5.1 4.9 4.7 5 5.4 4.8 5.4 5.1 5.2 4.9 ...
        ..$ Sepal.Width : num [1:45] 3.5 3 3.2 3.6 3.7 3 3.4 3.7 4.1 3.1 ...
        ..$ Petal.Length: num [1:45] 1.4 1.4 1.3 1.4 1.5 1.4 1.7 1.5 1.5 1.5 ...
        ..$ Petal.Width : num [1:45] 0.2 0.2 0.2 0.2 0.2 0.1 0.2 0.4 0.1 0.2 ...
        ..$ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
        ..$ .row_id     : int [1:45] 1 2 3 5 11 13 21 22 33 35 ...

---

    Code
      str(data_partition(iris, proportion = 0.7, by = "Species", seed = 123))
    Output
      List of 2
       $ p_0.7:'data.frame':	105 obs. of  6 variables:
        ..$ Sepal.Length: num [1:105] 4.7 4.6 5 4.6 5 4.4 4.9 5.4 4.8 4.8 ...
        ..$ Sepal.Width : num [1:105] 3.2 3.1 3.6 3.4 3.4 2.9 3.1 3.7 3.4 3 ...
        ..$ Petal.Length: num [1:105] 1.3 1.5 1.4 1.4 1.5 1.4 1.5 1.5 1.6 1.4 ...
        ..$ Petal.Width : num [1:105] 0.2 0.2 0.2 0.3 0.2 0.2 0.1 0.2 0.2 0.1 ...
        ..$ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
        ..$ .row_id     : int [1:105] 3 4 5 7 8 9 10 11 12 13 ...
       $ test :'data.frame':	45 obs. of  6 variables:
        ..$ Sepal.Length: num [1:45] 5.1 4.9 5.4 5.7 5.1 5.1 5.1 4.6 5.5 4.9 ...
        ..$ Sepal.Width : num [1:45] 3.5 3 3.9 4.4 3.5 3.8 3.7 3.6 4.2 3.1 ...
        ..$ Petal.Length: num [1:45] 1.4 1.4 1.7 1.5 1.4 1.5 1.5 1 1.4 1.5 ...
        ..$ Petal.Width : num [1:45] 0.2 0.2 0.4 0.4 0.3 0.3 0.4 0.2 0.2 0.2 ...
        ..$ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
        ..$ .row_id     : int [1:45] 1 2 6 16 18 20 22 23 34 35 ...

---

    Code
      str(data_partition(iris, proportion = c(0.2, 0.5), by = "Species", seed = 123))
    Output
      List of 3
       $ p_0.2:'data.frame':	30 obs. of  6 variables:
        ..$ Sepal.Length: num [1:30] 4.7 4.3 5.8 4.8 5 4.8 5.5 4.5 4.4 4.6 ...
        ..$ Sepal.Width : num [1:30] 3.2 3 4 3.4 3 3.1 3.5 2.3 3.2 3.2 ...
        ..$ Petal.Length: num [1:30] 1.3 1.1 1.2 1.9 1.6 1.6 1.3 1.3 1.3 1.4 ...
        ..$ Petal.Width : num [1:30] 0.2 0.1 0.2 0.2 0.2 0.2 0.2 0.3 0.2 0.2 ...
        ..$ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
        ..$ .row_id     : int [1:30] 3 14 15 25 26 31 37 42 43 48 ...
       $ p_0.5:'data.frame':	75 obs. of  6 variables:
        ..$ Sepal.Length: num [1:75] 5 5.4 5 4.4 4.9 5.4 4.8 4.8 5.7 5.4 ...
        ..$ Sepal.Width : num [1:75] 3.6 3.9 3.4 2.9 3.1 3.7 3.4 3 4.4 3.9 ...
        ..$ Petal.Length: num [1:75] 1.4 1.7 1.5 1.4 1.5 1.5 1.6 1.4 1.5 1.3 ...
        ..$ Petal.Width : num [1:75] 0.2 0.4 0.2 0.2 0.1 0.2 0.2 0.1 0.4 0.4 ...
        ..$ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
        ..$ .row_id     : int [1:75] 5 6 8 9 10 11 12 13 16 17 ...
       $ test :'data.frame':	45 obs. of  6 variables:
        ..$ Sepal.Length: num [1:45] 5.1 4.9 4.6 4.6 5.7 5.4 4.6 5 5.2 4.7 ...
        ..$ Sepal.Width : num [1:45] 3.5 3 3.1 3.4 3.8 3.4 3.6 3.4 3.5 3.2 ...
        ..$ Petal.Length: num [1:45] 1.4 1.4 1.5 1.4 1.7 1.7 1 1.6 1.5 1.6 ...
        ..$ Petal.Width : num [1:45] 0.2 0.2 0.2 0.3 0.3 0.2 0.2 0.4 0.2 0.2 ...
        ..$ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
        ..$ .row_id     : int [1:45] 1 2 4 7 19 21 23 27 28 30 ...


================================================
FILE: tests/testthat/_snaps/data_peek.md
================================================
# data_peek snapshots look as expected

    Code
      data_peek(iris)
    Output
      Data frame with 150 rows and 5 variables
      
      Variable     | Type    | Values                                        
      -----------------------------------------------------------------------
      Sepal.Length | numeric | 5.1, 4.9, 4.7, 4.6, 5, 5.4, 4.6, 5, 4.4, ...  
      Sepal.Width  | numeric | 3.5, 3, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, ...
      Petal.Length | numeric | 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, ...   
      Petal.Width  | numeric | 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, ...   
      Species      | factor  | setosa, setosa, setosa, setosa, setosa, ...   

---

    Code
      data_peek(iris, select = 1:3)
    Output
      Data frame with 150 rows and 5 variables
      
      Variable     | Type    | Values                                        
      -----------------------------------------------------------------------
      Sepal.Length | numeric | 5.1, 4.9, 4.7, 4.6, 5, 5.4, 4.6, 5, 4.4, ...  
      Sepal.Width  | numeric | 3.5, 3, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, ...
      Petal.Length | numeric | 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, ...   

---

    Code
      data_peek(iris, width = 130)
    Output
      Data frame with 150 rows and 5 variables
      
      Variable     | Type    | Values                                                                                                  
      ---------------------------------------------------------------------------------------------------------------------------------
      Sepal.Length | numeric | 5.1, 4.9, 4.7, 4.6, 5, 5.4, 4.6, 5, 4.4, 4.9, 5.4, 4.8, 4.8, 4.3, 5.8, 5.7, 5.4, 5.1, 5.7, 5.1, 5.4, ...
      Sepal.Width  | numeric | 3.5, 3, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7, 3.4, 3, 3, 4, 4.4, 3.9, 3.5, 3.8, 3.8, 3.4, ...    
      Petal.Length | numeric | 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5, 1.5, 1.6, 1.4, 1.1, 1.2, 1.5, 1.3, 1.4, 1.7, 1.5, ... 
      Petal.Width  | numeric | 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1, 0.2, 0.2, 0.1, 0.1, 0.2, 0.4, 0.4, 0.3, 0.3, 0.3, ... 
      Species      | factor  | setosa, setosa, setosa, setosa, setosa, setosa, setosa, setosa, setosa, setosa, setosa, setosa, ...     


================================================
FILE: tests/testthat/_snaps/data_read.md
================================================
# data_read, convert many labels correctly

    Code
      data_tabulate(d$selv1)
    Output
      d$selv1 <categorical>
      # total N=2413 valid N=2413
      
      Value                                              |   N | Raw % | Valid % | Cumulative %
      ---------------------------------------------------+-----+-------+---------+-------------
      Vignette 1 weiblich (Gülsen E. Reinigungskraft B)  | 150 |  6.22 |    6.22 |         6.22
      Vignette 2 weiblich (Gülsen E. Anwältin B)         | 150 |  6.22 |    6.22 |        12.43
      Vignette 3 weiblich (Monika E. Reinigungskraft B)  | 150 |  6.22 |    6.22 |        18.65
      Vignette 4 weiblich (Monika E. Anwältin B)         | 151 |  6.26 |    6.26 |        24.91
      Vignette 5 männlich (Hasan E. Reinigungskraft B)   | 151 |  6.26 |    6.26 |        31.16
      Vignette 6 männlich (Hasan E. Anwalt B)            | 153 |  6.34 |    6.34 |        37.51
      Vignette 7 männlich (Martin E. Reinigungskraft B)  | 150 |  6.22 |    6.22 |        43.72
      Vignette 8 männlich (Martin E. Anwalt B)           | 150 |  6.22 |    6.22 |        49.94
      Vignette 9 weiblich (Gülsen E. Reinigungskraft E)  | 151 |  6.26 |    6.26 |        56.20
      Vignette 10 weiblich (Gülsen E. Anwältin E)        | 150 |  6.22 |    6.22 |        62.41
      Vignette 11 weiblich (Monika E. Reinigungskraft E) | 150 |  6.22 |    6.22 |        68.63
      Vignette 12 weiblich (Monika E. Anwältin E)        | 151 |  6.26 |    6.26 |        74.89
      Vignette 13 männlich (Hasan E. Reinigungskraft E)  | 155 |  6.42 |    6.42 |        81.31
      Vignette 14 männlich (Hasan E. Anwalt E)           | 150 |  6.22 |    6.22 |        87.53
      Vignette 15 männlich (Martin E. Reinigungskraft E) | 150 |  6.22 |    6.22 |        93.74
      Vignette 16 männlich (Martin E. Anwalt E)          | 151 |  6.26 |    6.26 |       100.00
      <NA>                                               |   0 |  0.00 |    <NA> |         <NA>

---

    Code
      data_tabulate(d$c12)
    Output
      Sind oder waren Sie schon einmal selbst von solchen Beschwerden betroffen? (d$c12) <categorical>
      # total N=2413 valid N=2413
      
      Value        |    N | Raw % | Valid % | Cumulative %
      -------------+------+-------+---------+-------------
      ja           |  786 | 32.57 |   32.57 |        32.57
      nein         | 1616 | 66.97 |   66.97 |        99.54
      keine Angabe |   11 |  0.46 |    0.46 |       100.00
      <NA>         |    0 |  0.00 |    <NA> |         <NA>

---

    Code
      data_tabulate(d$c12a)
    Output
      Haben Sie deswegen Behandlung(en) in Anspruch genommen? (d$c12a) <categorical>
      # total N=2413 valid N=2413
      
      Value        |    N | Raw % | Valid % | Cumulative %
      -------------+------+-------+---------+-------------
      Filter       | 1627 | 67.43 |   67.43 |        67.43
      ja           |  500 | 20.72 |   20.72 |        88.15
      nein         |  285 | 11.81 |   11.81 |        99.96
      keine Angabe |    1 |  0.04 |    0.04 |       100.00
      <NA>         |    0 |  0.00 |    <NA> |         <NA>

---

    Code
      data_tabulate(d$c12c)
    Output
      Wie sehr haben diese Behandlung(en) Ihre Beeinträchtigung durch die Beschwerden verbessert? (d$c12c) <categorical>
      # total N=2413 valid N=2413
      
      Value                     |    N | Raw % | Valid % | Cumulative %
      --------------------------+------+-------+---------+-------------
      Filter                    | 1913 | 79.28 |   79.28 |        79.28
      0 = keine                 |   34 |  1.41 |    1.41 |        80.69
      1                         |    2 |  0.08 |    0.08 |        80.77
      2                         |   11 |  0.46 |    0.46 |        81.23
      3                         |   14 |  0.58 |    0.58 |        81.81
      4                         |   19 |  0.79 |    0.79 |        82.59
      5                         |   61 |  2.53 |    2.53 |        85.12
      6                         |   42 |  1.74 |    1.74 |        86.86
      7                         |   63 |  2.61 |    2.61 |        89.47
      8                         |   97 |  4.02 |    4.02 |        93.49
      9                         |   53 |  2.20 |    2.20 |        95.69
      10 = sehr starke          |   99 |  4.10 |    4.10 |        99.79
      weiß nicht / keine Angabe |    5 |  0.21 |    0.21 |       100.00
      <NA>                      |    0 |  0.00 |    <NA> |         <NA>

---

    Code
      table(d$selv1)
    Output
      
        1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16 
      150 150 150 151 151 153 150 150 151 150 150 151 155 150 150 151 

---

    Code
      table(d$c12)
    Output
      
         1    2   99 
       786 1616   11 

---

    Code
      table(d$c12a)
    Output
      
        -2    1    2   99 
      1627  500  285    1 

---

    Code
      table(d$c12c)
    Output
      
        -2    0    1    2    3    4    5    6    7    8    9   10   99 
      1913   34    2   11   14   19   61   42   63   97   53   99    5 


================================================
FILE: tests/testthat/_snaps/data_rescale.md
================================================
# rescale works as expected

    Code
      head(rescale(iris, to = c(0, 1)))
    Message
      Variables of class `factor` can't be rescaled and remain unchanged.
    Output
        Sepal.Length Sepal.Width Petal.Length Petal.Width Species
      1   0.22222222   0.6250000   0.06779661  0.04166667  setosa
      2   0.16666667   0.4166667   0.06779661  0.04166667  setosa
      3   0.11111111   0.5000000   0.05084746  0.04166667  setosa
      4   0.08333333   0.4583333   0.08474576  0.04166667  setosa
      5   0.19444444   0.6666667   0.06779661  0.04166667  setosa
      6   0.30555556   0.7916667   0.11864407  0.12500000  setosa

---

    Code
      head(rescale(iris, to = c(0, 1), select = "Sepal.Length"))
    Output
        Sepal.Length Sepal.Width Petal.Length Petal.Width Species
      1   0.22222222         3.5          1.4         0.2  setosa
      2   0.16666667         3.0          1.4         0.2  setosa
      3   0.11111111         3.2          1.3         0.2  setosa
      4   0.08333333         3.1          1.5         0.2  setosa
      5   0.19444444         3.6          1.4         0.2  setosa
      6   0.30555556         3.9          1.7         0.4  setosa

---

    Code
      head(rescale(iris, to = list(Sepal.Length = c(0, 1), Petal.Length = c(-1, 0))))
    Message
      Variables of class `factor` can't be rescaled and remain unchanged.
    Output
        Sepal.Length Sepal.Width Petal.Length Petal.Width Species
      1   0.22222222         3.5   -0.9322034         0.2  setosa
      2   0.16666667         3.0   -0.9322034         0.2  setosa
      3   0.11111111         3.2   -0.9491525         0.2  setosa
      4   0.08333333         3.1   -0.9152542         0.2  setosa
      5   0.19444444         3.6   -0.9322034         0.2  setosa
      6   0.30555556         3.9   -0.8813559         0.4  setosa


================================================
FILE: tests/testthat/_snaps/data_seek.md
================================================
# data_seek - print

    Code
      data_seek(iris, "Length")
    Output
      index |       column |       labels
      -----------------------------------
          1 | Sepal.Length | Sepal.Length
          3 | Petal.Length | Petal.Length

---

    Code
      data_seek(iris, "abc")
    Output
      No matches found.


================================================
FILE: tests/testthat/_snaps/data_separate.md
================================================
# data_separate: multiple columns

    Code
      out
    Output
        x_1 x_2  x_3  y_1  y_2  y_3
      1   1   a    6    m    n   99
      2   2   b    7   77    f    g
      3   3   c    8   44    9 <NA>
      4   5   j <NA> <NA> <NA> <NA>

---

    Code
      out
    Output
        x_1 x_2  x_3  y_1  y_2  y_3
      1   1   a    6    m    n   99
      2   2   b  7 d   77    f    g
      3   3   c    8   44    9 <NA>
      4   5   j <NA> <NA> <NA> <NA>

---

    Code
      out
    Output
        x_A x_B  x_C  y_A  y_B  y_C
      1   1   a    6    m    n   99
      2   2   b  7 d   77    f    g
      3   3   c    8   44    9 <NA>
      4   5   j <NA> <NA> <NA> <NA>

---

    Code
      out
    Output
              x      y x_A x_B  x_C  y_A  y_B  y_C
      1   1.a.6 m.n.99   1   a    6    m    n   99
      2 2.b.7.d 77.f.g   2   b  7 d   77    f    g
      3   3.c.8   44.9   3   c    8   44    9 <NA>
      4     5.j   <NA>   5   j <NA> <NA> <NA> <NA>

---

    Code
      out
    Output
        x_1 x_2  x_3  y_1  y_2  y_3
      1   1   a    6    m    n   99
      2   b   7    d   77    f    g
      3   3   c    8   44    9 <NA>
      4   5   j <NA> <NA> <NA> <NA>

---

    Code
      out
    Output
              x      y x_A x_B x_C  y_A  y_B  y_C
      1   1.a.6 m.n.99   1   a   6    m    n   99
      2 2.b.7.d 77.f.g   2   b 7 d   77    f    g
      3   3.c.8   44.9   3   c   8   44    9    9
      4     5.j   <NA>   5   j   j <NA> <NA> <NA>

---

    Code
      out
    Output
              x      y   A   B   C
      1   1.a.6 m.n.99  1m  an 699
      2 2.b.7.d 77.f.g 277  bf 7dg
      3   3.c.8   44.9 344  c9  89
      4     5.j   <NA> 5NA jNA jNA

---

    Code
      out
    Output
              x      y   A   B    C
      1   1.a.6 m.n.99  1m  an  699
      2 2.b.7.d 77.f.g 277  bf   7g
      3   3.c.8   44.9 344  c9  8NA
      4     5.j   <NA> 5NA jNA NANA

---

    Code
      out
    Output
        x_1 x_2 x_3  y_1  y_2  y_3
      1   1   a   6    m    n   99
      2   2   b   7   77    f    g
      3   3   c   8   44   44    9
      4   5   5   j <NA> <NA> <NA>

# data_separate: multiple columns, different lengths

    Code
      out
    Output
        A B    C   EE   FF   GG
      1 1 a    6    m    n   99
      2 2 b    7   77    f    g
      3 3 c    8   44    9 <NA>
      4 5 j <NA> <NA> <NA> <NA>

---

    Code
      out
    Output
        A B    C   EE   FF   GG   HH
      1 1 a    6    m    n   99   22
      2 2 b    7   77    f    g   34
      3 3 c    8   44    9 <NA> <NA>
      4 5 j <NA> <NA> <NA> <NA> <NA>

# data_separate: fail if invalid column selected

    Code
      data_separate(d_sep, guess_columns = "mode", select = NULL)
    Message
      Column `x` had different number of values after splitting. Variable was
        split into 3 columns.
      `x` returned more columns than expected after splitting. Right-most
        columns have been dropped.
      `x`returned fewer columns than expected after splitting. Right-most
        columns were filled with `NA`.
      Column `y` had different number of values after splitting. Variable was
        split into 3 columns.
      `y`returned fewer columns than expected after splitting. Right-most
        columns were filled with `NA`.
    Output
        x_1 x_2  x_3  y_1  y_2  y_3
      1   1   a    6    m    n   99
      2   2   b    7   77    f    g
      3   3   c    8   44    9 <NA>
      4   5   j <NA> <NA> <NA> <NA>

# data_separate: numeric column

    Code
      out
    Output
              y x_1 x_2 x_3 x_4
      V1 m.n.99  15 435 352   3
      V2 77.f.g  53 554 353   2
      V3   44.9  12 342 422    
      V4   <NA>  15 454 334 535


================================================
FILE: tests/testthat/_snaps/data_summary.md
================================================
# data_summary, print

    Code
      print(out)
    Output
      am | gear |    MW |   SD
      ------------------------
       0 |    3 | 16.11 | 3.37
       0 |    4 | 21.05 | 3.07
       1 |    4 | 26.27 | 5.41
       1 |    5 | 21.38 | 6.66

# data_summary, with NA

    Code
      print(out)
    Output
      c172code |    MW
      ----------------
             1 | 87.12
             2 | 94.05
             3 | 75.00
          <NA> | 47.80

---

    Code
      print(out)
    Output
      c172code |    MW
      ----------------
             1 | 87.12
             2 | 94.05
             3 | 75.00

---

    Code
      print(out)
    Output
      e42dep | c172code |     MW
      --------------------------
      1      |        2 |  17.00
      2      |        2 |  34.25
      3      |        1 |  39.50
      3      |        2 |  52.44
      3      |        3 |  52.00
      3      |     <NA> |  84.00
      4      |        1 | 134.75
      4      |        2 | 119.26
      4      |        3 |  88.80
      4      |     <NA> |  43.29
      <NA>   |        2 |   <NA>
      <NA>   |     <NA> |   7.00

# data_summary, bayestestR::ci

    Code
      out
    Output
      am | gear | mean_value |         95% CI
      ---------------------------------------
       0 |    3 |      16.11 | [10.40, 21.46]
       0 |    4 |      21.05 | [17.91, 24.28]
       1 |    4 |      26.27 | [21.00, 33.64]
       1 |    5 |      21.38 | [15.08, 29.96]


================================================
FILE: tests/testthat/_snaps/data_tabulate.md
================================================
# data_tabulate, tinytable

    Code
      display(data_tabulate(efc$c172code), format = "tt")
    Output
      
      +-------+----+-------+---------+--------------+
      | Value | N  | Raw % | Valid % | Cumulative % |
      +=======+====+=======+=========+==============+
      | 1     | 8  | 8     | 8.89    | 8.89         |
      +-------+----+-------+---------+--------------+
      | 2     | 66 | 66    | 73.33   | 82.22        |
      +-------+----+-------+---------+--------------+
      | 3     | 16 | 16    | 17.78   | 100.00       |
      +-------+----+-------+---------+--------------+
      | (NA)  | 10 | 10    | (NA)    | (NA)         |
      +=======+====+=======+=========+==============+
      | total N=100 valid N=90                      |
      +=======+====+=======+=========+==============+
      Table: carer's level of education (efc$c172code) (numeric) 

---

    Code
      display(data_tabulate(efc, "c172code"), format = "tt")
    Output
      
      +-------+----+-------+---------+--------------+
      | Value | N  | Raw % | Valid % | Cumulative % |
      +=======+====+=======+=========+==============+
      | 1     | 8  | 8     | 8.89    | 8.89         |
      +-------+----+-------+---------+--------------+
      | 2     | 66 | 66    | 73.33   | 82.22        |
      +-------+----+-------+---------+--------------+
      | 3     | 16 | 16    | 17.78   | 100.00       |
      +-------+----+-------+---------+--------------+
      | (NA)  | 10 | 10    | (NA)    | (NA)         |
      +=======+====+=======+=========+==============+
      | total N=100 valid N=90                      |
      +=======+====+=======+=========+==============+
      Table: carer's level of education (c172code) (numeric) 

# data_tabulate, weights

    Code
      print(data_tabulate(efc$e42dep, weights = efc$weights))
    Output
      elder's dependency (efc$e42dep) <categorical>
      # total N=105 valid N=100 (weighted)
      
      Value |  N | Raw % | Valid % | Cumulative %
      ------+----+-------+---------+-------------
      1     |  3 |  2.86 |       3 |            3
      2     |  4 |  3.81 |       4 |            7
      3     | 26 | 24.76 |      26 |           33
      4     | 67 | 63.81 |      67 |          100
      <NA>  |  5 |  4.76 |    <NA> |         <NA>

---

    Code
      print_md(data_tabulate(efc$e42dep, weights = efc$weights))
    Output
      
      
      Table: elder's dependency (efc$e42dep) (categorical)
      
      |Value |  N| Raw %| Valid %| Cumulative %|
      |:-----|--:|-----:|-------:|------------:|
      |1     |  3|  2.86|       3|            3|
      |2     |  4|  3.81|       4|            7|
      |3     | 26| 24.76|      26|           33|
      |4     | 67| 63.81|      67|          100|
      |(NA)  |  5|  4.76|    (NA)|         (NA)|
      total N=105 valid N=100 (weighted)
      
      
---

    Code
      display(data_tabulate(efc$e42dep, weights = efc$weights))
    Output
      
      
      Table: elder's dependency (efc$e42dep) (categorical)
      
      |Value |  N| Raw %| Valid %| Cumulative %|
      |:-----|--:|-----:|-------:|------------:|
      |1     |  3|  2.86|       3|            3|
      |2     |  4|  3.81|       4|            7|
      |3     | 26| 24.76|      26|           33|
      |4     | 67| 63.81|      67|          100|
      |(NA)  |  5|  4.76|    (NA)|         (NA)|
      total N=105 valid N=100 (weighted)
      
      
---

    Code
      print(data_tabulate(efc, c("e42dep", "e16sex"), collapse = TRUE, weights = efc$
        weights))
    Output
      # Frequency Table (weighted)
      
      Variable | Value |  N | Raw % | Valid % | Cumulative %
      ---------+-------+----+-------+---------+-------------
      e42dep   |     1 |  3 |  2.86 |       3 |            3
               |     2 |  4 |  3.81 |       4 |            7
               |     3 | 26 | 24.76 |      26 |           33
               |     4 | 67 | 63.81 |      67 |          100
               |  <NA> |  5 |  4.76 |    <NA> |         <NA>
      ---------+-------+----+-------+---------+-------------
      e16sex   |     1 | 50 | 47.62 |   47.62 |        47.62
               |     2 | 55 | 52.38 |   52.38 |       100.00
               |  <NA> |  0 |  0.00 |    <NA> |         <NA>
      ------------------------------------------------------

---

    Code
      print_md(data_tabulate(efc, c("e42dep", "e16sex"), weights = efc$weights))
    Output
      
      
      Table: Frequency Table (weighted)
      
      |Variable | Value|  N| Raw %| Valid %| Cumulative %|
      |:--------|-----:|--:|-----:|-------:|------------:|
      |e42dep   |     1|  3|  2.86|       3|            3|
      |         |     2|  4|  3.81|       4|            7|
      |         |     3| 26| 24.76|      26|           33|
      |         |     4| 67| 63.81|      67|          100|
      |         |  (NA)|  5|  4.76|    (NA)|         (NA)|
      |         |      |   |      |        |             |
      |e16sex   |     1| 50| 47.62|   47.62|        47.62|
      |         |     2| 55| 52.38|   52.38|       100.00|
      |         |  (NA)|  0|  0.00|    (NA)|         (NA)|
      |         |      |   |      |        |             |

---

    Code
      display(data_tabulate(efc, c("e42dep", "e16sex"), weights = efc$weights))
    Output
      
      
      Table: Frequency Table (weighted)
      
      |Variable | Value|  N| Raw %| Valid %| Cumulative %|
      |:--------|-----:|--:|-----:|-------:|------------:|
      |e42dep   |     1|  3|  2.86|       3|            3|
      |         |     2|  4|  3.81|       4|            7|
      |         |     3| 26| 24.76|      26|           33|
      |         |     4| 67| 63.81|      67|          100|
      |         |  (NA)|  5|  4.76|    (NA)|         (NA)|
      |         |      |   |      |        |             |
      |e16sex   |     1| 50| 47.62|   47.62|        47.62|
      |         |     2| 55| 52.38|   52.38|       100.00|
      |         |  (NA)|  0|  0.00|    (NA)|         (NA)|
      |         |      |   |      |        |             |

# data_tabulate print

    Code
      data_tabulate(efc$e42dep)
    Output
      elder's dependency (efc$e42dep) <categorical>
      # total N=100 valid N=97
      
      Value |  N | Raw % | Valid % | Cumulative %
      ------+----+-------+---------+-------------
      1     |  2 |     2 |    2.06 |         2.06
      2     |  4 |     4 |    4.12 |         6.19
      3     | 28 |    28 |   28.87 |        35.05
      4     | 63 |    63 |   64.95 |       100.00
      <NA>  |  3 |     3 |    <NA> |         <NA>

# data_tabulate print multiple

    Code
      data_tabulate(efc, c("c172code", "e16sex"))
    Output
      carer's level of education (c172code) <numeric>
      # total N=100 valid N=90
      
      Value |  N | Raw % | Valid % | Cumulative %
      ------+----+-------+---------+-------------
      1     |  8 |     8 |    8.89 |         8.89
      2     | 66 |    66 |   73.33 |        82.22
      3     | 16 |    16 |   17.78 |       100.00
      <NA>  | 10 |    10 |    <NA> |         <NA>
      
      elder's gender (e16sex) <numeric>
      # total N=100 valid N=100
      
      Value |  N | Raw % | Valid % | Cumulative %
      ------+----+-------+---------+-------------
      1     | 46 |    46 |      46 |           46
      2     | 54 |    54 |      54 |          100
      <NA>  |  0 |     0 |    <NA> |         <NA>

# data_tabulate big numbers

    Code
      data_tabulate(x)
    Output
      x <integer>
      # total N=10,000,000 valid N=10,000,000
      
      Value |         N | Raw % | Valid % | Cumulative %
      ------+-----------+-------+---------+-------------
      1     | 1,998,318 | 19.98 |   19.98 |        19.98
      2     | 1,998,338 | 19.98 |   19.98 |        39.97
      3     | 2,001,814 | 20.02 |   20.02 |        59.98
      4     | 1,999,423 | 19.99 |   19.99 |        79.98
      5     | 2,002,107 | 20.02 |   20.02 |       100.00
      <NA>  |         0 |  0.00 |    <NA> |         <NA>

---

    Code
      print(data_tabulate(x), big_mark = "-")
    Output
      x <integer>
      # total N=10-000-000 valid N=10-000-000
      
      Value |         N | Raw % | Valid % | Cumulative %
      ------+-----------+-------+---------+-------------
      1     | 1-998-318 | 19.98 |   19.98 |        19.98
      2     | 1-998-338 | 19.98 |   19.98 |        39.97
      3     | 2-001-814 | 20.02 |   20.02 |        59.98
      4     | 1-999-423 | 19.99 |   19.99 |        79.98
      5     | 2-002-107 | 20.02 |   20.02 |       100.00
      <NA>  |         0 |  0.00 |    <NA> |         <NA>

---

    Code
      print(data_tabulate(x), big_mark = "")
    Output
      x <integer>
      # total N=10000000 valid N=10000000
      
      Value |       N | Raw % | Valid % | Cumulative %
      ------+---------+-------+---------+-------------
      1     | 1998318 | 19.98 |   19.98 |        19.98
      2     | 1998338 | 19.98 |   19.98 |        39.97
      3     | 2001814 | 20.02 |   20.02 |        59.98
      4     | 1999423 | 19.99 |   19.99 |        79.98
      5     | 2002107 | 20.02 |   20.02 |       100.00
      <NA>  |       0 |  0.00 |    <NA> |         <NA>

# data_tabulate print multiple, collapse

    Code
      data_tabulate(efc, c("c172code", "e16sex"), collapse = TRUE)
    Output
      # Frequency Table
      
      Variable | Value |  N | Raw % | Valid % | Cumulative %
      ---------+-------+----+-------+---------+-------------
      c172code |     1 |  8 |     8 |    8.89 |         8.89
               |     2 | 66 |    66 |   73.33 |        82.22
               |     3 | 16 |    16 |   17.78 |       100.00
               |  <NA> | 10 |    10 |    <NA> |         <NA>
      ---------+-------+----+-------+---------+-------------
      e16sex   |     1 | 46 |    46 |      46 |           46
               |     2 | 54 |    54 |      54 |          100
               |  <NA> |  0 |     0 |    <NA> |         <NA>
      ------------------------------------------------------

# data_tabulate print grouped data

    Code
      data_tabulate(poorman::group_by(efc, e16sex), "c172code")
    Output
      carer's level of education (c172code) <numeric>
      Grouped by e16sex (1)
      # total N=46 valid N=41
      
      Value |  N | Raw % | Valid % | Cumulative %
      ------+----+-------+---------+-------------
      1     |  5 | 10.87 |   12.20 |        12.20
      2     | 32 | 69.57 |   78.05 |        90.24
      3     |  4 |  8.70 |    9.76 |       100.00
      <NA>  |  5 | 10.87 |    <NA> |         <NA>
      
      carer's level of education (c172code) <numeric>
      Grouped by e16sex (2)
      # total N=54 valid N=49
      
      Value |  N | Raw % | Valid % | Cumulative %
      ------+----+-------+---------+-------------
      1     |  3 |  5.56 |    6.12 |         6.12
      2     | 34 | 62.96 |   69.39 |        75.51
      3     | 12 | 22.22 |   24.49 |       100.00
      <NA>  |  5 |  9.26 |    <NA> |         <NA>

# data_tabulate print, collapse groups

    Code
      data_tabulate(poorman::group_by(efc, e16sex), "c172code", collapse = TRUE)
    Output
      # Frequency Table
      
      Variable |      Group | Value |  N | Raw % | Valid % | Cumulative %
      ---------+------------+-------+----+-------+---------+-------------
      c172code | e16sex (1) |     1 |  5 | 10.87 |   12.20 |        12.20
               |            |     2 | 32 | 69.57 |   78.05 |        90.24
               |            |     3 |  4 |  8.70 |    9.76 |       100.00
               |            |  <NA> |  5 | 10.87 |    <NA> |         <NA>
      ---------+------------+-------+----+-------+---------+-------------
      c172code | e16sex (2) |     1 |  3 |  5.56 |    6.12 |         6.12
               |            |     2 | 34 | 62.96 |   69.39 |        75.51
               |            |     3 | 12 | 22.22 |   24.49 |       100.00
               |            |  <NA> |  5 |  9.26 |    <NA> |         <NA>
      -------------------------------------------------------------------

# data_tabulate print, collapse groups, drop levels

    Code
      data_tabulate(poorman::group_by(efc, e16sex), "e42dep", collapse = TRUE,
      drop_levels = TRUE)
    Output
      # Frequency Table
      
      Variable |      Group | Value |  N | Raw % | Valid % | Cumulative %
      ---------+------------+-------+----+-------+---------+-------------
      e42dep   | e16sex (1) |     1 |  2 |  4.35 |    4.44 |         4.44
               |            |     2 |  2 |  4.35 |    4.44 |         8.89
               |            |     3 |  8 | 17.39 |   17.78 |        26.67
               |            |     4 | 33 | 71.74 |   73.33 |       100.00
               |            |  <NA> |  1 |  2.17 |    <NA> |         <NA>
      ---------+------------+-------+----+-------+---------+-------------
      e42dep   | e16sex (2) |     2 |  2 |  3.70 |    3.85 |         3.85
               |            |     3 | 20 | 37.04 |   38.46 |        42.31
               |            |     4 | 30 | 55.56 |   57.69 |       100.00
               |            |  <NA> |  2 |  3.70 |    <NA> |         <NA>
      -------------------------------------------------------------------

# data_tabulate, cross tables

    Code
      print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full"))
    Output
      efc$c172code |       male |     female |     <NA> | Total
      -------------+------------+------------+----------+------
      1            |  5  (5.0%) |  2  (2.0%) | 1 (1.0%) |     8
      2            | 31 (31.0%) | 33 (33.0%) | 2 (2.0%) |    66
      3            |  4  (4.0%) | 11 (11.0%) | 1 (1.0%) |    16
      <NA>         |  5  (5.0%) |  4  (4.0%) | 1 (1.0%) |    10
      -------------+------------+------------+----------+------
      Total        |         45 |         50 |        5 |   100

---

    Code
      print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full",
      remove_na = TRUE))
    Output
      efc$c172code |       male |     female | Total
      -------------+------------+------------+------
      1            |  5  (5.8%) |  2  (2.3%) |     7
      2            | 31 (36.0%) | 33 (38.4%) |    64
      3            |  4  (4.7%) | 11 (12.8%) |    15
      -------------+------------+------------+------
      Total        |         40 |         46 |    86

---

    Code
      print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full",
      weights = efc$weights))
    Output
      efc$c172code |       male |     female |     <NA> | Total
      -------------+------------+------------+----------+------
      1            |  5  (4.8%) |  3  (2.9%) | 2 (1.9%) |    10
      2            | 32 (30.5%) | 32 (30.5%) | 3 (2.9%) |    67
      3            |  3  (2.9%) | 11 (10.5%) | 1 (1.0%) |    15
      <NA>         |  8  (7.6%) |  5  (4.8%) | 1 (1.0%) |    14
      -------------+------------+------------+----------+------
      Total        |         48 |         51 |        7 |   105

---

    Code
      print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full",
      remove_na = TRUE, weights = efc$weights))
    Output
      efc$c172code |       male |     female | Total
      -------------+------------+------------+------
      1            |  5  (5.8%) |  3  (3.5%) |     8
      2            | 32 (37.2%) | 32 (37.2%) |    64
      3            |  3  (3.5%) | 11 (12.8%) |    14
      -------------+------------+------------+------
      Total        |         40 |         46 |    86

---

    Code
      print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row"))
    Output
      c172code |       male |     female |      <NA> | Total
      ---------+------------+------------+-----------+------
      1        |  5 (62.5%) |  2 (25.0%) | 1 (12.5%) |     8
      2        | 31 (47.0%) | 33 (50.0%) | 2  (3.0%) |    66
      3        |  4 (25.0%) | 11 (68.8%) | 1  (6.2%) |    16
      <NA>     |  5 (50.0%) |  4 (40.0%) | 1 (10.0%) |    10
      ---------+------------+------------+-----------+------
      Total    |         45 |         50 |         5 |   100

---

    Code
      print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row",
      remove_na = TRUE))
    Output
      c172code |       male |     female | Total
      ---------+------------+------------+------
      1        |  5 (71.4%) |  2 (28.6%) |     7
      2        | 31 (48.4%) | 33 (51.6%) |    64
      3        |  4 (26.7%) | 11 (73.3%) |    15
      ---------+------------+------------+------
      Total    |         40 |         46 |    86

---

    Code
      print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row",
      weights = efc$weights))
    Output
      c172code |       male |     female |      <NA> | Total
      ---------+------------+------------+-----------+------
      1        |  5 (50.0%) |  3 (30.0%) | 2 (20.0%) |    10
      2        | 32 (47.8%) | 32 (47.8%) | 3  (4.5%) |    67
      3        |  3 (20.0%) | 11 (73.3%) | 1  (6.7%) |    15
      <NA>     |  8 (57.1%) |  5 (35.7%) | 1  (7.1%) |    14
      ---------+------------+------------+-----------+------
      Total    |         48 |         51 |         7 |   105

---

    Code
      print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row",
      remove_na = TRUE, weights = efc$weights))
    Output
      c172code |       male |     female | Total
      ---------+------------+------------+------
      1        |  5 (62.5%) |  3 (37.5%) |     8
      2        | 32 (50.0%) | 32 (50.0%) |    64
      3        |  3 (21.4%) | 11 (78.6%) |    14
      ---------+------------+------------+------
      Total    |         40 |         46 |    86

---

    Code
      print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column"))
    Output
      c172code |       male |     female |      <NA> | Total
      ---------+------------+------------+-----------+------
      1        |  5 (11.1%) |  2  (4.0%) | 1 (20.0%) |     8
      2        | 31 (68.9%) | 33 (66.0%) | 2 (40.0%) |    66
      3        |  4  (8.9%) | 11 (22.0%) | 1 (20.0%) |    16
      <NA>     |  5 (11.1%) |  4  (8.0%) | 1 (20.0%) |    10
      ---------+------------+------------+-----------+------
      Total    |         45 |         50 |         5 |   100

---

    Code
      print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column",
        remove_na = TRUE))
    Output
      c172code |       male |     female | Total
      ---------+------------+------------+------
      1        |  5 (12.5%) |  2  (4.3%) |     7
      2        | 31 (77.5%) | 33 (71.7%) |    64
      3        |  4 (10.0%) | 11 (23.9%) |    15
      ---------+------------+------------+------
      Total    |         40 |         46 |    86

---

    Code
      print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column",
        weights = "weights"))
    Output
      c172code |       male |     female |      <NA> | Total
      ---------+------------+------------+-----------+------
      1        |  5 (10.4%) |  3  (5.9%) | 2 (28.6%) |    10
      2        | 32 (66.7%) | 32 (62.7%) | 3 (42.9%) |    67
      3        |  3  (6.2%) | 11 (21.6%) | 1 (14.3%) |    15
      <NA>     |  8 (16.7%) |  5  (9.8%) | 1 (14.3%) |    14
      ---------+------------+------------+-----------+------
      Total    |         48 |         51 |         7 |   105

---

    Code
      print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column",
        remove_na = TRUE, weights = "weights"))
    Output
      c172code |       male |     female | Total
      ---------+------------+------------+------
      1        |  5 (12.5%) |  3  (6.5%) |     8
      2        | 32 (80.0%) | 32 (69.6%) |    64
      3        |  3  (7.5%) | 11 (23.9%) |    14
      ---------+------------+------------+------
      Total    |         40 |         46 |    86

---

    Code
      print(data_tabulate(efc, c("c172code", "e42dep"), by = "e16sex", proportions = "row"))
    Output
      Variable | Value |        male |     female |      <NA> | Total
      ---------+-------+-------------+------------+-----------+------
      c172code |     1 |   5 (62.5%) |  2 (25.0%) | 1 (12.5%) |     8
      c172code |     2 |  31 (47.0%) | 33 (50.0%) | 2  (3.0%) |    66
      c172code |     3 |   4 (25.0%) | 11 (68.8%) | 1  (6.2%) |    16
      c172code |  <NA> |   5 (50.0%) |  4 (40.0%) | 1 (10.0%) |    10
      e42dep   |     1 |  2 (100.0%) |  0  (0.0%) |  0 (0.0%) |     2
      e42dep   |     2 |  2  (50.0%) |  2 (50.0%) |  0 (0.0%) |     4
      e42dep   |     3 |  8  (28.6%) | 18 (64.3%) |  2 (7.1%) |    28
      e42dep   |     4 | 32  (50.8%) | 28 (44.4%) |  3 (4.8%) |    63
      e42dep   |  <NA> |  1  (33.3%) |  2 (66.7%) |  0 (0.0%) |     3

# data_tabulate, cross tables, tinytable

    Code
      display(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full"),
      format = "tt")
    Output
      
      +--------------+------------+------------+----------+-------+
      | efc$c172code | male       | female     | (NA)     | Total |
      +==============+============+============+==========+=======+
      | 1            | 5  (5.0%)  | 2  (2.0%)  | 1 (1.0%) | 8     |
      +--------------+------------+------------+----------+-------+
      | 2            | 31 (31.0%) | 33 (33.0%) | 2 (2.0%) | 66    |
      +--------------+------------+------------+----------+-------+
      | 3            | 4  (4.0%)  | 11 (11.0%) | 1 (1.0%) | 16    |
      +--------------+------------+------------+----------+-------+
      | (NA)         | 5  (5.0%)  | 4  (4.0%)  | 1 (1.0%) | 10    |
      +--------------+------------+------------+----------+-------+
      | Total        | 45         | 50         | 5        | 100   |
      +--------------+------------+------------+----------+-------+ 

---

    Code
      display(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full",
      remove_na = TRUE), format = "tt")
    Output
      
      +--------------+------------+------------+-------+
      | efc$c172code | male       | female     | Total |
      +==============+============+============+=======+
      | 1            | 5  (5.8%)  | 2  (2.3%)  | 7     |
      +--------------+------------+------------+-------+
      | 2            | 31 (36.0%) | 33 (38.4%) | 64    |
      +--------------+------------+------------+-------+
      | 3            | 4  (4.7%)  | 11 (12.8%) | 15    |
      +--------------+------------+------------+-------+
      | Total        | 40         | 46         | 86    |
      +--------------+------------+------------+-------+ 

---

    Code
      display(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full",
      weights = efc$weights), format = "tt")
    Output
      
      +--------------+------------+------------+----------+-------+
      | efc$c172code | male       | female     | (NA)     | Total |
      +==============+============+============+==========+=======+
      | 1            | 5  (4.8%)  | 3  (2.9%)  | 2 (1.9%) | 10    |
      +--------------+------------+------------+----------+-------+
      | 2            | 32 (30.5%) | 32 (30.5%) | 3 (2.9%) | 67    |
      +--------------+------------+------------+----------+-------+
      | 3            | 3  (2.9%)  | 11 (10.5%) | 1 (1.0%) | 15    |
      +--------------+------------+------------+----------+-------+
      | (NA)         | 8  (7.6%)  | 5  (4.8%)  | 1 (1.0%) | 14    |
      +--------------+------------+------------+----------+-------+
      | Total        | 48         | 51         | 7        | 105   |
      +--------------+------------+------------+----------+-------+ 

---

    Code
      display(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full",
      remove_na = TRUE, weights = efc$weights), format = "tt")
    Output
      
      +--------------+------------+------------+-------+
      | efc$c172code | male       | female     | Total |
      +==============+============+============+=======+
      | 1            | 5  (5.8%)  | 3  (3.5%)  | 8     |
      +--------------+------------+------------+-------+
      | 2            | 32 (37.2%) | 32 (37.2%) | 64    |
      +--------------+------------+------------+-------+
      | 3            | 3  (3.5%)  | 11 (12.8%) | 14    |
      +--------------+------------+------------+-------+
      | Total        | 40         | 46         | 86    |
      +--------------+------------+------------+-------+ 

---

    Code
      display(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row"),
      format = "tt")
    Output
      
      +----------+------------+------------+-----------+-------+
      | c172code | male       | female     | (NA)      | Total |
      +==========+============+============+===========+=======+
      | 1        | 5 (62.5%)  | 2 (25.0%)  | 1 (12.5%) | 8     |
      +----------+------------+------------+-----------+-------+
      | 2        | 31 (47.0%) | 33 (50.0%) | 2  (3.0%) | 66    |
      +----------+------------+------------+-----------+-------+
      | 3        | 4 (25.0%)  | 11 (68.8%) | 1  (6.2%) | 16    |
      +----------+------------+------------+-----------+-------+
      | (NA)     | 5 (50.0%)  | 4 (40.0%)  | 1 (10.0%) | 10    |
      +----------+------------+------------+-----------+-------+
      | Total    | 45         | 50         | 5         | 100   |
      +----------+------------+------------+-----------+-------+ 

---

    Code
      display(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row",
      remove_na = TRUE, weights = efc$weights), format = "tt")
    Output
      
      +----------+------------+------------+-------+
      | c172code | male       | female     | Total |
      +==========+============+============+=======+
      | 1        | 5 (62.5%)  | 3 (37.5%)  | 8     |
      +----------+------------+------------+-------+
      | 2        | 32 (50.0%) | 32 (50.0%) | 64    |
      +----------+------------+------------+-------+
      | 3        | 3 (21.4%)  | 11 (78.6%) | 14    |
      +----------+------------+------------+-------+
      | Total    | 40         | 46         | 86    |
      +----------+------------+------------+-------+ 

# data_tabulate, cross tables, grouped df

    Code
      print(data_tabulate(grp, "c172code", by = "e16sex", proportions = "row"))
    Output
      Grouped by e42dep (1)
      
      Variable | Value |       male | female |     <NA> | Total
      ---------+-------+------------+--------+----------+------
      c172code |     2 | 2 (100.0%) |   <NA> | 0 (0.0%) |     2
               |  <NA> | 0   (0.0%) |   <NA> | 0 (0.0%) |     0
      
      Grouped by e42dep (2)
      
      Variable | Value |      male |    female |     <NA> | Total
      ---------+-------+-----------+-----------+----------+------
      c172code |     2 | 2 (50.0%) | 2 (50.0%) | 0 (0.0%) |     4
               |  <NA> | 0  (0.0%) | 0  (0.0%) | 0 (0.0%) |     0
      
      Grouped by e42dep (3)
      
      Variable | Value |      male |     female |      <NA> | Total
      ---------+-------+-----------+------------+-----------+------
      c172code |     1 | 2 (50.0%) |  2 (50.0%) | 0  (0.0%) |     4
               |     2 | 4 (25.0%) | 11 (68.8%) | 1  (6.2%) |    16
               |     3 | 1 (16.7%) |  5 (83.3%) | 0  (0.0%) |     6
               |  <NA> | 1 (50.0%) |  0  (0.0%) | 1 (50.0%) |     2
      
      Grouped by e42dep (4)
      
      Variable | Value |       male |     female |      <NA> | Total
      ---------+-------+------------+------------+-----------+------
      c172code |     1 |  3 (75.0%) |  0  (0.0%) | 1 (25.0%) |     4
               |     2 | 23 (54.8%) | 18 (42.9%) | 1  (2.4%) |    42
               |     3 |  3 (30.0%) |  6 (60.0%) | 1 (10.0%) |    10
               |  <NA> |  3 (42.9%) |  4 (57.1%) | 0  (0.0%) |     7
      
      Grouped by e42dep (NA)
      
      Variable | Value |       male |     female |     <NA> | Total
      ---------+-------+------------+------------+----------+------
      c172code |     2 | 0   (0.0%) | 2 (100.0%) | 0 (0.0%) |     2
               |  <NA> | 1 (100.0%) | 0   (0.0%) | 0 (0.0%) |     1

# data_tabulate, cross tables, print/format works

    Code
      print(x)
    Output
      Variable | Value |  3 | 4 | 5 | <NA> | Total
      ---------+-------+----+---+---+------+------
      cyl      |     4 |  1 | 8 | 2 |    0 |    11
      cyl      |     6 |  2 | 4 | 1 |    0 |     7
      cyl      |     8 | 12 | 0 | 2 |    0 |    14
      cyl      |  <NA> |  0 | 0 | 0 |    0 |     0
      am       |     0 | 15 | 4 | 0 |    0 |    19
      am       |     1 |  0 | 8 | 5 |    0 |    13
      am       |  <NA> |  0 | 0 | 0 |    0 |     0

# data_tabulate, cross tables, markdown

    Code
      print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full"))
    Output
      
      
      |efc$c172code |       male|     female|    (NA) | Total|
      |:------------|----------:|----------:|:--------|-----:|
      |1            |  5  (5.0%)|  2  (2.0%)|1 (1.0%) |     8|
      |2            | 31 (31.0%)| 33 (33.0%)|2 (2.0%) |    66|
      |3            |  4  (4.0%)| 11 (11.0%)|1 (1.0%) |    16|
      |(NA)         |  5  (5.0%)|  4  (4.0%)|1 (1.0%) |    10|
      |             |           |           |         |      |
      |Total        |         45|         50|       5 |   100|

---

    Code
      print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full",
      remove_na = TRUE))
    Output
      
      
      |efc$c172code |       male|     female| Total|
      |:------------|----------:|----------:|-----:|
      |1            |  5  (5.8%)|  2  (2.3%)|     7|
      |2            | 31 (36.0%)| 33 (38.4%)|    64|
      |3            |  4  (4.7%)| 11 (12.8%)|    15|
      |             |           |           |      |
      |Total        |         40|         46|    86|

---

    Code
      print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full",
      weights = efc$weights))
    Output
      
      
      |efc$c172code |       male|     female|    (NA) | Total|
      |:------------|----------:|----------:|:--------|-----:|
      |1            |  5  (4.8%)|  3  (2.9%)|2 (1.9%) |    10|
      |2            | 32 (30.5%)| 32 (30.5%)|3 (2.9%) |    67|
      |3            |  3  (2.9%)| 11 (10.5%)|1 (1.0%) |    15|
      |(NA)         |  8  (7.6%)|  5  (4.8%)|1 (1.0%) |    14|
      |             |           |           |         |      |
      |Total        |         48|         51|       7 |   105|

---

    Code
      print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full",
      remove_na = TRUE, weights = efc$weights))
    Output
      
      
      |efc$c172code |       male|     female| Total|
      |:------------|----------:|----------:|-----:|
      |1            |  5  (5.8%)|  3  (3.5%)|     8|
      |2            | 32 (37.2%)| 32 (37.2%)|    64|
      |3            |  3  (3.5%)| 11 (12.8%)|    14|
      |             |           |           |      |
      |Total        |         40|         46|    86|

---

    Code
      print_md(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column",
        remove_na = TRUE, weights = "weights"))
    Output
      
      
      |c172code |       male|     female| Total|
      |:--------|----------:|----------:|-----:|
      |1        |  5 (12.5%)|  3  (6.5%)|     8|
      |2        | 32 (80.0%)| 32 (69.6%)|    64|
      |3        |  3  (7.5%)| 11 (23.9%)|    14|
      |         |           |           |      |
      |Total    |         40|         46|    86|

---

    Code
      print_md(data_tabulate(efc, c("c172code", "e42dep"), by = "e16sex",
      proportions = "row"))
    Output
      
      
      |Variable | Value|        male|     female|     (NA) | Total|
      |:--------|-----:|-----------:|----------:|:---------|-----:|
      |c172code |     1|   5 (62.5%)|  2 (25.0%)|1 (12.5%) |     8|
      |c172code |     2|  31 (47.0%)| 33 (50.0%)|2  (3.0%) |    66|
      |c172code |     3|   4 (25.0%)| 11 (68.8%)|1  (6.2%) |    16|
      |c172code |  (NA)|   5 (50.0%)|  4 (40.0%)|1 (10.0%) |    10|
      |e42dep   |     1|  2 (100.0%)|  0  (0.0%)| 0 (0.0%) |     2|
      |e42dep   |     2|  2  (50.0%)|  2 (50.0%)| 0 (0.0%) |     4|
      |e42dep   |     3|  8  (28.6%)| 18 (64.3%)| 2 (7.1%) |    28|
      |e42dep   |     4| 32  (50.8%)| 28 (44.4%)| 3 (4.8%) |    63|
      |e42dep   |  (NA)|  1  (33.3%)|  2 (66.7%)| 0 (0.0%) |     3|

---

    Code
      display(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column",
        remove_na = TRUE, weights = "weights"))
    Output
      
      
      |c172code |       male|     female| Total|
      |:--------|----------:|----------:|-----:|
      |1        |  5 (12.5%)|  3  (6.5%)|     8|
      |2        | 32 (80.0%)| 32 (69.6%)|    64|
      |3        |  3  (7.5%)| 11 (23.9%)|    14|
      |         |           |           |      |
      |Total    |         40|         46|    86|

---

    Code
      display(data_tabulate(efc, c("c172code", "e42dep"), by = "e16sex", proportions = "row"))
    Output
      
      
      |Variable | Value|        male|     female|     (NA) | Total|
      |:--------|-----:|-----------:|----------:|:---------|-----:|
      |c172code |     1|   5 (62.5%)|  2 (25.0%)|1 (12.5%) |     8|
      |c172code |     2|  31 (47.0%)| 33 (50.0%)|2  (3.0%) |    66|
      |c172code |     3|   4 (25.0%)| 11 (68.8%)|1  (6.2%) |    16|
      |c172code |  (NA)|   5 (50.0%)|  4 (40.0%)|1 (10.0%) |    10|
      |e42dep   |     1|  2 (100.0%)|  0  (0.0%)| 0 (0.0%) |     2|
      |e42dep   |     2|  2  (50.0%)|  2 (50.0%)| 0 (0.0%) |     4|
      |e42dep   |     3|  8  (28.6%)| 18 (64.3%)| 2 (7.1%) |    28|
      |e42dep   |     4| 32  (50.8%)| 28 (44.4%)| 3 (4.8%) |    63|
      |e42dep   |  (NA)|  1  (33.3%)|  2 (66.7%)| 0 (0.0%) |     3|

# data_tabulate, correct 0% for proportions

    Code
      print(out[[1]])
    Output
      c172code |       male |     female |     <NA> | Total
      ---------+------------+------------+----------+------
      1        |  5 (10.9%) |  3  (5.6%) | 0 (0.0%) |     8
      2        | 32 (69.6%) | 34 (63.0%) | 0 (0.0%) |    66
      3        |  4  (8.7%) | 12 (22.2%) | 0 (0.0%) |    16
      <NA>     |  5 (10.9%) |  5  (9.3%) | 0 (0.0%) |    10
      ---------+------------+------------+----------+------
      Total    |         46 |         54 |        0 |   100

# data_tabulate, table methods

    Code
      as.table(x)
    Output
      [[1]]
       4  6  8 
      11  7 14 
      

---

    Code
      as.table(x)
    Output
      [[1]]
       4  6  8 
      11  7 14 
      

---

    Code
      as.table(x, remove_na = FALSE)
    Output
      [[1]]
         4    6    8 <NA> 
        11    7   14    0 
      

---

    Code
      as.table(x)
    Output
      [[1]]
       4  6  8 
      11  7 14 
      
      [[2]]
       3  4  5 
      15 12  5 
      

---

    Code
      as.table(x)
    Output
      [[1]]
         3  4  5
      4  1  8  2
      6  2  4  1
      8 12  0  2
      

---

    Code
      as.table(x, simplify = TRUE)
    Output
         3  4  5
      4  1  8  2
      6  2  4  1
      8 12  0  2

---

    Code
      as.table(x)
    Output
      [[1]]
         3  4  5
      4  1  8  2
      6  2  4  1
      8 12  0  2
      

---

    Code
      as.table(x, simplify = TRUE)
    Output
         3  4  5
      4  1  8  2
      6  2  4  1
      8 12  0  2

---

    Code
      as.table(x)
    Output
      [[1]]
         3  4  5
      0 15  4  0
      1  0  8  5
      
      [[2]]
         3  4  5
      4  1  8  2
      6  2  4  1
      8 12  0  2
      

---

    Code
      as.table(x)
    Output
      $`am (0)`
         3  4
      4  1  2
      6  2  2
      8 12  0
      
      $`am (1)`
        4 5
      4 6 2
      6 2 1
      8 0 2
      

================================================
FILE: tests/testthat/_snaps/data_to_factor.md
================================================
# data_read, convert many labels correctly

    Code
      data_tabulate(to_factor(d$selv1))
    Output
      to_factor(d$selv1) <categorical>
      # total N=2413 valid N=2413
      
      Value                                              |   N | Raw % | Valid % | Cumulative %
      ---------------------------------------------------+-----+-------+---------+-------------
      Vignette 1 weiblich (Gülsen E. Reinigungskraft B)  | 150 |  6.22 |    6.22 |         6.22
      Vignette 2 weiblich (Gülsen E. Anwältin B)         | 150 |  6.22 |    6.22 |        12.43
      Vignette 3 weiblich (Monika E. Reinigungskraft B)  | 150 |  6.22 |    6.22 |        18.65
      Vignette 4 weiblich (Monika E. Anwältin B)         | 151 |  6.26 |    6.26 |        24.91
      Vignette 5 männlich (Hasan E. Reinigungskraft B)   | 151 |  6.26 |    6.26 |        31.16
      Vignette 6 männlich (Hasan E. Anwalt B)            | 153 |  6.34 |    6.34 |        37.51
      Vignette 7 männlich (Martin E. Reinigungskraft B)  | 150 |  6.22 |    6.22 |        43.72
      Vignette 8 männlich (Martin E. Anwalt B)           | 150 |  6.22 |    6.22 |        49.94
      Vignette 9 weiblich (Gülsen E. Reinigungskraft E)  | 151 |  6.26 |    6.26 |        56.20
      Vignette 10 weiblich (Gülsen E. Anwältin E)        | 150 |  6.22 |    6.22 |        62.41
      Vignette 11 weiblich (Monika E. Reinigungskraft E) | 150 |  6.22 |    6.22 |        68.63
      Vignette 12 weiblich (Monika E. Anwältin E)        | 151 |  6.26 |    6.26 |        74.89
      Vignette 13 männlich (Hasan E. Reinigungskraft E)  | 155 |  6.42 |    6.42 |        81.31
      Vignette 14 männlich (Hasan E. Anwalt E)           | 150 |  6.22 |    6.22 |        87.53
      Vignette 15 männlich (Martin E. Reinigungskraft E) | 150 |  6.22 |    6.22 |        93.74
      Vignette 16 männlich (Martin E. Anwalt E)          | 151 |  6.26 |    6.26 |       100.00
      <NA>                                               |   0 |  0.00 |    <NA> |         <NA>

---

    Code
      data_tabulate(to_factor(d$c12))
    Output
      Sind oder waren Sie schon einmal selbst von solchen Beschwerden betroffen? (to_factor(d$c12)) <categorical>
      # total N=2413 valid N=2413
      
      Value        |    N | Raw % | Valid % | Cumulative %
      -------------+------+-------+---------+-------------
      ja           |  786 | 32.57 |   32.57 |        32.57
      nein         | 1616 | 66.97 |   66.97 |        99.54
      keine Angabe |   11 |  0.46 |    0.46 |       100.00
      <NA>         |    0 |  0.00 |    <NA> |         <NA>

---

    Code
      data_tabulate(to_factor(d$c12a))
    Output
      Haben Sie deswegen Behandlung(en) in Anspruch genommen? (to_factor(d$c12a)) <categorical>
      # total N=2413 valid N=2413
      
      Value        |    N | Raw % | Valid % | Cumulative %
      -------------+------+-------+---------+-------------
      Filter       | 1627 | 67.43 |   67.43 |        67.43
      ja           |  500 | 20.72 |   20.72 |        88.15
      nein         |  285 | 11.81 |   11.81 |        99.96
      keine Angabe |    1 |  0.04 |    0.04 |       100.00
      <NA>         |    0 |  0.00 |    <NA> |         <NA>

---

    Code
      data_tabulate(to_factor(d$c12c))
    Output
      Wie sehr haben diese Behandlung(en) Ihre Beeinträchtigung durch die Beschwerden verbessert? (to_factor(d$c12c)) <categorical>
      # total N=2413 valid N=2413
      
      Value                     |    N | Raw % | Valid % | Cumulative %
      --------------------------+------+-------+---------+-------------
      Filter                    | 1913 | 79.28 |   79.28 |        79.28
      0 = keine                 |   34 |  1.41 |    1.41 |        80.69
      1                         |    2 |  0.08 |    0.08 |        80.77
      2                         |   11 |  0.46 |    0.46 |        81.23
      3                         |   14 |  0.58 |    0.58 |        81.81
      4                         |   19 |  0.79 |    0.79 |        82.59
      5                         |   61 |  2.53 |    2.53 |        85.12
      6                         |   42 |  1.74 |    1.74 |        86.86
      7                         |   63 |  2.61 |    2.61 |        89.47
      8                         |   97 |  4.02 |    4.02 |        93.49
      9                         |   53 |  2.20 |    2.20 |        95.69
      10 = sehr starke          |   99 |  4.10 |    4.10 |        99.79
      weiß nicht / keine Angabe |    5 |  0.21 |    0.21 |       100.00
      <NA>                      |    0 |  0.00 |    <NA> |         <NA>


================================================
FILE: tests/testthat/_snaps/data_to_long.md
================================================
# data_to_long works - complex dataset

    Code
      str(long)
    Output
      'data.frame':	70000 obs. of  6 variables:
       $ gender     : int  1 1 1 1 1 1 1 1 1 1 ...
       $ education  : int  NA NA NA NA NA NA NA NA NA NA ...
       $ age        : int  16 16 16 16 16 16 16 16 16 16 ...
       $ Participant: num  61617 61617 61617 61617 61617 ...
       $ Item       : chr  "A1" "A2" "A3" "A4" ...
       $ Score      : int  2 4 3 4 4 2 3 3 4 4 ...

# don't convert factors to integer

    Code
      print(mtcars_long)
    Output
         cyl  hp drat    wt vs am gear carb am_f cyl_f id    g  value
      1    4  93 3.85 2.320  1  1    4    1    1     4  3  mpg  22.80
      2    4  93 3.85 2.320  1  1    4    1    1     4  3 qsec  18.61
      3    4  93 3.85 2.320  1  1    4    1    1     4  3 disp 108.00
      4    8 245 3.21 3.570  0  0    3    4    0     8  7  mpg  14.30
      5    8 245 3.21 3.570  0  0    3    4    0     8  7 qsec  15.84
      6    8 245 3.21 3.570  0  0    3    4    0     8  7 disp 360.00
      7    4  66 4.08 2.200  1  1    4    1    1     4 10  mpg  32.40
      8    4  66 4.08 2.200  1  1    4    1    1     4 10 qsec  19.47
      9    4  66 4.08 2.200  1  1    4    1    1     4 10 disp  78.70
      10   8 264 4.22 3.170  0  1    5    4    1     8 11  mpg  15.80
      11   8 264 4.22 3.170  0  1    5    4    1     8 11 qsec  14.50
      12   8 264 4.22 3.170  0  1    5    4    1     8 11 disp 351.00
      13   6 110 3.08 3.215  1  0    3    1    0     6  4  mpg  21.40
      14   6 110 3.08 3.215  1  0    3    1    0     6  4 qsec  19.44
      15   6 110 3.08 3.215  1  0    3    1    0     6  4 disp 258.00
      16   8 175 3.15 3.440  0  0    3    2    0     8  5  mpg  18.70
      17   8 175 3.15 3.440  0  0    3    2    0     8  5 qsec  17.02
      18   8 175 3.15 3.440  0  0    3    2    0     8  5 disp 360.00
      19   8 335 3.54 3.570  0  1    5    8    1     8 12  mpg  15.00
      20   8 335 3.54 3.570  0  1    5    8    1     8 12 qsec  14.60
      21   8 335 3.54 3.570  0  1    5    8    1     8 12 disp 301.00
      22   6 110 3.90 2.620  0  1    4    4    1     6  1  mpg  21.00
      23   6 110 3.90 2.620  0  1    4    4    1     6  1 qsec  16.46
      24   6 110 3.90 2.620  0  1    4    4    1     6  1 disp 160.00
      25   6 110 3.90 2.875  0  1    4    4    1     6  2  mpg  21.00
      26   6 110 3.90 2.875  0  1    4    4    1     6  2 qsec  17.02
      27   6 110 3.90 2.875  0  1    4    4    1     6  2 disp 160.00
      28   4  95 3.92 3.150  1  0    4    2    0     4  9  mpg  22.80
      29   4  95 3.92 3.150  1  0    4    2    0     4  9 qsec  22.90
      30   4  95 3.92 3.150  1  0    4    2    0     4  9 disp 140.80
      31   4  62 3.69 3.190  1  0    4    2    0     4  8  mpg  24.40
      32   4  62 3.69 3.190  1  0    4    2    0     4  8 qsec  20.00
      33   4  62 3.69 3.190  1  0    4    2    0     4  8 disp 146.70
      34   6 105 2.76 3.460  1  0    3    1    0     6  6  mpg  18.10
      35   6 105 2.76 3.460  1  0    3    1    0     6  6 qsec  20.22
      36   6 105 2.76 3.460  1  0    3    1    0     6  6 disp 225.00


================================================
FILE: tests/testthat/_snaps/data_to_numeric.md
================================================
# convert data frame to numeric

    Code
      to_numeric(head(ToothGrowth), dummy_factors = TRUE)
    Output
         len supp.OJ supp.VC dose
      1  4.2       0       1  0.5
      2 11.5       0       1  0.5
      3  7.3       0       1  0.5
      4  5.8       0       1  0.5
      5  6.4       0       1  0.5
      6 10.0       0       1  0.5

---

    Code
      to_numeric(head(ToothGrowth), dummy_factors = FALSE)
    Output
         len supp dose
      1  4.2    2  0.5
      2 11.5    2  0.5
      3  7.3    2  0.5
      4  5.8    2  0.5
      5  6.4    2  0.5
      6 10.0    2  0.5

# convert factor to numeric

    Code
      to_numeric(f, dummy_factors = TRUE)
    Output
         a c i s t
      1  0 0 0 1 0
      2  0 0 0 0 1
      3  1 0 0 0 0
      4  0 0 0 0 1
      5  0 0 1 0 0
      6  0 0 0 1 0
      7  0 0 0 0 1
      8  0 0 1 0 0
      9  0 1 0 0 0
      10 0 0 0 1 0


================================================
FILE: tests/testthat/_snaps/demean.md
================================================
# demean works

    Code
      head(x)
    Output
        Sepal.Length_between Petal.Length_between Sepal.Length_within
      1             5.925000             3.527500          -0.8250000
      2             5.925000             3.527500          -1.0250000
      3             5.925000             3.527500          -1.2250000
      4             5.862222             3.951111          -1.2622222
      5             5.925000             3.527500          -0.9250000
      6             5.862222             3.951111          -0.4622222
        Petal.Length_within
      1           -2.127500
      2           -2.127500
      3           -2.227500
      4           -2.451111
      5           -2.127500
      6           -2.251111

---

    Code
      head(x)
    Output
        Sepal.Length_between binary_between Species_between Species_setosa_between
      1             5.925000          0.375        0.850000              0.4250000
      2             5.925000          0.375        0.850000              0.4250000
      3             5.925000          0.375        0.850000              0.4250000
      4             5.862222          0.400        1.133333              0.2888889
      5             5.925000          0.375        0.850000              0.4250000
      6             5.862222          0.400        1.133333              0.2888889
        Species_versicolor_between Species_virginica_between Sepal.Length_within
      1                  0.3000000                 0.2750000          -0.8250000
      2                  0.3000000                 0.2750000          -1.0250000
      3                  0.3000000                 0.2750000          -1.2250000
      4                  0.2888889                 0.4222222          -1.2622222
      5                  0.3000000                 0.2750000          -0.9250000
      6                  0.2888889                 0.4222222          -0.4622222
        binary_within Species_within Species_setosa_within Species_versicolor_within
      1        -0.375      -0.850000             0.5750000                -0.3000000
      2         0.625      -0.850000             0.5750000                -0.3000000
      3        -0.375      -0.850000             0.5750000                -0.3000000
      4         0.600      -1.133333             0.7111111                -0.2888889
      5         0.625      -0.850000             0.5750000                -0.3000000
      6        -0.400      -1.133333             0.7111111                -0.2888889
        Species_virginica_within
      1               -0.2750000
      2               -0.2750000
      3               -0.2750000
      4               -0.4222222
      5               -0.2750000
      6               -0.4222222

---

    Code
      head(x)
    Output
        Sepal.Length Sepal.Width Petal.Length Petal.Width Species ID binary
      1          5.1         3.5          1.4         0.2  setosa  3      0
      2          4.9         3.0          1.4         0.2  setosa  3      1
      3          4.7         3.2          1.3         0.2  setosa  3      0
      4          4.6         3.1          1.5         0.2  setosa  2      1
      5          5.0         3.6          1.4         0.2  setosa  3      1
      6          5.4         3.9          1.7         0.4  setosa  2      0
        Sepal.Length_between Petal.Length_between Sepal.Length_within
      1             5.925000             3.527500          -0.8250000
      2             5.925000             3.527500          -1.0250000
      3             5.925000             3.527500          -1.2250000
      4             5.862222             3.951111          -1.2622222
      5             5.925000             3.527500          -0.9250000
      6             5.862222             3.951111          -0.4622222
        Petal.Length_within
      1           -2.127500
      2           -2.127500
      3           -2.227500
      4           -2.451111
      5           -2.127500
      6           -2.251111

# demean interaction term

    Code
      demean(dat, select = c("a", "x*y"), by = "ID", append = FALSE)
    Output
        a_between x_y_between   a_within x_y_within
      1  2.666667    4.666667 -1.6666667 -0.6666667
      2  2.333333    4.000000 -0.3333333  2.0000000
      3  2.500000    4.500000  0.5000000 -1.5000000
      4  2.666667    4.666667  1.3333333  3.3333333
      5  2.333333    4.000000 -1.3333333  0.0000000
      6  2.500000    4.500000 -0.5000000  1.5000000
      7  2.666667    4.666667  0.3333333 -2.6666667
      8  2.333333    4.000000  1.6666667 -2.0000000


================================================
FILE: tests/testthat/_snaps/describe_distribution.md
================================================
# describe_distribution - factor

    Code
      describe_distribution(factor(substring("statistics", 1:10, 1:10)))
    Output
      Mean | SD |  Range | Skewness | Kurtosis |  n | n_Missing
      ---------------------------------------------------------
           |    | [a, t] |    -0.77 |    -0.13 | 10 |         0

# describe_distribution - character

    Code
      describe_distribution(as.character(ToothGrowth$supp))
    Output
      Mean | SD |    Range | Skewness | Kurtosis |  n | n_Missing
      -----------------------------------------------------------
           |    | [VC, OJ] |        0 |    -2.07 | 60 |         0

# describe_distribution - grouped df

    Code
      out
    Output
      Species    |     Variable | Mean |   SD |  IQR |        Range | Skewness
      ------------------------------------------------------------------------
      setosa     | Petal.Length | 1.46 | 0.17 | 0.20 | [1.00, 1.90] |     0.11
      setosa     |  Petal.Width | 0.25 | 0.11 | 0.10 | [0.10, 0.60] |     1.25
      versicolor | Petal.Length | 4.26 | 0.47 | 0.60 | [3.00, 5.10] |    -0.61
      versicolor |  Petal.Width | 1.33 | 0.20 | 0.30 | [1.00, 1.80] |    -0.03
      virginica  | Petal.Length | 5.55 | 0.55 | 0.80 | [4.50, 6.90] |     0.55
      virginica  |  Petal.Width | 2.03 | 0.27 | 0.50 | [1.40, 2.50] |    -0.13
      
      Species    | Kurtosis |  n | n_Missing
      --------------------------------------
      setosa     |     1.02 | 50 |         0
      setosa     |     1.72 | 50 |         0
      versicolor |     0.05 | 50 |         0
      versicolor |    -0.41 | 50 |         0
      virginica  |    -0.15 | 50 |         0
      virginica  |    -0.60 | 50 |         0

# describe_distribution - grouped df and multiple groups

    Code
      describe_distribution(x)
    Output
      grp1 | grp2 | Variable |  Mean |    SD |   IQR |          Range | Skewness
      --------------------------------------------------------------------------
      a    |    a |   values | 10.00 |  6.48 | 12.00 |  [1.00, 19.00] |     0.00
      b    |    a |   values | 13.86 | 10.92 | 21.00 |  [1.00, 28.00] |     0.23
      c    |    a |   values | 20.50 |  5.61 | 10.50 | [13.00, 28.00] |     0.00
      a    |    b |   values | 11.00 |  6.48 | 12.00 |  [2.00, 20.00] |     0.00
      b    |    b |   values | 15.50 | 11.81 | 22.50 |  [2.00, 29.00] |     0.00
      c    |    b |   values | 20.00 |  6.48 | 12.00 | [11.00, 29.00] |     0.00
      a    |    c |   values | 10.50 |  5.61 | 10.50 |  [3.00, 18.00] |     0.00
      b    |    c |   values | 17.14 | 10.92 | 21.00 |  [3.00, 30.00] |    -0.23
      c    |    c |   values | 21.00 |  6.48 | 12.00 | [12.00, 30.00] |     0.00
      
      grp1 | Kurtosis | n | n_Missing
      -------------------------------
      a    |    -1.20 | 7 |         0
      b    |    -2.14 | 7 |         0
      c    |    -1.20 | 6 |         0
      a    |    -1.20 | 7 |         0
      b    |    -2.76 | 6 |         0
      c    |    -1.20 | 7 |         0
      a    |    -1.20 | 6 |         0
      b    |    -2.14 | 7 |         0
      c    |    -1.20 | 7 |         0

# describe_distribution formatting

    Code
      format(x)
    Output
      Mean |   SD |  IQR |        Range |  Quartiles | Skewness | Kurtosis |   n | n_Missing
      --------------------------------------------------------------------------------------
      3.06 | 0.44 | 0.52 | [2.00, 4.40] | 2.80, 3.30 |     0.32 |     0.23 | 150 |         0

# (multiple) centralities with CIs

    Code
      print(out, table_width = Inf)
    Output
      Median | 95% CI (Median) |  MAD | Mean | 95% CI (Mean) |   SD |  MAP | 95% CI (MAP) |  IQR |        Range | Skewness | Kurtosis |   n | n_Missing
      -------------------------------------------------------------------------------------------------------------------------------------------------
           3 |    [3.00, 3.10] | 0.44 | 3.06 |  [3.00, 3.13] | 0.44 | 3.00 | [2.94, 3.07] | 0.52 | [2.00, 4.40] |     0.32 |     0.23 | 150 |         0

---

    Code
      print(out, table_width = Inf)
    Output
      Mean | 95% CI (Mean) |   SD |  IQR |        Range | Skewness | Kurtosis |   n | n_Missing
      -----------------------------------------------------------------------------------------
      3.06 |  [3.00, 3.13] | 0.44 | 0.52 | [2.00, 4.40] |     0.32 |     0.23 | 150 |         0

---

    Code
      print(out, table_width = Inf)
    Output
      Median | 95% CI (Median) |  MAD |  MAP | 95% CI (MAP) |  IQR |        Range | Skewness | Kurtosis |   n | n_Missing
      -------------------------------------------------------------------------------------------------------------------
           3 |    [3.00, 3.10] | 0.44 | 3.00 | [2.91, 3.09] | 0.52 | [2.00, 4.40] |     0.32 |     0.23 | 150 |         0

# display() method exports to markdown

    Code
      display(out)
    Output
      
      
      |Variable     | Mean |   SD |  IQR |        Range | Skewness | Kurtosis |   n | n_Missing |
      |:------------|:----:|:----:|:----:|:------------:|:--------:|:--------:|:---:|:---------:|
      |Sepal.Length | 5.84 | 0.83 | 1.30 | (4.30, 7.90) |     0.31 |    -0.55 | 150 |         0 |
      |Sepal.Width  | 3.06 | 0.44 | 0.52 | (2.00, 4.40) |     0.32 |     0.23 | 150 |         0 |
      |Petal.Length | 3.76 | 1.77 | 3.52 | (1.00, 6.90) |    -0.27 |    -1.40 | 150 |         0 |
      |Petal.Width  | 1.20 | 0.76 | 1.50 | (0.10, 2.50) |    -0.10 |    -1.34 | 150 |         0 |

# display() method exports to tinytable

    Code
      display(out, format = "tt")
    Output
      
      +--------------+------+------+------+--------------+----------+----------+-----+-----------+
      | Variable     | Mean | SD   | IQR  | Range        | Skewness | Kurtosis | n   | n_Missing |
      +==============+======+======+======+==============+==========+==========+=====+===========+
      | Sepal.Length | 5.84 | 0.83 | 1.30 | (4.30, 7.90) | 0.31     | -0.55    | 150 | 0         |
      +--------------+------+------+------+--------------+----------+----------+-----+-----------+
      | Sepal.Width  | 3.06 | 0.44 | 0.52 | (2.00, 4.40) | 0.32     | 0.23     | 150 | 0         |
      +--------------+------+------+------+--------------+----------+----------+-----+-----------+
      | Petal.Length | 3.76 | 1.77 | 3.52 | (1.00, 6.90) | -0.27    | -1.40    | 150 | 0         |
      +--------------+------+------+------+--------------+----------+----------+-----+-----------+
      | Petal.Width  | 1.20 | 0.76 | 1.50 | (0.10, 2.50) | -0.10    | -1.34    | 150 | 0         |
      +--------------+------+------+------+--------------+----------+----------+-----+-----------+ 


================================================
FILE: tests/testthat/_snaps/empty-dataframe.md
================================================
# remove empty with character

    Code
      remove_empty_columns(tmp)
    Output
         a  b  d
      1  1  1  1
      2  2 NA NA
      3  3  3  3
      4 NA NA NA
      5  5  5  5

---

    Code
      remove_empty_rows(tmp)
    Output
        a  b  c  d
      1 1  1 NA  1
      2 2 NA NA NA
      3 3  3 NA  3
      5 5  5 NA  5

---

    Code
      remove_empty(tmp)
    Output
        a  b  d
      1 1  1  1
      2 2 NA NA
      3 3  3  3
      5 5  5  5


================================================
FILE: tests/testthat/_snaps/means_by_group.md
================================================
# mean_by_group

    Code
      means_by_group(efc, "c12hour", "e42dep")
    Output
      # Mean of average number of hours of care per week by elder's dependency
      
      Category             |   Mean |  N |    SD |           95% CI |      p
      ----------------------------------------------------------------------
      independent          |  17.00 |  2 | 11.31 | [-68.46, 102.46] | 0.573 
      slightly dependent   |  34.25 |  4 | 29.97 | [-26.18,  94.68] | 0.626 
      moderately dependent |  52.75 | 28 | 51.83 | [ 29.91,  75.59] | > .999
      severely dependent   | 106.97 | 63 | 65.88 | [ 91.74, 122.19] | 0.001 
      Total                |  86.46 | 97 | 66.40 |                  |       
      
      Anova: R2=0.186; adj.R2=0.160; F=7.098; p<.001

---

    Code
      means_by_group(efc, "c12hour", "e42dep", ci = 0.99)
    Output
      # Mean of average number of hours of care per week by elder's dependency
      
      Category             |   Mean |  N |    SD |           99% CI |      p
      ----------------------------------------------------------------------
      independent          |  17.00 |  2 | 11.31 | [-96.17, 130.17] | 0.573 
      slightly dependent   |  34.25 |  4 | 29.97 | [-45.77, 114.27] | 0.626 
      moderately dependent |  52.75 | 28 | 51.83 | [ 22.50,  83.00] | > .999
      severely dependent   | 106.97 | 63 | 65.88 | [ 86.80, 127.13] | 0.001 
      Total                |  86.46 | 97 | 66.40 |                  |       
      
      Anova: R2=0.186; adj.R2=0.160; F=7.098; p<.001

---

    Code
      means_by_group(efc, "c12hour", "e42dep", ci = NA)
    Output
      # Mean of average number of hours of care per week by elder's dependency
      
      Category             |   Mean |  N |    SD |      p
      ---------------------------------------------------
      independent          |  17.00 |  2 | 11.31 | 0.573 
      slightly dependent   |  34.25 |  4 | 29.97 | 0.626 
      moderately dependent |  52.75 | 28 | 51.83 | > .999
      severely dependent   | 106.97 | 63 | 65.88 | 0.001 
      Total                |  86.46 | 97 | 66.40 |       
      
      Anova: R2=0.186; adj.R2=0.160; F=7.098; p<.001

---

    Code
      means_by_group(efc, c("neg_c_7", "c12hour"), "e42dep")
    Output
      # Mean of Negative impact with 7 items by elder's dependency
      
      Category             |  Mean |  N |   SD |         95% CI |     p
      -----------------------------------------------------------------
      independent          | 11.00 |  2 | 0.00 | [ 5.00, 17.00] | 0.567
      slightly dependent   | 10.00 |  4 | 3.16 | [ 5.76, 14.24] | 0.296
      moderately dependent | 13.71 | 28 | 3.14 | [12.11, 15.32] | 0.296
      severely dependent   | 14.67 | 60 | 4.78 | [13.57, 15.76] | 0.108
      Total                | 14.11 | 94 | 4.34 |                |      
      
      Anova: R2=0.063; adj.R2=0.032; F=2.009; p=0.118
      
      # Mean of average number of hours of care per week by elder's dependency
      
      Category             |   Mean |  N |    SD |           95% CI |      p
      ----------------------------------------------------------------------
      independent          |  17.00 |  2 | 11.31 | [-68.46, 102.46] | 0.573 
      slightly dependent   |  34.25 |  4 | 29.97 | [-26.18,  94.68] | 0.626 
      moderately dependent |  52.75 | 28 | 51.83 | [ 29.91,  75.59] | > .999
      severely dependent   | 106.97 | 63 | 65.88 | [ 91.74, 122.19] | 0.001 
      Total                |  86.46 | 97 | 66.40 |                  |       
      
      Anova: R2=0.186; adj.R2=0.160; F=7.098; p<.001

---

    Code
      means_by_group(efc, c("neg_c_7", "c12hour"), "e42dep", ci = NA)
    Output
      # Mean of Negative impact with 7 items by elder's dependency
      
      Category             |  Mean |  N |   SD |     p
      ------------------------------------------------
      independent          | 11.00 |  2 | 0.00 | 0.567
      slightly dependent   | 10.00 |  4 | 3.16 | 0.296
      moderately dependent | 13.71 | 28 | 3.14 | 0.296
      severely dependent   | 14.67 | 60 | 4.78 | 0.108
      Total                | 14.11 | 94 | 4.34 |      
      
      Anova: R2=0.063; adj.R2=0.032; F=2.009; p=0.118
      
      # Mean of average number of hours of care per week by elder's dependency
      
      Category             |   Mean |  N |    SD |      p
      ---------------------------------------------------
      independent          |  17.00 |  2 | 11.31 | 0.573 
      slightly dependent   |  34.25 |  4 | 29.97 | 0.626 
      moderately dependent |  52.75 | 28 | 51.83 | > .999
      severely dependent   | 106.97 | 63 | 65.88 | 0.001 
      Total                |  86.46 | 97 | 66.40 |       
      
      Anova: R2=0.186; adj.R2=0.160; F=7.098; p<.001

---

    Code
      means_by_group(efc, c("neg_c_7", "c12hour"), "e42dep", ci = 0.99)
    Output
      # Mean of Negative impact with 7 items by elder's dependency
      
      Category             |  Mean |  N |   SD |         99% CI |     p
      -----------------------------------------------------------------
      independent          | 11.00 |  2 | 0.00 | [ 3.05, 18.95] | 0.567
      slightly dependent   | 10.00 |  4 | 3.16 | [ 4.38, 15.62] | 0.296
      moderately dependent | 13.71 | 28 | 3.14 | [11.59, 15.84] | 0.296
      severely dependent   | 14.67 | 60 | 4.78 | [13.22, 16.12] | 0.108
      Total                | 14.11 | 94 | 4.34 |                |      
      
      Anova: R2=0.063; adj.R2=0.032; F=2.009; p=0.118
      
      # Mean of average number of hours of care per week by elder's dependency
      
      Category             |   Mean |  N |    SD |           99% CI |      p
      ----------------------------------------------------------------------
      independent          |  17.00 |  2 | 11.31 | [-96.17, 130.17] | 0.573 
      slightly dependent   |  34.25 |  4 | 29.97 | [-45.77, 114.27] | 0.626 
      moderately dependent |  52.75 | 28 | 51.83 | [ 22.50,  83.00] | > .999
      severely dependent   | 106.97 | 63 | 65.88 | [ 86.80, 127.13] | 0.001 
      Total                |  86.46 | 97 | 66.40 |                  |       
      
      Anova: R2=0.186; adj.R2=0.160; F=7.098; p<.001

---

    Code
      means_by_group(efc$c12hour, efc$e42dep)
    Output
      # Mean of average number of hours of care per week by elder's dependency
      
      Category             |   Mean |  N |    SD |           95% CI |      p
      ----------------------------------------------------------------------
      independent          |  17.00 |  2 | 11.31 | [-68.46, 102.46] | 0.573 
      slightly dependent   |  34.25 |  4 | 29.97 | [-26.18,  94.68] | 0.626 
      moderately dependent |  52.75 | 28 | 51.83 | [ 29.91,  75.59] | > .999
      severely dependent   | 106.97 | 63 | 65.88 | [ 91.74, 122.19] | 0.001 
      Total                |  86.46 | 97 | 66.40 |                  |       
      
      Anova: R2=0.186; adj.R2=0.160; F=7.098; p<.001

---

    Code
      means_by_group(efc$c12hour, efc$e42dep, ci = NA)
    Output
      # Mean of average number of hours of care per week by elder's dependency
      
      Category             |   Mean |  N |    SD |      p
      ---------------------------------------------------
      independent          |  17.00 |  2 | 11.31 | 0.573 
      slightly dependent   |  34.25 |  4 | 29.97 | 0.626 
      moderately dependent |  52.75 | 28 | 51.83 | > .999
      severely dependent   | 106.97 | 63 | 65.88 | 0.001 
      Total                |  86.46 | 97 | 66.40 |       
      
      Anova: R2=0.186; adj.R2=0.160; F=7.098; p<.001


================================================
FILE: tests/testthat/_snaps/normalize.md
================================================
# normalize work as expected

    Code
      head(normalize(trees))
    Output
             Girth     Height      Volume
      1 0.00000000 0.29166667 0.001497006
      2 0.02439024 0.08333333 0.001497006
      3 0.04065041 0.00000000 0.000000000
      4 0.17886179 0.37500000 0.092814371
      5 0.19512195 0.75000000 0.128742515
      6 0.20325203 0.83333333 0.142215569


================================================
FILE: tests/testthat/_snaps/print.dw_transformer.md
================================================
# print.dw_transformer

    Code
      rescale(iris$Sepal.Length)
    Output
        [1]  22.222222  16.666667  11.111111   8.333333  19.444444  30.555556
        [7]   8.333333  19.444444   2.777778  16.666667  30.555556  13.888889
       [13]  13.888889   0.000000  41.666667  38.888889  30.555556  22.222222
       [19]  38.888889  22.222222  30.555556  22.222222   8.333333  22.222222
       [25]  13.888889  19.444444  19.444444  25.000000  25.000000  11.111111
       [31]  13.888889  30.555556  25.000000  33.333333  16.666667  19.444444
       [37]  33.333333  16.666667   2.777778  22.222222  19.444444   5.555556
       [43]   2.777778  19.444444  22.222222  13.888889  22.222222   8.333333
       [49]  27.777778  19.444444  75.000000  58.333333  72.222222  33.333333
       [55]  61.111111  38.888889  55.555556  16.666667  63.888889  25.000000
       [61]  19.444444  44.444444  47.222222  50.000000  36.111111  66.666667
       [67]  36.111111  41.666667  52.777778  36.111111  44.444444  50.000000
       [73]  55.555556  50.000000  58.333333  63.888889  69.444444  66.666667
       [79]  47.222222  38.888889  33.333333  33.333333  41.666667  47.222222
       [85]  30.555556  47.222222  66.666667  55.555556  36.111111  33.333333
       [91]  33.333333  50.000000  41.666667  19.444444  36.111111  38.888889
       [97]  38.888889  52.777778  22.222222  38.888889  55.555556  41.666667
      [103]  77.777778  55.555556  61.111111  91.666667  16.666667  83.333333
      [109]  66.666667  80.555556  61.111111  58.333333  69.444444  38.888889
      [115]  41.666667  58.333333  61.111111  94.444444  94.444444  47.222222
      [121]  72.222222  36.111111  94.444444  55.555556  66.666667  80.555556
      [127]  52.777778  50.000000  58.333333  80.555556  86.111111 100.000000
      [133]  58.333333  55.555556  50.000000  94.444444  55.555556  58.333333
      [139]  47.222222  72.222222  66.666667  72.222222  41.666667  69.444444
      [145]  66.666667  66.666667  55.555556  61.111111  52.777778  44.444444
      (original range = 4.3 to 7.9)

---

    Code
      normalize(iris$Sepal.Length)
    Output
        [1] 0.22222222 0.16666667 0.11111111 0.08333333 0.19444444 0.30555556
        [7] 0.08333333 0.19444444 0.02777778 0.16666667 0.30555556 0.13888889
       [13] 0.13888889 0.00000000 0.41666667 0.38888889 0.30555556 0.22222222
       [19] 0.38888889 0.22222222 0.30555556 0.22222222 0.08333333 0.22222222
       [25] 0.13888889 0.19444444 0.19444444 0.25000000 0.25000000 0.11111111
       [31] 0.13888889 0.30555556 0.25000000 0.33333333 0.16666667 0.19444444
       [37] 0.33333333 0.16666667 0.02777778 0.22222222 0.19444444 0.05555556
       [43] 0.02777778 0.19444444 0.22222222 0.13888889 0.22222222 0.08333333
       [49] 0.27777778 0.19444444 0.75000000 0.58333333 0.72222222 0.33333333
       [55] 0.61111111 0.38888889 0.55555556 0.16666667 0.63888889 0.25000000
       [61] 0.19444444 0.44444444 0.47222222 0.50000000 0.36111111 0.66666667
       [67] 0.36111111 0.41666667 0.52777778 0.36111111 0.44444444 0.50000000
       [73] 0.55555556 0.50000000 0.58333333 0.63888889 0.69444444 0.66666667
       [79] 0.47222222 0.38888889 0.33333333 0.33333333 0.41666667 0.47222222
       [85] 0.30555556 0.47222222 0.66666667 0.55555556 0.36111111 0.33333333
       [91] 0.33333333 0.50000000 0.41666667 0.19444444 0.36111111 0.38888889
       [97] 0.38888889 0.52777778 0.22222222 0.38888889 0.55555556 0.41666667
      [103] 0.77777778 0.55555556 0.61111111 0.91666667 0.16666667 0.83333333
      [109] 0.66666667 0.80555556 0.61111111 0.58333333 0.69444444 0.38888889
      [115] 0.41666667 0.58333333 0.61111111 0.94444444 0.94444444 0.47222222
      [121] 0.72222222 0.36111111 0.94444444 0.55555556 0.66666667 0.80555556
      [127] 0.52777778 0.50000000 0.58333333 0.80555556 0.86111111 1.00000000
      [133] 0.58333333 0.55555556 0.50000000 0.94444444 0.55555556 0.58333333
      [139] 0.47222222 0.72222222 0.66666667 0.72222222 0.41666667 0.69444444
      [145] 0.66666667 0.66666667 0.55555556 0.61111111 0.52777778 0.44444444
      (original range = 4.3 to 7.9)

---

    Code
      center(iris$Sepal.Length)
    Output
        [1] -0.74333333 -0.94333333 -1.14333333 -1.24333333 -0.84333333 -0.44333333
        [7] -1.24333333 -0.84333333 -1.44333333 -0.94333333 -0.44333333 -1.04333333
       [13] -1.04333333 -1.54333333 -0.04333333 -0.14333333 -0.44333333 -0.74333333
       [19] -0.14333333 -0.74333333 -0.44333333 -0.74333333 -1.24333333 -0.74333333
       [25] -1.04333333 -0.84333333 -0.84333333 -0.64333333 -0.64333333 -1.14333333
       [31] -1.04333333 -0.44333333 -0.64333333 -0.34333333 -0.94333333 -0.84333333
       [37] -0.34333333 -0.94333333 -1.44333333 -0.74333333 -0.84333333 -1.34333333
       [43] -1.44333333 -0.84333333 -0.74333333 -1.04333333 -0.74333333 -1.24333333
       [49] -0.54333333 -0.84333333  1.15666667  0.55666667  1.05666667 -0.34333333
       [55]  0.65666667 -0.14333333  0.45666667 -0.94333333  0.75666667 -0.64333333
       [61] -0.84333333  0.05666667  0.15666667  0.25666667 -0.24333333  0.85666667
       [67] -0.24333333 -0.04333333  0.35666667 -0.24333333  0.05666667  0.25666667
       [73]  0.45666667  0.25666667  0.55666667  0.75666667  0.95666667  0.85666667
       [79]  0.15666667 -0.14333333 -0.34333333 -0.34333333 -0.04333333  0.15666667
       [85] -0.44333333  0.15666667  0.85666667  0.45666667 -0.24333333 -0.34333333
       [91] -0.34333333  0.25666667 -0.04333333 -0.84333333 -0.24333333 -0.14333333
       [97] -0.14333333  0.35666667 -0.74333333 -0.14333333  0.45666667 -0.04333333
      [103]  1.25666667  0.45666667  0.65666667  1.75666667 -0.94333333  1.45666667
      [109]  0.85666667  1.35666667  0.65666667  0.55666667  0.95666667 -0.14333333
      [115] -0.04333333  0.55666667  0.65666667  1.85666667  1.85666667  0.15666667
      [121]  1.05666667 -0.24333333  1.85666667  0.45666667  0.85666667  1.35666667
      [127]  0.35666667  0.25666667  0.55666667  1.35666667  1.55666667  2.05666667
      [133]  0.55666667  0.45666667  0.25666667  1.85666667  0.45666667  0.55666667
      [139]  0.15666667  1.05666667  0.85666667  1.05666667 -0.04333333  0.95666667
      [145]  0.85666667  0.85666667  0.45666667  0.65666667  0.35666667  0.05666667
      (center: 5.8, scale = 1)

---

    Code
      standardize(iris$Sepal.Length)
    Output
        [1] -0.89767388 -1.13920048 -1.38072709 -1.50149039 -1.01843718 -0.53538397
        [7] -1.50149039 -1.01843718 -1.74301699 -1.13920048 -0.53538397 -1.25996379
       [13] -1.25996379 -1.86378030 -0.05233076 -0.17309407 -0.53538397 -0.89767388
       [19] -0.17309407 -0.89767388 -0.53538397 -0.89767388 -1.50149039 -0.89767388
       [25] -1.25996379 -1.01843718 -1.01843718 -0.77691058 -0.77691058 -1.38072709
       [31] -1.25996379 -0.53538397 -0.77691058 -0.41462067 -1.13920048 -1.01843718
       [37] -0.41462067 -1.13920048 -1.74301699 -0.89767388 -1.01843718 -1.62225369
       [43] -1.74301699 -1.01843718 -0.89767388 -1.25996379 -0.89767388 -1.50149039
       [49] -0.65614727 -1.01843718  1.39682886  0.67224905  1.27606556 -0.41462067
       [55]  0.79301235 -0.17309407  0.55148575 -1.13920048  0.91377565 -0.77691058
       [61] -1.01843718  0.06843254  0.18919584  0.30995914 -0.29385737  1.03453895
       [67] -0.29385737 -0.05233076  0.43072244 -0.29385737  0.06843254  0.30995914
       [73]  0.55148575  0.30995914  0.67224905  0.91377565  1.15530226  1.03453895
       [79]  0.18919584 -0.17309407 -0.41462067 -0.41462067 -0.05233076  0.18919584
       [85] -0.53538397  0.18919584  1.03453895  0.55148575 -0.29385737 -0.41462067
       [91] -0.41462067  0.30995914 -0.05233076 -1.01843718 -0.29385737 -0.17309407
       [97] -0.17309407  0.43072244 -0.89767388 -0.17309407  0.55148575 -0.05233076
      [103]  1.51759216  0.55148575  0.79301235  2.12140867 -1.13920048  1.75911877
      [109]  1.03453895  1.63835547  0.79301235  0.67224905  1.15530226 -0.17309407
      [115] -0.05233076  0.67224905  0.79301235  2.24217198  2.24217198  0.18919584
      [121]  1.27606556 -0.29385737  2.24217198  0.55148575  1.03453895  1.63835547
      [127]  0.43072244  0.30995914  0.67224905  1.63835547  1.87988207  2.48369858
      [133]  0.67224905  0.55148575  0.30995914  2.24217198  0.55148575  0.67224905
      [139]  0.18919584  1.27606556  1.03453895  1.27606556 -0.05233076  1.15530226
      [145]  1.03453895  1.03453895  0.55148575  0.79301235  0.43072244  0.06843254
      (center: 5.8, scale = 0.83)


================================================
FILE: tests/testthat/_snaps/ranktransform.md
================================================
# ranktransform works with data frames

    Code
      ranktransform(BOD)
    Output
        Time demand
      1    1      1
      2    2      2
      3    3      5
      4    4      4
      5    5      3
      6    6      6


================================================
FILE: tests/testthat/_snaps/rescale_weights.md
================================================
# rescale_weights works as expected

    Code
      head(rescale_weights(nhanes_sample, "WTINT2YR", "SDMVSTRA"))
    Output
        total  age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR rescaled_weights_a
      1     1 2.20        1        3       2       31 97593.68          1.5733612
      2     7 2.08        2        3       1       29 39599.36          0.6231745
      3     3 1.48        2        1       2       42 26619.83          0.8976966
      4     4 1.32        2        4       2       33 34998.53          0.7083628
      5     1 2.00        2        1       1       41 14746.45          0.4217782
      6     6 2.20        2        4       1       38 28232.10          0.6877550
        rescaled_weights_b
      1          1.2005159
      2          0.5246593
      3          0.5439111
      4          0.5498944
      5          0.3119698
      6          0.5155503

---

    Code
      head(rescale_weights(nhanes_sample, "WTINT2YR", c("SDMVSTRA", "SDMVPSU")))
    Output
        total  age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR pweight_a_SDMVSTRA
      1     1 2.20        1        3       2       31 97593.68          1.5733612
      2     7 2.08        2        3       1       29 39599.36          0.6231745
      3     3 1.48        2        1       2       42 26619.83          0.8976966
      4     4 1.32        2        4       2       33 34998.53          0.7083628
      5     1 2.00        2        1       1       41 14746.45          0.4217782
      6     6 2.20        2        4       1       38 28232.10          0.6877550
        pweight_b_SDMVSTRA pweight_a_SDMVPSU pweight_b_SDMVPSU
      1          1.2005159         1.8458164         1.3699952
      2          0.5246593         0.8217570         0.5780808
      3          0.5439111         0.5034683         0.3736824
      4          0.5498944         0.6619369         0.4913004
      5          0.3119698         0.3060151         0.2152722
      6          0.5155503         0.5858662         0.4121388

---

    Code
      head(rescale_weights(nhanes_sample, probability_weights = "WTINT2YR", method = "kish"))
    Output
        total  age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR rescaled_weights
      1     1 2.20        1        3       2       31 97593.68        1.3952529
      2     7 2.08        2        3       1       29 39599.36        0.5661343
      3     3 1.48        2        1       2       42 26619.83        0.3805718
      4     4 1.32        2        4       2       33 34998.53        0.5003582
      5     1 2.00        2        1       1       41 14746.45        0.2108234
      6     6 2.20        2        4       1       38 28232.10        0.4036216

---

    Code
      rescale_weights(nhanes_sample, "WTINT2YR", "SDMVSTRA")
    Output
         total  age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR rescaled_weights_a
      1      1 2.20        1        3       2       31 97593.68          1.0000000
      2      7 2.08        2        3       1       29 39599.36          0.5819119
      3      3 1.48        2        1       2       42       NA                 NA
      4      4 1.32        2        4       2       33 34998.53          0.6766764
      5      1 2.00        2        1       1       41 14746.45          0.7471696
      6      6 2.20        2        4       1       38 28232.10          1.0000000
      7    350 1.60        1        3       2       33 93162.43          1.8012419
      8     NA 1.48        2        3       1       29 82275.99          1.2090441
      9      3 2.28        2        4       1       41 24726.39          1.2528304
      10    30 0.84        1        3       2       35       NA                 NA
      11    70 1.24        1        4       2       33 27002.70          0.5220817
      12     5 1.68        2        1       2       39 18792.03          1.0000000
      13    60 2.20        1        3       2       30 76894.56          1.0000000
      14     2 1.48        2        3       1       29       NA                 NA
      15     8 2.36        2        3       2       39       NA                 NA
      16     3 2.04        2        3       2       36 98200.91          1.0000000
      17     1 2.08        1        3       1       40 87786.09          1.0000000
      18     7 1.00        1        3       2       32 90803.16          1.0000000
      19     9 2.28        2        3       2       34       NA                 NA
      20     2 1.24        2        3       1       29 82275.99          1.2090441
         rescaled_weights_b
      1           1.0000000
      2           0.5351412
      3                  NA
      4           0.5107078
      5           0.7022777
      6           1.0000000
      7           1.3594509
      8           1.1118681
      9           1.1775572
      10                 NA
      11          0.3940306
      12          1.0000000
      13          1.0000000
      14                 NA
      15                 NA
      16          1.0000000
      17          1.0000000
      18          1.0000000
      19                 NA
      20          1.1118681

---

    Code
      rescale_weights(nhanes_sample, "WTINT2YR", method = "kish")
    Output
         total  age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR rescaled_weights
      1      1 2.20        1        3       2       31 97593.68        1.2734329
      2      7 2.08        2        3       1       29 39599.36        0.5167049
      3      3 1.48        2        1       2       42       NA               NA
      4      4 1.32        2        4       2       33 34998.53        0.4566718
      5      1 2.00        2        1       1       41 14746.45        0.1924164
      6      6 2.20        2        4       1       38 28232.10        0.3683813
      7    350 1.60        1        3       2       33 93162.43        1.2156126
      8     NA 1.48        2        3       1       29 82275.99        1.0735629
      9      3 2.28        2        4       1       41 24726.39        0.3226377
      10    30 0.84        1        3       2       35       NA               NA
      11    70 1.24        1        4       2       33 27002.70        0.3523397
      12     5 1.68        2        1       2       39 18792.03        0.2452044
      13    60 2.20        1        3       2       30 76894.56        1.0033444
      14     2 1.48        2        3       1       29       NA               NA
      15     8 2.36        2        3       2       39       NA               NA
      16     3 2.04        2        3       2       36 98200.91        1.2813563
      17     1 2.08        1        3       1       40 87786.09        1.1454605
      18     7 1.00        1        3       2       32 90803.16        1.1848281
      19     9 2.28        2        3       2       34       NA               NA
      20     2 1.24        2        3       1       29 82275.99        1.0735629

# rescale_weights nested works as expected

    Code
      rescale_weights(data = head(nhanes_sample, n = 30), by = c("SDMVSTRA",
        "SDMVPSU"), probability_weights = "WTINT2YR", nest = TRUE)
    Output
         total  age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA  WTINT2YR rescaled_weights_a
      1      1 2.20        1        3       2       31 97593.679          1.0000000
      2      7 2.08        2        3       1       29 39599.363          0.5502486
      3      3 1.48        2        1       2       42 26619.834          0.9512543
      4      4 1.32        2        4       2       33 34998.530          0.6766764
      5      1 2.00        2        1       1       41 14746.454          0.7147710
      6      6 2.20        2        4       1       38 28232.100          1.0000000
      7    350 1.60        1        3       2       33 93162.431          1.8012419
      8     NA 1.48        2        3       1       29 82275.986          1.1432570
      9      3 2.28        2        4       1       41 24726.391          1.1985056
      10    30 0.84        1        3       2       35 39895.048          1.0000000
      11    70 1.24        1        4       2       33 27002.703          0.5220817
      12     5 1.68        2        1       2       39 18792.034          0.3866720
      13    60 2.20        1        3       2       30 76894.563          1.0000000
      14     2 1.48        2        3       1       29 82275.986          1.1432570
      15     8 2.36        2        3       2       39 78406.811          1.6133280
      16     3 2.04        2        3       2       36 98200.912          1.0000000
      17     1 2.08        1        3       1       40 87786.091          1.0000000
      18     7 1.00        1        3       2       32 90803.158          1.2693642
      19     9 2.28        2        3       2       34 45002.917          1.0000000
      20     2 1.24        2        3       1       29 82275.986          1.1432570
      21     4 2.28        2        3       1       34 91437.145          1.4088525
      22     3 1.04        1        1       2       42 29348.027          1.0487457
      23     4 1.12        1        1       1       34 38366.567          0.5911475
      24     1 1.52        2        1       1       42  6622.334          1.0000000
      25    22 2.24        1        4       1       41 22420.209          1.0867233
      26     7 1.00        2        3       2       41 65529.204          1.0000000
      27     5 0.92        2        4       1       30 27089.745          1.0000000
      28    15 1.04        1        3       2       32 52265.570          0.7306358
      29     3 0.80        1        3       1       33 64789.307          1.0000000
      30     1 1.00        1        3       1       29 73404.222          1.0199804
         rescaled_weights_b
      1           1.0000000
      2           0.5226284
      3           0.9489993
      4           0.5107078
      5           0.6854605
      6           1.0000000
      7           1.3594509
      8           1.0858702
      9           1.1493587
      10          1.0000000
      11          0.3940306
      12          0.2809766
      13          1.0000000
      14          1.0858702
      15          1.1723308
      16          1.0000000
      17          1.0000000
      18          1.1834934
      19          1.0000000
      20          1.0858702
      21          1.2070771
      22          1.0462596
      23          0.5064835
      24          1.0000000
      25          1.0421602
      26          1.0000000
      27          1.0000000
      28          0.6812093
      29          1.0000000
      30          0.9687816


================================================
FILE: tests/testthat/_snaps/reshape_ci.md
================================================
# reshape_ci with single CI level

    Code
      df_reshape
    Output
        Parameter CI_low CI_high  CI
      1    Term 1    0.2     0.5 0.8

# reshape_ci with multiple CI levels

    Code
      reshape_ci(x)
    Output
        Parameter CI_low_0.8 CI_high_0.8 CI_low_0.9 CI_high_0.9
      1    Term 1        0.2         0.5       0.10        0.80
      2    Term 2        0.3         0.6       0.15        0.85

---

    Code
      reshape_ci(reshape_ci(x))
    Output
        Parameter  CI CI_low CI_high
      1    Term 1 0.8   0.20    0.50
      2    Term 1 0.9   0.10    0.80
      3    Term 2 0.8   0.30    0.60
      4    Term 2 0.9   0.15    0.85


================================================
FILE: tests/testthat/_snaps/skewness-kurtosis.md
================================================
# skewness works with data frames

    Code
      skewness(iris[, 1:4])
    Output
      Parameter    | Skewness |    SE
      -------------------------------
      Sepal.Length |    0.315 | 0.196
      Sepal.Width  |    0.319 | 0.196
      Petal.Length |   -0.275 | 0.196
      Petal.Width  |   -0.103 | 0.196

---

    Code
      skewness(iris[, 1:4], iterations = 100)
    Output
      Parameter    | Skewness |    SE
      -------------------------------
      Sepal.Length |    0.315 | 0.126
      Sepal.Width  |    0.319 | 0.175
      Petal.Length |   -0.275 | 0.137
      Petal.Width  |   -0.103 | 0.134

# kurtosis works with data frames

    Code
      kurtosis(iris[, 1:4])
    Output
      Parameter    | Kurtosis |    SE
      -------------------------------
      Sepal.Length |   -0.552 | 0.381
      Sepal.Width  |    0.228 | 0.381
      Petal.Length |   -1.402 | 0.381
      Petal.Width  |   -1.341 | 0.381

---

    Code
      kurtosis(iris[, 1:4], iterations = 100)
    Output
      Parameter    | Kurtosis |    SE
      -------------------------------
      Sepal.Length |   -0.552 | 0.188
      Sepal.Width  |    0.228 | 0.351
      Petal.Length |   -1.402 | 0.167
      Petal.Width  |   -1.341 | 0.115

# skewness works with matrices

    Code
      skewness(as.matrix(iris[, 1:4]))
    Output
      Parameter    | Skewness |    SE
      -------------------------------
      Sepal.Length |    0.315 | 0.196
      Sepal.Width  |    0.319 | 0.196
      Petal.Length |   -0.275 | 0.196
      Petal.Width  |   -0.103 | 0.196

---

    Code
      skewness(as.matrix(iris[, 1:4]), iterations = 100)
    Output
      Parameter    | Skewness |    SE
      -------------------------------
      Sepal.Length |    0.315 | 0.126
      Sepal.Width  |    0.319 | 0.175
      Petal.Length |   -0.275 | 0.137
      Petal.Width  |   -0.103 | 0.134

# kurtosis works with matrices

    Code
      kurtosis(as.matrix(iris[, 1:4]))
    Output
      Parameter    | Kurtosis |    SE
      -------------------------------
      Sepal.Length |   -0.552 | 0.381
      Sepal.Width  |    0.228 | 0.381
      Petal.Length |   -1.402 | 0.381
      Petal.Width  |   -1.341 | 0.381

---

    Code
      kurtosis(as.matrix(iris[, 1:4]), iterations = 100)
    Output
      Parameter    | Kurtosis |    SE
      -------------------------------
      Sepal.Length |   -0.552 | 0.188
      Sepal.Width  |    0.228 | 0.351
      Petal.Length |   -1.402 | 0.167
      Petal.Width  |   -1.341 | 0.115


================================================
FILE: tests/testthat/_snaps/smoothness.md
================================================
# smoothness works with data frames

    Code
      smoothness(BOD)
    Output
             Parameter                    
      Time   "Time"    "0.986393923832144"
      demand "demand"  "0.406270770677043"
      attr(,"class")
      [1] "parameters_smoothness" "matrix"                "array"                


================================================
FILE: tests/testthat/_snaps/text_format.md
================================================
# text formatting helpers work as expected

    Code
      text_format(c("A very long First", "Some similar long Second", "Shorter Third",
        "More or less long Fourth", "And finally the Last"), width = 20)
    Output
      [1] "A very long First,\nSome similar long\nSecond, Shorter\nThird, More or less\nlong Fourth and And\nfinally the Last\n"

---

    Code
      text_format(c("A very long First", "Some similar long Second", "Shorter Third",
        "More or less long Fourth", "And finally the Last"), last = " or ", enclose = "`",
      width = 20)
    Output
      [1] "`A very long\nFirst`, `Some\nsimilar long\nSecond`, `Shorter\nThird`, `More or\nless long Fourth`\nor `And finally the\nLast`\n"

# text formatters respect `width` argument

    Code
      long_text <- strrep("abc ", 100)
      cat(text_format(long_text, width = 50))
    Output
       abc abc abc abc abc abc abc abc abc abc abc abc
      abc abc abc abc abc abc abc abc abc abc abc abc
      abc abc abc abc abc abc abc abc abc abc abc abc
      abc abc abc abc abc abc abc abc abc abc abc abc
      abc abc abc abc abc abc abc abc abc abc abc abc
      abc abc abc abc abc abc abc abc abc abc abc abc
      abc abc abc abc abc abc abc abc abc abc abc abc
      abc abc abc abc abc abc abc abc abc abc abc abc
      abc abc abc abc
    Code
      cat(text_format(long_text, width = 80))
    Output
       abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc
      abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc
      abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc
      abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc
      abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc abc
    Code
      withr::with_options(list(width = 50), code = {
        cat(text_format(long_text))
      })
    Output
       abc abc abc abc abc abc abc abc abc abc abc abc
      abc abc abc abc abc abc abc abc abc abc abc abc
      abc abc abc abc abc abc abc abc abc abc abc abc
      abc abc abc abc abc abc abc abc abc abc abc abc
      abc abc abc abc abc abc abc abc abc abc abc abc
      abc abc abc abc abc abc abc abc abc abc abc abc
      abc abc abc abc abc abc abc abc abc abc abc abc
      abc abc abc abc abc abc abc abc abc abc abc abc
      abc abc abc abc


================================================
FILE: tests/testthat/_snaps/windows/means_by_group.md
================================================
# mean_by_group, weighted

    Code
      means_by_group(efc, "c12hour", "e42dep", weights = "weight")
    Output
      # Mean of average number of hours of care per week by elder's dependency
      
      Category             |   Mean |  N |    SD |           95% CI |      p
      ----------------------------------------------------------------------
      independent          |  16.92 |  3 | 11.31 | [-60.82,  94.66] | 0.486 
      slightly dependent   |  33.56 |  4 | 29.75 | [-26.93,  94.05] | 0.593 
      moderately dependent |  52.74 | 26 | 54.44 | [ 28.71,  76.76] | 0.996 
      severely dependent   | 108.08 | 67 | 65.40 | [ 93.01, 123.16] | < .001
      Total                |  88.11 | 97 | 67.01 |                  |       
      
      Anova: R2=0.191; adj.R2=0.165; F=7.329; p<.001

---

    Code
      means_by_group(efc, "c12hour", "e42dep", weights = "weight", ci = NA)
    Output
      # Mean of average number of hours of care per week by elder's dependency
      
      Category             |   Mean |  N |    SD |      p
      ---------------------------------------------------
      independent          |  16.92 |  3 | 11.31 | 0.486 
      slightly dependent   |  33.56 |  4 | 29.75 | 0.593 
      moderately dependent |  52.74 | 26 | 54.44 | 0.996 
      severely dependent   | 108.08 | 67 | 65.40 | < .001
      Total                |  88.11 | 97 | 67.01 |       
      
      Anova: R2=0.191; adj.R2=0.165; F=7.329; p<.001


================================================
FILE: tests/testthat/_snaps/winsorization.md
================================================
# with missing values

    Code
      suppressWarnings(head(winsorize(na.omit(ggplot2::msleep$brainwt))))
    Output
      [1] 0.0155 0.0024 0.1750 0.0700 0.0982 0.1150


================================================
FILE: tests/testthat/helper-state.R
================================================
testthat::set_state_inspector(function() {
  # sometimes a dependency might add a custom option, so we need to
  # make sure we don't fail because of such additions
  options <- options()

  # Result of `dput(names(options()))`
  base_options <- c(
    "add.smooth",
    "askpass",
    "asksecret",
    "bitmapType",
    "browser",
    "browserNLdisabled",
    "buildtools.check",
    "buildtools.with",
    "callr.condition_handler_cli_message",
    "CBoundsCheck",
    "check.bounds",
    "citation.bibtex.max",
    "connectionObserver",
    "continue",
    "contrasts",
    "defaultPackages",
    "demo.ask",
    "deparse.cutoff",
    "deparse.max.lines",
    "device",
    "device.ask.default",
    "digits",
    "download.file.method",
    "dvipscmd",
    "echo",
    "editor",
    "encoding",
    "example.ask",
    "expressions",
    "ggvis.renderer",
    "help_type",
    "help.search.types",
    "help.try.all.packages",
    "HTTPUserAgent",
    "install.packages.compile.from.source",
    "internet.info",
    "keep.parse.data",
    "keep.parse.data.pkgs",
    "keep.source",
    "keep.source.pkgs",
    "locatorBell",
    "mailer",
    "matprod",
    "max.contour.segments",
    "max.print",
    "menu.graphics",
    "na.action",
    "nwarnings",
    "OutDec",
    "page_viewer",
    "pager",
    "papersize",
    "PCRE_limit_recursion",
    "PCRE_study",
    "PCRE_use_JIT",
    "pdfviewer",
    "pkgType",
    "plumber.docs.callback",
    "plumber.swagger.url",
    "printcmd",
    "profvis.keep_output",
    "profvis.print",
    "profvis.prof_extension",
    "profvis.prof_output",
    "prompt",
    "repos",
    "restart",
    "reticulate.initialized",
    "reticulate.repl.busy",
    "reticulate.repl.hook",
    "reticulate.repl.initialize",
    "reticulate.repl.teardown",
    "rl_word_breaks",
    "rsconnect.check.certificate",
    "rstudio.notebook.executing",
    "RStudioGD.antialias",
    "RStudioGD.backend",
    "scipen",
    "shiny.launch.browser",
    "shinygadgets.showdialog",
    "show.coef.Pvalues",
    "show.error.messages",
    "show.signif.stars",
    "showErrorCalls",
    "showNCalls",
    "showWarnCalls",
    "str",
    "str.dendrogram.last",
    "terminal.manager",
    "texi2dvi",
    "timeout",
    "ts.eps",
    "ts.S.compat",
    "unzip",
    "useFancyQuotes",
    "verbose",
    "viewer",
    "warn",
    "warning.length",
    "warnPartialMatchArgs",
    "warnPartialMatchAttr",
    "warnPartialMatchDollar",
    "width"
  )
  options <- options[base_options]

  list(
    attached = search(),
    connections = nrow(showConnections()),
    cwd = getwd(),
    envvars = Sys.getenv(),
    libpaths = .libPaths(),
    locale = Sys.getlocale(),
    options = options,
    packages = .packages(all.available = TRUE),
    NULL
  )
})


================================================
FILE: tests/testthat/helper.R
================================================
if (insight::check_if_installed("poorman", stop = FALSE)) {
  `%>%` <- poorman::`%>%`
}


================================================
FILE: tests/testthat/test-adjust.R
================================================
test_that("adjust multilevel", {
  skip_if_not_installed("lme4")
  adj <- adjust(
    iris[c("Sepal.Length", "Species")],
    multilevel = TRUE,
    bayesian = FALSE
  )
  # High tolerance to avoid issues on some R CMD check specification, see #592
  expect_equal(
    head(adj$Sepal.Length),
    c(0.08698, -0.11302, -0.31302, -0.41302, -0.01302, 0.38698),
    tolerance = 1e-1
  )
})

test_that("adjust", {
  adj <- adjust(
    iris[c("Sepal.Length", "Species")],
    multilevel = FALSE,
    bayesian = FALSE
  )
  expect_equal(
    head(adj$Sepal.Length),
    c(0.094, -0.106, -0.306, -0.406, -0.006, 0.394),
    tolerance = 1e-3
  )
})

# select helpers ------------------------------
test_that("adjust regex", {
  expect_identical(
    adjust(mtcars, select = "pg", regex = TRUE),
    adjust(mtcars, select = "mpg")
  )
  expect_identical(
    adjust(mtcars, select = "pg$", regex = TRUE),
    adjust(mtcars, select = "mpg")
  )
})

# select helpers ------------------------------
test_that("adjust, invalid column names", {
  data(iris)
  colnames(iris)[1] <- "I am"
  expect_error(
    adjust(iris[c("I am", "Species")], multilevel = FALSE, bayesian = FALSE),
    regex = "Bad column names"
  )
})


================================================
FILE: tests/testthat/test-assign_labels.R
================================================
test_that("assign_labels, unnamed values", {
  x <- 1:3
  # labelling by providing required number of labels
  out <- assign_labels(
    x,
    variable = "My x",
    values = c("one", "two", "three")
  )
  expect_identical(attributes(out)$label, "My x")
  expect_identical(
    attributes(out)$labels,
    structure(1:3, names = c("one", "two", "three"))
  )
})

test_that("assign_labels, named values", {
  # labelling using named vectors
  x <- factor(letters[1:3])
  out <- assign_labels(
    x,
    variable = "Labelled factor",
    values = c(a = "low", b = "mid", c = "high")
  )
  expect_identical(attributes(out)$label, "Labelled factor")
  expect_identical(attributes(out)$labels, c(low = "a", mid = "b", high = "c"))
})

test_that("assign_labels, partially named values", {
  x <- 1:5
  out <- assign_labels(
    x,
    variable = "My x",
    values = c(`1` = "lowest", `5` = "highest"),
    verbose = FALSE
  )
  expect_identical(attributes(out)$label, "My x")
  expect_identical(attributes(out)$labels, c(lowest = 1, highest = 5))
})

test_that("assign_labels, errors", {
  x <- 1:5
  expect_error(assign_labels(x, values = c(`1` = "lowest", `6` = "highest")))
  expect_error(assign_labels(
    x,
    variable = 1,
    values = c(`1` = "lowest", `6` = "highest")
  ))
  expect_error(assign_labels(x, values = c("a", "b", "c")))
})

test_that("assign_labels, data frame", {
  data(iris)
  out <- assign_labels(iris, "Species", values = c("a", "b", "c"))
  expect_identical(
    attributes(out$Species)$labels,
    c(a = "setosa", b = "versicolor", c = "virginica")
  )

  data(mtcars)
  out <- assign_labels(
    mtcars,
    select = c("am", "vs"),
    values = c("low", "high")
  )
  expect_identical(attributes(out$am)$labels, c(low = 0, high = 1))
  expect_identical(attributes(out$vs)$labels, c(low = 0, high = 1))
  expect_null(attributes(out$gear)$labels)
  expect_null(attributes(out$cyl)$labels)
})


================================================
FILE: tests/testthat/test-attributes-grouped-df.R
================================================
# data_arrange -----------------------------------

test_that("data_arrange, attributes preserved", {
  # if dplyr:::`[.grouped_df` in the environment it destroys the attributes
  # (only occurs when we run tests in random order)
  skip_if("[.grouped_df" %in% methods(`[`))
  x <- mtcars
  attr(x, "myattri") <- "I'm here"
  x2 <- data_arrange(data_group(x, "cyl"), "hp")
  expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here")
})


# rescale -----------------------------------

test_that("rescale, attributes preserved", {
  x <- iris
  attr(x, "myattri") <- "I'm here"
  x2 <- rescale(data_group(x, "Species"), 1:3)
  expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here")
})


# center -----------------------------------

test_that("center, attributes preserved", {
  x <- iris
  attr(x, "myattri") <- "I'm here"
  x2 <- center(data_group(x, "Species"), "Sepal.Width")
  expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here")
})


# categorize -----------------------------------

test_that("categorize, attributes preserved", {
  x <- iris
  attr(x, "myattri") <- "I'm here"
  x2 <- categorize(data_group(x, "Species"), "Sepal.Width")
  expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here")
})


# standardize -----------------------------------

test_that("standardize, attributes preserved", {
  x <- iris
  attr(x, "myattri") <- "I'm here"
  x2 <- standardize(data_group(x, "Species"), "Sepal.Width")
  expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here")
})

# filter -----------------------------------

test_that("filter, attributes preserved", {
  # if dplyr:::`[.grouped_df` in the environment it destroys the attributes
  # (only occurs when we run tests in random order)
  skip_if("[.grouped_df" %in% methods(`[`))
  test <- data.frame(
    id = c(1, 1, 2, 2),
    x = c(0, 1, 3, 4)
  )
  attr(test, "myattri") <- "I'm here"
  test2 <- data_filter(data_group(test, "id"), x == min(x))
  expect_identical(attr(test2, "myattri", exact = TRUE), "I'm here")
})


================================================
FILE: tests/testthat/test-attributes.R
================================================
data(efc, package = "datawizard")

# data_filter -----------------------------------

test_that("data_filter, attributes preserved", {
  attr(efc, "myattri") <- "I'm here"
  x <- data_filter(efc, c172code == 1 & c12hour > 40)
  expect_identical(
    attr(x$e42dep, "label", exact = TRUE),
    attr(efc$e42dep, "label", exact = TRUE)
  )
  expect_identical(
    attr(x, "myattri", exact = TRUE),
    "I'm here"
  )
})


# data_arrange -----------------------------------

test_that("data_arrange, attributes preserved", {
  x <- mtcars
  attr(x, "myattri") <- "I'm here"
  x2 <- data_arrange(x, "hp")
  expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here")
})


# data_match -----------------------------------

test_that("data_match, attributes preserved", {
  x <- mtcars
  attr(x, "myattri") <- "I'm here"
  x2 <- data_match(x, data.frame(vs = 0, am = 1))
  expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here")
})


# data_select -----------------------------------

test_that("data_select, attributes preserved", {
  x <- mtcars
  attr(x, "myattri") <- "I'm here"
  x2 <- data_select(x, "hp")
  expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here")
})


# data_group -----------------------------------

test_that("data_group, attributes preserved", {
  x <- mtcars
  attr(x, "myattri") <- "I'm here"
  x2 <- data_group(x, "cyl")
  expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here")
})


# data_relocate -----------------------------------

test_that("data_relocate, attributes preserved", {
  x <- mtcars
  attr(x, "myattri") <- "I'm here"
  x2 <- data_relocate(x, "am", "mpg")
  expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here")
})


# data_remove -----------------------------------

test_that("data_remove, attributes preserved", {
  x <- mtcars
  attr(x, "myattri") <- "I'm here"
  x2 <- data_remove(x, "am")
  expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here")
})


# data_reorder -----------------------------------

test_that("data_reorder, attributes preserved", {
  x <- mtcars
  attr(x, "myattri") <- "I'm here"
  x2 <- data_reorder(x, c("hp", "vs", "wt"))
  expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here")
})


# data_to_long -----------------------------------

test_that("data_to_long, attributes preserved", {
  wide_data <- data.frame(replicate(5, rnorm(10)))
  attr(wide_data, "myattri") <- "I'm here"
  x2 <- data_to_long(wide_data)
  expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here")
})


# to_numeric -----------------------------------

test_that("to_numeric, attributes preserved", {
  x <- iris
  attr(x, "myattri") <- "I'm here"
  x2 <- to_numeric(x, "Species")
  expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here")
})


# convert_to_na -----------------------------------

test_that("convert_to_na, attributes preserved", {
  x <- iris
  attr(x, "myattri") <- "I'm here"
  x2 <- convert_to_na(x, na = 2, verbose = FALSE)
  expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here")
  # label attribute is preserved
  attr(x$Species, "label") <- "Species Variable"
  x2 <- convert_to_na(x, na = "setosa", drop_levels = TRUE, verbose = FALSE)
  expect_identical(attributes(x$Species)$label, "Species Variable")
})


# data_rename -----------------------------------

test_that("data_rename, attributes preserved", {
  x <- mtcars
  attr(x, "myattri") <- "I'm here"
  x2 <- data_rename(x, select = "hp", replacement = "horsepower")
  expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here")
})


# rescale -----------------------------------

test_that("rescale, attributes preserved", {
  x <- iris
  attr(x, "myattri") <- "I'm here"
  x2 <- rescale(x, 1:3)
  expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here")
})


# center -----------------------------------

test_that("center, attributes preserved", {
  x <- iris
  attr(x, "myattri") <- "I'm here"
  x2 <- center(x, "Sepal.Width")
  expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here")
})


# categorize -----------------------------------

test_that("categorize, attributes preserved", {
  x <- iris
  attr(x, "myattri") <- "I'm here"
  x2 <- categorize(x, "Sepal.Width")
  expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here")
})


# change_code -----------------------------------

test_that("recode_values, attributes preserved", {
  x <- mtcars
  attr(x, "myattri") <- "I'm here"
  x2 <- recode_values(x, select = "am", recode = list(`5` = 0, `10` = 1))
  expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here")
})


# standardize -----------------------------------

test_that("standardize, attributes preserved", {
  x <- iris
  attr(x, "myattri") <- "I'm here"
  x2 <- standardize(x, "Sepal.Width")
  expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here")
})


================================================
FILE: tests/testthat/test-categorize.R
================================================
set.seed(123)
d <- sample.int(10, size = 500, replace = TRUE)

test_that("recode median", {
  expect_identical(categorize(d), ifelse(d >= median(d), 2, 1))
  expect_identical(categorize(d, lowest = 0), as.numeric(d >= median(d)))
})

test_that("recode mean", {
  expect_identical(categorize(d, split = "mean"), ifelse(d >= mean(d), 2, 1))
  expect_identical(
    categorize(d, split = "mean", lowest = 0),
    as.numeric(d >= mean(d))
  )
})

test_that("recode quantile", {
  expect_error(categorize(d, split = "quantile"))

  q <- quantile(d, probs = c(1 / 3, 2 / 3, 1))
  f <- cut(
    d,
    breaks = unique(c(min(d), q, max(d))),
    include.lowest = TRUE,
    right = FALSE
  )
  levels(f) <- 1:nlevels(f)
  expect_identical(
    categorize(d, split = "quantile", n_groups = 3),
    as.numeric(f)
  )
  expect_identical(
    categorize(d, split = "quantile", n_groups = 3, lowest = 0),
    as.numeric(f) - 1
  )
})

set.seed(123)
d <- sample.int(100, size = 1000, replace = TRUE)

test_that("recode range", {
  expect_error(categorize(d, split = "range"))
  d2 <- d
  d2[d <= 20] <- 1
  d2[d > 20 & d <= 40] <- 2
  d2[d > 40 & d <= 60] <- 3
  d2[d > 60 & d <= 80] <- 4
  d2[d > 80] <- 5
  expect_equal(
    table(categorize(d, split = "equal_range", range = 20)),
    table(d2),
    ignore_attr = TRUE
  )
  expect_equal(
    table(categorize(
      d,
      split = "equal_range",
      range = 20,
      lowest = 1
    )),
    table(d2),
    ignore_attr = TRUE
  )

  d2 <- d
  d2[d < 20] <- 0
  d2[d >= 20 & d < 40] <- 1
  d2[d >= 40 & d < 60] <- 2
  d2[d >= 60 & d < 80] <- 3
  d2[d >= 80] <- 4
  expect_equal(
    table(categorize(
      d,
      split = "equal_range",
      range = 20,
      lowest = 0
    )),
    table(d2),
    ignore_attr = TRUE
  )
})

test_that("recode length", {
  expect_error(categorize(d, split = "equal_length"))
  d2 <- d
  d2[d <= 20] <- 1
  d2[d > 20 & d <= 40] <- 2
  d2[d > 40 & d <= 60] <- 3
  d2[d > 60 & d <= 80] <- 4
  d2[d > 80] <- 5
  expect_equal(
    table(categorize(d, split = "equal_length", n_groups = 5)),
    table(d2),
    ignore_attr = TRUE
  )
  expect_equal(
    table(categorize(
      d,
      split = "equal_length",
      n_groups = 5,
      lowest = 1
    )),
    table(d2),
    ignore_attr = TRUE
  )
})

set.seed(123)
x <- sample.int(10, size = 30, replace = TRUE)
test_that("recode factor labels", {
  expect_type(categorize(x, "equal_length", n_groups = 3), "double")
  expect_s3_class(
    categorize(
      x,
      "equal_length",
      n_groups = 3,
      labels = c("low", "mid", "high")
    ),
    "factor"
  )
  expect_identical(
    levels(categorize(
      x,
      "equal_length",
      n_groups = 3,
      labels = c("low", "mid", "high")
    )),
    c("low", "mid", "high")
  )
  t1 <- table(categorize(x, "equal_length", n_groups = 3))
  t2 <- table(categorize(
    x,
    "equal_length",
    n_groups = 3,
    labels = c("low", "mid", "high")
  ))
  expect_equal(t1, t2, ignore_attr = TRUE)
})

test_that("recode data frame", {
  data(iris)
  x <- iris
  out <- categorize(
    x,
    split = "median",
    select = c("Sepal.Length", "Sepal.Width")
  )
  expect_s3_class(out, "data.frame")
  expect_identical(
    out$Sepal.Length,
    ifelse(iris$Sepal.Length >= median(iris$Sepal.Length), 2, 1)
  )
  expect_identical(out$Petal.Length, iris$Petal.Length)

  out <- categorize(x, split = "median", select = starts_with("Sepal"))
  expect_s3_class(out, "data.frame")
  expect_identical(
    out$Sepal.Length,
    ifelse(iris$Sepal.Length >= median(iris$Sepal.Length), 2, 1)
  )
  expect_identical(out$Petal.Length, iris$Petal.Length)

  out <- categorize(x, split = "median", select = ~ Sepal.Width + Sepal.Length)
  expect_s3_class(out, "data.frame")
  expect_identical(
    out$Sepal.Length,
    ifelse(iris$Sepal.Length >= median(iris$Sepal.Length), 2, 1)
  )
  expect_identical(out$Petal.Length, iris$Petal.Length)

  out <- categorize(x, split = "median", select = Sepal.Length)
  expect_s3_class(out, "data.frame")
  expect_identical(
    out$Sepal.Length,
    ifelse(iris$Sepal.Length >= median(iris$Sepal.Length), 2, 1)
  )
  expect_identical(out$Petal.Length, iris$Petal.Length)

  expect_warning(
    expect_warning(
      out <- categorize(
        x,
        split = "median",
        select = c("sepal.Length", "sepal.Width"),
        ignore_case = FALSE
      ),
      "not found"
    ),
    "not found"
  )
  expect_identical(out$Sepal.Length, iris$Sepal.Length)

  out <- categorize(
    x,
    split = "median",
    select = starts_with("sepal"),
    ignore_case = TRUE
  )
  expect_s3_class(out, "data.frame")
  expect_identical(
    out$Sepal.Length,
    ifelse(iris$Sepal.Length >= median(iris$Sepal.Length), 2, 1)
  )
  expect_identical(out$Petal.Length, iris$Petal.Length)

  out <- categorize(
    x,
    split = "median",
    select = starts_with("sepal"),
    ignore_case = FALSE
  )
  expect_identical(out$Sepal.Length, iris$Sepal.Length)

  out <- categorize(
    x,
    split = "median",
    select = starts_with("sepal"),
    ignore_case = TRUE,
    append = "_r"
  )
  expect_identical(
    colnames(out),
    c(
      "Sepal.Length",
      "Sepal.Width",
      "Petal.Length",
      "Petal.Width",
      "Species",
      "Sepal.Length_r",
      "Sepal.Width_r"
    )
  )

  out <- categorize(iris, split = "median", select = starts_with("Sepal"))
  expect_identical(
    out$Sepal.Length,
    c(
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      2,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      2,
      2,
      2,
      1,
      2,
      1,
      2,
      1,
      2,
      1,
      1,
      2,
      2,
      2,
      1,
      2,
      1,
      2,
      2,
      1,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      1,
      1,
      1,
      2,
      2,
      1,
      2,
      2,
      2,
      1,
      1,
      1,
      2,
      2,
      1,
      1,
      1,
      1,
      2,
      1,
      1,
      2,
      2,
      2,
      2,
      2,
      2,
      1,
      2,
      2,
      2,
      2,
      2,
      2,
      1,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      1,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      2
    )
  )

  skip_if_not_installed("poorman")

  x <- poorman::group_by(iris, Species)
  out <- categorize(x, split = "median", select = starts_with("Sepal"))
  expect_identical(
    out$Sepal.Length,
    c(
      2,
      1,
      1,
      1,
      2,
      2,
      1,
      2,
      1,
      1,
      2,
      1,
      1,
      1,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      1,
      2,
      1,
      2,
      2,
      2,
      2,
      1,
      1,
      2,
      2,
      2,
      1,
      2,
      2,
      1,
      1,
      2,
      2,
      1,
      1,
      2,
      2,
      1,
      2,
      1,
      2,
      2,
      2,
      2,
      2,
      1,
      2,
      1,
      2,
      1,
      2,
      1,
      1,
      2,
      2,
      2,
      1,
      2,
      1,
      1,
      2,
      1,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      1,
      1,
      1,
      1,
      2,
      1,
      2,
      2,
      2,
      1,
      1,
      1,
      2,
      1,
      1,
      1,
      1,
      1,
      2,
      1,
      1,
      1,
      1,
      2,
      1,
      2,
      2,
      1,
      2,
      2,
      2,
      2,
      1,
      2,
      1,
      1,
      1,
      2,
      2,
      2,
      1,
      2,
      1,
      2,
      1,
      2,
      2,
      1,
      1,
      1,
      2,
      2,
      2,
      1,
      1,
      1,
      2,
      1,
      1,
      1,
      2,
      2,
      2,
      1,
      2,
      2,
      2,
      1,
      2,
      1,
      1
    )
  )
})


test_that("recode all NA", {
  x <- rep(NA, 10)
  expect_message(
    y <- categorize(x),
    "can't be recoded"
  )
  expect_identical(y, x)

  x <- rep(NA_real_, 10)
  expect_message(
    y <- categorize(x),
    "only missing values"
  )
  expect_identical(y, x)
})


test_that("recode numeric", {
  expect_identical(
    categorize(mtcars$hp, split = c(100, 150)),
    c(
      2,
      2,
      1,
      2,
      3,
      2,
      3,
      1,
      1,
      2,
      2,
      3,
      3,
      3,
      3,
      3,
      3,
      1,
      1,
      1,
      1,
      3,
      3,
      3,
      3,
      1,
      1,
      2,
      3,
      3,
      3,
      2
    )
  )
  x <- mtcars$hp
  x[mtcars$hp < 100] <- 1
  x[mtcars$hp >= 100 & mtcars$hp < 150] <- 2
  x[mtcars$hp >= 150] <- 3
  expect_identical(categorize(mtcars$hp, split = c(100, 150)), x)
  expect_identical(categorize(mtcars$hp, split = c(100, 150), lowest = NULL), x)

  expect_identical(
    categorize(mtcars$hp, split = "equal_range", range = 50, lowest = NULL),
    c(
      2,
      2,
      1,
      2,
      3,
      2,
      4,
      1,
      1,
      2,
      2,
      3,
      3,
      3,
      4,
      4,
      4,
      1,
      1,
      1,
      1,
      2,
      2,
      4,
      3,
      1,
      1,
      2,
      5,
      3,
      6,
      2
    )
  )
})

# select helpers ------------------------------
test_that("categorize regex", {
  expect_identical(
    categorize(mtcars, select = "pg", regex = TRUE),
    categorize(mtcars, select = "mpg")
  )
})


# labelling ranges ------------------------------
test_that("categorize labelling ranged", {
  data(mtcars)
  expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5))
  expect_snapshot(categorize(
    mtcars$mpg,
    "equal_length",
    n_groups = 5,
    labels = "range"
  ))
  expect_snapshot(categorize(
    mtcars$mpg,
    "equal_length",
    n_groups = 5,
    labels = "observed"
  ))
})

test_that("categorize breaks", {
  data(mtcars)
  expect_snapshot(categorize(
    mtcars$mpg,
    "equal_length",
    n_groups = 5,
    labels = "range",
    breaks = "inclusive"
  ))
  expect_error(
    categorize(mtcars$mpg, "equal_length", n_groups = 5, breaks = "something"),
    regex = "should be one of"
  )
})


================================================
FILE: tests/testthat/test-center.R
================================================
test_that("center", {
  z <- center(iris$Sepal.Width)
  expect_equal(
    as.vector(z),
    iris$Sepal.Width - mean(iris$Sepal.Width),
    tolerance = 1e-4,
    ignore_attr = TRUE
  )
})

test_that("center, robust", {
  z <- center(mtcars$hp, robust = TRUE)
  expect_equal(
    as.vector(z),
    mtcars$hp - median(mtcars$hp),
    tolerance = 1e-4,
    ignore_attr = TRUE
  )
})

test_that("center, select", {
  z <- center(iris, select = "Sepal.Width")
  expect_equal(
    as.vector(z$Sepal.Width),
    iris$Sepal.Width - mean(iris$Sepal.Width),
    tolerance = 1e-4,
    ignore_attr = TRUE
  )
  # check class attributes
  expect_identical(
    vapply(z, class, character(1)),
    c(
      Sepal.Length = "numeric",
      Sepal.Width = "numeric",
      Petal.Length = "numeric",
      Petal.Width = "numeric",
      Species = "factor"
    )
  )
})

test_that("center, factors", {
  z <- center(iris, select = "Species")
  expect_identical(z$Species, iris$Species)
})

test_that("center, force factors", {
  z <- center(iris, select = "Species", force = TRUE)
  v <- as.numeric(iris$Species)
  expect_equal(
    as.vector(z$Species),
    v - median(v),
    tolerance = 1e-4,
    ignore_attr = TRUE
  )
})

test_that("center, all na", {
  z <- center(c(NA, NA, NA))
  expect_identical(z, c(NA, NA, NA))
})

test_that("center, with Inf", {
  z <- center(c(2, 4, Inf))
  expect_equal(z, c(-1, 1, NA), ignore_attr = TRUE)
})

test_that("center, all NA or Inf", {
  z <- center(c(NA, -Inf, Inf))
  expect_equal(z, c(NA, -Inf, Inf), ignore_attr = TRUE)
})

test_that("center works correctly with only one value", {
  expect_message(
    x <- center(100), # nolint
    "will be set to 0"
  )
  expect_equal(x, 0, ignore_attr = TRUE)

  expect_equal(center(100, center = 1), 99, ignore_attr = TRUE)
  expect_equal(
    center(100, reference = mtcars$mpg),
    100 - mean(mtcars$mpg),
    ignore_attr = TRUE
  )
})


# with grouped data -------------------------------------------

test_that("center (grouped data)", {
  skip_if_not_installed("poorman")

  datawizard <- iris %>%
    poorman::group_by(Species) %>%
    center(Sepal.Width) %>%
    poorman::ungroup() %>%
    poorman::pull(Sepal.Width)

  manual <- iris %>%
    poorman::group_by(Species) %>%
    poorman::mutate(Sepal.Width = Sepal.Width - mean(Sepal.Width)) %>%
    poorman::ungroup() %>%
    poorman::pull(Sepal.Width)

  expect_identical(datawizard, manual)
})

test_that("center (grouped data), with force = TRUE", {
  skip_if_not_installed("poorman")

  datawizard_c <- iris %>%
    poorman::group_by(Species) %>%
    center(force = TRUE) %>%
    poorman::ungroup()

  manual_c <- iris %>%
    poorman::group_by(Species) %>%
    poorman::mutate(
      Sepal.Length = Sepal.Length - mean(Sepal.Length),
      Sepal.Width = Sepal.Width - mean(Sepal.Width),
      Petal.Length = Petal.Length - mean(Petal.Length),
      Petal.Width = Petal.Width - mean(Petal.Width)
    ) %>%
    poorman::ungroup()

  expect_equal(datawizard_c, manual_c, ignore_attr = TRUE)
})

test_that("center, robust (grouped data)", {
  skip_if_not_installed("poorman")

  datawizard <- iris %>%
    poorman::group_by(Species) %>%
    center(Sepal.Width, robust = TRUE) %>%
    poorman::ungroup() %>%
    poorman::pull(Sepal.Width)

  manual <- iris %>%
    poorman::group_by(Species) %>%
    poorman::mutate(Sepal.Width = Sepal.Width - median(Sepal.Width)) %>%
    poorman::ungroup() %>%
    poorman::pull(Sepal.Width)

  expect_identical(datawizard, manual)
})

test_that("center, select (grouped data)", {
  skip_if_not_installed("poorman")

  datawizard <- iris %>%
    poorman::group_by(Species) %>%
    center(select = starts_with("Sepal\\.W")) %>%
    poorman::ungroup() %>%
    poorman::pull(Sepal.Width)

  manual <- iris %>%
    poorman::group_by(Species) %>%
    poorman::mutate(Sepal.Width = Sepal.Width - mean(Sepal.Width)) %>%
    poorman::ungroup() %>%
    poorman::pull(Sepal.Width)

  expect_identical(datawizard, manual)
})

test_that("center, factors (grouped data)", {
  skip_if_not_installed("poorman")

  datawizard <- iris %>%
    poorman::group_by(Species) %>%
    center(select = "Species") %>%
    poorman::ungroup() %>%
    poorman::pull(Species)

  manual <- poorman::pull(iris, Species)

  expect_identical(datawizard, manual)
})

# select helpers ------------------------------
test_that("center regex", {
  expect_equal(
    center(mtcars, select = "pg", regex = TRUE)$mpg,
    center(mtcars$mpg),
    ignore_attr = TRUE
  )
  expect_equal(
    center(mtcars, select = "pg$", regex = TRUE)$mpg,
    center(mtcars$mpg),
    ignore_attr = TRUE
  )
})

# no matches ------------------------------
test_that("center no match", {
  data(iris)
  expect_warning(center(iris, "Sepla.Length"))
})


================================================
FILE: tests/testthat/test-coef_var.R
================================================
test_that("coefficient of variation works", {
  expect_equal(coef_var(1:10), 0.5504818826)
  expect_equal(coef_var(1:10, method = "unbiased"), 0.5552700246)
  expect_equal(coef_var(c(1:10, 100), method = "median_mad"), 0.7413)
  expect_equal(coef_var(c(1:10, 100), method = "qcd"), 0.4166666667)
  expect_identical(coef_var(mu = 10, sigma = 20), 2)
  expect_equal(
    coef_var(mu = 10, sigma = 20, method = "unbiased", n = 30),
    2.250614348
  )
  expect_equal(distribution_coef_var(1:10), 0.5504818826)
})


test_that("coef_var returns NULL if can't compute", {
  expect_warning(
    {
      x <- coef_var(as.Date("2022-10-31"))
    },
    "Can't compute"
  )
  expect_null(x)
})


test_that("coef_var: argument 'remove_na' works", {
  expect_identical(coef_var(c(1:10, NA)), NA_real_)

  expect_identical(
    coef_var(1:10),
    coef_var(c(1:10, NA), remove_na = TRUE)
  )
})

test_that("coef_var: method 'unbiased' needs argument 'n' when sigma and mu are provided", {
  expect_error(
    coef_var(1:10, method = "unbiased", mu = 10, sigma = 20),
    "A value for `n` must be provided"
  )
})


================================================
FILE: tests/testthat/test-contr.deviation.R
================================================
test_that("contr.deviation", {
  c.treatment <- solve(cbind(Intercept = 1, contr.treatment(3)))
  c.sum <- solve(cbind(Intercept = 1, contr.sum(3)))
  c.deviation <- solve(cbind(Intercept = 1, contr.deviation(3)))

  expect_equal(c.deviation[1, ], c.sum[1, ])
  expect_equal(c.deviation[-1, ], c.treatment[-1, ])
})

test_that("contr.deviation | snapshot", {
  skip_if_not_installed("base", "4.3")
  # IF THIS TESTS FAILS, UPDATE THE EXAMPLE

  data("mtcars")
  mtcars <- data_modify(mtcars, cyl = factor(cyl))
  mtcars <- data_modify(mtcars, am = factor(am))
  mtcars <- data_arrange(mtcars, select = c("cyl", "am"))

  contrasts(mtcars$cyl) <- contr.deviation
  c.deviation <- cbind(Intercept = 1, contrasts(mtcars$cyl))
  expect_snapshot(solve(c.deviation))

  mm <- unique(model.matrix(~ cyl * am, data = mtcars))
  rownames(mm) <- c(
    "cyl4.am0",
    "cyl4.am1",
    "cyl6.am0",
    "cyl6.am1",
    "cyl8.am0",
    "cyl8.am1"
  )

  expect_snapshot(solve(mm))
})


================================================
FILE: tests/testthat/test-convert_na_to.R
================================================
# numeric --------------------------

test_that("convert_na_to - numeric: works", {
  expect_identical(
    convert_na_to(c(1, 2, 3, NA), replacement = 4),
    as.double(1:4)
  )

  expect_warning(
    expect_identical(
      convert_na_to(c(1, 2, 3, NA), replacement = NULL),
      c(1, 2, 3, NA)
    ),
    "needs to be a numeric"
  )
})

test_that("convert_na_to - numeric: arg 'replacement' can only be numeric", {
  expect_warning(
    convert_na_to(c(1, 2, 3, NA), replacement = "a"),
    regexp = "`replacement` needs to be a numeric vector."
  )
  expect_warning(
    convert_na_to(c(1, 2, 3, NA), replacement = factor(8)),
    regexp = "`replacement` needs to be a numeric vector."
  )
})

test_that("convert_na_to - numeric: arg 'replacement' must be of length one", {
  expect_warning(
    convert_na_to(c(1, 2, 3, NA), replacement = c(1, 2)),
    regexp = "`replacement` needs to be of length one."
  )
})


test_that("convert_na_to - numeric: returns original vector if 'replacement' not good", {
  expect_warning(
    expect_identical(
      convert_na_to(c(1, 2, 3, NA), replacement = "a"),
      c(1, 2, 3, NA)
    ),
    "needs to be a numeric"
  )
  expect_warning(
    expect_identical(
      convert_na_to(c(1, 2, 3, NA), replacement = factor(8)),
      c(1, 2, 3, NA)
    ),
    "needs to be a numeric"
  )
})


# character --------------------------

test_that("convert_na_to - character: works", {
  expect_identical(
    convert_na_to(c("a", "b", "c", NA), replacement = "d"),
    c("a", "b", "c", "d")
  )
  expect_warning(
    expect_identical(
      convert_na_to(c("a", "b", "c", NA), replacement = NULL),
      c("a", "b", "c", NA)
    ),
    "needs to be a character"
  )
})

test_that("convert_na_to - character: arg 'replacement' can only be character", {
  expect_warning(
    convert_na_to(c("a", "b", "c", NA), replacement = mtcars),
    regexp = "`replacement` needs to be a character or numeric vector."
  )
  expect_warning(
    convert_na_to(c("a", "b", "c", NA), replacement = factor(8)),
    regexp = "`replacement` needs to be a character or numeric vector."
  )
})

test_that("convert_na_to - numeric: arg 'replacement' must be of length one", {
  expect_warning(
    convert_na_to(c("a", "b", "c", NA), replacement = c("d", "e")),
    regexp = "`replacement` needs to be of length one."
  )
})

test_that("convert_na_to - character: returns original vector if 'replacement' not good", {
  expect_identical(
    convert_na_to(c("a", "b", "c", NA), replacement = 1),
    c("a", "b", "c", 1)
  )
  expect_warning(
    expect_identical(
      convert_na_to(c("a", "b", "c", NA), replacement = mtcars),
      c("a", "b", "c", NA)
    ),
    "needs to be a character or numeric vector"
  )
  expect_warning(
    expect_identical(
      convert_na_to(c("a", "b", "c", NA), replacement = factor(8)),
      c("a", "b", "c", NA)
    ),
    "needs to be a character or numeric vector"
  )
})


# factor --------------------------

test_that("convert_na_to - factor: works when 'replacement' is numeric ", {
  x <- convert_na_to(factor(c(1, 2, 3, NA)), replacement = 4)
  expect_identical(
    x,
    factor(1:4)
  )
  expect_identical(levels(x), as.character(1:4))
  expect_warning(
    expect_identical(
      convert_na_to(factor(c(1, 2, 3, NA)), replacement = NULL),
      factor(c(1, 2, 3, NA))
    ),
    "needs to be of length one"
  )
})

test_that("convert_na_to - factor: works when 'replacement' is character", {
  x <- convert_na_to(factor(c(1, 2, 3, NA)), replacement = "d")
  expect_identical(
    x,
    factor(c(1:3, "d"))
  )
  expect_identical(levels(x), as.character(c(1:3, "d")))
})


# data frame --------------------------

test <- data.frame(
  x = c(1, 2, NA),
  y = c("a", "b", NA),
  z = factor(c("a", "b", NA)),
  x2 = c(4, 5, NA),
  stringsAsFactors = FALSE
)

test_that("convert_na_to - data frame: works with replace_* args", {
  expect_identical(
    convert_na_to(test, replace_num = 4, replace_char = "e", replace_fac = 8),
    data.frame(
      x = c(1, 2, 4),
      y = c("a", "b", "e"),
      z = factor(c("a", "b", "8"), levels = c("a", "b", "8")),
      x2 = c(4, 5, 4),
      stringsAsFactors = FALSE
    )
  )
})

test_that("convert_na_to - data frame: only modifies numeric if only numeric specified", {
  expect_identical(
    convert_na_to(test, replace_num = 4),
    data.frame(
      x = c(1, 2, 4),
      y = c("a", "b", NA),
      z = factor(c("a", "b", NA)),
      x2 = c(4, 5, 4),
      stringsAsFactors = FALSE
    )
  )
})

test_that("convert_na_to - data frame: only modifies character if only character specified", {
  expect_identical(
    convert_na_to(test, replace_char = "e"),
    data.frame(
      x = c(1, 2, NA),
      y = c("a", "b", "e"),
      z = factor(c("a", "b", NA)),
      x2 = c(4, 5, NA),
      stringsAsFactors = FALSE
    )
  )
})

test_that("convert_na_to - data frame: only modifies factor if only factor specified", {
  expect_identical(
    convert_na_to(test, replace_fac = 8),
    data.frame(
      x = c(1, 2, NA),
      y = c("a", "b", NA),
      z = factor(c("a", "b", "8"), levels = c("a", "b", "8")),
      x2 = c(4, 5, NA),
      stringsAsFactors = FALSE
    )
  )
})

test_that("convert_na_to - data frame: arg 'select' works", {
  expect_identical(
    convert_na_to(
      test,
      replace_num = 4,
      replace_char = "e",
      replace_fac = 8,
      select = "x"
    ),
    data.frame(
      x = c(1, 2, 4),
      y = c("a", "b", NA),
      z = factor(c("a", "b", NA)),
      x2 = c(4, 5, NA),
      stringsAsFactors = FALSE
    )
  )

  expect_identical(
    convert_na_to(
      test,
      replace_num = 4,
      replace_char = "e",
      replace_fac = 8,
      select = ~x
    ),
    data.frame(
      x = c(1, 2, 4),
      y = c("a", "b", NA),
      z = factor(c("a", "b", NA)),
      x2 = c(4, 5, NA),
      stringsAsFactors = FALSE
    )
  )

  expect_identical(
    convert_na_to(
      test,
      replace_num = 4,
      replace_char = "e",
      replace_fac = 8,
      select = starts_with("x")
    ),
    data.frame(
      x = c(1, 2, 4),
      y = c("a", "b", NA),
      z = factor(c("a", "b", NA)),
      x2 = c(4, 5, 4),
      stringsAsFactors = FALSE
    )
  )

  expect_identical(
    convert_na_to(
      test,
      replace_num = 4,
      replace_char = "e",
      replace_fac = 8,
      select = ends_with("2")
    ),
    data.frame(
      x = c(1, 2, NA),
      y = c("a", "b", NA),
      z = factor(c("a", "b", NA)),
      x2 = c(4, 5, 4),
      stringsAsFactors = FALSE
    )
  )

  expect_identical(
    convert_na_to(
      test,
      replace_num = 4,
      replace_char = "e",
      replace_fac = 8,
      select = contains("x")
    ),
    data.frame(
      x = c(1, 2, 4),
      y = c("a", "b", NA),
      z = factor(c("a", "b", NA)),
      x2 = c(4, 5, 4),
      stringsAsFactors = FALSE
    )
  )

  expect_identical(
    convert_na_to(
      test,
      replace_num = 4,
      replace_char = "e",
      replace_fac = 8,
      select = 1:3
    ),
    data.frame(
      x = c(1, 2, 4),
      y = c("a", "b", "e"),
      z = factor(c("a", "b", "8"), levels = c("a", "b", "8")),
      x2 = c(4, 5, NA),
      stringsAsFactors = FALSE
    )
  )

  expect_identical(
    convert_na_to(
      test,
      replace_num = 4,
      replace_char = "e",
      replace_fac = 8,
      select = regex("2$")
    ),
    data.frame(
      x = c(1, 2, NA),
      y = c("a", "b", NA),
      z = factor(c("a", "b", NA)),
      x2 = c(4, 5, 4),
      stringsAsFactors = FALSE
    )
  )
})


test_that("convert_na_to - data frame: arg 'exclude' works", {
  expect_identical(
    convert_na_to(
      test,
      replace_num = 4,
      replace_char = "e",
      replace_fac = 8,
      exclude = "x"
    ),
    data.frame(
      x = c(1, 2, NA),
      y = c("a", "b", "e"),
      z = factor(c("a", "b", "8"), levels = c("a", "b", "8")),
      x2 = c(4, 5, 4),
      stringsAsFactors = FALSE
    )
  )

  expect_identical(
    convert_na_to(
      test,
      replace_num = 4,
      replace_char = "e",
      replace_fac = 8,
      exclude = ~x
    ),
    data.frame(
      x = c(1, 2, NA),
      y = c("a", "b", "e"),
      z = factor(c("a", "b", "8"), levels = c("a", "b", "8")),
      x2 = c(4, 5, 4),
      stringsAsFactors = FALSE
    )
  )

  expect_identical(
    convert_na_to(
      test,
      replace_num = 4,
      replace_char = "e",
      replace_fac = 8,
      select = starts_with("x"),
      exclude = ~x
    ),
    data.frame(
      x = c(1, 2, NA),
      y = c("a", "b", NA),
      z = factor(c("a", "b", NA)),
      x2 = c(4, 5, 4),
      stringsAsFactors = FALSE
    )
  )
})

test_that("convert_na_to - data frame: works when arg 'select' is a list", {
  # numeric
  expect_identical(
    convert_na_to(test, replace_num = 4, select = list(x = 0)),
    data.frame(
      x = c(1, 2, 0),
      y = c("a", "b", NA),
      z = factor(c("a", "b", NA)),
      x2 = c(4, 5, 4),
      stringsAsFactors = FALSE
    )
  )

  # character
  expect_identical(
    convert_na_to(test, replace_char = "e", select = list(y = "d")),
    data.frame(
      x = c(1, 2, NA),
      y = c("a", "b", "d"),
      z = factor(c("a", "b", NA)),
      x2 = c(4, 5, NA),
      stringsAsFactors = FALSE
    )
  )

  # only named list can override replace_*
  expect_identical(
    convert_na_to(test, replace_num = 4, select = list(0)),
    data.frame(
      x = c(1, 2, 4),
      y = c("a", "b", NA),
      z = factor(c("a", "b", NA)),
      x2 = c(4, 5, 4),
      stringsAsFactors = FALSE
    )
  )

  expect_identical(
    convert_na_to(test, replace_char = "e", select = list("d")),
    data.frame(
      x = c(1, 2, NA),
      y = c("a", "b", "e"),
      z = factor(c("a", "b", NA)),
      x2 = c(4, 5, NA),
      stringsAsFactors = FALSE
    )
  )

  # no problem if put a variable that doesn't exist in list
  expect_warning(
    expect_identical(
      convert_na_to(test, replace_num = 4, select = list(x = 0, foo = 5)),
      data.frame(
        x = c(1, 2, 0),
        y = c("a", "b", NA),
        z = factor(c("a", "b", NA)),
        x2 = c(4, 5, 4),
        stringsAsFactors = FALSE
      )
    ),
    "not found"
  )
})


# preserve attributes --------------------------

test_that("data_rename preserves attributes", {
  skip_if_not_installed("parameters")

  m <- lm(Sepal.Length ~ Species, data = iris)
  out <- parameters::parameters(m)
  a1 <- attributes(out)

  out2 <- convert_na_to(out, replace_num = 5)
  a2 <- attributes(out2)

  expect_identical(names(a1)[1:28], names(a2)[1:28])
})

# select helpers ------------------------------
test_that("convert_na_to regex", {
  expect_identical(
    convert_na_to(airquality, replacement = 0, select = "zone", regex = TRUE),
    convert_na_to(airquality, replacement = 0, select = "Ozone")
  )
  expect_identical(
    convert_na_to(airquality, replacement = 0, select = "zone$", regex = TRUE),
    convert_na_to(airquality, replacement = 0, select = "Ozone")
  )
})


================================================
FILE: tests/testthat/test-convert_to_na.R
================================================
data(iris)

test_that("convert_to_na-factor", {
  x <- convert_to_na(iris$Species, na = "versicolor")
  expect_identical(sum(is.na(x)), 50L)

  x <- convert_to_na(iris$Species, na = list(2, "versicolor"))
  expect_identical(sum(is.na(x)), 50L)

  x <- convert_to_na(
    iris$Species,
    na = list(2, "versicolor"),
    drop_levels = FALSE
  )
  expect_identical(levels(x), c("setosa", "versicolor", "virginica"))
  expect_identical(as.vector(table(x)), c(50L, 0L, 50L))

  x <- convert_to_na(
    iris$Species,
    na = list(2, "versicolor"),
    drop_levels = TRUE
  )
  expect_identical(levels(x), c("setosa", "virginica"))
  expect_identical(as.vector(table(x)), c(50L, 50L))

  expect_message(
    x <- convert_to_na(iris$Species, na = 2), # nolint
    "for a factor or character variable"
  )
  expect_identical(sum(is.na(x)), 0L)
})

test_that("convert_to_na-numeric", {
  x <- convert_to_na(iris$Sepal.Length, na = 5)
  expect_identical(sum(is.na(x)), sum(iris$Sepal.Length == 5))

  x <- convert_to_na(iris$Sepal.Length, na = list(5, "versicolor"))
  expect_identical(sum(is.na(x)), 10L)

  x <- convert_to_na(iris$Sepal.Width, na = "a", verbose = FALSE)
  expect_message(
    convert_to_na(iris$Sepal.Width, na = "a"),
    "needs to be a numeric vector"
  )
  expect_identical(sum(is.na(x)), 0L)
})

test_that("convert_to_na-df", {
  expect_message(
    x <- convert_to_na(iris, na = 5), # nolint
    "needs to be a character vector"
  )
  expect_identical(
    sum(is.na(x)),
    sum(vapply(iris, function(i) sum(i == 5), FUN.VALUE = integer(1L)))
  )

  x <- convert_to_na(iris, na = list(5, "versicolor"))
  expect_identical(sum(is.na(x)), 64L)

  data(iris)
  expect_message(
    x <- convert_to_na(iris, na = 3), # nolint
    "needs to be a character vector"
  )
  expect_identical(
    sum(is.na(x)),
    sum(vapply(
      iris,
      function(i) {
        if (is.numeric(i)) {
          sum(i == 3)
        } else {
          0L
        }
      },
      FUN.VALUE = integer(1L)
    ))
  )

  x <- convert_to_na(iris, na = list(3, "3"))
  expect_identical(sum(is.na(x)), 27L)
})


test_that("convert_to_na other classes", {
  d <- data.frame(
    a = 1:5,
    b = factor(letters[1:5]),
    c = as.Date(c(
      "2022-03-22",
      "2022-01-02",
      "2022-02-02",
      "2021-04-02",
      "2020-01-19"
    )),
    d = c(TRUE, TRUE, FALSE, FALSE, TRUE),
    e = as.complex(1:5)
  )

  x <- convert_to_na(d$a, na = 3)
  expect_equal(x, c(1, 2, NA, 4, 5), tolerance = 1e-3, ignore_attr = TRUE)
  expect_message(
    x <- convert_to_na(d$a, na = "c"),
    "needs to be a numeric vector"
  ) # nolint
  expect_equal(x, 1:5, tolerance = 1e-3, ignore_attr = TRUE)

  x <- convert_to_na(d$b, na = "c")
  expect_equal(
    x,
    structure(
      c(1L, 2L, NA, 4L, 5L),
      .Label = c("a", "b", "c", "d", "e"),
      class = "factor"
    ),
    tolerance = 1e-3,
    ignore_attr = TRUE
  )

  x <- convert_to_na(d$b, na = "c", drop_levels = TRUE)
  expect_equal(
    x,
    structure(
      c(1L, 2L, NA, 3L, 4L),
      .Label = c("a", "b", "d", "e"),
      class = "factor"
    ),
    tolerance = 1e-3,
    ignore_attr = TRUE
  )

  expect_message(
    convert_to_na(d$c, na = "2022-03-22"),
    "of class 'Date'"
  )
  x <- convert_to_na(d$c, na = as.Date("2022-03-22"))
  expect_equal(
    x,
    structure(c(NA, 18994, 19025, 18719, 18280), class = "Date"),
    tolerance = 1e-3,
    ignore_attr = TRUE
  )

  x <- convert_to_na(d$d, na = TRUE)
  expect_equal(
    x,
    c(NA, NA, FALSE, FALSE, NA),
    tolerance = 1e-3,
    ignore_attr = TRUE
  )
  expect_message(
    x <- convert_to_na(d$e, na = as.complex(4)), # nolint
    "variables of class `complex`"
  )
  expect_equal(x, d$e, tolerance = 1e-3, ignore_attr = TRUE)

  out <- data.frame(
    a = c(1, 2, NA, 4, 5),
    b = factor(c("a", "b", NA, "d", "e"), levels = letters[1:5]),
    c = as.Date(c(
      "2022-03-22",
      "2022-01-02",
      "2022-02-02",
      "2021-04-02",
      "2020-01-19"
    )),
    d = c(NA, NA, FALSE, FALSE, NA),
    e = as.complex(1:5)
  )
  convert_to_na(d, na = list(3, "c", TRUE, "2022-01-02"), verbose = FALSE)
  x <- convert_to_na(
    d,
    na = list(3, "c", TRUE, as.Date("2022-01-02")),
    verbose = FALSE
  )
  expect_equal(x, out, ignore_attr = TRUE, tolerance = 1e-3)
})

# select helpers ------------------------------
test_that("convert_to_na regex", {
  expect_identical(
    convert_to_na(mtcars, na = 4, select = "arb", regex = TRUE),
    convert_to_na(mtcars, na = 4, select = "carb")
  )
  expect_identical(
    convert_to_na(mtcars, na = 4, select = "arb$", regex = TRUE),
    convert_to_na(mtcars, na = 4, select = "carb")
  )
})


================================================
FILE: tests/testthat/test-data_addprefix.R
================================================
test_that("data_addprefix works as expected", {
  expect_equal(
    names(head(data_addprefix(iris, "NEW_"))),
    c(
      "NEW_Sepal.Length",
      "NEW_Sepal.Width",
      "NEW_Petal.Length",
      "NEW_Petal.Width",
      "NEW_Species"
    )
  )

  expect_equal(
    names(head(data_addsuffix(iris, "_OLD"))),
    c(
      "Sepal.Length_OLD",
      "Sepal.Width_OLD",
      "Petal.Length_OLD",
      "Petal.Width_OLD",
      "Species_OLD"
    )
  )

  expect_equal(
    names(head(data_addprefix(iris, "NEW_", select = starts_with("Sepal")))),
    c(
      "NEW_Sepal.Length",
      "NEW_Sepal.Width",
      "Petal.Length",
      "Petal.Width",
      "Species"
    )
  )

  expect_equal(
    names(head(data_addsuffix(iris, "_OLD", select = starts_with("Petal")))),
    c(
      "Sepal.Length",
      "Sepal.Width",
      "Petal.Length_OLD",
      "Petal.Width_OLD",
      "Species"
    )
  )
})

# select helpers ------------------------------
test_that("data_addprefix regex", {
  expect_equal(
    data_addsuffix(mtcars, "_regex", select = "pg", regex = TRUE),
    data_addsuffix(mtcars, "_regex", select = "mpg")
  )
  expect_equal(
    data_addsuffix(mtcars, select = "pg$", "_regex", regex = TRUE),
    data_addsuffix(mtcars, select = "mpg", "_regex")
  )
})


================================================
FILE: tests/testthat/test-data_arrange.R
================================================
df <- head(mtcars)
df$character <- c("a", "b", "b", "c", "c", "a")

test_that("data_arrange works with one numeric column", {
  skip_if_not_installed("poorman")
  expect_identical(
    poorman::arrange(df, carb),
    data_arrange(df, "carb")
  )
  expect_identical(
    poorman::arrange(df, -carb),
    data_arrange(df, "-carb")
  )
})

test_that("data_arrange works with one character column", {
  skip_if_not_installed("poorman")
  expect_identical(
    poorman::arrange(df, character),
    data_arrange(df, "character")
  )
  expect_identical(
    poorman::arrange(df, desc(character)),
    data_arrange(df, "-character")
  )
})

test_that("data_arrange works with several columns", {
  skip_if_not_installed("poorman")
  expect_identical(
    poorman::arrange(df, carb, gear),
    data_arrange(df, c("carb", "gear"))
  )
  expect_identical(
    poorman::arrange(df, -carb, gear),
    data_arrange(df, c("-carb", "gear"))
  )
  expect_identical(
    poorman::arrange(df, -carb, desc(character)),
    data_arrange(df, c("-carb", "-character"))
  )
})

test_that("data_arrange works without columns", {
  expect_identical(data_arrange(df), df)
})

test_that("data_arrange ignores wrong names if safe = TRUE", {
  expect_warning(
    expect_identical(data_arrange(df, "foo"), df),
    regexp = "don't exist"
  )

  expect_warning(
    expect_identical(
      data_arrange(df, c("gear", "foo")),
      data_arrange(df, "gear")
    ),
    regexp = "don't exist"
  )
})

test_that("data_arrange errors if safe = FALSE", {
  expect_error(data_arrange(df, "foo", safe = FALSE))
})

test_that("data_arrange errors if not coercable to data frame", {
  expect_error(data_arrange(list(a = 1:5, b = letters[1:3]), select = "b"))
  expect_equal(
    data_arrange(list(a = 1:5, b = letters[5:1]), select = "b"),
    structure(
      list(a = 5:1, b = c("a", "b", "c", "d", "e")),
      row.names = 5:1,
      class = "data.frame"
    ),
    ignore_attr = TRUE
  )
})

test_that("data_arrange works with grouped df", {
  set.seed(123)
  x <- mtcars[
    sample(seq_len(nrow(mtcars)), 10, replace = TRUE),
    c("cyl", "mpg")
  ]
  g <- data_group(x, cyl)

  expected <- data.frame(
    cyl = c(4, 4, 4, 6, 6, 8, 8, 8, 8, 8),
    mpg = c(22.8, 30.4, 32.4, 17.8, 19.2, 10.4, 15, 15.2, 15.5, 18.7)
  )
  class(expected) <- c("grouped_df", "data.frame")
  rownames(expected) <- c(
    "Datsun 710",
    "Honda Civic",
    "Fiat 128",
    "Merc 280C",
    "Merc 280",
    "Cadillac Fleetwood",
    "Maserati Bora",
    "Merc 450SLC",
    "Dodge Challenger",
    "Hornet Sportabout"
  )
  attributes(expected)$groups <- attributes(g)$groups

  expect_identical(
    data_arrange(g, "mpg"),
    expected,
    ignore_attr = TRUE
  )
})

test_that("data_arrange works with NA", {
  # without groups

  tmp <- data.frame(
    a = c(1, 2, 2, 8, 1, 3),
    b = c(1, NA, 3, 3, NA, 5)
  )

  expect_identical(
    data_arrange(tmp, "a"),
    data.frame(
      a = c(1, 1, 2, 2, 3, 8),
      b = c(1, NA, NA, 3, 5, 3)
    )
  )

  # with groups

  g <- data_group(tmp, "b")

  expected <- data.frame(
    a = c(1, 2, 8, 3, 1, 2),
    b = c(1, 3, 3, 5, NA, NA)
  )
  class(expected) <- c("grouped_df", "data.frame")
  attributes(expected)$groups <- attributes(g)$groups

  expect_identical(
    data_arrange(g, "a"),
    expected,
    ignore_attr = TRUE
  )
})

test_that("data_arrange works one-column data frames (and does not drop dimensions)", {
  data(mtcars)
  expect_s3_class(data_arrange(mtcars["gear"], select = "gear"), "data.frame")
  expect_s3_class(
    data_arrange(mtcars[c("gear", "cyl")], select = "gear"),
    "data.frame"
  )
})


================================================
FILE: tests/testthat/test-data_codebook.R
================================================
data(efc)
data(iris)

test_that("data_codebook iris", {
  expect_snapshot(data_codebook(iris))
})


test_that("data_codebook iris, reordered", {
  expect_snapshot(data_codebook(iris[c(1, 2, 5, 3, 4)]))
})


test_that("data_codebook NaN and Inf", {
  d <- data.frame(
    x = c(1, 4, NA, Inf, 4, NaN, 2, 1, 1)
  )
  expect_snapshot(data_codebook(d))

  set.seed(123)
  d <- data.frame(
    x = c(sample.int(15, 100, TRUE), Inf, Inf)
  )
  expect_snapshot(data_codebook(d))
  expect_snapshot(data_codebook(d, range_at = 100))
  expect_snapshot(data_codebook(d, range_at = 100, max_values = 4))
})


test_that("data_codebook, tinytable", {
  skip_if_not_installed("tinytable")
  d <- data.frame(
    x = c(1, 4, NA, Inf, 4, NaN, 2, 1, 1)
  )
  expect_snapshot(display(data_codebook(d), format = "tt"))

  set.seed(123)
  d <- data.frame(
    x = c(sample.int(15, 100, TRUE), Inf, Inf)
  )
  expect_snapshot(display(data_codebook(d), format = "tt"))
  expect_snapshot(display(data_codebook(d, range_at = 100), format = "tt"))
  expect_snapshot(display(
    data_codebook(d, range_at = 100, max_values = 4),
    format = "tt"
  ))

  data(iris)
  expect_snapshot(display(data_codebook(iris[c(1, 2, 5, 3, 4)]), format = "tt"))
})


test_that("data_codebook iris, select", {
  expect_snapshot(data_codebook(iris, select = starts_with("Sepal")))
})


test_that("data_codebook iris, select, ID", {
  expect_snapshot(data_codebook(iris, select = starts_with("Petal")))
})


test_that("data_codebook efc", {
  expect_snapshot(print(data_codebook(efc), table_width = Inf))
  expect_snapshot(print(
    data_codebook(efc),
    table_width = "auto",
    remove_duplicates = FALSE
  ))
  expect_snapshot(print(
    data_codebook(efc),
    table_width = "auto",
    remove_duplicates = TRUE
  ))
})


test_that("data_codebook efc, variable_label_width", {
  out <- data_codebook(efc, variable_label_width = 30)
  expect_snapshot(print(out, table_width = Inf))
  expect_snapshot(print(out, table_width = "auto", remove_duplicates = FALSE))
  expect_snapshot(print(out, table_width = "auto", remove_duplicates = TRUE))
})


test_that("data_codebook efc, value_label_width", {
  out <- data_codebook(efc, variable_label_width = 30, value_label_width = 15)
  expect_snapshot(print(out, table_width = Inf))
  expect_snapshot(print(out, table_width = "auto", remove_duplicates = FALSE))
  expect_snapshot(print(out, table_width = "auto", remove_duplicates = TRUE))
})


test_that("data_codebook truncated data", {
  set.seed(123)
  d <- data.frame(
    a = sample.int(15, 100, TRUE),
    b = sample(letters[1:18], 100, TRUE),
    stringsAsFactors = FALSE
  )
  expect_snapshot(data_codebook(d, max_values = 5))
})


test_that("data_codebook mixed numeric lengths", {
  set.seed(123)
  d <- data.frame(
    a = sample.int(4, 100, TRUE),
    b = sample(5:15, 100, TRUE),
    stringsAsFactors = FALSE
  )
  expect_snapshot(data_codebook(d))
})

test_that("data_codebook mixed range_at", {
  set.seed(123)
  d <- data.frame(
    a = sample.int(4, 100, TRUE),
    b = sample(5:15, 100, TRUE),
    stringsAsFactors = FALSE
  )
  expect_snapshot(data_codebook(d, range_at = 3))
})


test_that("data_codebook logicals", {
  set.seed(123)
  d <- data.frame(
    a = sample.int(15, 100, TRUE),
    b = sample(letters[1:3], 100, TRUE),
    c = sample(c(TRUE, FALSE), 100, TRUE),
    stringsAsFactors = FALSE
  )
  expect_snapshot(data_codebook(d))
})


test_that("data_codebook labelled data exceptions", {
  set.seed(123)

  f1 <- sample.int(5, 100, TRUE)
  f1[f1 == 4] <- NA
  attr(f1, "labels") <- setNames(1:5, c("One", "Two", "Three", "Four", "Five"))

  f2 <- sample.int(5, 100, TRUE)
  attr(f2, "labels") <- setNames(c(1:3, 5), c("One", "Two", "Three", "Five"))

  f3 <- sample.int(5, 100, TRUE)
  attr(f3, "labels") <- setNames(1:5, c("One", "Two", "Three", "Four", "Five"))

  d <- data.frame(f1, f2, f3)
  expect_snapshot(data_codebook(d))
})


test_that("data_codebook labelled data factors", {
  set.seed(123)

  f1 <- factor(sample(c("c", "b", "a"), 100, TRUE))
  attr(f1, "labels") <- setNames(c("c", "b", "a"), c("Cee", "Bee", "A"))

  f2 <- factor(sample(c("a", "b", "c"), 100, TRUE))
  attr(f2, "labels") <- setNames(c("c", "b", "a"), c("Cee", "Bee", "A"))

  f3 <- factor(sample(c("c", "b", "a"), 100, TRUE))
  attr(f3, "labels") <- setNames(c("a", "c", "b"), c("A", "Cee", "Bee"))

  d <- data.frame(f1, f2, f3)
  expect_snapshot(data_codebook(d))
})


test_that("data_codebook works with numbers < 1", {
  d <- data.frame(
    a = c(1, 1, 2, 2, 3, 3),
    b = c(0, 0, 0, 1, 1, 2)
  )
  expect_snapshot(data_codebook(d))
})


test_that("data_codebook, big marks", {
  set.seed(123)
  f1 <- factor(sample(c("c", "b", "a"), 1e6, TRUE))
  f2 <- factor(sample.int(3, 1e6, TRUE))
  d <- data.frame(f1, f2)
  expect_snapshot(data_codebook(d))
})


test_that("data_codebook, tagged NA", {
  skip_if_not_installed("haven")
  x <- haven::labelled(
    x = c(
      1:3,
      haven::tagged_na("a", "c", "z"),
      4:1,
      haven::tagged_na("a", "a", "c"),
      1:3,
      haven::tagged_na("z", "c", "c"),
      1:4,
      haven::tagged_na("a", "c", "z")
    ),
    labels = c(
      Agreement = 1,
      Disagreement = 4,
      First = haven::tagged_na("c"),
      Refused = haven::tagged_na("a"),
      `Not home` = haven::tagged_na("z")
    )
  )
  expect_snapshot(data_codebook(data.frame(x)))

  x <- haven::labelled(
    x = c(
      1:3,
      haven::tagged_na("a", "c"),
      4:1,
      haven::tagged_na("a", "a", "c"),
      1:3,
      haven::tagged_na("c", "c"),
      1:4,
      haven::tagged_na("a", "c")
    ),
    labels = c(
      Agreement = 1,
      Disagreement = 4,
      First = haven::tagged_na("c"),
      Refused = haven::tagged_na("a"),
      `Not home` = haven::tagged_na("z")
    )
  )
  expect_snapshot(data_codebook(data.frame(x)))
})


test_that("data_codebook, negative label values #334", {
  skip_if_not_installed("haven")
  x1 <- haven::labelled(
    x = 1:4,
    labels = c(Agreement = 1, Disagreement = 4, Missing = -9)
  )
  x2 <- haven::labelled(
    x = c(1:3, -9),
    labels = c(Agreement = 1, Disagreement = 4, Missing = -9)
  )
  expect_snapshot(data_codebook(data.frame(x1, x2)))
})


test_that("data_codebook, informative warning if no match", {
  data(iris)
  expect_warning(
    data_codebook(iris, select = starts_with("abc")),
    regex = "No column names that matched"
  )
})


================================================
FILE: tests/testthat/test-data_duplicated.R
================================================
# Preparations

df1 <- data.frame(
  id = c(1, 2, 3, 1, 3),
  year = c(2022, 2022, 2022, 2022, 2000),
  item1 = c(NA, 1, 1, 2, 3),
  item2 = c(NA, 1, 1, 2, 3),
  item3 = c(NA, 1, 1, 2, 3)
)

expected1 <- data.frame(
  Row = c(1, 4, 3, 5),
  id = c(1, 1, 3, 3),
  year = c(2022, 2022, 2022, 2000),
  item1 = c(NA, 2, 1, 3),
  item2 = c(NA, 2, 1, 3),
  item3 = c(NA, 2, 1, 3),
  count_na = c(3, 0, 0, 0)
)

expected2 <- data.frame(
  Row = c(1, 4),
  id = c(1, 1),
  year = c(2022, 2022),
  item1 = c(NA, 2),
  item2 = c(NA, 2),
  item3 = c(NA, 2),
  count_na = c(3, 0)
)

# Testing

test_that("data_duplicated basic", {
  x <- data_duplicated(df1, select = "id")
  rownames(x) <- NULL
  expect_equal(
    x,
    expected1
  )
})

test_that("data_duplicated unquoted", {
  x <- data_duplicated(df1, select = id)
  rownames(x) <- NULL
  expect_equal(
    x,
    expected1
  )
})

test_that("data_duplicated vector", {
  x <- data_duplicated(df1, select = 1)
  rownames(x) <- NULL
  expect_equal(
    x,
    expected1
  )
})

test_that("data_duplicated select-helper", {
  x <- data_duplicated(df1, select = contains("id"))
  rownames(x) <- NULL
  expect_equal(
    x,
    expected1
  )
})

test_that("data_duplicated multiple IDs", {
  x <- data_duplicated(df1, select = c("id", "year"))
  rownames(x) <- NULL
  expect_equal(
    x,
    expected2
  )
})

test_that("data_duplicated multiple IDs formula", {
  x <- data_duplicated(df1, select = ~ id + year)
  rownames(x) <- NULL
  expect_equal(
    x,
    expected2
  )
})

test_that("data_duplicated multiple IDs vector", {
  x <- data_duplicated(df1, select = 1:2)
  rownames(x) <- NULL
  expect_equal(
    x,
    expected2
  )
})

test_that("data_unique works with groups", {
  df <- data.frame(
    g = c(1, 1, 2, 2),
    x = c(1, 1, 2, 1)
  )
  df <- data_group(df, "g")

  expected <- data.frame(
    Row = 1:2,
    g = c(1, 1),
    x = c(1, 1),
    count_na = c(0, 0)
  )
  expected <- data_group(expected, "g")

  expect_identical(data_duplicated(df, "x"), expected, ignore_attr = TRUE)
})


================================================
FILE: tests/testthat/test-data_extract.R
================================================
data(efc)

test_that("data_extract works with select-length > 1", {
  # works with multiple selects
  expect_s3_class(
    data_extract(efc, select = c("e42dep", "c172code")),
    "data.frame"
  )

  # colnames properly set
  expect_named(
    data_extract(efc, select = c("e42dep", "c172code")),
    c("e42dep", "c172code")
  )

  # properly extract vector, w/o naming
  expect_identical(data_extract(efc, select = "e42dep"), efc$e42dep)

  # properly extract vector, with naming
  x <- data_extract(efc, select = "e42dep", name = "c172code")
  expect_named(x, as.character(efc$c172code))
})


test_that("data_extract works with select-helpers", {
  expect_identical(
    data_extract(iris, starts_with("Sepal")),
    iris[c("Sepal.Length", "Sepal.Width")]
  )

  expect_identical(
    data_extract(iris, 1:3),
    iris[1:3]
  )

  expect_identical(
    data_extract(iris, "Species"),
    iris$Species
  )

  expect_identical(
    data_extract(iris, contains("Wid")),
    iris[c("Sepal.Width", "Petal.Width")]
  )

  expect_identical(
    data_extract(iris, Sepal.Width),
    iris$Sepal.Width
  )
})


test_that("data_extract works with formulas", {
  expect_identical(
    data_extract(iris, ~ Sepal.Width + Species),
    iris[c("Sepal.Width", "Species")]
  )
})


test_that("data_extract from other functions", {
  test_fun <- function(data, i) {
    data_extract(data, select = i)
  }
  expect_identical(
    test_fun(iris, c("Sepal.Length", "Sepal.Width")),
    iris[c("Sepal.Length", "Sepal.Width")]
  )
})


test_that("data_extract extract, pull", {
  expect_identical(
    data_extract(iris, starts_with("Sepal")),
    iris[c("Sepal.Length", "Sepal.Width")]
  )

  expect_identical(
    data_extract(iris, starts_with("Sepal"), extract = "first"),
    iris$Sepal.Length
  )

  expect_identical(
    data_extract(iris, starts_with("Sepal"), extract = "last"),
    iris$Sepal.Width
  )

  expect_identical(
    data_extract(
      iris,
      starts_with("Sepal"),
      extract = "last",
      as_data_frame = TRUE
    ),
    iris["Sepal.Width"]
  )

  expect_identical(
    colnames(data_extract(mtcars, contains("a"))),
    c("drat", "am", "gear", "carb")
  )

  expect_identical(
    colnames(data_extract(mtcars, contains("a"), extract = "odd")),
    c("drat", "gear")
  )

  expect_identical(
    colnames(data_extract(mtcars, contains("a"), extract = "even")),
    c("am", "carb")
  )

  expect_identical(
    colnames(data_extract(mtcars, cyl:drat)),
    c("cyl", "disp", "hp", "drat")
  )

  expect_error(colnames(data_extract(mtcars, Cyl:Drat)))
  expect_identical(
    colnames(data_extract(mtcars, Cyl:Drat, ignore_case = TRUE)),
    c("cyl", "disp", "hp", "drat")
  )

  expect_identical(
    colnames(data_extract(iris, contains("Sep"))),
    c("Sepal.Length", "Sepal.Width")
  )

  expect_null(colnames(data_extract(iris, contains("sep"))))

  expect_identical(
    colnames(data_extract(iris, contains("sep"), ignore_case = TRUE)),
    c("Sepal.Length", "Sepal.Width")
  )

  expect_identical(
    colnames(data_extract(iris, c(1:2, 5))),
    c("Sepal.Length", "Sepal.Width", "Species")
  )
})

# select helpers ------------------------------
test_that("data_extract regex", {
  expect_identical(
    data_extract(mtcars, select = "pg", regex = TRUE),
    data_extract(mtcars, select = "mpg")
  )
  expect_identical(
    data_extract(mtcars, select = "pg$", regex = TRUE),
    data_extract(mtcars, select = "mpg")
  )
})


test_that("data_extract: 'name' is numeric", {
  expect_identical(
    data_extract(mtcars, "gear", 1),
    data_extract(mtcars, "gear", "mpg")
  )
  expect_identical(
    data_extract(mtcars, "gear", -2),
    data_extract(mtcars, "gear", "gear")
  )
  expect_identical(
    data_extract(mtcars, "gear", 0),
    data_extract(mtcars, "gear", "row.names")
  )
})


================================================
FILE: tests/testthat/test-data_group.R
================================================
data(efc)

test_that("data_group attributes", {
  x <- data_group(efc, "c172code")
  expect_identical(
    attributes(x)$groups,
    structure(
      list(
        c172code = c(1, 2, 3, NA),
        .rows = list(
          c(3L, 14L, 30L, 32L, 36L, 77L, 91L, 99L),
          c(
            1L,
            2L,
            4L,
            5L,
            6L,
            7L,
            8L,
            10L,
            11L,
            12L,
            16L,
            17L,
            18L,
            21L,
            22L,
            23L,
            24L,
            25L,
            26L,
            28L,
            29L,
            31L,
            33L,
            34L,
            35L,
            37L,
            38L,
            39L,
            40L,
            42L,
            44L,
            45L,
            46L,
            47L,
            50L,
            51L,
            52L,
            53L,
            54L,
            56L,
            57L,
            59L,
            60L,
            62L,
            65L,
            68L,
            69L,
            71L,
            72L,
            73L,
            76L,
            78L,
            80L,
            81L,
            82L,
            83L,
            84L,
            85L,
            86L,
            87L,
            88L,
            90L,
            92L,
            93L,
            96L,
            100L
          ),
          c(
            13L,
            15L,
            19L,
            20L,
            27L,
            41L,
            43L,
            55L,
            58L,
            64L,
            66L,
            67L,
            74L,
            75L,
            79L,
            89L
          ),
          c(9L, 48L, 49L, 61L, 63L, 70L, 94L, 95L, 97L, 98L)
        )
      ),
      row.names = c(2L, 1L, 4L, 3L),
      class = "data.frame",
      .drop = TRUE
    )
  )
  expect_s3_class(x, "grouped_df")
})


test_that("data_group attributes", {
  skip_if_not_installed("poorman")
  x <- data_group(efc, "c172code")
  out <- poorman::summarise(x, mw = mean(c12hour, na.rm = TRUE))
  expect_equal(out$mw, c(87.125, 94.046875, 75), tolerance = 1e-3)
})

# select helpers ------------------------------
test_that("data_group regex", {
  expect_identical(
    attributes(data_group(mtcars, select = "yl", regex = TRUE))$groups[[1]],
    sort(unique(mtcars$cyl))
  )
})


test_that("data_ungroup works", {
  x <- data_group(efc, "c172code")
  attr(x, "foo") <- TRUE

  ungrouped <- data_ungroup(x)
  expect_false(inherits(ungrouped, "grouped_df"))
  expect_true(attributes(x)$foo)
})


================================================
FILE: tests/testthat/test-data_match.R
================================================
data(efc, package = "datawizard")

test_that("data_match works as expected", {
  matching_rows <- data_match(
    mtcars,
    data.frame(vs = 0, am = 1),
    return_indices = TRUE
  )
  df1 <- mtcars[matching_rows, ]
  expect_identical(unique(df1$vs), 0)
  expect_identical(unique(df1$am), 1)

  matching_rows <- data_match(
    mtcars,
    data.frame(vs = 0, am = c(0, 1)),
    return_indices = TRUE
  )
  df2 <- mtcars[matching_rows, ]
  expect_identical(unique(df2$vs), 0)
  expect_identical(unique(df2$am), c(1, 0))
})


test_that("data_match works with missing data", {
  skip_if_not_installed("poorman")

  # "OR" works
  x1 <- length(data_match(
    efc,
    data.frame(c172code = 1, e16sex = 2),
    match = "or",
    return_indices = TRUE
  ))
  x2 <- nrow(poorman::filter(efc, c172code == 1 | e16sex == 2))
  expect_identical(x1, x2)

  # "AND" works
  x1 <- length(data_match(
    efc,
    data.frame(c172code = 1, e16sex = 2),
    match = "and",
    return_indices = TRUE
  ))
  x2 <- nrow(poorman::filter(efc, c172code == 1, e16sex == 2))
  expect_identical(x1, x2)

  # "NOT" works
  x1 <- length(data_match(
    efc,
    data.frame(c172code = 1, e16sex = 2),
    match = "not",
    return_indices = TRUE
  ))
  x2 <- nrow(poorman::filter(efc, c172code != 1, e16sex != 2))
  expect_identical(x1, x2)

  # remove NA
  x1 <- length(data_match(
    efc,
    data.frame(c172code = 1, e16sex = 2),
    match = "not",
    return_indices = TRUE,
    remove_na = FALSE
  ))
  expect_identical(x1, 41L)
  x1 <- length(data_match(
    efc,
    data.frame(c172code = 1, e16sex = 2),
    match = "not",
    return_indices = TRUE,
    remove_na = TRUE
  ))
  expect_identical(x1, 36L)
})


test_that("data_match and data_filter work similar", {
  out1 <- data_match(mtcars, data.frame(vs = 0, am = 1), match = "not")
  out2 <- data_filter(mtcars, vs != 0 & am != 1)
  expect_equal(out1, out2, ignore_attr = TRUE)

  # using a data frame re-orders rows!
  out1 <- data_match(mtcars, data.frame(vs = 0, am = 1), match = "or")
  out2 <- data_filter(mtcars, vs == 0 | am == 1)
  expect_equal(
    out1[order(out1$vs, out1$am), ],
    out2[order(out2$vs, out2$am), ],
    ignore_attr = TRUE
  )

  # string representation is working
  out1 <- data_match(mtcars, data.frame(vs = 0, am = 1), match = "or")
  out2 <- data_filter(mtcars, "vs == 0 | am == 1")
  expect_equal(
    out1[order(out1$vs, out1$am), ],
    out2[order(out2$vs, out2$am), ],
    ignore_attr = TRUE
  )
})


test_that("data_filter works", {
  out1 <- data_match(mtcars, data.frame(vs = 0, am = 1), match = "not")
  out2 <- data_filter(mtcars, vs != 0 & am != 1)
  out3 <- subset(mtcars, vs != 0 & am != 1)
  out4 <- data_filter(mtcars, vs != 0, am != 1)
  expect_equal(out1, out2, ignore_attr = TRUE)
  expect_equal(out1, out3, ignore_attr = TRUE)
  expect_equal(out2, out4, ignore_attr = TRUE)
})


test_that("data_filter works with string representation", {
  out1 <- data_match(mtcars, data.frame(vs = 0, am = 1), match = "not")
  out2 <- data_filter(mtcars, "vs != 0 & am != 1")
  out3 <- subset(mtcars, vs != 0 & am != 1)
  out4 <- data_filter(mtcars, c("vs != 0", "am != 1"))
  expect_equal(out1, out2, ignore_attr = TRUE)
  expect_equal(out1, out3, ignore_attr = TRUE)
  expect_equal(out2, out3, ignore_attr = TRUE)
  expect_equal(out2, out4, ignore_attr = TRUE)
})


test_that("data_filter works like slice", {
  out <- data_filter(mtcars, 5:10)
  expect_equal(out, mtcars[5:10, ], ignore_attr = TRUE)
  out <- data_filter(mtcars, "5:10")
  expect_equal(out, mtcars[5:10, ], ignore_attr = TRUE)
  slc <- 5:10
  out <- data_filter(mtcars, slc)
  expect_equal(out, mtcars[5:10, ], ignore_attr = TRUE)
  slc <- "5:10"
  out <- data_filter(mtcars, slc)
  expect_equal(out, mtcars[5:10, ], ignore_attr = TRUE)
})


test_that("data_filter gives informative message on errors", {
  expect_error(
    data_filter(mtcars, mpg = 10),
    "`==`"
  )
  expect_error(
    data_filter(mtcars, "mpg > 10 || cyl = 4"),
    "`==`"
  )
  expect_error(
    data_filter(mtcars, mpg > 10 || cyl == 4),
    "`||`"
  )
  expect_error(
    data_filter(mtcars, mpg > 10 && cyl == 4),
    "`&&`"
  )
  ## TODO: need to check why this fails on R 4.1
  skip_if(getRversion() < "4.2.0")
  expect_error(
    data_filter(mtcars, mpg > 10?cyl == 4),
    "syntax"
  )
  expect_error(
    data_filter(mtcars, mgp > 10?cyl == 4),
    "Variable \"mgp\""
  )
})


test_that("data_filter gives informative message on errors", {
  data(mtcars)
  expect_error(
    data_filter(mtcars, cxl == 6),
    regex = "Variable \"cxl\""
  )
  expect_error(
    data_filter(mtcars, "cxl == 6"),
    regex = "Variable \"cxl\""
  )
})


test_that("data_filter works with >= or <=", {
  expect_identical(
    data_filter(mtcars, "mpg >= 30.4"),
    subset(mtcars, mpg >= 30.4)
  )
  expect_identical(
    data_filter(mtcars, mpg >= 30.4),
    subset(mtcars, mpg >= 30.4)
  )
  expect_identical(
    data_filter(mtcars, "mpg <= 30.4"),
    subset(mtcars, mpg <= 30.4)
  )
  expect_identical(
    data_filter(mtcars, mpg <= 30.4),
    subset(mtcars, mpg <= 30.4)
  )
  mpgl30 <- "mpg <= 30.4"
  expect_identical(
    data_filter(mtcars, mpgl30),
    subset(mtcars, mpg <= 30.4)
  )

  expect_identical(
    data_filter(mtcars, "mpg >= 30.4 & hp == 66"),
    subset(mtcars, mpg >= 30.4 & hp == 66)
  )
  expect_identical(
    data_filter(mtcars, mpg <= 30.4 & hp == 66),
    subset(mtcars, mpg <= 30.4 & hp == 66)
  )
  mpgl30hp66 <- "mpg >= 30.4 & hp == 66"
  expect_identical(
    data_filter(mtcars, mpgl30hp66),
    subset(mtcars, mpg >= 30.4 & hp == 66)
  )
})


test_that("programming with data_filter", {
  # One arg ------------

  foo <- function(var) {
    data_filter(mtcars, var)
  }
  expect_identical(
    foo("mpg >= 30"),
    data_filter(mtcars, "mpg >= 30")
  )

  foo2 <- function(data) {
    var2 <- "mpg >= 30"
    data_filter(data, var2)
  }
  expect_identical(
    foo2(mtcars),
    data_filter(mtcars, "mpg >= 30")
  )

  foo3 <- function(data) {
    var <- "mpg >= 30"
    data_filter(data, var)
  }
  expect_identical(
    foo3(mtcars),
    data_filter(mtcars, "mpg >= 30")
  )

  # Two args -----------

  foo4 <- function(data, var3) {
    data_filter(data, var3)
  }
  expect_identical(
    foo4(mtcars, "mpg >= 30 & hp <= 66"),
    data_filter(mtcars, "mpg >= 30 & hp <= 66")
  )
})


test_that("programming with data_filter with variables", {
  var4 <- "mpg >= 30 & hp <= 66"
  expect_identical(
    data_filter(mtcars, var4),
    data_filter(mtcars, "mpg >= 30 & hp <= 66")
  )
  var <- "mpg >= 30 & hp <= 66"
  expect_identical(
    data_filter(mtcars, var),
    data_filter(mtcars, "mpg >= 30 & hp <= 66")
  )
})


test_that("data_filter works with groups", {
  test <- data.frame(
    id = c(1, 1, 2, 2),
    x = c(0, 1, 3, 4),
    y = c(1, 2, 3, 4)
  )
  test <- data_group(test, "id")

  expected <- data.frame(id = c(1, 2), x = c(0, 3), y = c(1, 3))
  class(expected) <- c("grouped_df", "data.frame")
  attributes(expected)$groups <- attributes(test)$groups

  expect_equal(
    data_filter(test, x == min(x)),
    expected,
    ignore_attr = TRUE
  )
})


test_that("data_filter programming works with groups", {
  test <- data.frame(
    id = c(1, 1, 2, 2),
    x = c(0, 1, 3, 4),
    y = c(1, 2, 3, 4)
  )
  test <- data_group(test, "id")

  expected <- data.frame(id = c(1, 2), x = c(0, 3), y = c(1, 3))
  class(expected) <- c("grouped_df", "data.frame")
  attributes(expected)$groups <- attributes(test)$groups

  expect_equal(
    data_filter(test, "x == min(x)"),
    expected,
    ignore_attr = TRUE
  )

  foo_gr1 <- function(data, var) {
    data_filter(data, var)
  }
  out <- foo_gr1(test, "x == min(x)")
  expect_equal(out, expected, ignore_attr = TRUE)
})


test_that("data_filter with groups, different ways of dots", {
  grp <- data_group(mtcars, "cyl")
  fli <- "mpg <= 20"
  out1 <- data_filter(grp, mpg <= 20)
  out2 <- data_filter(grp, "mpg <= 20")
  out3 <- data_filter(grp, fli)
  expect_identical(out1, out2)
  expect_identical(out1, out3)
})


test_that("data_filter, slicing works with functions", {
  d <- data.frame(
    a = c("aa", "a1", "bb", "b1", "cc", "c1"),
    b = 1:6,
    stringsAsFactors = FALSE
  )

  rows <- grep("^[A-Za-z][0-9]$", x = d$a)
  out1 <- data_filter(d, rows)
  out2 <- data_filter(d, grep("^[A-Za-z][0-9]$", x = d$a))

  expect_identical(out1, out2)

  out3 <- data_filter(iris, (Sepal.Width == 3.0) & (Species == "setosa"))
  expect_identical(nrow(out3), 6L)

  # styler: off
  expect_error(
    data_filter(iris, (Sepal.Width = 3.0) & (Species = "setosa")), # nolint
    regex = "Filtering did not work"
  )
  # styler: on
})


test_that("data_filter works with tibbles", {
  skip_if_not_installed("tibble")
  skip_if_not_installed("dplyr")
  data(mtcars)

  # preserve class
  d <- tibble::as_tibble(mtcars)
  out <- data_filter(d, mpg > 15)
  expect_s3_class(out, "tbl_df")

  # preserve attributes
  d <- tibble::as_tibble(mtcars)
  d <- dplyr::group_by(d, cyl)
  out <- data_filter(d, mpg > 15)
  expect_s3_class(out, "tbl_df")
  expect_named(attr(out, "groups"), c("cyl", ".rows"))
})


================================================
FILE: tests/testthat/test-data_merge.R
================================================
data(mtcars)
x <- mtcars[3:5, 1:3]
y <- mtcars[30:32, c(1, 4:5)]
z <- mtcars[11:13, 6:8]

x$id <- 1:3
y$id <- 2:4
z$id <- 3:5

# left -----------------------

test_that("left-join", {
  skip_if_not_installed("poorman")

  out <- data_merge(x, y, join = "left")
  expect_identical(colnames(out), c("mpg", "cyl", "disp", "id", "hp", "drat"))
  expect_identical(dim(out), c(3L, 6L))
  expect_identical(out, suppressMessages(poorman::left_join(x, y)))

  out <- data_merge(x, y, join = "left", by = "id")
  expect_identical(
    colnames(out),
    c("cyl", "disp", "id", "hp", "drat", "mpg.x", "mpg.y")
  )
  expect_identical(out$disp, poorman::left_join(x, y, by = "id")$disp)
  expect_identical(dim(out), c(3L, 7L))

  out <- data_merge(x, y, join = "left", by = "mpg")
  expect_identical(
    colnames(out),
    c("mpg", "cyl", "disp", "hp", "drat", "id.x", "id.y")
  )
  expect_identical(out$disp, poorman::left_join(x, y, by = "mpg")$disp)
  expect_identical(out$mpg, poorman::left_join(x, y, by = "mpg")$mpg)
  expect_identical(dim(out), c(3L, 7L))
})


# semi/anti -----------------------

# errors
test_that("semi-anti-join", {
  expect_error(data_merge(x, y, join = "semi"))
  expect_error(data_merge(x, y, join = "anti"))
})


# right -----------------------

test_that("right-join", {
  skip_if_not_installed("poorman")

  out <- data_merge(x, y, join = "right")
  expect_identical(colnames(out), c("mpg", "cyl", "disp", "id", "hp", "drat"))
  expect_identical(dim(out), c(3L, 6L))
  # in data_merge(), we keep sorting from x, so do some preparation here
  poor_out <- suppressMessages(poorman::right_join(x, y))
  poor_out <- poor_out[order(poor_out$id), ]
  row.names(poor_out) <- seq_len(nrow(poor_out))
  expect_identical(out, poor_out)

  out <- data_merge(x, y, join = "right", by = "id")
  expect_identical(
    colnames(out),
    c("cyl", "disp", "id", "hp", "drat", "mpg.x", "mpg.y")
  )
  # in data_merge(), we keep sorting from x, so do some preparation here
  poor_out <- suppressMessages(poorman::right_join(x, y, by = "id"))
  poor_out <- poor_out[order(poor_out$id), ]
  expect_identical(out$disp, poor_out$disp)
  expect_identical(dim(out), c(3L, 7L))

  out <- data_merge(x, y, join = "right", by = "mpg")
  expect_identical(
    colnames(out),
    c("mpg", "cyl", "disp", "hp", "drat", "id.x", "id.y")
  )
  # in data_merge(), we keep sorting from x, so do some preparation here
  poor_out <- suppressMessages(poorman::right_join(x, y, by = "mpg"))
  poor_out <- poor_out[order(poor_out$id.y, decreasing = TRUE), ]
  out <- out[order(out$id.y, decreasing = TRUE), ]
  expect_identical(out$disp, poor_out$disp)
  expect_identical(out$mpg, poor_out$mpg)
  expect_identical(dim(out), c(3L, 7L))
})


# inner -----------------------

test_that("inner-join", {
  skip_if_not_installed("poorman")

  out <- data_merge(x, y, join = "inner")
  expect_identical(colnames(out), c("mpg", "cyl", "disp", "id", "hp", "drat"))
  expect_identical(dim(out), c(0L, 6L))

  out <- data_merge(x, y, join = "inner", by = "id")
  expect_identical(
    colnames(out),
    c("cyl", "disp", "id", "hp", "drat", "mpg.x", "mpg.y")
  )
  expect_identical(out$disp, poorman::inner_join(x, y, by = "id")$disp)
  expect_identical(dim(out), c(2L, 7L))

  out <- data_merge(x, y, join = "inner", by = "mpg")
  expect_identical(
    colnames(out),
    c("mpg", "cyl", "disp", "hp", "drat", "id.x", "id.y")
  )
  expect_identical(out$disp, poorman::inner_join(x, y, by = "mpg")$disp)
  expect_identical(dim(out), c(1L, 7L))
})


# full -----------------------

test_that("full-join", {
  out <- data_merge(x, y, join = "full")
  expect_identical(colnames(out), c("mpg", "cyl", "disp", "id", "hp", "drat"))
  expect_identical(dim(out), c(6L, 6L))
  expect_identical(
    out$mpg,
    c(22.8, 21.4, 18.7, 19.7, 15, 21.4),
    tolerance = 1e-2
  )
  expect_identical(out$id, c(1, 2, 3, 2, 3, 4), tolerance = 1e-2)

  out <- data_merge(x, y, join = "full", by = "id")
  expect_identical(
    colnames(out),
    c("cyl", "disp", "id", "hp", "drat", "mpg.x", "mpg.y")
  )
  expect_identical(dim(out), c(4L, 7L))
  expect_identical(out$mpg.x, c(22.8, 21.4, 18.7, NA), tolerance = 1e-2)
  expect_identical(out$id, 1:4, tolerance = 1e-2)

  out <- data_merge(x, y, join = "full", by = "mpg")
  expect_identical(
    colnames(out),
    c("mpg", "cyl", "disp", "hp", "drat", "id.x", "id.y")
  )
  expect_identical(dim(out), c(5L, 7L))
  expect_identical(out$mpg, c(22.8, 21.4, 18.7, 19.7, 15), tolerance = 1e-2)
  expect_identical(out$id.x, c(1, 2, 3, NA, NA), tolerance = 1e-2)

  out <- data_merge(x, y, join = "full", by = c("id", "mpg"))
  expect_identical(colnames(out), c("mpg", "cyl", "disp", "id", "hp", "drat"))
  expect_identical(dim(out), c(6L, 6L))
  expect_identical(
    out$mpg,
    c(22.8, 21.4, 18.7, 19.7, 15, 21.4),
    tolerance = 1e-2
  )
  expect_identical(out$id, c(1, 2, 3, 2, 3, 4), tolerance = 1e-2)
})


# bind -----------------------

test_that("bind-join", {
  skip_if_not_installed("poorman")

  out <- data_merge(x, y, join = "bind")
  poor_out <- poorman::bind_rows(x, y)
  row.names(poor_out) <- seq_len(nrow(poor_out))
  expect_identical(colnames(out), c("mpg", "cyl", "disp", "id", "hp", "drat"))
  expect_identical(dim(out), c(6L, 6L))
  expect_identical(out, poor_out)

  # by will be ignored
  out <- data_merge(x, y, join = "bind", by = "id")
  expect_identical(out, poor_out)

  # by will be ignored
  out <- data_merge(x, y, join = "bind", by = "mpg")
  expect_identical(out, poor_out)

  # by will be ignored
  out <- data_merge(x, y, join = "bind", by = c("id", "mpg"))
  expect_identical(out, poor_out)

  x <- mtcars[1, ]
  y <- mtcars[2, ]
  expect_warning(
    {
      out <- data_merge(x, y, join = "bind", id = "mpg")
    },
    regexp = "already exists"
  )
  expect_named(
    out,
    c(names(mtcars), "mpg_1")
  )
  expect_identical(out$mpg_1, c(1, 2))
})

# joins without common columns -----------------------

test_that("bind-join", {
  skip_if_not_installed("poorman")

  x2 <- mtcars[3:5, 1:3]
  y2 <- mtcars[30:32, 4:6]

  expect_warning(
    data_merge(x2, y2, join = "full"),
    "Found no matching columns in the data frames."
  )

  expect_identical(
    suppressWarnings(data_merge(x2, y2, join = "full")),
    suppressMessages(poorman::full_join(x2, y2)),
    ignore_attr = TRUE
  )

  expect_identical(
    data_merge(x2, y2, join = "bind"),
    poorman::bind_rows(x2, y2),
    ignore_attr = TRUE
  )
})

# joins without common columns -----------------------

test_that("compare bind and full joins", {
  x2 <- mtcars[3:5, 1:3]
  y2 <- mtcars[30:32, 3:6]
  expect_identical(
    data_merge(x2, y2, join = "full"),
    data_merge(x2, y2, join = "bind"),
    ignore_attr = TRUE
  )
})

# join data frames in a list -----------------------

test_that("join data frames in a list", {
  skip_if_not_installed("poorman")

  x <- mtcars[1:5, 1:3]
  y <- mtcars[28:31, 3:5]
  z <- mtcars[11:18, c(1, 3:4, 6:8)]
  x$id <- 1:5
  y$id <- 4:7
  z$id <- 3:10

  dat <- data_merge(list(x, y, z), by = "id", id = "df", join = "bind")

  expect_identical(
    remove_empty(subset(poorman::filter(dat, df == 1), select = -df)),
    x,
    ignore_attr = TRUE
  )

  expect_identical(
    remove_empty(subset(poorman::filter(dat, df == 2), select = -c(df, id))),
    subset(y, select = -id),
    ignore_attr = TRUE
  )

  expect_identical(
    remove_empty(subset(poorman::filter(dat, df == 3), select = -c(df, id))),
    subset(z, select = -id),
    ignore_attr = TRUE
  )

  x <- mtcars[1, ]
  y <- mtcars[2, ]
  expect_warning(
    {
      out <- data_merge(list(x, y), join = "bind", id = "mpg")
    },
    regexp = "already exists"
  )
  expect_named(
    out,
    c(names(mtcars), "mpg_1")
  )
  expect_identical(out$mpg_1, c(1, 2))
})


# join empty data frames -----------------------

x <- data.frame(x = character(), stringsAsFactors = FALSE)
y <- data.frame(x = character(), stringsAsFactors = FALSE)
z <- data.frame(y = character(), stringsAsFactors = FALSE)

test_that("join empty data frames", {
  expect_identical(dim(data_merge(x, y, join = "left")), c(0L, 1L))
  expect_identical(dim(data_merge(x, y, join = "full")), c(0L, 1L))
  expect_identical(dim(data_merge(x, y, join = "right")), c(0L, 1L))
  expect_identical(dim(data_merge(x, y, join = "bind")), c(0L, 1L))
  expect_identical(dim(data_merge(x, z, join = "bind")), c(0L, 2L))
})

# join when all "by" are not present ---------------------

test_that("join when all 'by' are not present", {
  x <- mtcars[, c("mpg", "drat", "cyl", "qsec")]
  y <- mtcars[, c("mpg", "hp", "cyl", "wt")]

  expect_error(
    {
      out <- data_merge(x, y, by = c("mpg", "drat", "qsec"))
    },
    regexp = "Not all columns"
  )
})

# no warning for tibble #404 ---------------------

test_that("no warning for tibble when checking if column exist", {
  skip_if_not_installed("tibble")
  d_tibble <- tibble::as_tibble(iris)
  expect_silent(data_merge(d_tibble, d_tibble[20:30, ], join = "bind"))
})


================================================
FILE: tests/testthat/test-data_modify.R
================================================
## styler: off

test_that("data_modify works", {
  data(iris)
  out <- data_modify(
    iris,
    Sepal_W_z = standardize(Sepal.Width),
    Sepal_Wz_double = 2 * Sepal_W_z
  )
  expect_equal(
    out$Sepal_W_z,
    as.vector(scale(iris$Sepal.Width)),
    ignore_attr = TRUE,
    tolerance = 1e-3
  )
  expect_equal(
    out$Sepal_Wz_double,
    2 * as.vector(scale(iris$Sepal.Width)),
    ignore_attr = TRUE,
    tolerance = 1e-3
  )
})


test_that("data_modify works with strings", {
  data(iris)
  out <- data_modify(
    iris,
    as_expr("Sepal_W_z = standardize(Sepal.Width)")
  )
  expect_equal(
    out$Sepal_W_z,
    as.vector(scale(iris$Sepal.Width)),
    ignore_attr = TRUE,
    tolerance = 1e-3
  )
  out <- data_modify(
    iris,
    as_expr(c(
      "Sepal_W_z = standardize(Sepal.Width)",
      "Sepal_Wz_double = 2 * Sepal_W_z"
    ))
  )
  expect_equal(
    out$Sepal_Wz_double,
    2 * as.vector(scale(iris$Sepal.Width)),
    ignore_attr = TRUE,
    tolerance = 1e-3
  )
})


test_that("data_modify preserves labels", {
  data(efc)
  out <- data_modify(
    efc,
    c12hour_c = center(c12hour),
    c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE)
  )
  expect_identical(
    attributes(out$c12hour_c)$label,
    attributes(efc$c12hour)$label
  )
  expect_identical(
    attributes(out$c12hour_z)$label,
    attributes(efc$c12hour)$label
  )
  out <- data_modify(
    efc,
    as_expr(c(
      "c12hour_c = center(c12hour)",
      "c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE)"
    ))
  )
  expect_identical(
    attributes(out$c12hour_c)$label,
    attributes(efc$c12hour)$label
  )
  expect_identical(
    attributes(out$c12hour_z)$label,
    attributes(efc$c12hour)$label
  )
})


test_that("data_modify recycling works", {
  data(iris)
  out <- data_modify(iris, x = 1)
  expect_equal(out$x, rep(1, nrow(iris)), ignore_attr = TRUE)
  out <- data_modify(iris, x = c(1, 2))
  expect_equal(out$x, rep(c(1, 2), nrow(iris) / 2), ignore_attr = TRUE)
  expect_error(data_modify(iris, x = 1:4), regex = "same length")
  out <- data_modify(iris, x = "a")
  expect_equal(out$x, rep("a", nrow(iris)), ignore_attr = TRUE)
})


test_that("data_modify recycling works with grouped df", {
  data(iris)
  d <- data_group(iris, "Species")
  expect_silent(data_modify(d, x = 1, test = 1:2))
})


test_that("data_modify expression in character vector-1", {
  data(iris)
  x <- "var_a = Sepal.Width"
  out <- data_modify(iris, as_expr(x))
  expect_named(
    out,
    c(
      "Sepal.Length",
      "Sepal.Width",
      "Petal.Length",
      "Petal.Width",
      "Species",
      "var_a"
    )
  )
})


test_that("data_modify expression in character vector-2", {
  data(iris)
  foo <- function(data) {
    y <- "var_a = Sepal.Width"
    head(data_modify(data, as_expr(y)))
  }
  out <- foo(iris)
  expect_named(
    out,
    c(
      "Sepal.Length",
      "Sepal.Width",
      "Petal.Length",
      "Petal.Width",
      "Species",
      "var_a"
    )
  )
  expect_identical(out$var_a, out$Sepal.Width)

  foo2 <- function(data, z) {
    head(data_modify(data, as_expr(z)))
  }
  out <- foo2(iris, "var_a = Sepal.Width")
  expect_named(
    out,
    c(
      "Sepal.Length",
      "Sepal.Width",
      "Petal.Length",
      "Petal.Width",
      "Species",
      "var_a"
    )
  )
  expect_identical(out$var_a, out$Sepal.Width)
})


test_that("data_modify expression in character vector-3", {
  data(iris)
  aa <- "2 * Sepal.Width"
  out <- data_modify(iris, new_var = as_expr(aa))
  expect_named(
    out,
    c(
      "Sepal.Length",
      "Sepal.Width",
      "Petal.Length",
      "Petal.Width",
      "Species",
      "new_var"
    )
  )
  expect_identical(out$new_var, 2 * out$Sepal.Width)

  aa <- "2 * Sepal.Width"
  out <- data_modify(iris, new_var = as_expr(aa))
  expect_named(
    out,
    c(
      "Sepal.Length",
      "Sepal.Width",
      "Petal.Length",
      "Petal.Width",
      "Species",
      "new_var"
    )
  )
  expect_identical(out$new_var, 2 * out$Sepal.Width)

  foo_nv <- function(data, z) {
    head(data_modify(data, new_var = as_expr(z)))
  }
  out <- foo_nv(iris, "2 * Sepal.Width")
  expect_identical(
    colnames(out),
    c(
      "Sepal.Length",
      "Sepal.Width",
      "Petal.Length",
      "Petal.Width",
      "Species",
      "new_var"
    )
  )
  expect_identical(out$new_var, 2 * out$Sepal.Width)
})


test_that("data_modify expression as character vector-4", {
  data(iris)
  x <- "var_a = Sepal.Width"
  y <- "Sepal_Wz_double = 2 * var_a"
  out <- data_modify(iris, as_expr(c(x, y)))
  expect_named(
    out,
    c(
      "Sepal.Length",
      "Sepal.Width",
      "Petal.Length",
      "Petal.Width",
      "Species",
      "var_a",
      "Sepal_Wz_double"
    )
  )
  expect_identical(out$var_a, out$Sepal.Width)
  expect_identical(out$Sepal_Wz_double, 2 * out$Sepal.Width)

  foo1 <- function(data) {
    x1 <- "var_a = Sepal.Width"
    y1 <- "Sepal_Wz_double = 2 * var_a"
    combined <- c(x1, y1)
    data_modify(iris, as_expr(combined))
  }
  out <- foo1(iris)
  expect_named(
    out,
    c(
      "Sepal.Length",
      "Sepal.Width",
      "Petal.Length",
      "Petal.Width",
      "Species",
      "var_a",
      "Sepal_Wz_double"
    )
  )
  expect_identical(out$var_a, out$Sepal.Width)
  expect_identical(out$Sepal_Wz_double, 2 * out$Sepal.Width)

  foo2 <- function(data, z3) {
    data_modify(data, as_expr(z3))
  }
  out <- foo2(iris, c("var_a = Sepal.Width", "Sepal_Wz_double = 2 * var_a"))
  expect_named(
    out,
    c(
      "Sepal.Length",
      "Sepal.Width",
      "Petal.Length",
      "Petal.Width",
      "Species",
      "var_a",
      "Sepal_Wz_double"
    )
  )
  expect_identical(out$var_a, out$Sepal.Width)
  expect_identical(out$Sepal_Wz_double, 2 * out$Sepal.Width)

  # works with separated strings
  data(iris)
  out <- data_modify(
    iris,
    as_expr("var_a = Sepal.Width"),
    as_expr("Sepal_Wz_double = 2 * var_a")
  )
  expect_identical(out$var_a, out$Sepal.Width)
  expect_identical(out$Sepal_Wz_double, 2 * out$Sepal.Width)

  out <- data_modify(
    iris,
    as_expr(c("var_a = Sepal.Width", "Sepal_Wz_double = 2 * var_a"))
  )
  expect_identical(out$var_a, out$Sepal.Width)
  expect_identical(out$Sepal_Wz_double, 2 * out$Sepal.Width)
})


test_that("data_modify works with function as expression", {
  data(iris)
  out <- data_modify(iris, foo = grepl("a", Species)) # nolint
  expect_identical(out$foo, rep(c(TRUE, FALSE, TRUE), each = 50))
  out <- data_modify(iris, foo = as_expr("grepl(\"a\", Species)"))
  expect_identical(out$foo, rep(c(TRUE, FALSE, TRUE), each = 50))
  out <- data_modify(iris, as_expr("foo = grepl(\"a\", Species)"))
  expect_identical(out$foo, rep(c(TRUE, FALSE, TRUE), each = 50))
  out <- data_modify(iris, foo = as_expr("grepl('a', Species)"))
  expect_identical(out$foo, rep(c(TRUE, FALSE, TRUE), each = 50))
  out <- data_modify(iris, as_expr("foo = grepl('a', Species)"))
  expect_identical(out$foo, rep(c(TRUE, FALSE, TRUE), each = 50))
  out <- data_modify(iris, foo = as_expr('grepl(\'a\', Species)')) # nolint
  expect_identical(out$foo, rep(c(TRUE, FALSE, TRUE), each = 50))
  out <- data_modify(iris, as_expr('foo = grepl(\'a\', Species)')) # nolint
  expect_identical(out$foo, rep(c(TRUE, FALSE, TRUE), each = 50))
  out <- data_modify(iris, foo = as_expr('grepl(\"a\", Species)'))
  expect_identical(out$foo, rep(c(TRUE, FALSE, TRUE), each = 50))
  out <- data_modify(iris, as_expr('foo = grepl(\"a\", Species)'))
  expect_identical(out$foo, rep(c(TRUE, FALSE, TRUE), each = 50))
})


test_that("data_modify remove variables with NULL", {
  data(iris)
  out <- data_modify(iris, PL_new = 2 * Petal.Length, Petal.Length = NULL)
  expect_named(
    out,
    c("Sepal.Length", "Sepal.Width", "Petal.Width", "Species", "PL_new")
  )
  expect_identical(out$PL_new, 2 * iris$Petal.Length)

  out <- data_modify(iris, as_expr("Species = NULL"))
  expect_named(
    out,
    c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")
  )
})


test_that("data_modify works on grouped data", {
  data(efc)
  grouped_efc <- data_group(efc, "c172code")
  out <- data_modify(
    grouped_efc,
    c12hour_c = center(c12hour),
    c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE),
    c12hour_z2 = standardize(c12hour)
  )
  out2 <- lapply(by(efc["c12hour"], efc$c172code, scale), as.vector)
  expect_equal(
    na.omit(out$c12hour_z2[out$c172code == 1]),
    out2[[1]],
    ignore_attr = TRUE,
    tolerance = 1e-3
  )
})


test_that("data_modify works on grouped data, with character vectors", {
  data(efc)
  grouped_efc <- data_group(efc, "c172code")
  out <- data_modify(
    grouped_efc,
    as_expr(c(
      "c12hour_c = center(c12hour)",
      "c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE)",
      "c12hour_z2 = standardize(c12hour)"
    ))
  )
  out2 <- lapply(by(efc["c12hour"], efc$c172code, scale), as.vector)
  expect_equal(
    na.omit(out$c12hour_z2[out$c172code == 1]),
    out2[[1]],
    ignore_attr = TRUE,
    tolerance = 1e-3
  )
  expect_equal(
    na.omit(out$c12hour_z[out$c172code == 1]),
    out2[[1]],
    ignore_attr = TRUE,
    tolerance = 1e-3
  )
})


test_that("data_modify works on grouped data, preserves attributes and labels", {
  data(efc)
  grouped_efc <- data_group(efc, "c172code")
  out <- data_modify(
    grouped_efc,
    c12hour_c = center(c12hour)
  )
  expect_identical(
    attributes(out$c12hour)$label,
    attributes(efc$c12hour)$label
  )
})


test_that("data_modify works on grouped data, inside functions", {
  data(efc)
  foo4 <- function(data) {
    data_modify(
      data,
      as_expr(c(
        "c12hour_c = center(c12hour)",
        "c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE)",
        "c12hour_z2 = standardize(c12hour)"
      ))
    )
  }
  out <- foo4(data_group(efc, "c172code"))
  out2 <- lapply(by(efc["c12hour"], efc$c172code, scale), as.vector)
  expect_equal(
    na.omit(out$c12hour_z2[out$c172code == 1]),
    out2[[1]],
    ignore_attr = TRUE,
    tolerance = 1e-3
  )
  expect_equal(
    na.omit(out$c12hour_z[out$c172code == 1]),
    out2[[1]],
    ignore_attr = TRUE,
    tolerance = 1e-3
  )

  foo5 <- function(data, rec) {
    data_modify(data, as_expr(rec))
  }
  out <- foo5(
    data_group(efc, "c172code"),
    c(
      "c12hour_c = center(c12hour)",
      "c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE)",
      "c12hour_z2 = standardize(c12hour)"
    )
  )
  out2 <- lapply(by(efc["c12hour"], efc$c172code, scale), as.vector)
  expect_equal(
    na.omit(out$c12hour_z2[out$c172code == 1]),
    out2[[1]],
    ignore_attr = TRUE,
    tolerance = 1e-3
  )
  expect_equal(
    na.omit(out$c12hour_z[out$c172code == 1]),
    out2[[1]],
    ignore_attr = TRUE,
    tolerance = 1e-3
  )
})


test_that("data_modify errors for non df", {
  expect_error(data_modify(
    iris$Sepal.Length,
    Sepal_W_z = standardize(Sepal.Width)
  ))
})


test_that("data_modify errors for empty data frames", {
  data(mtcars)
  x <- mtcars[1, ]
  expect_error(
    data_modify(x[-1, ], new_var = 5),
    regex = "empty data frame"
  )
})


test_that("data_modify errors for typos", {
  data(efc)
  a <- "center(c22hour)" # <---------------- error in variable name
  b <- "c12hour_c / sd(c12hour, na.rm = TRUE)"
  expect_error(
    data_modify(efc, c12hour_c = as_expr(a), c12hour_z = as_expr(b)),
    regex = "c22hour"
  )

  a <- "center(c12hour)"
  b <- "c12hour_c / sd(c21hour, na.rm = TRUE)" # <------ error in variable name
  expect_error(
    data_modify(efc, c12hour_c = as_expr(a), c12hour_z = as_expr(b)),
    regex = "c12hour_c"
  )

  expect_error(
    data_modify(efc, c12hour_c = as_expr(a), c12hour_z = as_expr(b)),
    regex = "second expression"
  )
})


test_that("data_modify message about recycling values", {
  expect_snapshot(head(data_modify(iris, Sepal.Width = 1)))
  expect_snapshot(head(data_modify(iris, Sepal.Width = 1:2)))
  expect_snapshot(head(data_modify(iris, Petal.Length = 1, Sepal.Width = 1)))
  expect_snapshot(head(data_modify(iris, Petal.Length = 1, Sepal.Width = 1:2)))
  expect_snapshot(head(data_modify(iris, Petal.Length = 2, Sepal.Width = 2)))
})


test_that("data_modify message about modified variables", {
  expect_snapshot(head(data_modify(iris, Sepal.Width = 2 * Sepal.Width)))
  expect_snapshot(head(data_modify(
    iris,
    Petal.Length = Sepal.Length,
    Sepal.Width = Petal.Width
  )))
})


test_that("data_modify works with character variables, and inside functions", {
  data(efc)
  a <- "center(c12hour)"
  b <- "c12hour_c / sd(c12hour, na.rm = TRUE)"
  d <- "standardize(c12hour)"
  out <- data_modify(
    efc,
    c12hour_c = as_expr(a),
    c12hour_z = as_expr(b),
    c12hour_z2 = as_expr(d)
  )
  expect_equal(
    out$c12hour_z2,
    as.vector(scale(efc$c12hour)),
    ignore_attr = TRUE,
    tolerance = 1e-3
  )
  expect_equal(
    out$c12hour_z,
    as.vector(scale(efc$c12hour)),
    ignore_attr = TRUE,
    tolerance = 1e-3
  )

  # when calling functions
  a1 <- "center(c12hour)"
  b1 <- "c12hour_c / sd(c12hour, na.rm = TRUE)"
  d1 <- "standardize(c12hour)"
  foo <- function(data, x1, x2, x3) {
    data_modify(
      efc,
      c12hour_c = as_expr(x1),
      c12hour_z = as_expr(x2),
      c12hour_z2 = as_expr(x3)
    )
  }
  out <- foo(efc, a1, b1, d1)
  expect_equal(
    out$c12hour_z2,
    as.vector(scale(efc$c12hour)),
    ignore_attr = TRUE,
    tolerance = 1e-3
  )
  expect_equal(
    out$c12hour_z,
    as.vector(scale(efc$c12hour)),
    ignore_attr = TRUE,
    tolerance = 1e-3
  )

  # when calling functions, arguments inside function defined
  foo2 <- function(data) {
    a2 <- "center(c12hour)"
    b2 <- "c12hour_c / sd(c12hour, na.rm = TRUE)"
    d2 <- "standardize(c12hour)"

    data_modify(
      efc,
      c12hour_c = as_expr(a2),
      c12hour_z = as_expr(b2),
      c12hour_z2 = as_expr(d2)
    )
  }
  out <- foo2(efc)
  expect_equal(
    out$c12hour_z2,
    as.vector(scale(efc$c12hour)),
    ignore_attr = TRUE,
    tolerance = 1e-3
  )
  expect_equal(
    out$c12hour_z,
    as.vector(scale(efc$c12hour)),
    ignore_attr = TRUE,
    tolerance = 1e-3
  )
})


test_that("data_modify works with grouped df when overwriting existing variables", {
  data(iris)
  iris_grp <- data_group(iris, "Species")
  out <- data_modify(iris_grp, Sepal.Length = normalize(Sepal.Length))
  expect_equal(
    head(out$Sepal.Length),
    c(0.53333, 0.4, 0.26667, 0.2, 0.46667, 0.73333),
    tolerance = 1e-3
  )

  out <- data_modify(
    iris_grp,
    Sepal.Length = normalize(Sepal.Length),
    Sepal.Length2 = 2 * Sepal.Length
  )
  expect_equal(
    head(out$Sepal.Length2),
    2 * c(0.53333, 0.4, 0.26667, 0.2, 0.46667, 0.73333),
    tolerance = 1e-3
  )
})


test_that("data_modify works with functions that return character vectors", {
  data(iris)
  set.seed(123)
  out <- data_modify(iris, grp = sample(letters[1:3], nrow(iris), TRUE))
  expect_identical(head(out$grp), c("a", "c", "b", "a", "c", "c"))
})


test_that("data_modify 1:n() and similar works in (grouped) data frames", {
  data(mtcars)
  out <- data_modify(mtcars, Trials = 1:n()) # nolint
  expect_identical(out$Trials, 1:32)
  x <- data_group(mtcars, "gear")
  out <- data_modify(x, Trials = 1:n()) # nolint
  expect_identical(out$Trials[out$gear == 3], 1:15)
  expect_identical(out$Trials[out$gear == 4], 1:12)
  out <- data_modify(x, Trials = 3:(n() + 2))
  expect_identical(out$Trials[out$gear == 3], 3:17)
  expect_identical(out$Trials[out$gear == 4], 3:14)
})


test_that("data_modify .if/.at arguments", {
  data(iris)
  d <- iris[1:5, ]
  # validate results
  out <- data_modify(d, .at = "Species", .modify = as.numeric)
  expect_identical(out$Species, c(1, 1, 1, 1, 1))
  out <- data_modify(d, .if = is.factor, .modify = as.numeric)
  expect_identical(out$Species, c(1, 1, 1, 1, 1))
  out <- data_modify(
    d,
    new_length = Petal.Length * 2,
    .at = "Species",
    .modify = as.numeric
  )
  expect_identical(out$Species, c(1, 1, 1, 1, 1))
  expect_named(
    out,
    c(
      "Sepal.Length",
      "Sepal.Width",
      "Petal.Length",
      "Petal.Width",
      "Species",
      "new_length"
    )
  )
  # using other functions with `.at`
  out <- data_modify(
    d,
    .at = extract_column_names(d, select = starts_with("Sepal")),
    .modify = as.factor
  )
  expect_s3_class(out$Sepal.Length, "factor")
  expect_s3_class(out$Sepal.Width, "factor")

  # .at and .if cannot be used at same timne
  expect_error(
    data_modify(d, .at = "Species", .if = is.factor, .modify = as.numeric),
    regex = "You cannot use both"
  )
  # modify must be a function
  expect_error(
    data_modify(d, .at = "Species", .modify = "a"),
    regex = "`.modify` must"
  )
  # unknown variable
  expect_error(
    data_modify(d, .at = c("Species", "Test"), .modify = as.numeric),
    regex = "Variable \"Test\""
  )
  # unknown variables
  expect_error(
    data_modify(d, .at = c("Species", "Hi", "Test"), .modify = as.numeric),
    regex = "Variables \"Hi\" and \"Test\""
  )
  # one of .at or .if must be specified
  expect_error(
    data_modify(d, .modify = as.numeric),
    regex = "You need to specify"
  )
  # function not applicable to factors
  expect_error(
    data_modify(d, .at = "Species", .modify = function(x) 2 / y + x),
    regex = "Error in modifying variable"
  )
  # function not applicable to factors
  expect_error(
    data_modify(d, .at = "Species", .modify = function(x) 2 * x),
    regex = "Error in modifying variable"
  )
  # .modify needs to be specified
  expect_error(
    data_modify(d, .at = "Species", .if = is.factor),
    regex = "You need to specify"
  )
  # newly created variables are processed by if/at
  out <- data_modify(
    d,
    new_length = Petal.Length * 2,
    .if = is.numeric,
    .modify = round
  )
  expect_equal(out$new_length, c(3, 3, 3, 3, 3), ignore_attr = TRUE)
})


test_that("data_modify works with new expressions, different use cases same results", {
  data(iris)
  out1 <- data_modify(iris, as_expr("sepwid = 2 * Sepal.Width"))
  out2 <- data_modify(iris, sepwid = as_expr("2 * Sepal.Width"))
  e <- "sepwid = 2 * Sepal.Width"
  out3 <- data_modify(iris, as_expr(e))
  e <- "2 * Sepal.Width"
  out4 <- data_modify(iris, sepwid = as_expr(e))

  expect_equal(head(out1), head(out2), ignore_attr = TRUE, tolerance = 1e-4)
  expect_equal(head(out1), head(out3), ignore_attr = TRUE, tolerance = 1e-4)
  expect_equal(head(out1), head(out4), ignore_attr = TRUE, tolerance = 1e-4)

  out1b <- data_modify(
    iris,
    as_expr(c("sepwid = 2 * Sepal.Width", "seplen = 5 * Sepal.Length"))
  )
  out2b <- data_modify(
    iris,
    sepwid = as_expr("2 * Sepal.Width"),
    seplen = as_expr("5 * Sepal.Length")
  )
  e <- c("sepwid = 2 * Sepal.Width", "seplen = 5 * Sepal.Length")
  out3b <- data_modify(iris, as_expr(e))
  e <- "2 * Sepal.Width"
  out4b <- data_modify(iris, sepwid = as_expr(e), seplen = 5 * Sepal.Length)

  expect_equal(head(out1b), head(out2b), ignore_attr = TRUE, tolerance = 1e-4)
  expect_equal(head(out1b), head(out3b), ignore_attr = TRUE, tolerance = 1e-4)
  expect_equal(head(out1b), head(out4b), ignore_attr = TRUE, tolerance = 1e-4)

  # no expression
  out <- data_modify(iris, sepwid = "2 * Sepal.Widht")
  expect_identical(
    head(out$sepwid),
    c(
      "2 * Sepal.Widht",
      "2 * Sepal.Widht",
      "2 * Sepal.Widht",
      "2 * Sepal.Widht",
      "2 * Sepal.Widht",
      "2 * Sepal.Widht"
    )
  )

  # works with paste()
  to_standardize <- c("Petal.Length", "Sepal.Length")
  out <- data_modify(
    iris,
    as_expr(
      paste0(to_standardize, "_stand = standardize(", to_standardize, ")")
    )
  )
  expect_equal(
    head(out$Petal.Length_stand),
    c(-1.33575, -1.33575, -1.3924, -1.2791, -1.33575, -1.16581),
    tolerance = 1e-3
  )
  expect_equal(
    head(out$Sepal.Length_stand),
    c(-0.89767, -1.1392, -1.38073, -1.50149, -1.01844, -0.53538),
    tolerance = 1e-3
  )

  # complex example
  e <- "2 * Sepal.Width"
  f <- "half_petal = 0.5 * Petal.Length"
  a <- "string"
  num <- 1:5
  out_complex <- data_modify(
    iris,
    sepwid = as_expr(e),
    seplen = 5 * Sepal.Length,
    as_expr(f),
    new_var = a,
    new_num = num,
    new_var2 = "ho",
    new_num2 = 4:6,
    Sepal.Length = NULL,
    Petal.Length = NULL,
    Sepal.Width = NULL,
    Petal.Width = NULL
  )
  expect_snapshot(print(head(out_complex)))
})


test_that("data_modify works with new expressions, grouped_df, different use cases same results", {
  data(efc, package = "datawizard")
  grouped_efc <- data_group(efc, "c172code")
  new_efc1 <- data_modify(
    grouped_efc,
    c12hour_c = center(c12hour),
    c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE),
    c12hour_z2 = standardize(c12hour),
    id = 1:n() # nolint
  )

  new_efc2 <- data_modify(
    grouped_efc,
    as_expr("c12hour_c = center(c12hour)"),
    c12hour_z = as_expr("c12hour_c / sd(c12hour, na.rm = TRUE)"),
    c12hour_z2 = standardize(c12hour),
    id = 1:n() # nolint
  )
  expect_equal(
    head(new_efc1),
    head(new_efc2),
    ignore_attr = TRUE,
    tolerance = 1e-4
  )

  s <- c(
    "c12hour_c = center(c12hour)",
    "c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE)",
    "c12hour_z2 = standardize(c12hour)"
  )
  new_efc3 <- data_modify(
    grouped_efc,
    as_expr(s),
    id = 1:n() # nolint
  )
  expect_equal(
    head(new_efc1),
    head(new_efc3),
    ignore_attr = TRUE,
    tolerance = 1e-4
  )

  new_efc4 <- data_modify(
    grouped_efc,
    c12hour_c = center(c12hour),
    c12hour_z = as_expr("c12hour_c / sd(c12hour, na.rm = TRUE)"),
    c12hour_z2 = standardize(c12hour),
    id = 1:n() # nolint
  )
  expect_equal(
    head(new_efc1),
    head(new_efc4),
    ignore_attr = TRUE,
    tolerance = 1e-4
  )
})


test_that("data_modify errors with new expressions", {
  e <- "sepwid = 2 * Sepal.Widht"
  expect_error(
    data_modify(iris, as_expr(e)),
    regex = "in the first expression"
  )
  expect_error(
    data_modify(iris, as_expr(e)),
    regex = "Sepal.Widht"
  )

  expect_error(
    data_modify(iris, as_expr("sepwid = 2 * Sepal.Widht")),
    regex = "in the first expression"
  )
  expect_error(
    data_modify(iris, as_expr("sepwid = 2 * Sepal.Widht")),
    regex = "Sepal.Widht"
  )

  expect_error(
    data_modify(iris, sepwid = 2 * Sepal.Widht),
    regex = "in the first expression"
  )
  expect_error(
    data_modify(iris, sepwid = 2 * Sepal.Widht),
    regex = "Sepal.Widht"
  )

  expect_error(
    data_modify(iris, as_expr("2 * Sepal.Widht")),
    regex = "variable name"
  )

  e <- "2 * Sepal.Widht"
  expect_error(
    data_modify(iris, as_expr(e)),
    regex = "variable name"
  )

  data(efc, package = "datawizard")
  a <- "center(c22hour)" # <---------------- error in variable name
  b <- "c12hour_c / sd(c12hour, na.rm = TRUE)"
  expect_error(
    data_modify(efc, c12hour_c = as_expr(a), c12hour_z = as_expr(b)),
    regex = "c22hour"
  )

  expect_error(
    data_modify(iris, a = as_expr(c("1 + 1", "2 + 2"))),
    regex = "Could not evaluate expression"
  )
})


skip_if_not_installed("withr")

withr::with_environment(
  new.env(),
  test_that("data_modify 1:n() and similar works in (grouped) data frames inside function calls", {
    data(mtcars)
    x <- data_group(mtcars, "gear")

    foo <- function(d) {
      out <- data_modify(d, Trials = 1:n()) # nolint
      out$Trials
    }
    expect_identical(
      foo(x),
      c(
        1L,
        2L,
        3L,
        1L,
        2L,
        3L,
        4L,
        4L,
        5L,
        6L,
        7L,
        5L,
        6L,
        7L,
        8L,
        9L,
        10L,
        8L,
        9L,
        10L,
        11L,
        12L,
        13L,
        14L,
        15L,
        11L,
        1L,
        2L,
        3L,
        4L,
        5L,
        12L
      )
    )
  })
)

test_that("data_modify errors on non-defined function", {
  expect_error(data_modify(iris, Species = foo()))
})


withr::with_environment(
  new.env(),
  test_that("data_modify correctly assigns values from variables", {
    d <- data.frame()
    for (param in letters[c(1, 2, 5)]) {
      out <- data.frame(x = as.numeric(as.factor(param)))
      out <- data_modify(out, Parameter = param)
      d <- rbind(out, d)
    }
    expect_named(d, c("x", "Parameter"))
    expect_identical(d$Parameter, c("e", "b", "a"))

    d <- data.frame()
    for (param in c("a 1", "b 2")) {
      out <- data.frame(x = as.numeric(as.factor(param)))
      out <- data_modify(out, Parameter = param)
      d <- rbind(out, d)
    }
    expect_named(d, c("x", "Parameter"))
    expect_identical(d$Parameter, c("b 2", "a 1"))

    # variable is not copied, values is used
    a <- "x"
    d <- data.frame(x = 1)
    out <- data_modify(d, y = a)
    expect_identical(out$y, "x")
  })
)

withr::with_environment(
  new.env(),
  test_that("data_modify passes expression syntax to function", {
    foo1 <- function(data, ...) {
      head(data_modify(data, ...))
    }
    out1 <- foo1(iris, SW_fraction = Sepal.Width / 10)
    out2 <- foo1(iris, as_expr("SW_fraction = Sepal.Width / 10"))
    expect_identical(out1, out2)
  })
)

## styler: on


================================================
FILE: tests/testthat/test-data_partition.R
================================================
test_that("data_partition works as expected", {
  # not supported

  expect_error(
    data_partition(new.env()),
    "`data` must be a data frame"
  )

  # to be coerced to data frames

  expect_snapshot(data_partition(letters, seed = 123))

  # validation checks

  expect_warning(
    data_partition(iris, 0.7, row_id = "Species"),
    "exists"
  )

  expect_warning(expect_warning(
    data_partition(iris, c(0.7, 0.3), row_id = "Species"),
    "generated"
  ))

  # values

  out <- data_partition(mtcars, proportion = 0.8, seed = 123)

  expect_identical(
    out$p_0.8$.row_id,
    c(
      1L,
      3L,
      4L,
      5L,
      7L,
      8L,
      9L,
      10L,
      11L,
      14L,
      15L,
      17L,
      18L,
      19L,
      20L,
      21L,
      22L,
      23L,
      24L,
      26L,
      27L,
      28L,
      29L,
      30L,
      31L,
      32L
    )
  )

  expect_identical(
    colnames(out$p_0.8),
    c(
      "mpg",
      "cyl",
      "disp",
      "hp",
      "drat",
      "wt",
      "qsec",
      "vs",
      "am",
      "gear",
      "carb",
      ".row_id"
    )
  )

  expect_identical(
    lapply(out, nrow),
    list(p_0.8 = 26L, test = 6L)
  )

  # data frames

  data(iris)
  expect_snapshot(str(data_partition(iris, proportion = 0.7, seed = 123)))
  expect_snapshot(str(data_partition(
    iris,
    proportion = c(0.2, 0.5),
    seed = 123
  )))
  expect_snapshot(str(data_partition(
    iris,
    proportion = 0.7,
    by = "Species",
    seed = 123
  )))
  expect_snapshot(str(data_partition(
    iris,
    proportion = c(0.2, 0.5),
    by = "Species",
    seed = 123
  )))
})

test_that("data_partition warns if no testing set", {
  expect_warning(
    data_partition(iris, proportion = 1),
    "sums up to 1"
  )
  expect_warning(
    data_partition(iris, proportion = c(0.5, 0.5)),
    "sums up to 1"
  )
})

test_that("data_partition errors if values in proportion not between 0 and 1", {
  expect_error(
    data_partition(iris, proportion = 1.3),
    "cannot be higher"
  )
  expect_error(
    data_partition(iris, proportion = c(0.5, 0.6)),
    "cannot be higher"
  )
  expect_error(
    data_partition(iris, proportion = c(1.3, -1)),
    "cannot be negative"
  )
  expect_error(
    data_partition(iris, proportion = -1),
    "cannot be negative"
  )
})

test_that("data_partition warns if row_id already exists", {
  iris2 <- iris

  iris2[[".row_id"]] <- "A"
  expect_warning(
    data_partition(iris2, proportion = 0.5),
    "already exists"
  )

  iris2[["foo"]] <- "A"
  expect_warning(
    data_partition(iris2, proportion = 0.5, row_id = "foo"),
    "already exists"
  )

  part1 <- data_partition(iris, proportion = 0.5, seed = 123)
  part2 <- suppressWarnings(data_partition(iris2, proportion = 0.5, seed = 123))

  expect_identical(
    part1$p_0.5[1:5],
    part2$p_0.5[1:5]
  )
})


================================================
FILE: tests/testthat/test-data_peek.R
================================================
test_that("data_peek works as expected", {
  out <- data_peek(iris)
  expect_named(out, c("Variable", "Type", "Values"))
  expect_identical(
    out$Variable,
    c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species")
  )
  expect_identical(dim(out), c(5L, 3L))
})

test_that("data_peek works as expected with select", {
  out <- data_peek(iris, select = 2:4)
  expect_named(out, c("Variable", "Type", "Values"))
  expect_identical(
    out$Variable,
    c("Sepal.Width", "Petal.Length", "Petal.Width")
  )
  expect_identical(dim(out), c(3L, 3L))
})

test_that("data_peek works as expetced with custom width", {
  out <- data_peek(iris, width = 130)
  expect_named(out, c("Variable", "Type", "Values"))
  expect_identical(
    out$Variable,
    c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species")
  )
  expect_identical(dim(out), c(5L, 3L))
})

test_that("data_peek snapshots look as expected", {
  expect_snapshot(data_peek(iris))
  expect_snapshot(data_peek(iris, select = 1:3))
  expect_snapshot(data_peek(iris, width = 130))
})


================================================
FILE: tests/testthat/test-data_read.R
================================================
skip_if_not_installed("httr")
skip_if_not_installed("readxl")
skip_if_not_installed("haven")
skip_if_not_installed("readr")
skip_if_not_installed("data.table")
skip_if_not_installed("rio")

skip_on_cran()

skip_if_not_installed("curl")
skip_if_offline()

# csv -------------------------

test_that("data_read - csv", {
  d <- data_read(
    "https://raw.githubusercontent.com/easystats/circus/main/data/bootstrapped.csv",
    verbose = FALSE
  )
  expect_identical(dim(d), c(10000L, 4L))
})


# csv -------------------------

test_that("data_read, skip_empty", {
  d <- data_read(
    "https://raw.githubusercontent.com/easystats/circus/main/data/test_skip_empty.csv",
    verbose = FALSE
  )
  expect_identical(ncol(d), 3L)
  expect_identical(colnames(d), c("Var1", "Var2", "Var3"))
})


# tsv -------------------------

test_that("data_read - tsv", {
  skip_if_not_installed("withr")

  withr::with_tempfile("temp_file", fileext = ".tsv", code = {
    request <- httr::GET(
      "https://raw.github.com/easystats/circus/main/data/sample1.tsv"
    )
    httr::stop_for_status(request)
    writeBin(httr::content(request, type = "raw"), temp_file)

    d <- data_read(
      temp_file,
      verbose = FALSE
    )
    expect_identical(nrow(d), 3L)
    expect_identical(colnames(d), c("a", "b", "c"))
    expect_identical(sum(vapply(d, is.numeric, FUN.VALUE = logical(1L))), 2L)
    expect_identical(sum(vapply(d, is.character, FUN.VALUE = logical(1L))), 1L)
  })
})


# excel -------------------------

test_that("data_read - excel", {
  skip_if_not_installed("withr")

  withr::with_tempfile("temp_file", fileext = ".xlsx", code = {
    request <- httr::GET(
      "https://raw.github.com/easystats/circus/main/data/sample1.xlsx"
    )
    httr::stop_for_status(request)
    writeBin(httr::content(request, type = "raw"), temp_file)

    d <- data_read(
      temp_file,
      verbose = FALSE
    )

    expect_identical(nrow(d), 3L)
    expect_identical(colnames(d), c("a", "b", "c"))
    expect_identical(sum(vapply(d, is.numeric, FUN.VALUE = logical(1L))), 2L)
    expect_identical(sum(vapply(d, is.character, FUN.VALUE = logical(1L))), 1L)
  })
})


# Stata file -----------------------------------

test_that("data_read - Stata file", {
  withr::with_tempfile("temp_file", fileext = ".dta", code = {
    request <- httr::GET(
      "https://raw.github.com/easystats/circus/main/data/stata_test.dta"
    )
    httr::stop_for_status(request)
    writeBin(httr::content(request, type = "raw"), temp_file)

    d <- data_read(
      temp_file,
      verbose = FALSE
    )
    expect_identical(
      d,
      data.frame(
        mpg = c(21, 21, 22.8),
        cyl = c(6, 6, 4),
        disp = c(160, 160, 108)
      )
    )
  })
})


# SAS file -----------------------------------

test_that("data_read - SAS file", {
  withr::with_tempfile("temp_file", fileext = ".sas7bdat", code = {
    request <- httr::GET(
      "https://raw.github.com/easystats/circus/main/data/sas_test.sas7bdat"
    )
    httr::stop_for_status(request)
    writeBin(httr::content(request, type = "raw"), temp_file)

    d <- data_read(
      temp_file,
      verbose = FALSE
    )
    expect_identical(
      d,
      data.frame(
        mpg = c(21, 21, 22.8),
        cyl = c(6, 6, 4),
        disp = c(160, 160, 108)
      )
    )
  })
})


# RDS file, matrix, coercible -----------------------------------

test_that("data_read - RDS file, matrix, coercible", {
  withr::with_tempfile("temp_file", fileext = ".rds", code = {
    request <- httr::GET(
      "https://raw.github.com/easystats/circus/main/data/matrix_object.rds"
    )
    httr::stop_for_status(request)
    writeBin(httr::content(request, type = "raw"), temp_file)

    expect_message({
      d <- data_read(
        temp_file,
        verbose = TRUE
      )
    })

    expect_s3_class(d, "data.frame")
    expect_identical(dim(d), c(2L, 5L))
  })
})


# RDS file, preserve class /types -----------------------------------

test_that("data_read - RDS file, preserve class", {
  withr::with_tempfile("temp_file", fileext = ".rds", code = {
    request <- httr::GET(
      "https://raw.github.com/easystats/circus/main/data/hiv.rds"
    )
    httr::stop_for_status(request)
    writeBin(httr::content(request, type = "raw"), temp_file)

    d <- data_read(temp_file, verbose = FALSE)
    expect_s3_class(d, "data.frame")
    expect_identical(
      sapply(d, class),
      c(
        village = "integer",
        outcome = "integer",
        distance = "numeric",
        amount = "numeric",
        incentive = "integer",
        age = "integer",
        hiv2004 = "integer",
        agecat = "factor"
      )
    )
  })
})


# RData -----------------------------------

test_that("data_read - no warning for RData", {
  withr::with_tempfile("temp_file", fileext = ".RData", code = {
    data(mtcars)
    save(mtcars, file = temp_file)
    expect_silent(data_read(temp_file, verbose = FALSE))
  })
})


test_that("data_read - message for multiple objects in RData", {
  withr::with_tempfile("temp_file", fileext = ".RData", code = {
    data(mtcars)
    data(iris)
    save(mtcars, iris, file = temp_file)
    expect_message(
      expect_message(
        data_read(temp_file, verbose = TRUE),
        regex = "File contained more than one object"
      ),
      "Reading data"
    )
  })
})


# SPSS file -----------------------------------

test_that("data_read - SPSS file", {
  withr::with_tempfile("temp_file", fileext = ".sav", code = {
    request <- httr::GET(
      "https://raw.github.com/easystats/circus/main/data/EFC.sav"
    )
    httr::stop_for_status(request)
    writeBin(httr::content(request, type = "raw"), temp_file)

    d <- data_read(
      temp_file,
      verbose = FALSE
    )
    expect_identical(sum(vapply(d, is.factor, FUN.VALUE = logical(1L))), 15L)
    expect_identical(sum(vapply(d, is.numeric, FUN.VALUE = logical(1L))), 11L)
    expect_identical(
      levels(d$c172code),
      c(
        "low level of education",
        "intermediate level of education",
        "high level of education"
      )
    )
    expect_identical(
      attr(d$n4pstu, "labels"),
      c(
        `spouse/partner` = 1,
        child = 2,
        sibling = 3,
        `daughter or son -in-law` = 4
      )
    )
  })
})


# SPSS file 2 ---------------------------------

test_that("data_read - SPSS file 2", {
  withr::with_tempfile("temp_file", fileext = ".sav", code = {
    request <- httr::GET(
      "https://raw.github.com/easystats/circus/main/data/spss_test.sav"
    )
    httr::stop_for_status(request)
    writeBin(httr::content(request, type = "raw"), temp_file)

    d <- data_read(
      temp_file,
      verbose = FALSE
    )

    expect_identical(
      d,
      structure(
        list(
          V1 = structure(
            1:4,
            levels = c(
              "Eins",
              "Zwei",
              "Drei",
              "Vier"
            ),
            class = "factor",
            converted_to_factor = TRUE,
            label = "Variable 1"
          ),
          V2 = structure(
            c(2, 3, 4, 1),
            labels = c(
              Eins = 1,
              Zwei = 2,
              Drei = 3
            ),
            label = "Variable 2"
          ),
          V3 = structure(
            c(
              3L,
              2L,
              1L,
              4L
            ),
            levels = c("Eins", "Zwei", "Drei", "Vier"),
            class = "factor",
            converted_to_factor = TRUE,
            label = "Variable 3"
          )
        ),
        row.names = c(NA, -4L),
        class = "data.frame"
      )
    )
  })
})


# zipped SPSS file -----------------------------------

test_that("data_read - zipped SPSS file", {
  withr::with_tempfile("temp_file", fileext = ".zip", code = {
    request <- httr::GET(
      "https://raw.github.com/easystats/circus/main/data/EFC.zip"
    )
    httr::stop_for_status(request)
    writeBin(httr::content(request, type = "raw"), temp_file)

    d <- data_read(
      temp_file,
      verbose = FALSE
    )
    expect_identical(sum(vapply(d, is.factor, FUN.VALUE = logical(1L))), 15L)
    expect_identical(sum(vapply(d, is.numeric, FUN.VALUE = logical(1L))), 11L)

    d <- data_read(
      temp_file,
      convert_factors = FALSE,
      verbose = FALSE
    )
    expect_identical(sum(vapply(d, is.factor, FUN.VALUE = logical(1L))), 0L)
    expect_identical(sum(vapply(d, is.numeric, FUN.VALUE = logical(1L))), 26L)
  })
})


# SPSS file, many value labels  -----------------------------------

test_that("data_read, convert many labels correctly", {
  # Output validated against SPSS output from original dataset

  withr::with_tempfile("temp_file", fileext = ".sav", code = {
    request <- httr::GET(
      "https://raw.github.com/easystats/circus/main/data/spss_many_labels.sav"
    )
    httr::stop_for_status(request)
    writeBin(httr::content(request, type = "raw"), temp_file)

    d <- data_read(
      temp_file,
      verbose = FALSE
    )
    # all are factors by default
    expect_identical(
      vapply(d, class, character(1)),
      c(selv1 = "factor", c12 = "factor", c12a = "factor", c12c = "factor")
    )
    expect_identical(
      levels(d$selv1),
      c(
        "Vignette 1 weiblich (Gülsen E. Reinigungskraft B)",
        "Vignette 2 weiblich (Gülsen E. Anwältin B)",
        "Vignette 3 weiblich (Monika E. Reinigungskraft B)",
        "Vignette 4 weiblich (Monika E. Anwältin B)",
        "Vignette 5 männlich (Hasan E. Reinigungskraft B)",
        "Vignette 6 männlich (Hasan E. Anwalt B)",
        "Vignette 7 männlich (Martin E. Reinigungskraft B)",
        "Vignette 8 männlich (Martin E. Anwalt B)",
        "Vignette 9 weiblich (Gülsen E. Reinigungskraft E)",
        "Vignette 10 weiblich (Gülsen E. Anwältin E)",
        "Vignette 11 weiblich (Monika E. Reinigungskraft E)",
        "Vignette 12 weiblich (Monika E. Anwältin E)",
        "Vignette 13 männlich (Hasan E. Reinigungskraft E)",
        "Vignette 14 männlich (Hasan E. Anwalt E)",
        "Vignette 15 männlich (Martin E. Reinigungskraft E)",
        "Vignette 16 männlich (Martin E. Anwalt E)"
      )
    )
    expect_snapshot(data_tabulate(d$selv1))

    expect_identical(levels(d$c12), c("ja", "nein", "keine Angabe"))
    expect_snapshot(data_tabulate(d$c12))

    expect_identical(levels(d$c12a), c("Filter", "ja", "nein", "keine Angabe"))
    expect_snapshot(data_tabulate(d$c12a))
    expect_identical(
      levels(d$c12c),
      c(
        "Filter",
        "0 = keine",
        "1",
        "2",
        "3",
        "4",
        "5",
        "6",
        "7",
        "8",
        "9",
        "10 = sehr starke",
        "weiß nicht / keine Angabe"
      )
    )
    expect_snapshot(data_tabulate(d$c12c))

    expect_message(
      expect_message(
        expect_message(
          data_read(temp_file),
          regexp = "Reading"
        ),
        regexp = "Variables where all"
      ),
      regexp = "4 out of 4"
    )

    d <- data_read(
      temp_file,
      convert_factors = FALSE,
      verbose = FALSE
    )
    # all are factors by default
    expect_identical(
      vapply(d, class, character(1)),
      c(selv1 = "numeric", c12 = "numeric", c12a = "numeric", c12c = "numeric")
    )
    expect_snapshot(table(d$selv1))
    expect_identical(
      attributes(d$selv1)$labels,
      c(
        `Vignette 1 weiblich (Gülsen E. Reinigungskraft B)` = 1,
        `Vignette 2 weiblich (Gülsen E. Anwältin B)` = 2,
        `Vignette 3 weiblich (Monika E. Reinigungskraft B)` = 3,
        `Vignette 4 weiblich (Monika E. Anwältin B)` = 4,
        `Vignette 5 männlich (Hasan E. Reinigungskraft B)` = 5,
        `Vignette 6 männlich (Hasan E. Anwalt B)` = 6,
        `Vignette 7 männlich (Martin E. Reinigungskraft B)` = 7,
        `Vignette 8 männlich (Martin E. Anwalt B)` = 8,
        `Vignette 9 weiblich (Gülsen E. Reinigungskraft E)` = 9,
        `Vignette 10 weiblich (Gülsen E. Anwältin E)` = 10,
        `Vignette 11 weiblich (Monika E. Reinigungskraft E)` = 11,
        `Vignette 12 weiblich (Monika E. Anwältin E)` = 12,
        `Vignette 13 männlich (Hasan E. Reinigungskraft E)` = 13,
        `Vignette 14 männlich (Hasan E. Anwalt E)` = 14,
        `Vignette 15 männlich (Martin E. Reinigungskraft E)` = 15,
        `Vignette 16 männlich (Martin E. Anwalt E)` = 16,
        `99` = 99
      )
    )

    expect_snapshot(table(d$c12))
    expect_identical(
      attributes(d$c12)$labels,
      c(Filter = -2, ja = 1, nein = 2, `keine Angabe` = 99)
    )

    expect_snapshot(table(d$c12a))
    expect_identical(
      attributes(d$c12a)$labels,
      c(Filter = -2, ja = 1, nein = 2, `keine Angabe` = 99)
    )

    expect_snapshot(table(d$c12c))
    expect_identical(
      attributes(d$c12c)$labels,
      c(
        Filter = -2,
        `0 = keine` = 0,
        `1` = 1,
        `2` = 2,
        `3` = 3,
        `4` = 4,
        `5` = 5,
        `6` = 6,
        `7` = 7,
        `8` = 8,
        `9` = 9,
        `10 = sehr starke` = 10,
        `weiß nicht / keine Angabe` = 99
      )
    )
  })
})


# invalid file type -------------------------

test_that("data_read, no file extension", {
  expect_error(data_read("mytestfile"), regex = "extension")
  expect_error(data_read(NULL, regex = "extension"))
})


# file not exists -------------------------

test_that("data_read, file not exists", {
  expect_error(data_read("thisfileshouldnotexist.csv"), regex = "not exist")
  expect_error(
    suppressMessages(data_read("thisfileshouldnotexist.sav")),
    regex = "not exist"
  )
})

# RDS file, no data frame -----------------------------------

test_that("data_read - RDS file, no data frame", {
  skip_if_not_installed("withr")

  withr::with_tempfile("temp_file", fileext = ".rda", code = {
    request <- httr::GET(
      "https://raw.github.com/easystats/circus/main/data/list_for_testing.rda"
    )
    httr::stop_for_status(request)
    writeBin(httr::content(request, type = "raw"), temp_file)

    expect_message(
      expect_warning(
        d <- data_read(temp_file, verbose = TRUE), # nolint
        regex = "no data frame"
      ),
      "Reading data"
    )
    expect_type(d, "list")
  })
})

test_that("data_read - RDA file, model object", {
  skip_if_not_installed("withr")
  skip_if_not_installed("brms")

  withr::with_tempfile("temp_file", fileext = ".rds", code = {
    request <- httr::GET(
      "https://raw.github.com/easystats/circus/main/data/model_object.rds"
    )
    httr::stop_for_status(request)
    writeBin(httr::content(request, type = "raw"), temp_file)

    expect_message(
      expect_message(
        d <- data_read(temp_file, verbose = TRUE), # nolint
        regex = "Imported file is a regression"
      ),
      "Reading data"
    )
    expect_s3_class(d, "lm")
  })

  withr::with_tempfile("temp_file", fileext = ".rda", code = {
    request <- httr::GET(
      "https://raw.github.com/easystats/circus/main/data/brms_1.rda"
    )
    httr::stop_for_status(request)
    writeBin(httr::content(request, type = "raw"), temp_file)

    expect_message(
      expect_message(
        d <- data_read(temp_file, verbose = TRUE), # nolint
        regex = "Imported file is a regression"
      ),
      "Reading data"
    )
    expect_s3_class(d, "brmsfit")
  })
})


test_that("data_read - RDS file, from URL", {
  # works with URL
  request <- httr::GET(
    "https://raw.github.com/easystats/circus/main/data/model_object.rds"
  )
  httr::stop_for_status(request)
  expect_message(
    expect_message(
      d <- data_read(
        # nolint
        "https://raw.github.com/easystats/circus/main/data/model_object.rds",
        verbose = TRUE
      ),
      regex = "Imported file is a regression"
    ),
    "Reading data"
  )
})


test_that("data_read - nanoparquet", {
  skip_if_not_installed("withr")
  skip_if_not_installed("nanoparquet")

  withr::with_tempfile("temp_file", fileext = ".parquet", code = {
    request <- httr::GET(
      "https://raw.github.com/easystats/circus/main/data/penguins.parquet"
    )
    httr::stop_for_status(request)
    writeBin(httr::content(request, type = "raw"), temp_file)

    d <- data_read(temp_file)
    expect_named(
      d,
      c(
        "species",
        "island",
        "bill_len",
        "bill_dep",
        "flipper_len",
        "body_mass",
        "sex",
        "year"
      )
    )
    expect_identical(dim(d), c(344L, 8L))
  })
})


================================================
FILE: tests/testthat/test-data_recode.R
================================================
# set recode pattern old=new --------------

options(data_recode_pattern = "old=new")


# numeric -----------------------

set.seed(123)
x <- sample(c(1:4, NA), 15, TRUE)

test_that("recode numeric", {
  out <- recode_values(x, list(`1` = 0, `2:3` = 1, `4` = 2))
  expect_equal(
    out,
    c(1, 1, 1, 1, 1, NA, 2, 0, 1, 1, NA, 1, 1, 0, 2),
    ignore_attr = TRUE
  )
  out <- recode_values(
    x,
    list(`1` = 0, `2:3` = 1, `4` = 2, `NA` = 9),
    preserve_na = FALSE
  )
  expect_equal(
    out,
    c(1, 1, 1, 1, 1, 9, 2, 0, 1, 1, 9, 1, 1, 0, 2),
    ignore_attr = TRUE
  )
  out <- recode_values(x, list(`1` = 0, `2:3` = 1, `4` = 2, `NA` = 9))
  expect_equal(
    out,
    c(1, 1, 1, 1, 1, NA, 2, 0, 1, 1, NA, 1, 1, 0, 2),
    ignore_attr = TRUE
  )
  out <- recode_values(
    x,
    list(`1` = 0, `2` = 1),
    default = 99,
    preserve_na = FALSE
  )
  expect_equal(
    out,
    c(99, 99, 1, 1, 99, 99, 99, 0, 1, 99, 99, 99, 99, 0, 99),
    ignore_attr = TRUE
  )
  out <- recode_values(x, list(`1` = 0, `2` = 1), default = 99)
  expect_equal(
    out,
    c(99, 99, 1, 1, 99, NA, 99, 0, 1, 99, NA, 99, 99, 0, 99),
    ignore_attr = TRUE
  )
})


# Date -----------------------

set.seed(123)
x <- as.Date("2022-01-01")

test_that("recode date", {
  expect_message(recode_values(x))
})


# factor -----------------------

set.seed(123)
x <- as.factor(sample(c("a", "b", "c"), 15, TRUE))

test_that("recode factor", {
  out <- recode_values(x, list(a = "x", `b, c` = "y"))
  expect_equal(
    out,
    structure(
      c(
        2L,
        2L,
        2L,
        2L,
        2L,
        2L,
        2L,
        2L,
        2L,
        1L,
        2L,
        2L,
        1L,
        2L,
        2L
      ),
      .Label = c("x", "y"),
      class = "factor"
    ),
    ignore_attr = TRUE
  )
  out <- recode_values(x, list(a = "x", `b, c` = "y"))
  expect_equal(
    out,
    structure(
      c(
        2L,
        2L,
        2L,
        2L,
        2L,
        2L,
        2L,
        2L,
        2L,
        1L,
        2L,
        2L,
        1L,
        2L,
        2L
      ),
      .Label = c("x", "y"),
      class = "factor"
    ),
    ignore_attr = TRUE
  )
})

set.seed(123)
x <- as.factor(sample(c("a", "b", "c", NA_character_), 15, TRUE))

test_that("recode factor", {
  out <- recode_values(x, list(a = "x", `b, c` = "y"))
  expect_equal(
    as.character(out),
    c("y", "y", "y", "y", "y", "y", "y", "y", "y", "x", NA, "y", "y", "x", "y"),
    ignore_attr = TRUE
  )
  out <- recode_values(x, list(a = "x", b = NA))
  expect_equal(
    as.character(out),
    c("c", "c", "c", NA, "c", NA, NA, NA, "c", "x", NA, NA, NA, "x", NA),
    ignore_attr = TRUE
  )
  out <- recode_values(x, list(a = "x", b = "y"), default = "zz")
  expect_equal(
    as.character(out),
    c(
      "zz",
      "zz",
      "zz",
      "y",
      "zz",
      "y",
      "y",
      "y",
      "zz",
      "x",
      NA,
      "y",
      "y",
      "x",
      "y"
    ),
    ignore_attr = TRUE
  )
  out <- recode_values(
    x,
    list(a = "x", b = "y"),
    default = "zz",
    preserve_na = FALSE
  )
  expect_equal(
    as.character(out),
    c(
      "zz",
      "zz",
      "zz",
      "y",
      "zz",
      "y",
      "y",
      "y",
      "zz",
      "x",
      "zz",
      "y",
      "y",
      "x",
      "y"
    ),
    ignore_attr = TRUE
  )
})


# character -----------------------

set.seed(123)
x <- as.character(sample(c("a", "b", "c"), 15, TRUE))

test_that("recode character", {
  out <- recode_values(x, list(a = "x", `b, c` = "y"))
  expect_equal(
    out,
    c(
      "y",
      "y",
      "y",
      "y",
      "y",
      "y",
      "y",
      "y",
      "y",
      "x",
      "y",
      "y",
      "x",
      "y",
      "y"
    ),
    ignore_attr = TRUE
  )
})


# data frame -----------------------

set.seed(123)
d <- data.frame(
  x = sample(c(1:4, NA), 15, TRUE),
  y = as.factor(sample(c("a", "b", "c"), 15, TRUE)),
  stringsAsFactors = FALSE
)

test_that("recode data.frame", {
  out <- recode_values(
    d,
    recode = list(`1` = 0, `2:3` = 1, `4` = 2, a = "x", `b, c` = "y")
  )
  expect_equal(
    out,
    structure(
      list(
        x = c(1, 1, 1, 1, 1, NA, 2, 0, 1, 1, NA, 1, 1, 0, 2),
        y = structure(
          c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L),
          .Label = c("x", "y"),
          class = "factor"
        )
      ),
      row.names = c(NA, 15L),
      class = "data.frame"
    ),
    ignore_attr = TRUE
  )

  out <- recode_values(
    d,
    recode = list(`1` = 0, `2:3` = 1, `4` = 2, a = "x", `b, c` = "y")
  )
  expect_equal(
    out,
    structure(
      list(
        x = c(1, 1, 1, 1, 1, NA, 2, 0, 1, 1, NA, 1, 1, 0, 2),
        y = structure(
          c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L),
          .Label = c("x", "y"),
          class = "factor"
        )
      ),
      row.names = c(NA, 15L),
      class = "data.frame"
    ),
    ignore_attr = TRUE
  )

  out <- recode_values(
    d,
    recode = list(`1` = 0, `2:3` = 1, `4` = 2, a = "x", `b, c` = "y"),
    select = is.numeric()
  )
  expect_equal(
    out,
    structure(
      list(
        x = c(1, 1, 1, 1, 1, NA, 2, 0, 1, 1, NA, 1, 1, 0, 2),
        y = structure(
          c(1L, 1L, 1L, 3L, 2L, 3L, 2L, 1L, 2L, 3L, 2L, 1L, 3L, 3L, 1L),
          .Label = c("a", "b", "c"),
          class = "factor"
        )
      ),
      row.names = c(NA, 15L),
      class = "data.frame"
    ),
    ignore_attr = TRUE
  )
})


# set recode pattern back to default --------------

options(data_recode_pattern = NULL)

set.seed(123)
x <- sample(c(1:4, NA), 15, TRUE)

test_that("recode numeric", {
  out <- recode_values(x, list(`0` = 1, `1` = 2:3, `2` = 4))
  expect_equal(
    out,
    c(1, 1, 1, 1, 1, NA, 2, 0, 1, 1, NA, 1, 1, 0, 2),
    ignore_attr = TRUE
  )
  out <- recode_values(
    x,
    list(`0` = 1, `1` = 2:3, `2` = 4, `9` = NA),
    preserve_na = FALSE
  )
  expect_equal(
    out,
    c(1, 1, 1, 1, 1, 9, 2, 0, 1, 1, 9, 1, 1, 0, 2),
    ignore_attr = TRUE
  )
  out <- recode_values(
    x,
    list(`0` = 1, `1` = 2:3, `2` = 4, `9` = NA),
    preserve_na = TRUE
  )
  expect_equal(
    out,
    c(1, 1, 1, 1, 1, NA, 2, 0, 1, 1, NA, 1, 1, 0, 2),
    ignore_attr = TRUE
  )
})

test_that("recode, recode-arg is named list", {
  expect_warning(expect_identical(
    recode_values(x, recode = c(`0` = 1, `1` = 2:3, `2` = 4)),
    x
  ))
})


set.seed(123)
x <- as.factor(sample(c("a", "b", "c"), 15, TRUE))

test_that("recode factor", {
  out <- recode_values(x, list(x = "a", y = "b, c"))
  expect_equal(
    out,
    structure(
      c(
        2L,
        2L,
        2L,
        2L,
        2L,
        2L,
        2L,
        2L,
        2L,
        1L,
        2L,
        2L,
        1L,
        2L,
        2L
      ),
      .Label = c("x", "y"),
      class = "factor"
    ),
    ignore_attr = TRUE
  )
  out <- recode_values(x, list(x = "a", y = c("b", "c")))
  expect_equal(
    out,
    structure(
      c(
        2L,
        2L,
        2L,
        2L,
        2L,
        2L,
        2L,
        2L,
        2L,
        1L,
        2L,
        2L,
        1L,
        2L,
        2L
      ),
      .Label = c("x", "y"),
      class = "factor"
    ),
    ignore_attr = TRUE
  )
})

test_that("recode, recode-arg is named list", {
  expect_warning(expect_identical(
    recode_values(x, recode = c(x = "a", y = "b, c")),
    x
  ))
})


set.seed(123)
d <- data.frame(
  x = sample(c(1:4, NA), 15, TRUE),
  y = as.factor(sample(c("a", "b", "c"), 15, TRUE)),
  stringsAsFactors = FALSE
)

test_that("recode data.frame", {
  out <- recode_values(
    d,
    recode = list(`0` = 1, `1` = 2:3, `2` = 4, x = "a", y = "b, c")
  )
  expect_equal(
    out,
    structure(
      list(
        x = c(1, 1, 1, 1, 1, NA, 2, 0, 1, 1, NA, 1, 1, 0, 2),
        y = structure(
          c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L),
          .Label = c("x", "y"),
          class = "factor"
        )
      ),
      row.names = c(NA, 15L),
      class = "data.frame"
    ),
    ignore_attr = TRUE
  )

  out <- recode_values(
    d,
    recode = list(`0` = 1, `1` = 2:3, `2` = 4, x = "a", y = c("b", "c"))
  )
  expect_equal(
    out,
    structure(
      list(
        x = c(1, 1, 1, 1, 1, NA, 2, 0, 1, 1, NA, 1, 1, 0, 2),
        y = structure(
          c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L),
          .Label = c("x", "y"),
          class = "factor"
        )
      ),
      row.names = c(NA, 15L),
      class = "data.frame"
    ),
    ignore_attr = TRUE
  )

  out <- recode_values(
    d,
    recode = list(`0` = 1, `1` = 2:3, `2` = 4, x = "a", y = c("b", "c")),
    select = is.numeric()
  )
  expect_equal(
    out,
    structure(
      list(
        x = c(1, 1, 1, 1, 1, NA, 2, 0, 1, 1, NA, 1, 1, 0, 2),
        y = structure(
          c(1L, 1L, 1L, 3L, 2L, 3L, 2L, 1L, 2L, 3L, 2L, 1L, 3L, 3L, 1L),
          .Label = c("a", "b", "c"),
          class = "factor"
        )
      ),
      row.names = c(NA, 15L),
      class = "data.frame"
    ),
    ignore_attr = TRUE
  )
})

# select helpers ------------------------------
test_that("recode_values regex", {
  expect_identical(
    recode_values(
      iris,
      select = "ies",
      regex = TRUE,
      recode = list(
        Group1 = "setosa",
        Group2 = "versicolor",
        Group3 = "virginica"
      )
    ),
    recode_values(
      iris,
      select = "Species",
      recode = list(
        Group1 = "setosa",
        Group2 = "versicolor",
        Group3 = "virginica"
      )
    )
  )
})


================================================
FILE: tests/testthat/test-data_relocate.R
================================================
test_that("data_relocate works as expected", {
  expect_error(
    data_relocate(iris, select = "Species", before = 2, after = 3),
    "You must supply only one of `before` or `after`."
  )

  expect_error(
    data_relocate(iris, select = "Species", before = 10),
    "No valid position defined in `before`."
  )

  expect_error(
    data_relocate(iris, select = "Species", after = 10),
    "No valid position defined in `after`."
  )

  expect_named(
    data_relocate(iris, select = "Species", before = "Sepal.Length"),
    c("Species", "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")
  )
  expect_named(
    data_relocate(iris, select = "Species", before = "Sepal.Width"),
    c("Sepal.Length", "Species", "Sepal.Width", "Petal.Length", "Petal.Width")
  )

  expect_named(
    data_relocate(iris, select = "Sepal.Width", after = "Species"),
    names(data_relocate(iris, select = "Sepal.Width", after = -1))
  )

  expect_named(
    data_relocate(
      iris,
      select = c("Species", "Petal.Length"),
      after = "Sepal.Width"
    ),
    names(data_relocate(iris, select = c("Species", "Petal.Length"), after = 2))
  )
})


test_that("data_relocate select-helpers", {
  expect_identical(
    colnames(data_relocate(iris, select = starts_with("Sepal"), after = 5)),
    colnames(iris[c(3:5, 1:2)])
  )
  expect_identical(
    colnames(data_relocate(iris, select = 1:2, after = 5)),
    colnames(iris[c(3:5, 1:2)])
  )
  expect_identical(
    colnames(data_relocate(iris, select = -1)),
    colnames(iris[c(2:5, 1)])
  )
  expect_identical(
    colnames(data_relocate(iris, select = Species, after = 1)),
    colnames(iris[c(1, 5, 2:4)])
  )
  expect_identical(
    colnames(data_relocate(iris, select = ~ Sepal.Width + Species)),
    colnames(iris[c(2, 5, 1, 3:4)])
  )
  expect_identical(
    colnames(data_relocate(iris, select = starts_with("sepal"), after = 5)),
    colnames(iris)
  )
  expect_identical(
    colnames(data_relocate(
      iris,
      select = starts_with("sepal"),
      after = 5,
      ignore_case = TRUE
    )),
    colnames(iris[c(3:5, 1:2)])
  )
})


# preserve attributes --------------------------

test_that("data_relocate preserves attributes", {
  skip_if_not_installed("parameters")

  m <- lm(Sepal.Length ~ Species, data = iris)
  out <- parameters::parameters(m)
  a1 <- attributes(out)

  out2 <- data_relocate(out, 4:6)
  a2 <- attributes(out2)

  # attributes may not be in the same order
  expect_true(all(names(a1) %in% names(a2)))
  expect_identical(length(a1), length(a2))
})


# select helpers ------------------------------
test_that("data_relocate regex", {
  expect_identical(
    names(data_relocate(mtcars, select = "pg", regex = TRUE, after = "carb"))[
      11
    ],
    "mpg"
  )
})


# fuzzy matching ------------------------------
out <- data.frame(
  Parameter = "Test",
  Median = 0.5,
  CI_low = 0.4,
  CI_high = 0.6,
  pd = 0.97,
  Rhat = 0.99,
  ESS = 1000,
  log_BF = 3,
  stringsAsFactors = FALSE
)

test_that("data_relocate misspelled", {
  # close match
  expect_error(
    data_relocate(out, "pd", before = "BF"),
    "log_BF"
  )
  # close multiple matches
  expect_error(
    data_relocate(out, "pd", before = "CIl"),
    "CI_low"
  )
  # not even close
  expect_error(
    data_relocate(out, "pd", before = "xyz"),
    "misspelled"
  )
})


================================================
FILE: tests/testthat/test-data_remove.R
================================================
test_that("data_remove works as expected", {
  expect_identical(
    data_remove(BOD, "Time"),
    structure(
      list(demand = c(8.3, 10.3, 19, 16, 15.6, 19.8)),
      class = "data.frame",
      row.names = c(NA, 6L),
      reference = "A1.4, p. 270"
    )
  )
})


test_that("data_remove works with NSE", {
  expect_named(
    data_remove(iris, starts_with("Sepal")),
    c("Petal.Length", "Petal.Width", "Species")
  )

  expect_named(
    data_remove(iris, "Sepal"),
    c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species")
  )

  expect_named(
    data_remove(iris, c("Sepal.Length", "Sepal.Width")),
    c("Petal.Length", "Petal.Width", "Species")
  )

  expect_named(
    data_remove(iris, regex("\\.")),
    "Species"
  )

  expect_named(
    data_remove(iris, Sepal.Width:Petal.Width),
    c("Sepal.Length", "Species")
  )

  expect_named(
    data_remove(iris, contains("Sep")),
    c("Petal.Length", "Petal.Width", "Species")
  )

  expect_named(
    data_remove(iris, contains("sep")),
    c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species")
  )

  expect_named(
    data_remove(iris, contains("sep"), ignore_case = TRUE),
    c("Petal.Length", "Petal.Width", "Species")
  )

  expect_named(
    data_remove(iris, 1:3),
    c("Petal.Width", "Species")
  )

  expect_identical(
    colnames(data_remove(iris, c(1, 5))),
    colnames(iris)[2:4]
  )

  expect_identical(
    colnames(data_remove(iris, -1:-2)),
    colnames(iris)[1:2]
  )

  expect_identical(
    colnames(data_remove(iris, c(1, 4:5))),
    colnames(iris)[2:3]
  )

  expect_identical(
    colnames(data_remove(iris, "abc")),
    colnames(iris)
  )

  expect_named(
    data_remove(iris, "Species"),
    c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")
  )

  expect_named(
    data_remove(iris, "species"),
    c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species")
  )

  expect_named(
    data_remove(iris, "species", ignore_case = TRUE),
    c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")
  )
})


test_that("data_remove from other functions", {
  test_fun <- function(data, i) {
    data_remove(data, select = i)
  }
  expect_named(
    test_fun(iris, c("Sepal.Length", "Sepal.Width")),
    c("Petal.Length", "Petal.Width", "Species")
  )
})


# preserve attributes --------------------------

test_that("data_remove preserves attributes", {
  skip_if_not_installed("parameters")

  m <- lm(Sepal.Length ~ Species, data = iris)
  out <- parameters::parameters(m)
  a1 <- attributes(out)

  out2 <- data_remove(out, "SE")
  a2 <- attributes(out2)

  # attributes may not be in the same order
  expect_true(all(names(a1) %in% names(a2)))
  expect_identical(length(a1), length(a2))
})

# select helpers ------------------------------
test_that("data_remove regex", {
  expect_identical(
    names(data_remove(mtcars, select = "pg", regex = TRUE)),
    names(mtcars[-(1)])
  )
})


================================================
FILE: tests/testthat/test-data_rename.R
================================================
test <- head(iris)

# basic tests --------------

test_that("data_rename works with one or several replacements", {
  expect_named(
    data_rename(test, "Sepal.Length", "length"),
    c("length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species")
  )
  expect_named(
    data_rename(
      test,
      c("Sepal.Length", "Sepal.Width"),
      c("length", "width")
    ),
    c("length", "width", "Petal.Length", "Petal.Width", "Species")
  )
  expect_named(
    data_rename(test, c(length = "Sepal.Length", width = "Sepal.Width")),
    c("length", "width", "Petal.Length", "Petal.Width", "Species")
  )
})

test_that("data_rename cannot have a partially named vector", {
  expect_error(
    data_rename(test, c(length = "Sepal.Length", "Sepal.Width")),
    "all elements must"
  )
})

test_that("data_rename returns a data frame", {
  x <- data_rename(test, "Sepal.Length", "length")
  expect_s3_class(x, "data.frame")
})

test_that("data_rename: multiple selection types", {
  expect_named(
    data_rename(test, select = 1, "foo"),
    c("foo", names(iris)[2:5])
  )
  expect_named(
    data_rename(test, select = regex("tal"), c("foo1", "foo2")),
    c("Sepal.Length", "Sepal.Width", "foo1", "foo2", "Species")
  )
})

test_that("data_rename: replacement not allowed to have NA or empty strings", {
  expect_error(
    data_rename(
      test,
      select = c("Species", "Sepal.Length"),
      replacement = c("foo", NA_character_)
    ),
    regexp = "`replacement` is not allowed"
  )
})

# replacement -------------

test_that("data_rename errors when no replacement", {
  expect_error(
    data_rename(test, select = c("Sepal.Length", "Petal.Length")),
    "There are more names in `select` than in `replacement`"
  )
})

test_that("data_rename errors when too many names in 'replacement'", {
  expect_error(
    data_rename(test, replacement = paste0("foo", 1:6)),
    "There are more names in `replacement` than in `select`"
  )
})

test_that("data_rename works when not enough names in 'replacement'", {
  expect_error(
    data_rename(test, replacement = paste0("foo", 1:2)),
    "There are more names in `select` than in `replacement`"
  )
})


# no select --------------

test_that("data_rename errors when select = NULL", {
  expect_error(
    data_rename(test),
    "more names in `select`"
  )
})


# other --------------

test_that("data_rename deals correctly with duplicated replacement", {
  x <- data_rename(
    test,
    select = names(test)[1:4],
    replacement = c("foo", "bar", "foo", "bar")
  )
  expect_identical(dim(test), dim(x))
  expect_named(x[1:4], c("foo", "bar", "foo.2", "bar.2"))
})


# preserve attributes --------------------------

test_that("data_rename preserves attributes", {
  skip_if_not_installed("parameters")

  m <- lm(Sepal.Length ~ Species, data = iris)
  out <- parameters::parameters(m)
  a1 <- attributes(out)

  out2 <- data_rename(out, "p", "p-val")
  a2 <- attributes(out2)

  expect_named(a1, names(a2))
})


# glue-styled select --------------------------

test_that("data_rename glue-style", {
  data(mtcars)
  out <- data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "formerly_{col}")
  expect_named(out, c("formerly_mpg", "formerly_cyl", "formerly_disp"))
  out <- data_rename(
    mtcars[1:3],
    c("mpg", "cyl", "disp"),
    "{col}_is_column_{n}"
  )
  expect_named(out, c("mpg_is_column_1", "cyl_is_column_2", "disp_is_column_3"))
  out <- data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "new_{letter}")
  expect_named(out, c("new_a", "new_b", "new_c"))
})

test_that("data_rename enough letters", {
  data(efc, package = "datawizard")
  data(mtcars)
  data(iris)
  data(ChickWeight)
  data(ToothGrowth)
  data(USArrests)
  data(airquality)
  x <- cbind(
    mtcars[1:5, ],
    iris[1:5, ],
    efc[1:5, ],
    ChickWeight[1:5, ],
    ToothGrowth[1:5, ],
    USArrests[1:5, ],
    airquality[1:5, ]
  )
  expect_named(
    data_rename(x, replacement = "long_letter_{letter}"),
    c(
      "long_letter_a1",
      "long_letter_b1",
      "long_letter_c1",
      "long_letter_d1",
      "long_letter_e1",
      "long_letter_f1",
      "long_letter_g1",
      "long_letter_h1",
      "long_letter_i1",
      "long_letter_j1",
      "long_letter_k1",
      "long_letter_l1",
      "long_letter_m1",
      "long_letter_n1",
      "long_letter_o1",
      "long_letter_p1",
      "long_letter_q1",
      "long_letter_r1",
      "long_letter_s1",
      "long_letter_t1",
      "long_letter_u1",
      "long_letter_v1",
      "long_letter_w1",
      "long_letter_x1",
      "long_letter_y1",
      "long_letter_z1",
      "long_letter_a2",
      "long_letter_b2",
      "long_letter_c2",
      "long_letter_d2",
      "long_letter_e2",
      "long_letter_f2",
      "long_letter_g2",
      "long_letter_h2",
      "long_letter_i2",
      "long_letter_j2",
      "long_letter_k2",
      "long_letter_l2"
    )
  )
})

skip_if_not_installed("withr")
withr::with_environment(
  new.env(),
  test_that("data_rename glue-style, environment", {
    data(mtcars)
    x <- c("hi", "there", "!")
    out <- data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "col_{x}")
    expect_named(out, c("col_hi", "col_there", "col_!"))
    expect_error(
      data_rename(mtcars[1:3], c("mpg", "disp"), "col_{x}"),
      regex = "The number of values"
    )
  })
)

withr::with_environment(
  new.env(),
  test_that("data_rename glue-style, object not in environment", {
    data(mtcars)
    expect_error(
      data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "col_{x}"),
      regex = "The object"
    )
  })
)

withr::with_environment(
  new.env(),
  test_that("data_rename glue-style, function in environment", {
    data(mtcars)
    my_fun <- function(cols_to_rename) {
      data_rename(head(mtcars)[, 1:6], cols_to_rename, "new_{col}")
    }
    expect_named(
      my_fun(c("mpg", "drat")),
      c("new_mpg", "cyl", "disp", "hp", "new_drat", "wt")
    )
    expect_named(
      my_fun("mpg"),
      c("new_mpg", "cyl", "disp", "hp", "drat", "wt")
    )
  })
)

test_that("works with lists", {
  result <- list(x = 1, y = 2)
  expect_error(
    data_rename(result, select = names(result), replacement = c("a", "b")),
    regex = "must be a data frame"
  )
})


================================================
FILE: tests/testthat/test-data_reorder.R
================================================
test_that("data_reorder works as expected", {
  expect_named(
    data_reorder(iris, c("Species", "Sepal.Length")),
    c("Species", "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")
  )

  expect_warning(expect_named(
    data_reorder(iris, c("Species", "dupa")),
    c("Species", "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")
  ))
})


# preserve attributes --------------------------

test_that("data_reorder preserves attributes", {
  skip_if_not_installed("parameters")

  m <- lm(Sepal.Length ~ Species, data = iris)
  out <- parameters::parameters(m)
  a1 <- attributes(out)

  out2 <- data_reorder(out, 4:6)
  a2 <- attributes(out2)

  # attributes may not be in the same order
  expect_true(all(names(a1) %in% names(a2)))
  expect_length(a1, length(a2))
})


================================================
FILE: tests/testthat/test-data_replicate.R
================================================
test_that("data_replicate: simple use case", {
  data(mtcars)
  d <- head(mtcars)
  out <- data_replicate(d, "carb")
  expect_identical(dim(out), c(13L, 10L))
  expect_identical(
    out$disp,
    c(160, 160, 160, 160, 160, 160, 160, 160, 108, 258, 360, 360, 225)
  )
  expect_named(
    out,
    c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear")
  )

  out <- data_replicate(d, 11)
  expect_identical(dim(out), c(13L, 10L))
  expect_identical(
    out$disp,
    c(160, 160, 160, 160, 160, 160, 160, 160, 108, 258, 360, 360, 225)
  )
  expect_named(
    out,
    c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear")
  )

  d$mpg[5] <- NA
  out <- data_replicate(d, "carb")
  expect_identical(dim(out), c(13L, 10L))
  expect_identical(
    out$mpg,
    c(21, 21, 21, 21, 21, 21, 21, 21, 22.8, 21.4, NA, NA, 18.1)
  )
  expect_named(
    out,
    c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear")
  )

  d$carb[3] <- NA
  out <- data_replicate(d, "carb", remove_na = TRUE)
  expect_identical(dim(out), c(12L, 10L))
  expect_identical(
    out$mpg,
    c(21, 21, 21, 21, 21, 21, 21, 21, 21.4, NA, NA, 18.1)
  )
  expect_named(
    out,
    c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear")
  )

  out <- data_replicate(d, "carb", select = c("disp", "hp"), remove_na = TRUE)
  expect_identical(dim(out), c(12L, 2L))
  expect_identical(
    out$disp,
    c(160, 160, 160, 160, 160, 160, 160, 160, 258, 360, 360, 225)
  )
  expect_named(out, c("disp", "hp"))

  d <- data.frame(
    a = c("a", "b", "c"),
    b = 1:3,
    rep = c(3, 2, 4),
    stringsAsFactors = FALSE
  )
  out <- data_replicate(d, "rep")
  expect_identical(out$a, c("a", "a", "a", "b", "b", "c", "c", "c", "c"))
})


test_that("data_replicate: errors", {
  data(mtcars)
  d <- head(mtcars)
  expect_error(data_replicate(d), regex = "No column")
  expect_error(
    data_replicate(d, expand = c("mpg", "gear")),
    regex = "a single string"
  )
  expect_error(
    data_replicate(d, expand = "geas"),
    regex = "The column provided"
  )
  expect_error(
    data_replicate(d, expand = "qsec"),
    regex = "The column provided"
  )
  d$carb[3] <- NA
  expect_error(data_replicate(d, "carb"), regex = "missing values")
  d <- head(mtcars)
  d$carb[3] <- Inf
  expect_error(data_replicate(d, "carb"), regex = "infinite values")
})


test_that("data_replicate: don't simplify if only one column left", {
  a <- c(1, 2, 3, 4)
  b <- c(4, 3, 2, 1)
  nrtimes <- c(1, 2, 0, 1)

  d <- data.frame(a, b, nrtimes)
  out <- data_replicate(d, expand = "nrtimes")
  expect_identical(dim(out), c(4L, 2L))

  d <- data.frame(a, nrtimes)
  out <- data_replicate(d, expand = "nrtimes")
  expect_identical(dim(out), c(4L, 1L))
})


================================================
FILE: tests/testthat/test-data_rescale.R
================================================
test_that("rescale works as expected", {
  expect_equal(
    rescale(c(0, 1, 5, -5, -2), to = NULL),
    c(0, 1, 5, -5, -2),
    ignore_attr = TRUE
  )

  expect_equal(
    rescale(rep(NA_real_, 3)),
    rep(NA_real_, 3),
    ignore_attr = TRUE
  )

  expect_message(rescale(iris$Species))

  expect_equal(
    rescale(c(0, 1, 5, -5, -2)),
    c(50, 60, 100, 0, 30),
    ignore_attr = TRUE
  )

  expect_equal(
    rescale(c(0, 1, 5, -5, -2), to = c(-5, 5)),
    c(0, 1, 5, -5, -2),
    ignore_attr = TRUE
  )

  expect_equal(
    rescale(c(1, 3, 4), to = c(0, 40), range = c(0, 4)),
    c(10, 30, 40),
    ignore_attr = TRUE
  )

  expect_snapshot(head(rescale(iris, to = c(0, 1))))

  expect_snapshot(head(rescale(iris, to = c(0, 1), select = "Sepal.Length")))

  expect_snapshot(
    head(rescale(
      iris,
      to = list(
        Sepal.Length = c(0, 1),
        Petal.Length = c(-1, 0)
      )
    ))
  )
})


test_that("rescale works with select helpers", {
  out <- rescale(iris, to = c(0, 1), select = c("Sepal.Width", "Sepal.Length"))
  expect_equal(
    head(out$Sepal.Width),
    c(0.625, 0.41667, 0.5, 0.45833, 0.66667, 0.79167),
    tolerance = 1e-3
  )
  expect_equal(
    head(out$Petal.Length),
    head(iris$Petal.Length),
    tolerance = 1e-3
  )

  # check class attributes
  expect_identical(
    vapply(out, class, character(1)),
    c(
      Sepal.Length = "numeric",
      Sepal.Width = "numeric",
      Petal.Length = "numeric",
      Petal.Width = "numeric",
      Species = "factor"
    )
  )

  out <- rescale(iris, to = c(0, 1), select = starts_with("Sepal"))
  expect_equal(
    head(out$Sepal.Width),
    c(0.625, 0.41667, 0.5, 0.45833, 0.66667, 0.79167),
    tolerance = 1e-3
  )
  expect_equal(
    head(out$Petal.Length),
    head(iris$Petal.Length),
    tolerance = 1e-3
  )

  skip_if_not_installed("poorman")

  x <- poorman::group_by(iris, Species)
  out <- rescale(x, to = c(0, 1), select = starts_with("Sepal"))
  expect_equal(
    head(out$Sepal.Width),
    c(0.57143, 0.33333, 0.42857, 0.38095, 0.61905, 0.7619),
    tolerance = 1e-3
  )
  expect_equal(
    head(out$Petal.Length),
    head(iris$Petal.Length),
    tolerance = 1e-3
  )
})


# grouped df ------------------------------
test_that("rescale works grouped df and append", {
  out <- rescale(
    iris,
    to = c(0, 1),
    select = c("Sepal.Width", "Sepal.Length"),
    append = TRUE
  )
  expect_equal(
    head(out$Sepal.Width_r),
    c(0.625, 0.41667, 0.5, 0.45833, 0.66667, 0.79167),
    tolerance = 1e-3
  )
  expect_equal(
    head(out$Petal.Length),
    head(iris$Petal.Length),
    tolerance = 1e-3
  )
  expect_identical(
    colnames(out),
    c(
      "Sepal.Length",
      "Sepal.Width",
      "Petal.Length",
      "Petal.Width",
      "Species",
      "Sepal.Width_r",
      "Sepal.Length_r"
    )
  )

  skip_if_not_installed("poorman")

  x <- poorman::group_by(iris, Species)
  out <- rescale(x, to = c(0, 1), select = starts_with("Sepal"), append = TRUE)
  expect_equal(
    head(out$Sepal.Width_r),
    c(0.57143, 0.33333, 0.42857, 0.38095, 0.61905, 0.7619),
    tolerance = 1e-3
  )
  expect_equal(
    head(out$Petal.Length),
    head(iris$Petal.Length),
    tolerance = 1e-3
  )
  expect_identical(
    colnames(out),
    c(
      "Sepal.Length",
      "Sepal.Width",
      "Petal.Length",
      "Petal.Width",
      "Species",
      "Sepal.Length_r",
      "Sepal.Width_r"
    )
  )
})


# select helpers ------------------------------
test_that("data_rescale regex", {
  expect_equal(
    rescale(mtcars, select = "pg", regex = TRUE)$mpg,
    rescale(mtcars, select = "mpg")$mpg,
    ignore_attr = TRUE
  )
})


# expanding range ------------------------------
test_that("data_rescale can expand range", {
  # for vectors
  x <- 5:15
  expect_equal(
    rescale(x, multiply = 1.1),
    c(4.5, 5.6, 6.7, 7.8, 8.9, 10, 11.1, 12.2, 13.3, 14.4, 15.5),
    ignore_attr = TRUE
  )
  expect_equal(
    rescale(x, multiply = 1.1),
    rescale(x, add = 0.5),
    ignore_attr = TRUE
  )
  expect_error(rescale(x, multiply = 0.9, add = 1), regex = "Only one of")
  expect_error(rescale(x, multiply = c(1.2, 1.4)), regex = "The length of")

  # different values for add
  expect_equal(
    rescale(x, add = c(1, 3)),
    c(4, 5.4, 6.8, 8.2, 9.6, 11, 12.4, 13.8, 15.2, 16.6, 18),
    ignore_attr = TRUE
  )
  expect_error(rescale(x, add = 1:3), regex = "The length of")

  # works with NA
  expect_equal(
    rescale(rep(NA_real_, 3), multiply = 1.1),
    rep(NA_real_, 3),
    ignore_attr = TRUE
  )
  expect_equal(
    rescale(rep(NA_real_, 3), add = 2),
    rep(NA_real_, 3),
    ignore_attr = TRUE
  )

  # for data frames
  d <- data.frame(x = 5:15, y = 5:15)
  expect_equal(
    rescale(d, multiply = 1.1),
    rescale(d, add = 0.5),
    ignore_attr = TRUE
  )
  expect_equal(
    rescale(d, multiply = list(x = 1.1, y = 0.5)),
    rescale(d, add = list(x = 0.5, y = -2.5)),
    ignore_attr = TRUE
  )
  # data frames accept multiple add-values per column
  out <- rescale(d, add = list(x = c(1, 3), y = c(2, 4)))
  expect_equal(
    out$x,
    rescale(d$x, add = c(1, 3)),
    ignore_attr = TRUE
  )
  expect_equal(
    out$y,
    rescale(d$y, add = c(2, 4)),
    ignore_attr = TRUE
  )

  expect_error(rescale(d, multiply = 0.9, add = 1), regex = "Only one of")
  expect_error(
    rescale(d, multiply = list(x = 0.9, y = 2), add = list(y = 1)),
    regex = "Only one of"
  )
  expect_error(rescale(d, multiply = c(0.9, 1.5)), regex = "The length of")
})


================================================
FILE: tests/testthat/test-data_restoretype.R
================================================
test_that("data_restoretype works with reference", {
  data <- data.frame(
    Sepal.Length = c("1", "3", "2"),
    Species = c("setosa", "versicolor", "setosa"),
    New = c("1", "3", "4"),
    stringsAsFactors = FALSE
  )

  fixed <- data_restoretype(data, reference = iris)

  expect_equal(typeof(fixed$Species), typeof(iris$Species))
  expect_equal(typeof(fixed$Sepal.Length), typeof(iris$Sepal.Length))
  expect_equal(typeof(fixed$New), "character")
})


test_that("data_restoretype works without reference", {
  data <- data.frame(
    Sepal.Length = c("1", "3", "2"),
    Species = c("setosa", "versicolor", "setosa"),
    New = c("1", "3", "4"),
    stringsAsFactors = FALSE
  )

  expect_equal(
    data_restoretype(data, reference = NULL),
    data.frame(
      Sepal.Length = c(1, 3, 2),
      Species = c("setosa", "versicolor", "setosa"),
      New = c(1, 3, 4),
      stringsAsFactors = FALSE
    )
  )
})


================================================
FILE: tests/testthat/test-data_reverse.R
================================================
# explanation of how reverse works:
# https://github.com/easystats/datawizard/issues/106#issuecomment-1066628399

test_that("reverse works with numeric", {
  expect_identical(
    reverse(1:5),
    as.double(5:1)
  )
  expect_identical(
    reverse(-2:2),
    as.double(2:-2)
  )
})

test_that("reverse works with factor", {
  expect_identical(
    reverse(factor(1:5)),
    factor(5:1)
  )
  expect_identical(
    reverse(factor(-2:2)),
    factor(2:-2)
  )
})

test_that("reverse works with data frame", {
  test <- data.frame(
    x = 1:5,
    y = c(3, 8, 2, 5, 1)
  )
  expect_identical(
    reverse(test, select = "x"),
    data.frame(
      x = as.double(5:1),
      y = c(3, 8, 2, 5, 1)
    )
  )
  expect_identical(
    reverse(test, exclude = "x"),
    data.frame(
      x = 1:5,
      y = c(6, 1, 7, 4, 8)
    )
  )
  expect_identical(
    reverse(test),
    data.frame(
      x = as.double(5:1),
      y = c(6, 1, 7, 4, 8)
    )
  )
})

test_that("reverse works with data frame and append", {
  test <- data.frame(
    x = 1:5,
    y = c(3, 8, 2, 5, 1)
  )
  expect_identical(
    reverse(test, select = "x", append = TRUE),
    data.frame(
      x = 1:5,
      y = c(3, 8, 2, 5, 1),
      x_r = as.double(5:1)
    )
  )
  expect_identical(
    reverse(test, append = TRUE),
    data.frame(
      x = 1:5,
      y = c(3, 8, 2, 5, 1),
      x_r = as.double(5:1),
      y_r = c(6, 1, 7, 4, 8)
    )
  )
})

test_that("reverse: arg 'select' works with formula", {
  test <- data.frame(
    x = 1:5,
    y = c(3, 8, 2, 5, 1)
  )
  expect_identical(
    reverse(test, select = ~x),
    data.frame(
      x = as.double(5:1),
      y = c(3, 8, 2, 5, 1)
    )
  )
  expect_identical(
    reverse(test, select = ~ x + y),
    data.frame(
      x = as.double(5:1),
      y = c(6, 1, 7, 4, 8)
    )
  )
})

test_that("reverse: arg 'exclude' works with formula", {
  test <- data.frame(
    x = 1:5,
    y = c(3, 8, 2, 5, 1)
  )
  expect_identical(
    reverse(test, exclude = ~x),
    data.frame(
      x = 1:5,
      y = c(6, 1, 7, 4, 8)
    )
  )
  expect_identical(
    reverse(test, exclude = ~ x + y),
    test
  )
})

test_that("reverse: argument 'range' works", {
  expect_identical(
    reverse(c(1, 3, 4), range = c(0, 4)),
    c(3, 1, 0)
  )
  expect_identical(
    reverse(factor(c(1, 2, 3, 4, 5)), range = 0:10),
    factor(9:5, levels = 0:10)
  )

  test <- data.frame(
    x = 1:5,
    y = c(3, 8, 2, 5, 1)
  )
  expect_identical(
    reverse(test, select = "x", range = c(0, 8)),
    data.frame(
      x = as.double(7:3),
      y = c(3, 8, 2, 5, 1)
    )
  )
  expect_identical(
    reverse(test, range = c(0, 8)),
    data.frame(
      x = as.double(7:3),
      y = c(5, 0, 6, 3, 7)
    )
  )
})

test_that("reverse ignores NA", {
  expect_identical(
    reverse(c(1, 2, 8, NA)),
    c(8, 7, 1, NA)
  )
})

test_that("reverse returns NA if only NA provided", {
  expect_identical(
    reverse(c(NA_real_, NA_real_)),
    c(NA_real_, NA_real_)
  )
  expect_identical(
    reverse(factor(c(NA, NA))),
    factor(c(NA, NA))
  )
})

test_that("reverse warns if single value to reverse", {
  expect_warning(
    reverse(1),
    regexp = "A `range` must be provided for data with only one unique value."
  )
  expect_warning(
    reverse(factor(1)),
    regexp = "A `range` must be provided for data with only one unique value."
  )
})

test_that("reverse msg for unsupported", {
  expect_message(reverse(as.Date(c("2022-04-24", "2022-04-23"))))
})


# Same tests with reverse_scale (alias) --------------------------

test_that("reverse_scale works with numeric", {
  expect_identical(
    reverse_scale(1:5),
    as.double(5:1)
  )
  expect_identical(
    reverse_scale(-2:2),
    as.double(2:-2)
  )
})

test_that("reverse_scale works with factor", {
  expect_identical(
    reverse_scale(factor(1:5)),
    factor(5:1)
  )
  expect_identical(
    reverse_scale(factor(-2:2)),
    factor(2:-2)
  )
})

test_that("reverse_scale works with data frame", {
  test <- data.frame(
    x = 1:5,
    y = c(3, 8, 2, 5, 1)
  )
  expect_identical(
    reverse_scale(test, select = "x"),
    data.frame(
      x = as.double(5:1),
      y = c(3, 8, 2, 5, 1)
    )
  )
  expect_identical(
    reverse_scale(test, exclude = "x"),
    data.frame(
      x = 1:5,
      y = c(6, 1, 7, 4, 8)
    )
  )
  expect_identical(
    reverse_scale(test),
    data.frame(
      x = as.double(5:1),
      y = c(6, 1, 7, 4, 8)
    )
  )
})

test_that("reverse_scale: arg 'select' works with formula", {
  test <- data.frame(
    x = 1:5,
    y = c(3, 8, 2, 5, 1)
  )
  expect_identical(
    reverse_scale(test, select = ~x),
    data.frame(
      x = as.double(5:1),
      y = c(3, 8, 2, 5, 1)
    )
  )
  expect_identical(
    reverse_scale(test, select = ~ x + y),
    data.frame(
      x = as.double(5:1),
      y = c(6, 1, 7, 4, 8)
    )
  )
})

test_that("reverse_scale: arg 'exclude' works with formula", {
  test <- data.frame(
    x = 1:5,
    y = c(3, 8, 2, 5, 1)
  )
  expect_identical(
    reverse_scale(test, exclude = ~x),
    data.frame(
      x = 1:5,
      y = c(6, 1, 7, 4, 8)
    )
  )
  expect_identical(
    reverse_scale(test, exclude = ~ x + y),
    test
  )
})

test_that("reverse_scale: argument 'range' works", {
  expect_identical(
    reverse_scale(c(1, 3, 4), range = c(0, 4)),
    c(3, 1, 0)
  )
  expect_identical(
    reverse_scale(factor(c(1, 2, 3, 4, 5)), range = 0:10),
    factor(9:5, levels = 0:10)
  )

  test <- data.frame(
    x = 1:5,
    y = c(3, 8, 2, 5, 1)
  )
  expect_identical(
    reverse_scale(test, select = "x", range = c(0, 8)),
    data.frame(
      x = as.double(7:3),
      y = c(3, 8, 2, 5, 1)
    )
  )
  expect_identical(
    reverse_scale(test, range = c(0, 8)),
    data.frame(
      x = as.double(7:3),
      y = c(5, 0, 6, 3, 7)
    )
  )
})

test_that("reverse_scale ignores NA", {
  expect_identical(
    reverse_scale(c(1, 2, 8, NA)),
    c(8, 7, 1, NA)
  )
})

test_that("reverse_scale returns NA if only NA provided", {
  expect_identical(
    reverse_scale(c(NA_real_, NA_real_)),
    c(NA_real_, NA_real_)
  )
  expect_identical(
    reverse_scale(factor(c(NA, NA))),
    factor(c(NA, NA))
  )
})

test_that("reverse_scale warns if single value to reverse", {
  expect_warning(
    reverse_scale(1),
    regexp = "A `range` must be provided for data with only one unique value."
  )
  expect_warning(
    reverse_scale(factor(1)),
    regexp = "A `range` must be provided for data with only one unique value."
  )
})


test_that("reverse_scale select helpers", {
  data(iris)
  out <- rescale(
    iris,
    to = list(
      Sepal.Length = c(0, 1),
      Petal.Length = c(-1, 0)
    ),
    select = ends_with("length")
  )

  expect_identical(out$Sepal.Length, iris$Sepal.Length, tolerance = 1e-3)

  out <- rescale(
    iris,
    to = list(
      Sepal.Length = c(0, 1),
      Petal.Length = c(-1, 0)
    ),
    select = ends_with("length"),
    ignore_case = TRUE
  )

  expect_identical(
    head(out$Sepal.Length),
    c(0.22222, 0.16667, 0.11111, 0.08333, 0.19444, 0.30556),
    tolerance = 1e-3
  )
})


# with grouped data -------------------------------------------

set.seed(123)
value1 <- sample(1:10, 6, replace = TRUE)
set.seed(456)
value2 <- sample(1:10, 6, replace = TRUE)

test_df <- data.frame(
  id = rep(c("A", "B"), each = 3),
  value1 = value1,
  value2 = value2,
  stringsAsFactors = FALSE
)

test_that("reverse works with data frames (grouped data)", {
  skip_if_not_installed("poorman")

  expect_identical(
    test_df %>%
      poorman::group_by(id) %>%
      reverse(exclude = "id") %>%
      poorman::ungroup(),
    data.frame(
      id = rep(c("A", "B"), each = 3),
      value1 = c(10, 10, 3, 6, 2, 3),
      value2 = c(4, 6, 3, 10, 6, 5),
      stringsAsFactors = FALSE
    )
  )
})


test_that("reverse works with grouped data frames and append", {
  skip_if_not_installed("poorman")

  test <- data.frame(
    x = 1:6,
    y = c(3, 8, 2, 5, 1, 4),
    grp = rep(c("a", "b"), 3),
    stringsAsFactors = FALSE
  )
  expect_identical(
    test %>%
      poorman::group_by(grp) %>%
      reverse(append = TRUE) %>%
      poorman::ungroup(),
    data.frame(
      x = 1:6,
      y = c(3, 8, 2, 5, 1, 4),
      grp = rep(c("a", "b"), 3),
      x_r = as.double(c(5, 6, 3, 4, 1, 2)),
      y_r = as.double(c(1, 4, 2, 7, 3, 8)),
      stringsAsFactors = FALSE
    )
  )
})


set.seed(789)
value1 <- sample(c(1:10, NA), 6, replace = TRUE)
set.seed(10)
value2 <- sample(c(1:10, NA), 6, replace = TRUE)

test_df <- data.frame(
  id = rep(c("A", "B"), each = 3),
  value1 = value1,
  value2 = value2,
  stringsAsFactors = FALSE
)

test_that("reverse works with data frames containing NAs (grouped data)", {
  skip_if_not_installed("poorman")

  expect_identical(
    test_df %>%
      poorman::group_by(id) %>%
      reverse(exclude = "id") %>%
      poorman::ungroup(),
    data.frame(
      id = rep(c("A", "B"), each = 3),
      value1 = c(10, 4, 4, 5, 3, 4),
      value2 = c(NA, 10, 9, 7, 6, 8),
      stringsAsFactors = FALSE
    )
  )
})

# select helpers ------------------------------
test_that("reverse regex", {
  expect_identical(
    reverse(mtcars, select = "arb", regex = TRUE),
    reverse(mtcars, select = "carb")
  )
})


# work or give informative errors / warnings (#380) ------------------
test_that("reverse, larger range", {
  # works
  expect_identical(
    reverse(c(1, 3, 4), range = c(0, 4)),
    c(3, 1, 0)
  )
  expect_identical(
    reverse(factor(c(1, 3, 4)), range = 0:4),
    structure(
      c(4L, 2L, 1L),
      levels = c("0", "1", "2", "3", "4"),
      class = "factor"
    )
  )
  expect_identical(
    reverse(factor(c(1, 3, 4)), range = c(0, 4)),
    structure(
      c(4L, 2L, 1L),
      levels = c("0", "1", "2", "3", "4"),
      class = "factor"
    )
  )

  # errors on invalid input
  expect_error(reverse(c(1, 3, 4), range = 0:4))
  expect_error(reverse(factor(c(1, 3, 4, 5)), range = c(0, 2, 4)))
  # errors on invalid input (NA in range)
  expect_error(reverse(c(1, 3, 4), range = c(1, NA)), regex = "missing")
  expect_error(
    reverse(factor(letters[1:3]), range = c(1, NA)),
    regex = "missing"
  )

  # warns
  expect_warning(
    reverse(factor(c("a", "b", "c")), range = c(1, 3, 5, 7)),
    regex = "No current"
  )
  expect_warning(
    reverse(factor(c(9, 10, 11)), range = c(1, 3, 5, 7)),
    regex = "No current"
  )
  expect_warning(
    reverse(factor(c(1, 3, 11)), range = c(1, 3, 5, 7)),
    regex = "Not all"
  )

  # silent
  expect_silent(reverse(
    factor(c("a", "b", "c")),
    range = c(1, 3, 5, 7),
    verbose = FALSE
  ))
  expect_silent(reverse(
    factor(c(9, 10, 11)),
    range = c(1, 3, 5, 7),
    verbose = FALSE
  ))
  expect_silent(reverse(
    factor(c(1, 3, 11)),
    range = c(1, 3, 5, 7),
    verbose = FALSE
  ))

  # works as intended
  expect_identical(
    reverse(factor(c(1, 3, 11)), range = c(1, 3, 5, 7), verbose = FALSE),
    structure(c(4L, 3L, NA), levels = c("1", "3", "5", "7"), class = "factor")
  )
})


================================================
FILE: tests/testthat/test-data_rotate.R
================================================
test_that("rotate data works as expected", {
  df <- mtcars[1:3, 1:4]

  expect_equal(
    data_rotate(df),
    structure(
      list(
        `Mazda RX4` = c(21, 6, 160, 110),
        `Mazda RX4 Wag` = c(21, 6, 160, 110),
        `Datsun 710` = c(22.8, 4, 108, 93)
      ),
      class = "data.frame",
      row.names = c("mpg", "cyl", "disp", "hp")
    )
  )

  expect_equal(
    data_rotate(df, rownames = "property"),
    structure(
      list(
        property = c("mpg", "cyl", "disp", "hp"),
        `Mazda RX4` = c(21, 6, 160, 110),
        `Mazda RX4 Wag` = c(21, 6, 160, 110),
        `Datsun 710` = c(22.8, 4, 108, 93)
      ),
      class = "data.frame",
      row.names = c(NA, 4L)
    )
  )

  expect_equal(
    data_rotate(df, colnames = TRUE),
    structure(
      list(
        `21` = c(6, 160, 110),
        `21` = c(6, 160, 110),
        `22.8` = c(4, 108, 93)
      ),
      class = "data.frame",
      row.names = c("cyl", "disp", "hp")
    )
  )

  expect_equal(
    data_rotate(df, rownames = "property", colnames = TRUE),
    structure(
      list(
        property = c("cyl", "disp", "hp"),
        `21` = c(6, 160, 110),
        `21` = c(6, 160, 110),
        `22.8` = c(4, 108, 93)
      ),
      class = "data.frame",
      row.names = c(NA, 3L)
    )
  )
})

test_that("data_rotate, arg 'colnames' works", {
  df <- mtcars[1:3, 1:4]
  df <- rownames_as_column(df)

  expected <- data.frame(
    `Mazda RX4` = c(21, 6, 160, 110),
    `Mazda RX4 Wag` = c(21, 6, 160, 110),
    `Datsun 710` = c(22.8, 4, 108, 93),
    check.names = FALSE
  )
  row.names(expected) <- c("mpg", "cyl", "disp", "hp")

  expect_identical(
    data_rotate(df, colnames = "rowname"),
    expected
  )
})

test_that("data_rotate warns if mixed types of data", {
  df <- mtcars[1:3, 1:4]
  df <- rownames_as_column(df)

  expect_warning(
    data_rotate(df),
    "mixed types of data"
  )

  df$rowname <- factor(df$rowname)
  expect_warning(
    data_rotate(df),
    "mixed types of data"
  )
})


================================================
FILE: tests/testthat/test-data_seek.R
================================================
test_that("data_seek - simple use case", {
  data(iris)
  out <- data_seek(iris, "Length")
  expect_identical(out$index, c(1L, 3L))
  expect_identical(out$labels, c("Sepal.Length", "Petal.Length"))
})

test_that("data_seek - seek label attribute", {
  data(efc)
  out <- data_seek(efc, "dependency")
  expect_identical(out$index, which(colnames(efc) == out$column))
  expect_identical(out$labels, "elder's dependency")
})

test_that("data_seek - seek label attribute", {
  data(efc)
  out <- data_seek(efc, "female")
  expect_identical(nrow(out), 0L)
  out <- data_seek(efc, "female", seek = "all")
  expect_identical(out$index, which(colnames(efc) == out$column))
  expect_identical(out$labels, "elder's gender")
})

test_that("data_seek - fuzzy match", {
  data(iris)
  out <- data_seek(iris, "Lenght")
  expect_identical(nrow(out), 0L)
  out <- data_seek(iris, "Lenght", fuzzy = TRUE)
  expect_identical(out$index, which(colnames(iris) %in% out$column))
  expect_identical(out$labels, c("Sepal.Length", "Petal.Length"))
})

test_that("data_seek - fuzzy match, value labels", {
  data(efc)
  out <- data_seek(efc, "femlae", seek = "all", fuzzy = TRUE)
  expect_identical(nrow(out), 1L)
  expect_identical(out$index, which(colnames(efc) %in% out$column))
  expect_identical(out$labels, "elder's gender")
})

test_that("data_seek - multiple pattern", {
  data(efc)
  out <- data_seek(efc, c("e16", "e42"))
  expect_identical(nrow(out), 2L)
  expect_identical(out$index, which(colnames(efc) %in% out$column))
  expect_identical(out$labels, c("elder's gender", "elder's dependency"))
  # only one match, typo
  out <- data_seek(efc, c("femlae", "dependency"))
  expect_identical(nrow(out), 1L)
  expect_identical(out$index, which(colnames(efc) %in% out$column))
  expect_identical(out$labels, "elder's dependency")
  # only one match, not searching in value labels
  out <- data_seek(efc, c("female", "dependency"))
  expect_identical(nrow(out), 1L)
  expect_identical(out$index, which(colnames(efc) %in% out$column))
  expect_identical(out$labels, "elder's dependency")
  # two matches
  out <- data_seek(efc, c("female", "dependency"), seek = "all")
  expect_identical(nrow(out), 2L)
  expect_identical(out$index, which(colnames(efc) %in% out$column))
  expect_identical(out$labels, c("elder's gender", "elder's dependency"))
  # only one match, typo
  out <- data_seek(efc, c("femlae", "dependency"), seek = "all")
  expect_identical(nrow(out), 1L)
  expect_identical(out$index, which(colnames(efc) %in% out$column))
  expect_identical(out$labels, "elder's dependency")
  # two matches, despite typo
  out <- data_seek(efc, c("femlae", "dependency"), seek = "all", fuzzy = TRUE)
  expect_identical(nrow(out), 2L)
  expect_identical(out$index, which(colnames(efc) %in% out$column))
  expect_identical(out$labels, c("elder's gender", "elder's dependency"))
})

test_that("data_seek - valid input", {
  expect_error(
    data_seek(rnorm(10), "Length"),
    regex = "`data` must be a data frame."
  )
  expect_error(
    data_seek(iris, "Length", seek = "somewhere"),
    regex = "`seek` must be"
  )
})

test_that("data_seek - print", {
  expect_snapshot(data_seek(iris, "Length"))
  expect_snapshot(data_seek(iris, "abc"))
})


================================================
FILE: tests/testthat/test-data_select.R
================================================
# input check ---------------------

test_that("data_select checks for data frame", {
  expect_error(data_select(NULL), regexp = "provided")
  x <- list(a = 1:2, b = letters[1:3])
  expect_error(data_select(x), regexp = "coerced")
})


# select helpers ---------------------

test_that("data_select works with select helpers", {
  expect_identical(
    data_select(iris, starts_with("Sepal")),
    iris[c("Sepal.Length", "Sepal.Width")]
  )

  expect_identical(
    data_select(iris, ends_with("Width")),
    iris[c("Sepal.Width", "Petal.Width")]
  )

  expect_identical(
    data_select(iris, regex("\\.")),
    iris[c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")]
  )

  expect_identical(
    data_select(iris, contains("Wid")),
    iris[c("Sepal.Width", "Petal.Width")]
  )
})


# select helpers, negation ---------------------

test_that("data_select works with negation of select helpers", {
  expect_identical(
    data_select(iris, -starts_with("Sepal")),
    iris[c("Petal.Length", "Petal.Width", "Species")]
  )

  expect_identical(
    data_select(iris, -ends_with("Width")),
    iris[c("Sepal.Length", "Petal.Length", "Species")]
  )
})


# select-nse with function  ---------------------

test_that("data_select works with select-functions", {
  expect_identical(
    data_select(iris, is.numeric()),
    iris[sapply(iris, is.numeric)]
  )

  expect_identical(
    data_select(iris, is.numeric),
    iris[sapply(iris, is.numeric)]
  )

  expect_identical(
    data_select(iris, is.factor()),
    iris[sapply(iris, is.factor)]
  )

  expect_identical(
    data_select(iris, is.factor),
    iris[sapply(iris, is.factor)]
  )

  expect_warning(expect_null(data_select(iris, is.logical())))
})


# select-nse with user-function  ---------------------
testfun <- function(i) {
  is.numeric(i) && mean(i, na.rm = TRUE) > 3.5
}
test_that("data_select works with user-defined select-functions", {
  expect_identical(data_select(iris, testfun), iris[sapply(iris, testfun)])
  expect_identical(data_select(iris, -testfun), iris[!sapply(iris, testfun)])

  testfun2 <- function(i) {
    is.numeric(i) && mean(i, na.rm = TRUE) < 5
  }
  expect_identical(
    data_select(iris, select = testfun, exclude = testfun2),
    iris["Sepal.Length"]
  )
  expect_identical(
    data_select(iris, select = testfun, exclude = -testfun2),
    iris["Petal.Length"]
  )
})


# select-nse with negation of functions  ---------------------

test_that("data_select works with negated select-functions", {
  expect_identical(
    data_select(iris, -is.numeric()),
    iris[sapply(iris, function(i) !is.numeric(i))] # nolint
  )

  expect_identical(
    data_select(iris, -is.numeric),
    iris[sapply(iris, function(i) !is.numeric(i))] # nolint
  )

  expect_identical(
    data_select(iris, -is.factor()),
    iris[sapply(iris, function(i) !is.factor(i))] # nolint
  )

  expect_identical(
    data_select(iris, -is.factor),
    iris[sapply(iris, function(i) !is.factor(i))] # nolint
  )

  expect_identical(data_select(iris, -is.logical), iris)
})


# select-nse with ranges  ---------------------

test_that("data_select works with ranges", {
  expect_identical(
    data_select(iris, 2:3),
    iris[2:3]
  )

  expect_identical(
    data_select(iris, Sepal.Width:Petal.Length),
    iris[2:3]
  )
})


# select-nse with negated ranges  ---------------------

test_that("data_select works with negated ranges", {
  expect_identical(
    data_select(iris, -(1:2)),
    iris[c(3, 4, 5)]
  )

  expect_identical(
    data_select(iris, -1:-2),
    iris[c(3, 4, 5)]
  )

  expect_identical(
    data_select(iris, exclude = -1:-2),
    iris[1:2]
  )

  expect_identical(
    data_select(iris, exclude = 2:3),
    iris[c(1, 4, 5)]
  )

  expect_error(
    data_select(iris, -Sepal.Width:Petal.Length),
    "can't mix negative and positive"
  )
  expect_identical(
    data_select(iris, -(Sepal.Width:Petal.Length)),
    iris[c(1, 4, 5)]
  )
})


# select-nse with formulas  ---------------------

test_that("data_select works with formulas", {
  expect_identical(
    data_select(iris, ~ Sepal.Width + Petal.Length),
    iris[2:3]
  )

  expect_identical(
    data_select(iris, exclude = ~ Sepal.Width + Petal.Length),
    iris[c(1, 4, 5)]
  )
})


# select-nse, other cases ---------------------

test_that("data_select works, other cases", {
  expect_identical(data_select(iris), iris)

  expect_identical(
    data_select(iris, c("Petal.Width", "Sepal.Length")),
    iris[c("Petal.Width", "Sepal.Length")]
  )

  expect_identical(
    data_select(iris, -c("Petal.Width", "Sepal.Length")),
    iris[setdiff(colnames(iris), c("Petal.Width", "Sepal.Length"))]
  )

  expect_identical(
    data_select(iris, -Petal.Width),
    iris[setdiff(colnames(iris), "Petal.Width")]
  )

  expect_identical(
    data_select(mtcars, c("am", "gear", "cyl")),
    mtcars[c("am", "gear", "cyl")]
  )

  expect_identical(
    data_select(mtcars, c("vam", "gear", "cyl")),
    mtcars[c("gear", "cyl")]
  )

  expect_warning(expect_null(data_select(mtcars, ends_with("abc"))))

  expect_identical(
    data_select(mtcars, regex("rb$")),
    mtcars["carb"]
  )

  expect_identical(
    data_select(mtcars, regex("^c")),
    mtcars[c("cyl", "carb")]
  )

  expect_warning(expect_null(data_select(mtcars, "^c")))

  expect_identical(
    data_select(mtcars, regex("^C"), ignore_case = TRUE),
    mtcars[c("cyl", "carb")]
  )
})


# select-nse works when called from other function  ---------------------

test_that("data_select from other functions", {
  test_fun1 <- function(data, i) {
    data_select(data, select = i)
  }
  expect_identical(
    test_fun1(iris, c("Sepal.Length", "Sepal.Width")),
    iris[c("Sepal.Length", "Sepal.Width")]
  )

  expect_identical(
    test_fun1(iris, starts_with("Sep")),
    iris[c("Sepal.Length", "Sepal.Width")]
  )

  test_fun1a <- function(data, i) {
    data_select(data, select = i, regex = TRUE)
  }
  expect_identical(
    test_fun1a(iris, "Sep"),
    iris[c("Sepal.Length", "Sepal.Width")]
  )

  test_fun1b <- function(data, i) {
    data_select(data, select = i, regex = TRUE)
  }
  expect_identical(
    test_fun1b(iris, "Width$"),
    iris[c("Sepal.Width", "Petal.Width")]
  )

  test_fun1c <- function(data, i) {
    data_select(data, select = -i)
  }
  expect_identical(
    test_fun1c(iris, c("Sepal.Length", "Sepal.Width")),
    iris[c("Petal.Length", "Petal.Width", "Species")]
  )

  test_fun2 <- function(data) {
    data_select(data, select = starts_with("Sep"))
  }
  expect_identical(
    test_fun2(iris),
    iris[c("Sepal.Length", "Sepal.Width")]
  )

  test_fun3 <- function(data) {
    i <- "Sep"
    data_select(data, select = starts_with(i))
  }
  expect_identical(
    test_fun3(iris),
    iris[, c("Sepal.Length", "Sepal.Width")]
  )

  test_top <- function(x) {
    testfun1 <- function(i) {
      is.numeric(i) && mean(i, na.rm = TRUE) > 3.5
    }
    testfun2 <- function(i) {
      is.numeric(i) && mean(i, na.rm = TRUE) < 5
    }
    data_select(x, select = testfun, exclude = -testfun2)
  }
  expect_identical(test_top(iris), iris["Petal.Length"])
})


# preserve attributes --------------------------

test_that("data_select preserves attributes", {
  skip_if_not_installed("parameters")

  m <- lm(Sepal.Length ~ Species, data = iris)
  out <- parameters::parameters(m)
  a1 <- attributes(out)

  out2 <- data_select(out, 1:3)
  a2 <- attributes(out2)

  expect_identical(sort(names(a1)), sort(names(a2)))
})

# Select helpers work in functions and loops

test_that("select helpers work in functions and loops", {
  foo <- function(data, i) {
    extract_column_names(data, select = starts_with(i))
  }
  expect_identical(
    foo(iris, "Sep"),
    c("Sepal.Length", "Sepal.Width")
  )

  for (i in "Sepal") {
    x <- extract_column_names(iris, select = starts_with(i))
  }
  expect_identical(
    x,
    c("Sepal.Length", "Sepal.Width")
  )

  for (i in "Length") {
    x <- extract_column_names(iris, select = ends_with(i))
  }
  expect_identical(
    x,
    c("Sepal.Length", "Petal.Length")
  )
})

test_that("select helpers work in functions and loops even if there's an object with the same name in the environment above", {
  i <- "Petal"
  foo <- function(data, i) {
    extract_column_names(data, select = starts_with(i))
  }
  expect_identical(
    foo(iris, "Sep"),
    c("Sepal.Length", "Sepal.Width")
  )

  for (i in "Sepal") {
    x <- extract_column_names(iris, select = starts_with(i))
  }
  expect_identical(
    x,
    c("Sepal.Length", "Sepal.Width")
  )

  i <- "Width"

  for (i in "Length") {
    x <- extract_column_names(iris, select = ends_with(i))
  }
  expect_identical(
    x,
    c("Sepal.Length", "Petal.Length")
  )
})

test_that("old solution still works", {
  foo <- function(data) {
    i <- "Sep"
    extract_column_names(data, select = i, regex = TRUE)
  }
  expect_identical(
    foo(iris),
    c("Sepal.Length", "Sepal.Width")
  )
})

test_that("data_select renames variables on the fly", {
  data(mtcars)
  expect_named(
    data_select(mtcars, c(new = "mpg", old = "cyl", hoho = "wt")),
    c("new", "old", "hoho")
  )
  expect_named(
    data_select(mtcars, c(new = "mpg", "cyl", hoho = "wt")),
    c("new", "cyl", "hoho")
  )
  expect_named(
    data_select(mtcars, c("mpg", "cyl", "wt")),
    c("mpg", "cyl", "wt")
  )
  # don't fail for non-existing columns
  expect_named(
    data_select(mtcars, c(new = "mpg", "cyl", hoho = "wt", test = "grea")),
    c("new", "cyl", "hoho")
  )
  # check that excluded variables don't cause troubles
  expect_named(
    data_select(mtcars, c(new = "mpg", "cyl", hoho = "wt"), exclude = "wt"),
    c("new", "cyl")
  )
  # error when names are not unique
  expect_error(
    data_select(mtcars, c(new = "mpg", old = "cyl", new = "wt")), # nolint
    regex = "Following names are duplicated"
  )
  expect_error(
    data_select(mtcars, c(new = "mpg", "cyl", cyl = "wt")), # nolint
    regex = "Following names are duplicated"
  )
  # when new name is used in exclude, it should be ignored
  expect_named(
    data_select(mtcars, c(drat = "mpg"), exclude = "drat"),
    "drat"
  )
})


================================================
FILE: tests/testthat/test-data_separate.R
================================================
test_that("data_separate: simple use case", {
  # simple case
  d_sep <- data.frame(
    x = c("1.a.6", "2.b.7", "3.c.8"),
    stringsAsFactors = FALSE
  )

  expect_error(data_separate(d_sep), regex = "Either")

  # basic
  expect_silent(data_separate(d_sep, guess_columns = "mode", verbose = FALSE))
  expect_silent({
    out <- data_separate(d_sep, guess_columns = "mode")
  })
  expect_identical(colnames(out), c("x_1", "x_2", "x_3"))
  expect_identical(out$x_1, c("1", "2", "3"))
  expect_identical(out$x_2, c("a", "b", "c"))

  # manual separator char
  out2 <- data_separate(
    d_sep,
    separator = "\\.",
    guess_columns = "mode",
    verbose = FALSE
  )
  expect_identical(out, out2)

  # non-existing separator char
  expect_message(
    data_separate(d_sep, separator = "_", guess_columns = "mode"),
    regex = "Separator probably not found"
  )

  # column names
  out <- data_separate(
    d_sep,
    new_columns = c("A1", "B2", "C3"),
    verbose = FALSE
  )
  expect_identical(colnames(out), c("A1", "B2", "C3"))
  expect_identical(out$A1, c("1", "2", "3"))
  expect_identical(out$B2, c("a", "b", "c"))

  out <- data_separate(d_sep, new_columns = letters[1:3], append = TRUE)
  expect_equal(
    out,
    data.frame(
      x = c("1.a.6", "2.b.7", "3.c.8"),
      a = c("1", "2", "3"),
      b = c("a", "b", "c"),
      c = c("6", "7", "8"),
      stringsAsFactors = FALSE
    ),
    ignore_attr = TRUE
  )
})


test_that("data_separate: convert between data_unite and data_separate", {
  d_unite <- data.frame(
    x = as.character(c(NA, 1:3)),
    y = c(letters[1:3], NA_character_),
    z = as.character(6:9),
    m = c("X", NA_character_, "Y", "Z"),
    n = c("NATION", "COUNTRY", "NATION", NA_character_),
    stringsAsFactors = FALSE
  )

  out1 <- data_unite(d_unite, new_column = "test")
  d_sep <- data_separate(
    out1,
    new_columns = c("x", "y", "z", "m", "n"),
    separator = "_"
  )

  expect_identical(d_unite, d_sep)
})


test_that("data_separate: different number of values", {
  d_sep <- data.frame(
    x = c("1.a.6", "2.b.7.d", "3.c.8", "5.j"),
    stringsAsFactors = FALSE
  )

  # basic use-case
  expect_silent(data_separate(d_sep, guess_columns = "mode", verbose = FALSE))
  expect_message(
    expect_message(
      expect_message(
        data_separate(d_sep, guess_columns = "mode"),
        regex = "3 columns"
      ),
      regex = "have been dropped"
    ),
    regex = "filled with `NA`"
  )
  out <- data_separate(d_sep, guess_columns = "mode", verbose = FALSE)
  expect_identical(colnames(out), c("x_1", "x_2", "x_3"))
  expect_identical(out$x_1, c("1", "2", "3", "5"))
  expect_identical(out$x_2, c("a", "b", "c", "j"))
  expect_identical(out$x_3, c("6", "7", "8", NA))

  # fill missings left
  out <- data_separate(
    d_sep,
    guess_columns = "mode",
    fill = "left",
    verbose = FALSE
  )
  expect_identical(colnames(out), c("x_1", "x_2", "x_3"))
  expect_identical(out$x_1, c("1", "2", "3", NA))
  expect_identical(out$x_2, c("a", "b", "c", "5"))
  expect_identical(out$x_3, c("6", "7", "8", "j"))

  # merge extra right
  out <- data_separate(
    d_sep,
    guess_columns = "mode",
    extra = "merge_right",
    verbose = FALSE
  )
  expect_identical(colnames(out), c("x_1", "x_2", "x_3"))
  expect_identical(out$x_1, c("1", "2", "3", "5"))
  expect_identical(out$x_2, c("a", "b", "c", "j"))
  expect_identical(out$x_3, c("6", "7 d", "8", NA))

  # max columns
  out <- data_separate(d_sep, guess_columns = "max", verbose = FALSE)
  expect_equal(
    out,
    data.frame(
      x_1 = c("1", "2", "3", "5"),
      x_2 = c("a", "b", "c", "j"),
      x_3 = c("6", "7", "8", NA),
      x_4 = c(NA, "d", NA, NA),
      stringsAsFactors = FALSE
    ),
    ignore_attr = TRUE
  )

  # min columns
  out <- data_separate(d_sep, guess_columns = "min", verbose = FALSE)
  expect_equal(
    out,
    data.frame(
      x_1 = c("1", "2", "3", "5"),
      x_2 = c("a", "b", "c", "j"),
      stringsAsFactors = FALSE
    ),
    ignore_attr = TRUE
  )

  out <- data_separate(
    d_sep,
    guess_columns = "min",
    extra = "merge_left",
    verbose = FALSE
  )
  expect_equal(
    out,
    data.frame(
      x_1 = c("1 a", "2 b 7", "3 c", "5"),
      x_2 = c("6", "d", "8", "j"),
      stringsAsFactors = FALSE
    ),
    ignore_attr = TRUE
  )

  out <- data_separate(
    d_sep,
    guess_columns = "max",
    fill = "left",
    verbose = FALSE
  )
  expect_equal(
    out,
    data.frame(
      x_1 = c(NA, "2", NA, NA),
      x_2 = c("1", "b", "3", NA),
      x_3 = c("a", "7", "c", "5"),
      x_4 = c("6", "d", "8", "j"),
      stringsAsFactors = FALSE
    ),
    ignore_attr = TRUE
  )
})


test_that("data_separate: multiple columns", {
  d_sep <- data.frame(
    x = c("1.a.6", "2.b.7.d", "3.c.8", "5.j"),
    y = c("m.n.99", "77.f.g", "44.9", NA),
    stringsAsFactors = FALSE
  )

  # select works
  out <- data_separate(
    d_sep,
    select = "x",
    guess_columns = "mode",
    verbose = FALSE
  )
  expect_identical(colnames(out), c("y", "x_1", "x_2", "x_3"))
  expect_identical(out$x_1, c("1", "2", "3", "5"))
  expect_identical(out$x_2, c("a", "b", "c", "j"))
  expect_identical(out$x_3, c("6", "7", "8", NA))

  out <- data_separate(d_sep, guess_columns = "mode", verbose = FALSE)
  expect_snapshot(out)

  out <- data_separate(
    d_sep,
    guess_columns = "mode",
    extra = "merge_right",
    verbose = FALSE
  )
  expect_snapshot(out)

  out <- data_separate(
    d_sep,
    new_columns = c("A", "B", "C"),
    extra = "merge_right",
    verbose = FALSE
  )
  expect_snapshot(out)

  out <- data_separate(
    d_sep,
    new_columns = c("A", "B", "C"),
    extra = "merge_right",
    append = TRUE,
    verbose = FALSE
  )
  expect_snapshot(out)

  out <- data_separate(
    d_sep,
    guess_columns = "mode",
    extra = "drop_left",
    verbose = FALSE
  )
  expect_snapshot(out)

  out <- data_separate(
    d_sep,
    new_columns = c("A", "B", "C"),
    fill = "value_right",
    extra = "merge_right",
    append = TRUE,
    verbose = FALSE
  )
  expect_snapshot(out)

  out <- data_separate(
    d_sep,
    new_columns = c("A", "B", "C"),
    fill = "value_right",
    extra = "merge_right",
    merge_multiple = TRUE,
    append = TRUE,
    verbose = FALSE
  )
  expect_snapshot(out)

  out <- data_separate(
    d_sep,
    new_columns = c("A", "B", "C"),
    merge_multiple = TRUE,
    append = TRUE,
    verbose = FALSE
  )
  expect_snapshot(out)

  out <- data_separate(
    d_sep,
    guess_columns = "mode",
    fill = "value_left",
    verbose = FALSE
  )
  expect_snapshot(out)
})


test_that("data_separate: multiple columns, different lengths", {
  d_sep <- data.frame(
    x = c("1.a.6", "2.b.7.d", "3.c.8", "5.j"),
    y = c("m.n.99.22", "77.f.g.34", "44.9", NA),
    stringsAsFactors = FALSE
  )

  # separate column names
  out <- data_separate(
    d_sep,
    select = c("x", "y"),
    new_columns = list(x = c("A", "B", "C"), y = c("EE", "FF", "GG")),
    verbose = FALSE
  )
  expect_named(out, c("A", "B", "C", "EE", "FF", "GG"))
  expect_snapshot(out)

  out <- data_separate(
    d_sep,
    select = c("x", "y"),
    new_columns = list(x = c("A", "B", "C"), y = c("EE", "FF", "GG", "HH")),
    verbose = FALSE
  )
  expect_named(out, c("A", "B", "C", "EE", "FF", "GG", "HH"))
  expect_snapshot(out)
})


test_that("data_separate: numeric separator", {
  d_sep <- data.frame(
    x = c("Thisisalongstring", "Doeshe1losteverything", "Wereme2longornot"),
    stringsAsFactors = FALSE
  )

  expect_silent({
    out <- data_separate(
      d_sep,
      guess_columns = "mode",
      separator = c(5, 7, 8, 12),
      verbose = TRUE
    )
  })
  expect_equal(
    out,
    data.frame(
      x_1 = c("This", "Does", "Were"),
      x_2 = c("is", "he", "me"),
      x_3 = c("a", "1", "2"),
      x_4 = c("long", "lost", "long"),
      x_5 = c("string", "everything", "ornot"),
      stringsAsFactors = FALSE
    ),
    ignore_attr = TRUE
  )

  d_sep <- data.frame(
    x = c("Thisisalongstring", "Doeshe1losteverything"),
    y = c("Wereme2longornot", NA),
    stringsAsFactors = FALSE
  )
  expect_silent({
    out <- data_separate(
      d_sep,
      separator = c(5, 7, 8, 12),
      new_columns = LETTERS[1:5]
    )
  })
  expect_equal(
    out,
    data.frame(
      A = c("This", "Does"),
      B = c("is", "he"),
      C = c("a", "1"),
      D = c("long", "lost"),
      E = c("string", "everything"),
      A.1 = c("Were", NA),
      B.1 = c("me", NA),
      C.1 = c("2", NA),
      D.1 = c("long", NA),
      E.1 = c("ornot", NA),
      stringsAsFactors = FALSE
    ),
    ignore_attr = TRUE
  )

  expect_error(
    data_separate(
      d_sep,
      separator = c(5, 7, 8, 12),
      new_columns = LETTERS[1:6]
    ),
    regex = "went wrong"
  )
})


test_that("data_separate: fail if invalid column selected", {
  d_sep <- data.frame(
    x = c("1.a.6", "2.b.7.d", "3.c.8", "5.j"),
    y = c("m.n.99", "77.f.g", "44.9", NA),
    stringsAsFactors = FALSE
  )
  expect_warning(
    expect_message(
      data_separate(d_sep, guess_columns = "mode", select = "z"),
      reg = "not found"
    ),
    regex = "misspelled?"
  )
  expect_identical(
    data_separate(d_sep, guess_columns = "mode", select = "z", verbose = FALSE),
    d_sep
  )
  expect_snapshot(data_separate(d_sep, guess_columns = "mode", select = NULL))
})


test_that("data_separate: numeric column", {
  d_sep <- data.frame(
    x = c(154353523, 535543532, 12342422, 15454334535),
    y = c("m.n.99", "77.f.g", "44.9", NA),
    stringsAsFactors = FALSE
  )
  expect_message(
    data_separate(d_sep, guess_columns = "mode", select = "x"),
    regex = "Separator probably"
  )
  out <- data_separate(
    d_sep,
    guess_columns = "mode",
    select = "x",
    separator = c(3, 6, 9)
  )
  expect_snapshot(out)
})


================================================
FILE: tests/testthat/test-data_shift.R
================================================
# numeric
test_that("slide", {
  x <- c(10, 11, 12)
  expect_identical(slide(x), c(0, 1, 2))

  x <- c(10, 11, 12)
  expect_identical(slide(x, lowest = 10), x)

  x <- c(10, 11, 12)
  expect_identical(slide(x, lowest = 1), c(1, 2, 3))

  x <- c(10, 11, NA, 12)
  expect_identical(slide(x, lowest = 1), c(1, 2, NA, 3))
})

# factor
test_that("slide", {
  data(efc)
  expect_message(expect_identical(slide(efc$e42dep), efc$e42dep))
})

# data frame
test_that("slide", {
  data(iris)
  expect_message(
    out <- slide(iris), # nolint
    "Shifting non-numeric variables is not possible"
  )
  expect_identical(out$Species, iris$Species)
  expect_identical(range(out$Sepal.Length), c(0, 3.6), tolerance = 1e-2)
})

# select helpers ------------------------------
test_that("slide regex", {
  expect_identical(
    slide(mtcars, select = "pg", regex = TRUE),
    slide(mtcars, select = "mpg")
  )
})


================================================
FILE: tests/testthat/test-data_summary.R
================================================
test_that("data_summary, single row summary", {
  data(iris)
  out <- data_summary(iris, MW = mean(Sepal.Width), SD = sd(Sepal.Width))
  expect_equal(out$MW, mean(iris$Sepal.Width), tolerance = 1e-4)
  expect_equal(out$SD, sd(iris$Sepal.Width), tolerance = 1e-4)
})


test_that("data_summary, single row summary, string expression", {
  data(iris)
  out <- data_summary(iris, "MW = mean(Sepal.Width)", "SD = sd(Sepal.Width)")
  expect_equal(out$MW, mean(iris$Sepal.Width), tolerance = 1e-4)
  expect_equal(out$SD, sd(iris$Sepal.Width), tolerance = 1e-4)
})


test_that("data_summary, summary for groups", {
  data(iris)
  out <- data_summary(
    iris,
    MW = mean(Sepal.Width),
    SD = sd(Sepal.Width),
    by = "Species"
  )
  expect_equal(
    out$MW,
    aggregate(iris["Sepal.Width"], list(iris$Species), mean)$Sepal.Width,
    tolerance = 1e-4
  )
  expect_equal(
    out$SD,
    aggregate(iris["Sepal.Width"], list(iris$Species), sd)$Sepal.Width,
    tolerance = 1e-4
  )
})


test_that("data_summary, summary for groups, string expression", {
  data(iris)
  out <- data_summary(
    iris,
    "MW = mean(Sepal.Width)",
    "SD = sd(Sepal.Width)",
    by = "Species"
  )
  expect_equal(
    out$MW,
    aggregate(iris["Sepal.Width"], list(iris$Species), mean)$Sepal.Width,
    tolerance = 1e-4
  )
  expect_equal(
    out$SD,
    aggregate(iris["Sepal.Width"], list(iris$Species), sd)$Sepal.Width,
    tolerance = 1e-4
  )
})


test_that("data_summary, grouped data frames", {
  data(iris)
  d <- data_group(iris, "Species")
  out <- data_summary(d, MW = mean(Sepal.Width), SD = sd(Sepal.Width))
  expect_equal(
    out$MW,
    aggregate(iris["Sepal.Width"], list(iris$Species), mean)$Sepal.Width,
    tolerance = 1e-4
  )
  expect_equal(
    out$SD,
    aggregate(iris["Sepal.Width"], list(iris$Species), sd)$Sepal.Width,
    tolerance = 1e-4
  )
  # "by" overrides groups
  data(mtcars)
  d <- data_group(mtcars, "gear")
  out <- data_summary(d, MW = mean(mpg), SD = sd(mpg), by = "am")
  expect_identical(
    out$MW,
    aggregate(mtcars["mpg"], list(mtcars$am), mean)$mpg
  )
})


test_that("data_summary, summary for multiple groups", {
  data(mtcars)
  out <- data_summary(
    mtcars,
    MW = mean(mpg),
    SD = sd(mpg),
    by = c("am", "gear")
  )
  expect_equal(
    out$MW,
    aggregate(mtcars["mpg"], list(mtcars$am, mtcars$gear), mean)$mpg,
    tolerance = 1e-4
  )
  expect_equal(
    out$SD,
    aggregate(mtcars["mpg"], list(mtcars$am, mtcars$gear), sd)$mpg,
    tolerance = 1e-4
  )
  x <- data_group(mtcars, c("am", "gear"))
  out <- data_summary(x, MW = mean(mpg), SD = sd(mpg))
  expect_equal(
    out$MW,
    aggregate(mtcars["mpg"], list(mtcars$am, mtcars$gear), mean)$mpg,
    tolerance = 1e-4
  )
  expect_equal(
    out$SD,
    aggregate(mtcars["mpg"], list(mtcars$am, mtcars$gear), sd)$mpg,
    tolerance = 1e-4
  )
})


test_that("data_summary, errors", {
  data(iris)
  data(mtcars)
  # "by" must be character
  expect_error(
    data_summary(iris, MW = mean(Sepal.Width), SD = sd(Sepal.Width), by = 5),
    regex = "Argument `by` must be a character string"
  )
  # "by" must be in data
  expect_error(
    data_summary(
      iris,
      MW = mean(Sepal.Width),
      SD = sd(Sepal.Width),
      by = "Speceis"
    ),
    regex = "Variable \"Speceis\" not"
  )
  # by for multiple variables
  expect_error(
    data_summary(mtcars, MW = mean(mpg), SD = sd(mpg), by = c("bam", "gear")),
    regex = "Variable \"bam\" not"
  )
  expect_error(
    data_summary(mtcars, MW = mean(mpg), SD = sd(mpg), by = c("bam", "geas")),
    regex = "Did you mean one of \"am\" or \"gear\"?"
  )
  # not a data frame
  expect_error(
    data_summary(
      iris$Sepal.Width,
      MW = mean(Sepal.Width),
      SD = sd(Sepal.Width)
    ),
    regex = "only works for"
  )
  # no expressions
  expect_error(
    data_summary(iris, by = "Species"),
    regex = "No expressions for calculating"
  )
  # wrong expression
  expect_error(
    data_summary(mtcars, mw = mesn(mpg), by = "am"),
    regex = "There was an error"
  )
  # wrong variable name
  expect_error(
    data_summary(mtcars, n = max(mpeg)),
    regex = "There was an error"
  )
  # expression returns more than one value
  expect_error(
    data_summary(
      mtcars,
      n = unique(mpg),
      j = c(min(am), max(am)),
      by = c("am", "gear")
    ),
    regex = "Each expression must return"
  )
})


test_that("data_summary, values_at", {
  data(mtcars)
  out <- data_summary(
    mtcars,
    pos1 = mpg[1],
    pos_end = mpg[length(mpg)],
    by = c("am", "gear")
  )
  # same as:
  # dplyr::summarise(mtcars, pos1 = dplyr::first(mpg), pos_end = dplyr::last(mpg), .by = c("am", "gear"))
  expect_equal(out$pos1, c(21.4, 24.4, 21, 26), tolerance = 1e-3)
  expect_equal(out$pos_end, c(19.2, 17.8, 21.4, 15), tolerance = 1e-3)
})


test_that("data_summary, print", {
  data(mtcars)
  out <- data_summary(
    mtcars,
    MW = mean(mpg),
    SD = sd(mpg),
    by = c("am", "gear")
  )
  expect_snapshot(print(out))
})


test_that("data_summary, with NA", {
  data(efc, package = "datawizard")
  out <- data_summary(efc, MW = mean(c12hour, na.rm = TRUE), by = "c172code")
  expect_snapshot(print(out))
  out <- data_summary(
    efc,
    MW = mean(c12hour, na.rm = TRUE),
    by = "c172code",
    remove_na = TRUE
  )
  expect_snapshot(print(out))
  # sorting for multiple groups
  out <- data_summary(
    efc,
    MW = mean(c12hour, na.rm = TRUE),
    by = c("e42dep", "c172code")
  )
  expect_snapshot(print(out))
})


test_that("data_summary, inside functions", {
  foo1 <- function(x, ...) {
    datawizard::data_summary(x, ..., by = "Species")
  }

  foo2 <- function(x, by, ...) {
    datawizard::data_summary(x, ..., by = by)
  }

  foo3 <- function(x, by) {
    datawizard::data_summary(x, MW = mean(Sepal.Width), by = by)
  }

  data(iris)
  out1 <- foo1(iris, MW = mean(Sepal.Width))
  out2 <- foo2(iris, by = "Species", MW = mean(Sepal.Width))
  out3 <- foo3(iris, "Species")
  expect_equal(out1$MW, out2$MW, tolerance = 1e-4)
  expect_equal(out1$MW, out3$MW, tolerance = 1e-4)
})


test_that("data_summary, expression as variable", {
  data(mtcars)
  a <- "MW = mean(mpg)"
  b <- "SD = sd(mpg)"
  out <- data_summary(mtcars, a, by = c("am", "gear"))
  expect_named(out, c("am", "gear", "MW"))
  expect_equal(
    out$MW,
    aggregate(mtcars["mpg"], list(mtcars$am, mtcars$gear), mean)$mpg,
    tolerance = 1e-4
  )
  expect_error(
    data_summary(mtcars, a, b, by = c("am", "gear")),
    regex = "You cannot mix"
  )
  out <- data_summary(mtcars, c(a, b), by = c("am", "gear"))
  expect_named(out, c("am", "gear", "MW", "SD"))
  expect_equal(
    out$SD,
    aggregate(mtcars["mpg"], list(mtcars$am, mtcars$gear), sd)$mpg,
    tolerance = 1e-4
  )
})


test_that("data_summary, extra functions", {
  data(mtcars)
  # n()
  out <- data_summary(mtcars, n = n(), by = c("am", "gear"))
  expect_identical(out$n, c(15L, 4L, 8L, 5L))
})


test_that("data_summary, bayestestR::ci", {
  skip_if_not_installed("bayestestR")
  data(mtcars)
  out <- data_summary(
    mtcars,
    mean_value = mean(mpg),
    ci = bayestestR::ci(mpg),
    by = c("am", "gear")
  )
  expect_named(out, c("am", "gear", "mean_value", "CI", "CI_low", "CI_high"))
  expect_snapshot(out)
  out <- data_summary(
    mtcars,
    mw = mean(mpg),
    test = bayestestR::ci(mpg),
    yolo = c(mean(mpg), sd(mpg)),
    by = c("am", "gear")
  )
  expect_named(
    out,
    c("am", "gear", "mw", "CI", "CI_low", "CI_high", "yolo_1", "yolo_2")
  )
})

test_that("no warning when variable name and function in global env clash, #583", {
  dat <- data.frame(rt = 1:10)
  expect_silent(data_summary(dat, rt = mean(rt)))
})


test_that("allow multiple columns for expressions", {
  set.seed(123)
  d <- data.frame(
    x = rnorm(100, 1, 1),
    y = rnorm(100, 2, 2),
    groups = rep(1:4, each = 25)
  )

  out <- data_summary(
    d,
    quant_x = quantile(x, c(0.25, 0.75)),
    quant_y = quantile(y, c(0.25, 0.75)),
    suffix = c("Q1", "Q3")
  )
  expect_equal(
    out$quant_xQ1,
    0.50615,
    tolerance = 1e-3,
    ignore_attr = TRUE
  )
  expect_named(out, c("quant_xQ1", "quant_xQ3", "quant_yQ1", "quant_yQ3"))

  # automatic suffixes
  out <- data_summary(
    d,
    quant_x = quantile(x, c(0.25, 0.75)),
    quant_y = quantile(y, c(0.1, 0.9)),
    suffix = NULL
  )
  expect_named(out, c("quant_x25%", "quant_x75%", "quant_y10%", "quant_y90%"))

  # use own suffix only for one expression - other expressions are
  # suffixed with `_1`, `_2`, etc.
  out <- data_summary(
    d,
    quant_x = quantile(x, c(0.25, 0.75)),
    quant_y = quantile(y, c(0.25, 0.5, 0.75)),
    mean_x = mean(x),
    suffix = list(quant_y = c("_Q1", "_Q2", "_Q3")),
    by = "groups"
  )
  expect_named(
    out,
    c(
      "groups",
      "quant_x25%",
      "quant_x75%",
      "quant_y_Q1",
      "quant_y_Q2",
      "quant_y_Q3",
      "mean_x"
    )
  )

  set.seed(123)
  d <- data.frame(
    x = rnorm(100, 1, 1),
    y = rnorm(100, 2, 2),
    w = rnorm(100, 3, 0.5),
    z = rnorm(100, 4, 3),
    groups = rep(1:4, each = 25)
  )

  out <- data_summary(
    d,
    quant_x = quantile(x, c(0.25, 0.75)),
    mean_x = mean(x),
    quant_y = quantile(y, c(0.25, 0.5, 0.75))
  )
  expect_equal(
    out,
    data.frame(
      `quant_x25%` = 0.50615,
      `quant_x75%` = 1.69182,
      mean_x = 1.09041,
      `quant_y25%` = 0.39779,
      `quant_y50%` = 1.54834,
      `quant_y75%` = 2.93569
    ),
    tolerance = 1e-3,
    ignore_attr = TRUE
  )

  out <- data_summary(
    d,
    quant_x = quantile(x, c(0.25, 0.75)),
    mean_x = mean(x),
    fivenum = fivenum(y)
  )
  expect_equal(
    out,
    data.frame(
      `quant_x25%` = 0.50615,
      `quant_x75%` = 1.69182,
      mean_x = 1.09041,
      fivenum_1 = -2.10649,
      fivenum_2 = 0.36539,
      fivenum_3 = 1.54834,
      fivenum_4 = 2.96837,
      fivenum_5 = 8.48208
    ),
    tolerance = 1e-3,
    ignore_attr = TRUE
  )

  out <- data_summary(
    d,
    quant_x = quantile(x, c(0.25, 0.75)),
    mean_x = mean(x),
    quant_y = quantile(y, c(0.25, 0.5, 0.75)),
    suffix = list(quant_y = c("_Q1", "_Q2", "_Q3"))
  )
  expect_equal(
    out,
    data.frame(
      `quant_x25%` = 0.50615,
      `quant_x75%` = 1.69182,
      mean_x = 1.09041,
      quant_y_Q1 = 0.39779,
      quant_y_Q2 = 1.54834,
      quant_y_Q3 = 2.93569
    ),
    tolerance = 1e-3,
    ignore_attr = TRUE
  )

  out <- data_summary(
    d,
    quant_x = quantile(x, c(0.25, 0.75)),
    mean_x = mean(x),
    quant_y = quantile(y, c(0.25, 0.5, 0.75)),
    suffix = list(quant_x = c("Q1", "Q3"), quant_y = c("_Q1", "_Q2", "_Q3"))
  )
  expect_equal(
    out,
    data.frame(
      quant_xQ1 = 0.50615,
      quant_xQ3 = 1.69182,
      mean_x = 1.09041,
      quant_y_Q1 = 0.39779,
      quant_y_Q2 = 1.54834,
      quant_y_Q3 = 2.93569
    ),
    tolerance = 1e-3,
    ignore_attr = TRUE
  )

  out <- data_summary(
    d,
    quant_x = quantile(x, c(0.25, 0.5)),
    quant_w = quantile(w, c(0.25, 0.5)),
    quant_y = quantile(y, c(0.25, 0.5)),
    quant_z = quantile(z, c(0.25, 0.5)),
    suffix = c("_Q1", "_Q2")
  )
  expect_equal(
    out,
    data.frame(
      quant_x_Q1 = 0.50615,
      quant_x_Q2 = 1.06176,
      quant_w_Q1 = 2.73435,
      quant_w_Q2 = 3.01796,
      quant_y_Q1 = 0.39779,
      quant_y_Q2 = 1.54834,
      quant_z_Q1 = 1.81187,
      quant_z_Q2 = 3.98947
    ),
    tolerance = 1e-3,
    ignore_attr = TRUE
  )

  # errors ------------------------------------------------------------------

  expect_error(
    data_summary(
      d,
      quant_x = quantile(x, c(0.25, 0.75)),
      mean_x = mean(x),
      quant_y = quantile(y, c(0.25, 0.5, 0.75)),
      suffix = list(quant_xy = c("_Q1", "_Q2", "_Q3"))
    ),
    regex = "Names of `suffix` must match the names",
    fixed = TRUE
  )

  expect_error(
    data_summary(
      d,
      quant_x = quantile(x, c(0.25, 0.75)),
      mean_x = mean(x),
      quant_y = quantile(y, c(0.25, 0.5, 0.75)),
      suffix = list(c("Q1", "Q3"), "mean", c("_Q1", "_Q2", "_Q3"))
    ),
    regex = "All elements of `suffix` must have names.",
    fixed = TRUE
  )

  expect_error(
    data_summary(
      d,
      quant_x = quantile(x, c(0.25, 0.75)),
      mean_x = mean(x),
      quant_y = quantile(y, c(0.25, 0.5, 0.75)),
      suffix = c("_Q1", "_Q2", "_Q3")
    ),
    regex = "Argument `suffix` must have the same length",
    fixed = TRUE
  )

  expect_error(
    data_summary(
      d,
      quant_x = quantile(x, c(0.25, 0.75)),
      mean_x = mean(x),
      quant_y = quantile(y, c(0.25, 0.5, 0.75)),
      suffix = list(quant_x = c("_Q1", "_Q2", "_Q3"))
    ),
    regex = "Argument `suffix` must have the same length",
    fixed = TRUE
  )

  expect_error(
    data_summary(
      d,
      quant_x = quantile(x, c(0.25, 0.75)),
      mean_x = mean(x),
      quant_y = quantile(y, c(0.25, 0.5, 0.75)),
      suffix = list(quant_x = c("Q1", "Q3"), quant_y = c("_Q1", "_Q2", "_Q2"))
    ),
    regex = "All suffixes for a single expression must be unique",
    fixed = TRUE
  )

  expect_error(
    data_summary(
      d,
      quant_x = quantile(x, c(0.25, 0.5)),
      quant_w = quantile(w, c(0.25, 0.5)),
      quant_y = quantile(y, c(0.25, 0.5)),
      quant_z = quantile(z, c(0.25, 0.5)),
      suffix = c("_Q1", "_Q2", "_Q3")
    ),
    regex = "Argument `suffix` must have the same length",
    fixed = TRUE
  )
})


================================================
FILE: tests/testthat/test-data_tabulate.R
================================================
test_that("data_tabulate factor", {
  data(efc, package = "datawizard")
  x <- data_tabulate(efc$e42dep)
  expect_identical(
    as.vector(x$Value),
    as.vector(sort(unique(
      addNA(efc$e42dep)
    )))
  )
  expect_identical(x$N, as.vector(table(addNA(efc$e42dep))))
  expect_identical(
    x$`Valid %`,
    as.vector(c(
      100 * table(efc$e42dep) / sum(!is.na(efc$e42dep)),
      NA
    )),
    ignore_attr = TRUE,
    tolerance = 1e-3
  )
})


test_that("data_tabulate numeric", {
  data(efc, package = "datawizard")
  x <- data_tabulate(efc$neg_c_7)
  expect_identical(
    as.vector(x$Value),
    as.vector(sort(unique(
      addNA(efc$neg_c_7)
    )))
  )
  expect_identical(x$N, as.vector(table(addNA(efc$neg_c_7))))
  expect_identical(
    x$`Valid %`,
    as.vector(c(
      100 * table(efc$neg_c_7) / sum(!is.na(efc$neg_c_7)),
      NA
    )),
    ignore_attr = TRUE,
    tolerance = 1e-3
  )
})


test_that("data_tabulate, HTML", {
  skip_if_not_installed("gt")
  data(efc, package = "datawizard")
  expect_s3_class(print_html(data_tabulate(efc$c172code)), "gt_tbl")
  expect_s3_class(print_html(data_tabulate(efc, "c172code")), "gt_tbl")
  expect_s3_class(
    display(data_tabulate(efc, "c172code"), format = "html"),
    "gt_tbl"
  )
})


test_that("data_tabulate, tinytable", {
  skip_if_not_installed("tinytable")
  data(efc, package = "datawizard")
  expect_snapshot(display(data_tabulate(efc$c172code), format = "tt"))
  expect_snapshot(display(data_tabulate(efc, "c172code"), format = "tt"))
})


test_that("data_tabulate, weights", {
  skip_if_not_installed("knitr")
  data(efc, package = "datawizard")
  set.seed(123)
  efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
  # vector/factor
  out1 <- data_tabulate(efc$e42dep, weights = efc$weights)
  out2 <- data_tabulate(efc$e42dep)
  expect_equal(out1$N, c(3, 4, 26, 67, 5), ignore_attr = TRUE)
  expect_equal(out2$N, c(2L, 4L, 28L, 63L, 3L), ignore_attr = TRUE)
  expect_equal(
    out1$N,
    round(xtabs(efc$weights ~ efc$e42dep, addNA = TRUE)),
    ignore_attr = TRUE
  )
  # data frames
  out <- data_tabulate(efc, c("e42dep", "e16sex"), weights = efc$weights)
  expect_equal(out[[1]]$N, out1$N, ignore_attr = TRUE)
  # mismatch of lengths
  w <- c(efc$weights, 1)
  expect_error(
    data_tabulate(efc$e42dep, weights = w),
    regex = "Length of `weights`"
  )
  # correct table footer
  expect_snapshot(print(data_tabulate(efc$e42dep, weights = efc$weights)))
  expect_snapshot(print_md(data_tabulate(efc$e42dep, weights = efc$weights)))
  expect_snapshot(display(data_tabulate(efc$e42dep, weights = efc$weights)))
  # correct table caption
  expect_snapshot(print(data_tabulate(
    efc,
    c("e42dep", "e16sex"),
    collapse = TRUE,
    weights = efc$weights
  )))
  expect_snapshot(print_md(data_tabulate(
    efc,
    c("e42dep", "e16sex"),
    weights = efc$weights
  )))
  expect_snapshot(display(data_tabulate(
    efc,
    c("e42dep", "e16sex"),
    weights = efc$weights
  )))
})


test_that("data_tabulate data.frame", {
  data(efc, package = "datawizard")
  x <- data_tabulate(efc, c("e16sex", "c172code"))
  expect_s3_class(x, "list")
  expect_length(x, 2L)
  expect_identical(
    attributes(x[[1]]),
    list(
      names = c(
        "Variable",
        "Value",
        "N",
        "Raw %",
        "Valid %",
        "Cumulative %"
      ),
      class = c("datawizard_table", "data.frame"),
      row.names = 1:3,
      type = "numeric",
      varname = "e16sex",
      label = "elder's gender",
      object = "e16sex",
      duplicate_varnames = c(FALSE, TRUE, TRUE),
      total_n = 100L,
      valid_n = 100L
    )
  )
  expect_identical(
    attributes(x[[2]]),
    list(
      names = c(
        "Variable",
        "Value",
        "N",
        "Raw %",
        "Valid %",
        "Cumulative %"
      ),
      class = c("datawizard_table", "data.frame"),
      row.names = 1:4,
      type = "numeric",
      varname = "c172code",
      label = "carer's level of education",
      object = "c172code",
      duplicate_varnames = c(FALSE, TRUE, TRUE, TRUE),
      total_n = 100L,
      valid_n = 90L
    )
  )
  table1 <- x[[1]]
  expect_identical(
    as.vector(table1$Value),
    as.character(c(
      sort(
        unique(efc$e16sex)
      ),
      NA
    ))
  )
  expect_identical(table1$N, as.vector(table(addNA(efc$e16sex))))
  expect_identical(
    table1$`Valid %`,
    as.vector(c(
      100 * table(efc$e16sex) / sum(!is.na(efc$e16sex)),
      NA
    )),
    ignore_attr = TRUE,
    tolerance = 1e-3
  )
})


test_that("data_tabulate unsupported class", {
  data(mtcars)
  expect_warning(
    data_tabulate(lm(mpg ~ hp, data = mtcars)),
    regex = "Can't compute frequency tables"
  )
})


test_that("data_tabulate print", {
  set.seed(123)
  x <- sample.int(3, 1e6, TRUE)
  out <- data_tabulate(x, name = "Large Number")
  expect_identical(
    attributes(out),
    list(
      names = c("Variable", "Value", "N", "Raw %", "Valid %", "Cumulative %"),
      class = c("datawizard_table", "data.frame"),
      row.names = 1:4,
      type = "integer",
      varname = "Large Number",
      object = "x",
      duplicate_varnames = c(FALSE, TRUE, TRUE, TRUE),
      total_n = 1000000L,
      valid_n = 1000000L
    )
  )
})


test_that("data_tabulate print", {
  data(efc, package = "datawizard")
  expect_snapshot(data_tabulate(efc$e42dep))
})


test_that("data_tabulate print multiple", {
  data(efc, package = "datawizard")
  expect_snapshot(data_tabulate(efc, c("c172code", "e16sex")))
})


test_that("data_tabulate big numbers", {
  set.seed(123)
  x <- sample.int(5, size = 1e7, TRUE)
  expect_snapshot(data_tabulate(x))
  expect_snapshot(print(data_tabulate(x), big_mark = "-"))
  expect_snapshot(print(data_tabulate(x), big_mark = ""))
})


test_that("data_tabulate print multiple, collapse", {
  data(efc, package = "datawizard")
  expect_snapshot(data_tabulate(efc, c("c172code", "e16sex"), collapse = TRUE))
})


test_that("data_tabulate grouped data.frame", {
  skip_if_not_installed("poorman")
  data(efc, package = "datawizard")
  x <- data_tabulate(poorman::group_by(efc, e16sex), "c172code")
  expect_s3_class(x, "list")
  expect_length(x, 2L)
  expect_identical(
    attributes(x[[1]]),
    list(
      names = c(
        "Variable",
        "Group",
        "Value",
        "N",
        "Raw %",
        "Valid %",
        "Cumulative %"
      ),
      class = c("datawizard_table", "data.frame"),
      row.names = 1:4,
      type = "numeric",
      varname = "c172code",
      label = "carer's level of education",
      object = "c172code",
      group_variable = structure(
        list(e16sex = 1),
        .drop = TRUE,
        row.names = 1L,
        class = "data.frame"
      ),
      duplicate_varnames = c(FALSE, TRUE, TRUE, TRUE),
      total_n = 46L,
      valid_n = 41L
    )
  )
  table1 <- x[[1]]
  expect_identical(
    as.vector(table1$Value),
    as.character(c(
      sort(
        unique(efc$c172code)
      ),
      NA
    ))
  )
  expect_identical(
    table1$N,
    as.vector(table(addNA(efc$c172code[efc$e16sex == 1])))
  )
  expect_identical(
    table1$`Valid %`,
    as.vector(c(
      100 *
        table(efc$c172code[efc$e16sex == 1]) /
        sum(!is.na(efc$c172code[efc$e16sex == 1])),
      NA
    )),
    ignore_attr = TRUE,
    tolerance = 1e-3
  )
})


test_that("data_tabulate print grouped data", {
  skip_if_not_installed("poorman")
  data(efc, package = "datawizard")
  expect_snapshot(data_tabulate(poorman::group_by(efc, e16sex), "c172code"))
})

test_that("data_tabulate print, collapse groups", {
  skip_if_not_installed("poorman")
  data(efc, package = "datawizard")
  expect_snapshot(
    data_tabulate(poorman::group_by(efc, e16sex), "c172code", collapse = TRUE)
  )
})

test_that("data_tabulate print, collapse groups, drop levels", {
  skip_if_not_installed("poorman")
  data(efc, package = "datawizard")
  expect_snapshot(
    data_tabulate(
      poorman::group_by(efc, e16sex),
      "e42dep",
      collapse = TRUE,
      drop_levels = TRUE
    )
  )
})

test_that("data_tabulate drop levels", {
  x <- factor(rep(letters[1:3], 3), levels = letters[1:5])
  out1 <- data_tabulate(x, drop_levels = FALSE)
  out2 <- data_tabulate(x, drop_levels = TRUE)
  expect_identical(out1$N, c(3L, 3L, 3L, 0L, 0L, 0L))
  expect_identical(as.character(out1$Value), c("a", "b", "c", "d", "e", NA))
  expect_identical(out2$N, c(3L, 3L, 3L, 0L))
  expect_identical(as.character(out2$Value), c("a", "b", "c", NA))
})


# select helpers ------------------------------

test_that("data_tabulate regex", {
  data(mtcars)
  expect_identical(
    data_tabulate(mtcars, select = "arb", regex = TRUE),
    data_tabulate(mtcars, select = "carb")
  )
})


# missing values ------------------------------

test_that("data_tabulate exclude/include missing values", {
  data(efc, package = "datawizard")
  set.seed(123)
  efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
  efc$e16sex[sample.int(nrow(efc), 5)] <- NA
  out <- data_tabulate(efc$c172code)
  expect_identical(out$N, c(8L, 66L, 16L, 10L))
  out <- data_tabulate(efc$c172code, remove_na = TRUE)
  expect_identical(out$N, c(8L, 66L, 16L))
  out <- data_tabulate(efc$c172code, weights = efc$weights)
  expect_identical(out$N, c(10, 67, 15, 13))
  out <- data_tabulate(efc$c172code, remove_na = TRUE, weights = efc$weights)
  expect_identical(out$N, c(10, 67, 15))
})


# cross tables ------------------------------

test_that("data_tabulate, cross tables", {
  data(efc, package = "datawizard")
  set.seed(123)
  efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
  efc$e16sex[sample.int(nrow(efc), 5)] <- NA

  expect_snapshot(print(data_tabulate(
    efc$c172code,
    by = efc$e16sex,
    proportions = "full"
  )))
  expect_snapshot(print(data_tabulate(
    efc$c172code,
    by = efc$e16sex,
    proportions = "full",
    remove_na = TRUE
  )))
  expect_snapshot(print(data_tabulate(
    efc$c172code,
    by = efc$e16sex,
    proportions = "full",
    weights = efc$weights
  )))
  expect_snapshot(print(data_tabulate(
    efc$c172code,
    by = efc$e16sex,
    proportions = "full",
    remove_na = TRUE,
    weights = efc$weights
  ))) # nolint
  expect_snapshot(print(data_tabulate(
    efc,
    "c172code",
    by = efc$e16sex,
    proportions = "row"
  )))
  expect_snapshot(print(data_tabulate(
    efc,
    "c172code",
    by = efc$e16sex,
    proportions = "row",
    remove_na = TRUE
  )))
  expect_snapshot(print(data_tabulate(
    efc,
    "c172code",
    by = efc$e16sex,
    proportions = "row",
    weights = efc$weights
  )))
  expect_snapshot(print(data_tabulate(
    efc,
    "c172code",
    by = efc$e16sex,
    proportions = "row",
    remove_na = TRUE,
    weights = efc$weights
  ))) # nolint
  expect_snapshot(print(data_tabulate(
    efc,
    "c172code",
    by = "e16sex",
    proportions = "column"
  )))
  expect_snapshot(print(data_tabulate(
    efc,
    "c172code",
    by = "e16sex",
    proportions = "column",
    remove_na = TRUE
  )))
  expect_snapshot(print(data_tabulate(
    efc,
    "c172code",
    by = "e16sex",
    proportions = "column",
    weights = "weights"
  )))
  expect_snapshot(print(data_tabulate(
    efc,
    "c172code",
    by = "e16sex",
    proportions = "column",
    remove_na = TRUE,
    weights = "weights"
  ))) # nolint
  expect_snapshot(print(data_tabulate(
    efc,
    c("c172code", "e42dep"),
    by = "e16sex",
    proportions = "row"
  ))) # nolint
})

test_that("data_tabulate, cross tables, HTML", {
  skip_if_not_installed("gt")
  data(efc, package = "datawizard")
  set.seed(123)
  efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
  efc$e16sex[sample.int(nrow(efc), 5)] <- NA

  expect_s3_class(
    print_html(data_tabulate(
      efc$c172code,
      by = efc$e16sex,
      proportions = "full"
    )),
    "gt_tbl"
  )
  expect_s3_class(
    print_html(data_tabulate(
      efc$c172code,
      by = efc$e16sex,
      proportions = "full",
      remove_na = TRUE
    )),
    "gt_tbl"
  ) # nolint
  expect_s3_class(
    print_html(data_tabulate(
      efc$c172code,
      by = efc$e16sex,
      proportions = "full",
      weights = efc$weights
    )),
    "gt_tbl"
  ) # nolint
  expect_s3_class(
    print_html(data_tabulate(
      efc$c172code,
      by = efc$e16sex,
      proportions = "full",
      remove_na = TRUE,
      weights = efc$weights
    )),
    "gt_tbl"
  ) # nolint
  expect_s3_class(
    print_html(data_tabulate(
      efc,
      "c172code",
      by = efc$e16sex,
      proportions = "row"
    )),
    "gt_tbl"
  )
  expect_s3_class(
    print_html(data_tabulate(
      efc,
      "c172code",
      by = efc$e16sex,
      proportions = "row",
      remove_na = TRUE,
      weights = efc$weights
    )),
    "gt_tbl"
  ) # nolint
  expect_s3_class(
    display(
      data_tabulate(
        efc,
        "c172code",
        by = efc$e16sex,
        proportions = "row",
        remove_na = TRUE,
        weights = efc$weights
      ),
      format = "html"
    ),
    "gt_tbl"
  ) # nolint
})

test_that("data_tabulate, cross tables, tinytable", {
  skip_if_not_installed("tinytable")
  data(efc, package = "datawizard")
  set.seed(123)
  efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
  efc$e16sex[sample.int(nrow(efc), 5)] <- NA

  expect_snapshot(display(
    data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full"),
    format = "tt"
  ))
  expect_snapshot(display(
    data_tabulate(
      efc$c172code,
      by = efc$e16sex,
      proportions = "full",
      remove_na = TRUE
    ),
    format = "tt"
  ))
  expect_snapshot(display(
    data_tabulate(
      efc$c172code,
      by = efc$e16sex,
      proportions = "full",
      weights = efc$weights
    ),
    format = "tt"
  ))
  expect_snapshot(display(
    data_tabulate(
      efc$c172code,
      by = efc$e16sex,
      proportions = "full",
      remove_na = TRUE,
      weights = efc$weights
    ),
    format = "tt"
  ))
  expect_snapshot(display(
    data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row"),
    format = "tt"
  ))
  expect_snapshot(display(
    data_tabulate(
      efc,
      "c172code",
      by = efc$e16sex,
      proportions = "row",
      remove_na = TRUE,
      weights = efc$weights
    ),
    format = "tt"
  ))
})

test_that("data_tabulate, cross tables, grouped df", {
  data(efc, package = "datawizard")
  set.seed(123)
  efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
  efc$e16sex[sample.int(nrow(efc), 5)] <- NA
  grp <- data_group(efc, "e42dep")
  expect_snapshot(print(data_tabulate(
    grp,
    "c172code",
    by = "e16sex",
    proportions = "row"
  )))
  skip_if_not_installed("gt")
  expect_s3_class(
    print_html(data_tabulate(
      grp,
      "c172code",
      by = "e16sex",
      proportions = "row"
    )),
    "gt_tbl"
  ) # nolint
  expect_s3_class(
    print_html(data_tabulate(
      efc,
      c("e16sex", "e42dep"),
      by = "c172code",
      proportions = "row"
    )),
    "gt_tbl"
  ) # nolint
})

test_that("data_tabulate, cross tables, print/format works", {
  data(mtcars)
  x <- data_tabulate(mtcars, c("cyl", "am"), by = "gear")
  expect_snapshot(print(x))
})

test_that("data_tabulate, cross tables, errors by", {
  data(efc, package = "datawizard")
  set.seed(123)
  efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
  efc$e16sex[sample.int(nrow(efc), 5)] <- NA
  expect_error(
    data_tabulate(efc$c172code, by = "e16sex"),
    regex = "If `by` is a string"
  )
  expect_error(
    data_tabulate(efc$c172code, by = efc$e16sex[-1]),
    regex = "Length of `by`"
  )
  expect_error(
    data_tabulate(efc, "c172code", by = efc$e16sex[-1]),
    regex = "Length of `by`"
  )
  expect_error(
    data_tabulate(efc, "c172code", by = "c16sex"),
    regex = "not found"
  )
  expect_error(
    data_tabulate(efc, "c172code", by = c("e16sex", "e42dep")),
    regex = "You may use"
  )
})

test_that("data_tabulate, cross tables, errors weights", {
  data(efc, package = "datawizard")
  set.seed(123)
  efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
  efc$e16sex[sample.int(nrow(efc), 5)] <- NA
  expect_error(
    data_tabulate(efc$c172code, weights = "weights"),
    regex = "If `weights`"
  )
  expect_error(
    data_tabulate(efc$c172code, weights = efc$weights[-1]),
    regex = "Length of `weights`"
  )
  expect_error(
    data_tabulate(efc, "c172code", weights = efc$weights[-1]),
    regex = "Length of `weights`"
  )
  expect_error(
    data_tabulate(efc, "c172code", weights = "weigths"),
    regex = "not found"
  )
  expect_error(
    data_tabulate(efc, "c172code", weights = c("e16sex", "e42dep")),
    regex = "length 1"
  )
  expect_error(
    data_tabulate(efc$c172code, weights = efc$wweight),
    regex = "not found"
  )
})

test_that("data_tabulate, cross tables, modify structure", {
  skip_if_not_installed("knitr")
  data(efc, package = "datawizard")
  x <- data_group(efc, c("c172code", "e16sex"))
  out <- data_tabulate(x, "c172code")
  out[] <- lapply(
    out,
    data_select,
    exclude = c("Variable", "Raw %", "Cumulative %")
  )
  junk <- capture.output(print_md(out))
  expect_false(grepl("Variable", junk[3], fixed = TRUE))
  expect_false(grepl("Raw %", junk[3], fixed = TRUE))
  # display() default to markdown
  junk <- capture.output(display(out))
  expect_false(grepl("Variable", junk[3], fixed = TRUE))
  expect_false(grepl("Raw %", junk[3], fixed = TRUE))
})


# markdown -------------------------

test_that("data_tabulate, cross tables, markdown", {
  skip_if_not_installed("knitr")
  data(efc, package = "datawizard")
  set.seed(123)
  efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
  efc$e16sex[sample.int(nrow(efc), 5)] <- NA

  expect_snapshot(print_md(data_tabulate(
    efc$c172code,
    by = efc$e16sex,
    proportions = "full"
  )))
  expect_snapshot(print_md(data_tabulate(
    efc$c172code,
    by = efc$e16sex,
    proportions = "full",
    remove_na = TRUE
  )))
  expect_snapshot(print_md(data_tabulate(
    efc$c172code,
    by = efc$e16sex,
    proportions = "full",
    weights = efc$weights
  )))
  expect_snapshot(print_md(data_tabulate(
    efc$c172code,
    by = efc$e16sex,
    proportions = "full",
    remove_na = TRUE,
    weights = efc$weights
  ))) # nolint
  expect_snapshot(print_md(data_tabulate(
    efc,
    "c172code",
    by = "e16sex",
    proportions = "column",
    remove_na = TRUE,
    weights = "weights"
  ))) # nolint
  expect_snapshot(print_md(data_tabulate(
    efc,
    c("c172code", "e42dep"),
    by = "e16sex",
    proportions = "row"
  ))) # nolint
  expect_snapshot(display(data_tabulate(
    efc,
    "c172code",
    by = "e16sex",
    proportions = "column",
    remove_na = TRUE,
    weights = "weights"
  ))) # nolint
  expect_snapshot(display(data_tabulate(
    efc,
    c("c172code", "e42dep"),
    by = "e16sex",
    proportions = "row"
  ))) # nolint
})


# validate against table -------------------------

test_that("data_tabulate, validate against table", {
  data(mtcars)
  # frequency table
  out1 <- as.data.frame(table(mtcars$cyl))
  out2 <- data_tabulate(mtcars$cyl, remove_na = TRUE)
  expect_identical(out1$Freq, out2$N)
  # crosstable
  out1 <- data_arrange(
    as.data.frame(table(mtcars$cyl, mtcars$gear)),
    c("Var1", "Var2")
  )
  out2 <- data_rename(
    data_to_long(
      as.data.frame(data_tabulate(
        mtcars$cyl,
        by = mtcars$gear,
        remove_na = TRUE
      )),
      2:4,
      names_to = "Var2",
      values_to = "Freq"
    ),
    "mtcars$cyl",
    "Var1"
  )
  out1[[2]] <- as.character(out1[[2]])
  expect_equal(out1, out2, ignore_attr = TRUE)
})


test_that("data_tabulate, correct 0% for proportions", {
  data(efc, package = "datawizard")
  out <- data_tabulate(efc, "c172code", by = "e16sex", proportions = "column")
  expect_identical(
    format(out[[1]])[[4]],
    c("0 (0.0%)", "0 (0.0%)", "0 (0.0%)", "0 (0.0%)", "", "0")
  )
  expect_snapshot(print(out[[1]]))
})


# coercing to data frame -------------------------

test_that("data_tabulate, as.data.frame, frequency tables", {
  data(mtcars)
  # frequency table
  x <- data_tabulate(mtcars$cyl)
  out <- as.data.frame(x)
  expect_named(
    out,
    c("Variable", "Value", "N", "Raw %", "Valid %", "Cumulative %")
  )
  expect_identical(
    out$Variable,
    c("mtcars$cyl", "mtcars$cyl", "mtcars$cyl", "mtcars$cyl")
  )
  expect_false(any(vapply(out[2:ncol(out)], is.character, logical(1))))
  # frequency tables
  x <- data_tabulate(mtcars, select = c("cyl", "am"))
  out <- as.data.frame(x)
  expect_named(out, c("var", "table"))
  expect_equal(
    vapply(out, class, character(1)),
    c("character", "AsIs"),
    ignore_attr = TRUE
  )
  expect_length(out$table, 2L)
  expect_named(
    out$table[[1]],
    c("Variable", "Value", "N", "Raw %", "Valid %", "Cumulative %")
  )
  expect_identical(out$table[[1]]$Variable, c("cyl", "cyl", "cyl", "cyl"))
  expect_false(any(vapply(
    out$table[[1]][2:ncol(out$table[[1]])],
    is.character,
    logical(1)
  )))
})


test_that("data_tabulate, as.data.frame, cross tables", {
  data(mtcars)
  # cross table
  x <- data_tabulate(mtcars, "cyl", by = "am")
  out <- as.data.frame(x)
  expect_named(out, c("var", "table"))
  expect_equal(
    vapply(out, class, character(1)),
    c("character", "AsIs"),
    ignore_attr = TRUE
  )
  expect_length(out$table, 1L)
  expect_named(out$table[[1]], c("cyl", "0", "1", "NA"))
  expect_identical(nrow(out$table[[1]]), 4L)
  # cross tables
  x <- data_tabulate(mtcars, c("cyl", "vs"), by = "am")
  out <- as.data.frame(x)
  expect_named(out, c("var", "table"))
  expect_equal(
    vapply(out, class, character(1)),
    c("character", "AsIs"),
    ignore_attr = TRUE
  )
  expect_length(out$table, 2L)
  expect_named(out$table[[1]], c("cyl", "0", "1", "NA"))
  expect_identical(nrow(out$table[[1]]), 4L)
})


test_that("data_tabulate, as.data.frame, cross tables with total N", {
  # cross table, with total
  x <- data_tabulate(mtcars, "cyl", by = "am")
  out <- as.data.frame(x, add_total = TRUE)
  expect_named(out, c("var", "table"))
  expect_equal(
    vapply(out, class, character(1)),
    c("character", "AsIs"),
    ignore_attr = TRUE
  )
  expect_length(out$table, 1L)
  expect_named(out$table[[1]], c("cyl", "0", "1", "<NA>", "Total"))
  expect_identical(nrow(out$table[[1]]), 5L)
  expect_identical(out$table[[1]]$cyl, c("4", "6", "8", NA, "Total"))
  # cross tables, with total
  x <- data_tabulate(mtcars, c("cyl", "vs"), by = "am")
  out <- as.data.frame(x, add_total = TRUE)
  expect_named(out, c("var", "table"))
  expect_equal(
    vapply(out, class, character(1)),
    c("character", "AsIs"),
    ignore_attr = TRUE
  )
  expect_length(out$table, 2L)
  expect_named(out$table[[1]], c("cyl", "0", "1", "<NA>", "Total"))
  expect_identical(nrow(out$table[[1]]), 5L)
  expect_identical(out$table[[1]]$cyl, c("4", "6", "8", NA, "Total"))
})


# table methods -----------------------------

test_that("data_tabulate, table methods", {
  data(mtcars)

  # datawizard_table
  x <- data_tabulate(mtcars$cyl)
  expect_type(as.table(x), "list")
  expect_s3_class(as.table(x, simplify = TRUE), "table")
  expect_snapshot(as.table(x))

  # datawizard_tables
  x <- data_tabulate(mtcars, "cyl")
  expect_type(as.table(x), "list")
  expect_s3_class(as.table(x, simplify = TRUE), "table")
  expect_snapshot(as.table(x))

  # test remove_na
  x <- data_tabulate(mtcars, "cyl", remove_na = TRUE)
  expect_identical(x[[1]]$N, as.vector(as.table(x, simplify = TRUE)))
  x <- data_tabulate(mtcars, "cyl")
  expect_identical(
    x[[1]]$N,
    as.vector(as.table(x, simplify = TRUE, remove_na = FALSE))
  )
  expect_snapshot(as.table(x, remove_na = FALSE))

  # datawizard_tables, multiple
  x <- data_tabulate(mtcars, c("cyl", "gear"))
  expect_identical(unlist(lapply(as.table(x), class)), rep("table", 2L))
  expect_type(as.table(x, simplify = TRUE), "list") # no simplification
  expect_type(as.table(x, simplify = FALSE), "list")
  expect_snapshot(as.table(x))

  # datawizard_crosstab
  x <- data_tabulate(mtcars$cyl, mtcars$gear)
  expect_type(as.table(x), "list")
  expect_s3_class(as.table(x, simplify = TRUE), "table")
  expect_snapshot(as.table(x))
  expect_snapshot(as.table(x, simplify = TRUE))

  # datawizard_crosstabs
  x <- data_tabulate(mtcars, "cyl", by = "gear")
  expect_type(as.table(x), "list")
  expect_s3_class(as.table(x, simplify = TRUE), "table")
  expect_snapshot(as.table(x))
  expect_snapshot(as.table(x, simplify = TRUE))

  # datawizard_crosstabs, multiple
  x <- data_tabulate(mtcars, c("am", "cyl"), by = "gear")
  expect_identical(unlist(lapply(as.table(x), class)), rep("table", 2L))
  expect_identical(
    x[[1]]$`3`[1:2],
    as.vector(as.table(x)[[1]][, 1, drop = TRUE])
  )
  expect_identical(
    x[[2]]$`4`[1:3],
    as.vector(as.table(x)[[2]][, 2, drop = TRUE])
  )
  expect_type(as.table(x), "list")
  expect_type(as.table(x, simplify = TRUE), "list") # no simplification
  expect_snapshot(as.table(x))

  # grouped data frames
  d <- data_group(mtcars, "am")
  x <- data_tabulate(d, "cyl", by = "gear")
  expect_named(as.table(x), c("am (0)", "am (1)"))
  expect_snapshot(as.table(x))

  # messages - no missings to remove
  expect_silent(as.table(data_tabulate(mtcars, "cyl")))
  expect_silent(as.table(data_tabulate(mtcars, "cyl"), verbose = FALSE))
})


test_that("data_tabulate, table methods, only warn if necessary", {
  # missings
  data(efc)

  # single variable
  expect_message(as.table(data_tabulate(efc$c172code)))
  expect_silent(as.table(data_tabulate(efc$c172code, remove_na = TRUE)))
  expect_silent(as.table(data_tabulate(efc$c172code), remove_na = FALSE))
  expect_silent(as.table(data_tabulate(efc$c172code), verbose = FALSE))

  # cross table
  expect_message(
    as.table(data_tabulate(efc, "c172code", by = "e42dep")),
    regex = "Removing NA values"
  )
  expect_silent(as.table(data_tabulate(
    efc,
    "c172code",
    by = "e42dep",
    remove_na = TRUE
  )))
  expect_silent(as.table(
    data_tabulate(efc, "c172code", by = "e42dep"),
    remove_na = FALSE
  ))
  expect_silent(as.table(
    data_tabulate(efc, "c172code", by = "e42dep"),
    verbose = FALSE
  ))

  # no missings
  data(mtcars)

  # single variable
  expect_silent(as.table(data_tabulate(mtcars$gear)))
  expect_silent(as.table(data_tabulate(mtcars$gear, remove_na = TRUE)))
  expect_silent(as.table(data_tabulate(mtcars$gear), verbose = FALSE))

  # cross table
  expect_silent(as.table(data_tabulate(mtcars, "gear", by = "cyl")))
  expect_silent(as.table(data_tabulate(
    mtcars,
    "gear",
    by = "cyl",
    remove_na = TRUE
  )))
  expect_silent(as.table(
    data_tabulate(mtcars, "gear", by = "cyl"),
    verbose = FALSE
  ))

  # group DF throws no warning
  d <- data_group(mtcars, "am")
  expect_silent(as.table(data_tabulate(d, "cyl", by = "gear")))
})


test_that("data_tabulate, cross tables, extract proportions", {
  data(efc, package = "datawizard")
  out <- data_tabulate(
    efc,
    "c172code",
    by = "e16sex",
    proportions = "row",
    remove_na = TRUE
  )
  tab <- table(efc$c172code, efc$e16sex) /
    rowSums(table(efc$c172code, efc$e16sex))
  dimnames(tab) <- list(c("1", "2", "3"), c("male", "female"))
  expect_equal(
    as.prop.table(out, verbose = FALSE),
    list(tab),
    ignore_attr = TRUE,
    tolerance = 1e-4
  )
  expect_equal(
    as.prop.table(out, verbose = FALSE, simplify = TRUE),
    tab,
    ignore_attr = TRUE,
    tolerance = 1e-4
  )
  out <- data_tabulate(
    efc,
    "c172code",
    by = "e16sex",
    proportions = "col",
    remove_na = TRUE
  )
  tab <- as.table(t(
    t(table(efc$c172code, efc$e16sex)) /
      colSums(table(efc$c172code, efc$e16sex))
  ))
  dimnames(tab) <- list(c("1", "2", "3"), c("male", "female"))
  expect_equal(
    as.prop.table(out, verbose = FALSE),
    list(tab),
    ignore_attr = TRUE,
    tolerance = 1e-4
  )
  expect_equal(
    as.prop.table(out, verbose = FALSE, simplify = TRUE),
    tab,
    ignore_attr = TRUE,
    tolerance = 1e-4
  )
})


================================================
FILE: tests/testthat/test-data_to_factor.R
================================================
# numeric
test_that("to_factor", {
  x <- c(10, 11, 12)
  expect_identical(
    to_factor(x),
    structure(1:3, .Label = c("10", "11", "12"), class = "factor")
  )

  data(efc)
  x <- to_factor(efc$c172code)
  expect_identical(
    levels(x),
    c(
      "low level of education",
      "intermediate level of education",
      "high level of education"
    )
  )
  x <- to_factor(efc$c172code, labels_to_levels = FALSE)
  expect_identical(levels(x), c("1", "2", "3"))
})

# numeric, partially labelled
test_that("to_factor", {
  x <- c(10, 11, 12)
  attr(x, "labels") <- c(ten = 10, twelve = 12)
  expect_message(
    expect_identical(
      to_factor(x),
      structure(1:3, levels = c("ten", "11", "twelve"), class = "factor")
    ),
    regexp = "Not all factor levels"
  )
})

# factor
test_that("to_factor", {
  data(efc)
  expect_identical(to_factor(efc$e42dep), efc$e42dep)
})

# data frame
test_that("to_factor", {
  data(iris)
  out <- to_factor(iris)
  expect_identical(out$Species, iris$Species)
  expect_true(all(vapply(out, is.factor, TRUE)))
  expect_identical(
    levels(out$Sepal.Length),
    c(
      "4.3",
      "4.4",
      "4.5",
      "4.6",
      "4.7",
      "4.8",
      "4.9",
      "5",
      "5.1",
      "5.2",
      "5.3",
      "5.4",
      "5.5",
      "5.6",
      "5.7",
      "5.8",
      "5.9",
      "6",
      "6.1",
      "6.2",
      "6.3",
      "6.4",
      "6.5",
      "6.6",
      "6.7",
      "6.8",
      "6.9",
      "7",
      "7.1",
      "7.2",
      "7.3",
      "7.4",
      "7.6",
      "7.7",
      "7.9"
    )
  )

  out <- to_factor(iris, select = starts_with("Sep"), append = TRUE)
  expect_identical(
    colnames(out),
    c(
      "Sepal.Length",
      "Sepal.Width",
      "Petal.Length",
      "Petal.Width",
      "Species",
      "Sepal.Length_f",
      "Sepal.Width_f"
    )
  )
  expect_identical(sum(vapply(out, is.factor, TRUE)), 3L)
})


# select helpers ------------------------------
test_that("to_factor regex", {
  expect_identical(
    to_factor(mtcars, select = "yl", regex = TRUE),
    to_factor(mtcars, select = "cyl")
  )
  expect_identical(
    to_factor(mtcars, select = "yl$", regex = TRUE),
    to_factor(mtcars, select = "cyl")
  )
})


# SPSS file, many value labels  -----------------------------------

skip_if_not_installed("httr")
skip_if_not_installed("haven")

skip_on_cran()

skip_if_not_installed("curl")
skip_if_offline()

# Output validated against SPSS output from original dataset

test_that("data_read, convert many labels correctly", {
  temp_file <- tempfile(fileext = ".sav")
  request <- httr::GET(
    "https://raw.github.com/easystats/circus/master/data/spss_many_labels.sav"
  )
  httr::stop_for_status(request)
  writeBin(httr::content(request, type = "raw"), temp_file)

  d <- data_read(
    temp_file,
    convert_factors = FALSE,
    verbose = FALSE
  )
  expect_identical(
    levels(to_factor(d$selv1)),
    c(
      "Vignette 1 weiblich (Gülsen E. Reinigungskraft B)",
      "Vignette 2 weiblich (Gülsen E. Anwältin B)",
      "Vignette 3 weiblich (Monika E. Reinigungskraft B)",
      "Vignette 4 weiblich (Monika E. Anwältin B)",
      "Vignette 5 männlich (Hasan E. Reinigungskraft B)",
      "Vignette 6 männlich (Hasan E. Anwalt B)",
      "Vignette 7 männlich (Martin E. Reinigungskraft B)",
      "Vignette 8 männlich (Martin E. Anwalt B)",
      "Vignette 9 weiblich (Gülsen E. Reinigungskraft E)",
      "Vignette 10 weiblich (Gülsen E. Anwältin E)",
      "Vignette 11 weiblich (Monika E. Reinigungskraft E)",
      "Vignette 12 weiblich (Monika E. Anwältin E)",
      "Vignette 13 männlich (Hasan E. Reinigungskraft E)",
      "Vignette 14 männlich (Hasan E. Anwalt E)",
      "Vignette 15 männlich (Martin E. Reinigungskraft E)",
      "Vignette 16 männlich (Martin E. Anwalt E)"
    )
  )
  expect_snapshot(data_tabulate(to_factor(d$selv1)))

  expect_identical(levels(to_factor(d$c12)), c("ja", "nein", "keine Angabe"))
  expect_snapshot(data_tabulate(to_factor(d$c12)))

  expect_identical(
    levels(to_factor(d$c12a)),
    c("Filter", "ja", "nein", "keine Angabe")
  )
  expect_snapshot(data_tabulate(to_factor(d$c12a)))
  expect_identical(
    levels(to_factor(d$c12c)),
    c(
      "Filter",
      "0 = keine",
      "1",
      "2",
      "3",
      "4",
      "5",
      "6",
      "7",
      "8",
      "9",
      "10 = sehr starke",
      "weiß nicht / keine Angabe"
    )
  )
  expect_snapshot(data_tabulate(to_factor(d$c12c)))
  unlink(temp_file)
})


test_that("to_factor works with haven_labelled, convert many labels correctly", {
  skip_if_not_installed("withr")
  withr::with_tempfile("temp_file", fileext = ".sav", code = {
    request <- httr::GET(
      "https://raw.github.com/easystats/circus/main/data/EFC.sav"
    )
    httr::stop_for_status(request)
    writeBin(httr::content(request, type = "raw"), temp_file)

    d <- haven::read_spss(temp_file)
    x <- to_factor(d$c172code)
    expect_identical(
      levels(x),
      c(
        "low level of education",
        "intermediate level of education",
        "high level of education"
      )
    )
  })
})


================================================
FILE: tests/testthat/test-data_to_long.R
================================================
set.seed(123)
wide_data <- data.frame(replicate(3, sample.int(5)))

test_that("data_to_long works", {
  expect_equal(
    head(data_to_long(wide_data)),
    data.frame(
      Name = c("X1", "X2", "X3", "X1", "X2", "X3"),
      Value = c(3L, 3L, 2L, 2L, 1L, 3L),
      stringsAsFactors = FALSE
    ),
    ignore_attr = TRUE,
    tolerance = 1e-3
  )

  expect_equal(
    head(data_to_long(
      wide_data,
      select = c(1, 2),
      names_to = "Column",
      values_to = "Numbers",
      rows_to = "Row"
    )),
    data.frame(
      X3 = c(2L, 2L, 3L, 3L, 1L, 1L),
      Row = c(1, 1, 2, 2, 3, 3),
      Column = c("X1", "X2", "X1", "X2", "X1", "X2"),
      Numbers = c(3L, 3L, 2L, 1L, 5L, 2L),
      stringsAsFactors = FALSE
    ),
    ignore_attr = TRUE,
    tolerance = 1e-3
  )
})


test_that("data_to_long works - using row names as idvar", {
  data(mtcars)
  out <- data_to_long(mtcars, select = 2:4)
  expect_equal(
    dim(out),
    c(96, 10),
    ignore_attr = TRUE,
    tolerance = 1e-3
  )
  expect_equal(
    colnames(out),
    c("mpg", "drat", "wt", "qsec", "vs", "am", "gear", "carb", "name", "value"),
    ignore_attr = TRUE,
    tolerance = 1e-3
  )
  expect_equal(
    head(out$value),
    c(8, 304, 150, 8, 472, 205),
    ignore_attr = TRUE,
    tolerance = 1e-3
  )
})


test_that("data_to_long works - complex dataset", {
  skip_if_not_installed("psych")
  data <- psych::bfi

  long <- data_to_long(
    data,
    select = regex("\\d"),
    names_to = "Item",
    values_to = "Score",
    rows_to = "Participant"
  )

  expect_snapshot(str(long))

  long$Facet <- gsub("\\d", "", long$Item)
  long$Item <- gsub("[A-Z]", "", long$Item)
  long$Item <- paste0("I", long$Item)

  long1 <- data_to_long(
    data,
    select = starts_with("A"),
    names_to = "Item",
    values_to = "Score",
    rows_to = "Participant"
  )

  expect_identical(unique(long1$Item), c("A1", "A2", "A3", "A4", "A5"))
  expect_identical(unique(long1$Score), c(2L, 4L, 3L, 5L, 6L, 1L, NA))
  expect_identical(ncol(long1), 26L)
  expect_identical(nrow(long1), 14000L)

  long1 <- data_to_long(
    data,
    select = starts_with("a"),
    names_to = "Item",
    values_to = "Score",
    rows_to = "Participant"
  )

  expect_identical(ncol(long1), 30L)
  expect_identical(nrow(long1), nrow(data))

  long1 <- data_to_long(
    data,
    select = starts_with("a"),
    names_to = "Item",
    values_to = "Score",
    rows_to = "Participant",
    ignore_case = TRUE
  )

  expect_identical(unique(long1$Item), c("A1", "A2", "A3", "A4", "A5", "age"))
  expect_identical(ncol(long1), 25L)
  expect_identical(nrow(long1), 16800L)

  long1 <- data_to_long(
    data,
    select = c(1:5, 28),
    names_to = "Item",
    values_to = "Score",
    rows_to = "Participant",
    ignore_case = TRUE
  )

  expect_identical(unique(long1$Item), c("A1", "A2", "A3", "A4", "A5", "age"))
  expect_identical(ncol(long1), 25L)
  expect_identical(nrow(long1), 16800L)
})


test_that("data_to_long: arg 'cols' overrides 'select'", {
  skip_if_not_installed("psych")
  data <- psych::bfi

  expect_identical(
    data_to_long(
      wide_data,
      select = c(1, 2),
      names_to = "Column",
      values_to = "Numbers",
      rows_to = "Row"
    ),
    data_to_long(
      wide_data,
      cols = c(1, 2),
      names_to = "Column",
      values_to = "Numbers",
      rows_to = "Row"
    )
  )

  expect_identical(
    data_to_long(
      data,
      cols = regex("\\d"),
      names_to = "Item",
      values_to = "Score",
      rows_to = "Participant"
    ),
    data_to_long(
      data,
      select = regex("\\d"),
      names_to = "Item",
      values_to = "Score",
      rows_to = "Participant"
    )
  )

  expect_identical(
    data_to_long(
      data,
      cols = starts_with("A"),
      names_to = "Item",
      values_to = "Score",
      rows_to = "Participant"
    ),
    data_to_long(
      data,
      select = starts_with("A"),
      names_to = "Item",
      values_to = "Score",
      rows_to = "Participant"
    )
  )
})


d <- data.frame(
  age = c(20, 30, 40),
  sex = c("Female", "Male", "Male"),
  score_t1 = c(30, 35, 32),
  score_t2 = c(33, 34, 37),
  speed_t1 = c(2, 3, 1),
  speed_t2 = c(3, 4, 5),
  stringsAsFactors = FALSE
)

test_that("data_to_long works as expected - simple dataset", {
  out <- data_to_long(d, starts_with("score"))
  expect_identical(
    out$name,
    c("score_t1", "score_t2", "score_t1", "score_t2", "score_t1", "score_t2")
  )
  expect_identical(
    out$value,
    c(d$score_t1, d$score_t2)[c(1, 4, 2, 5, 3, 6)]
  )

  out <- data_to_long(
    d,
    contains("t2"),
    names_to = "NewCol",
    values_to = "Time"
  )
  expect_identical(
    out$NewCol,
    c("score_t2", "speed_t2", "score_t2", "speed_t2", "score_t2", "speed_t2")
  )
  expect_identical(out$Time, c(33, 3, 34, 4, 37, 5))
})


test_that("data_to_long works as expected - select-helper inside functions, using regex", {
  test_fun <- function(data, i) {
    data_to_long(data, select = i, regex = TRUE)
  }
  out <- test_fun(d, "^score")
  expect_identical(
    out$name,
    c("score_t1", "score_t2", "score_t1", "score_t2", "score_t1", "score_t2")
  )
  expect_identical(
    out$value,
    c(d$score_t1, d$score_t2)[c(1, 4, 2, 5, 3, 6)]
  )
})


test_that("data_to_long: need to provide sep or pattern if several names_to", {
  expect_error(
    data_to_long(wide_data, names_to = c("foo", "foo2")),
    "you supply multiple names"
  )
})

test_that("data_to_long: can't use sep or pattern if only one names_to", {
  expect_error(
    data_to_long(wide_data, names_to = "foo", names_sep = "_"),
    "can't use `names_sep`"
  )

  expect_error(
    data_to_long(wide_data, names_to = "foo", names_pattern = "_"),
    "can't use `names_pattern`"
  )
})

test_that("data_to_long: error if no columns to reshape", {
  # since #602, we no longer have the case that .select_nse() returns no
  # columns, because we error before when no column found, instead of returning
  # NULL or a vector of lenght zero.
  expect_error(
    data_to_long(wide_data, cols = "foo"),
    "Possibly misspelled"
  )
})


# EQUIVALENCE WITH TIDYR - PIVOT_LONGER -------------------------------------------

# Examples coming from: https://tidyr.tidyverse.org/articles/pivot.html#longer

test_that("data_to_long equivalent to pivot_longer: ex 1", {
  skip_if_not_installed("tidyr")

  x <- tidyr::relig_income %>% # nolint
    tidyr::pivot_longer(!religion, names_to = "income", values_to = "count")

  y <- tidyr::relig_income %>% # nolint
    data_to_long(cols = -religion, names_to = "income", values_to = "count")

  expect_equal(x, y, ignore_attr = TRUE)
})


test_that("data_to_long equivalent to pivot_longer: ex 2", {
  skip_if_not_installed("tidyr")

  x <- tidyr::billboard %>%
    tidyr::pivot_longer(
      cols = starts_with("wk"),
      names_to = "week",
      values_to = "rank"
    )

  y <- tidyr::billboard %>%
    data_to_long(
      cols = starts_with("wk"),
      names_to = "week",
      values_to = "rank"
    )

  expect_equal(x, y, ignore_attr = TRUE)
})


test_that("data_to_long equivalent to pivot_longer: ex 3", {
  skip_if_not_installed("tidyr")

  x <- tidyr::billboard %>%
    tidyr::pivot_longer(
      cols = starts_with("wk"),
      names_to = "week",
      values_to = "rank",
      values_drop_na = TRUE
    )

  y <- tidyr::billboard %>%
    data_to_long(
      cols = starts_with("wk"),
      names_to = "week",
      values_to = "rank",
      values_drop_na = TRUE
    )

  expect_equal(x, y, ignore_attr = TRUE)
})


test_that("data_to_long equivalent to pivot_longer: ex 4", {
  skip_if_not_installed("tidyr")

  x <- tidyr::billboard %>%
    tidyr::pivot_longer(
      cols = starts_with("wk"),
      names_to = "week",
      names_prefix = "wk",
      values_to = "rank",
      values_drop_na = TRUE
    )

  y <- tidyr::billboard %>%
    data_to_long(
      select = starts_with("wk"),
      names_to = "week",
      names_prefix = "wk",
      values_to = "rank",
      values_drop_na = TRUE
    )

  expect_equal(x, y, ignore_attr = TRUE)
})


test_that("data_to_long equivalent to pivot_longer: ex 5", {
  skip_if_not_installed("tidyr")

  suppressWarnings({
    x <- tidyr::who %>%
      tidyr::pivot_longer(
        cols = 5:60,
        names_to = c("diagnosis", "gender", "age"),
        names_sep = "_",
        values_to = "count"
      )
  })

  y <- tidyr::who %>%
    data_to_long(
      cols = 5:60,
      names_to = c("diagnosis", "gender", "age"),
      names_sep = "_",
      values_to = "count"
    )

  expect_equal(x, y, ignore_attr = TRUE)
})

test_that("data_to_long equivalent to pivot_longer: ex 6", {
  skip_if_not_installed("tidyr")

  x <- tidyr::who %>%
    tidyr::pivot_longer(
      cols = new_sp_m014:newrel_f65,
      names_to = c("diagnosis", "gender", "age"),
      names_pattern = "new_?(.*)_(.)(.*)",
      values_to = "count"
    )

  y <- tidyr::who %>%
    data_to_long(
      cols = 5:60,
      names_to = c("diagnosis", "gender", "age"),
      names_pattern = "new_?(.*)_(.)(.*)",
      values_to = "count"
    )

  expect_equal(x, y, ignore_attr = TRUE)
})


# tests coming from tidyr's repo
# https://github.com/tidyverse/tidyr/blob/main/tests/testthat/test-pivot-long.R

test_that("can reshape all cols to long", {
  skip_if_not_installed("tidyr")

  df <- tidyr::tibble(x = 1:2, y = 3:4)
  pv <- data_to_long(df, x:y)

  expect_named(pv, c("name", "value"))
  expect_identical(pv$name, rep(names(df), 2))
  expect_identical(pv$value, c(1L, 3L, 2L, 4L))
})

test_that("values interleaved correctly", {
  skip_if_not_installed("tidyr")

  df <- tidyr::tibble(
    x = c(1, 2),
    y = c(10, 20),
    z = c(100, 200)
  )
  pv <- data_to_long(df, 1:3)

  expect_identical(pv$value, c(1, 10, 100, 2, 20, 200))
})

test_that("preserves original keys", {
  skip_if_not_installed("tidyr")

  df <- tidyr::tibble(x = 1:2, y = 2, z = 1:2)
  pv <- data_to_long(df, y:z)

  expect_named(pv, c("x", "name", "value"))
  expect_identical(pv$x, rep(df$x, each = 2))
})

test_that("can drop missing values", {
  skip_if_not_installed("tidyr")

  df <- data.frame(x = c(1, NA), y = c(NA, 2))
  pv <- data_to_long(df, x:y, values_drop_na = TRUE)

  expect_identical(pv$name, c("x", "y"))
  expect_identical(pv$value, c(1, 2))
})

test_that("mixed columns are automatically coerced", {
  skip_if_not_installed("tidyr")

  df <- data.frame(x = factor("a"), y = factor("b"))
  pv <- data_to_long(df, x:y)

  expect_identical(pv$value, factor(c("a", "b")))
})

test_that("error when overwriting existing column", {
  skip_if_not_installed("tidyr")

  df <- tidyr::tibble(x = 1, y = 2)

  expect_error(
    data_to_long(df, y, names_to = "x"),
    regexp = "are already present"
  )
})

test_that("preserve date format", {
  skip_if_not_installed("tidyr")

  family <- tidyr::tibble(
    family = 1:3,
    dob_child1 = as.Date(c("1998-11-26", "2004-10-10", "2000-12-05")),
    dob_child2 = as.Date(c("2000-01-29", NA, "2004-04-05"))
  )

  tidyr <- tidyr::pivot_longer(family, !family, names_to = "child")
  datawiz <- data_to_long(family, -family, names_to = "child")

  expect_identical(tidyr, datawiz)
})


test_that("works with labelled data", {
  data(efc, package = "datawizard")
  out <- data_to_long(
    efc,
    select = c("e16sex", "c172code"),
    names_to = "Dummy",
    values_to = "Score"
  )
  expect_identical(nrow(out), 200L)
  expect_identical(attributes(out$e42dep)$label, "elder's dependency")
})


test_that("don't convert factors to integer", {
  data("mtcars")
  mtcars <- mtcars[c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 18L, 29L, 31L), ]
  mtcars$am_f <- factor(mtcars$am)
  mtcars$cyl_f <- factor(mtcars$cyl)

  mtcars$id <- factor(seq_len(nrow(mtcars)))
  mtcars_long <- data_to_long(
    mtcars,
    select = c("mpg", "qsec", "disp"),
    names_to = "g"
  )
  expect_snapshot(print(mtcars_long))
})


test_that("tell user about typos", {
  data("mtcars")
  expect_silent(data_to_long(
    mtcars,
    select = c("mpg", "hp", "disp"),
    names_to = "time",
    values_to = "count"
  ))
  expect_error(
    data_to_long(
      mtcars,
      select = c("mpg", "ho", "dist"),
      names_to = "time",
      values_to = "count"
    ),
    regex = "Following"
  )
})


================================================
FILE: tests/testthat/test-data_to_numeric.R
================================================
test_that("convert data frame to numeric", {
  expect_snapshot(to_numeric(head(ToothGrowth), dummy_factors = TRUE))
  expect_snapshot(to_numeric(head(ToothGrowth), dummy_factors = FALSE))
})

test_that("convert character to numeric", {
  expect_identical(to_numeric(c("xyz", "ab")), c(2, 1))
})

test_that("convert character to numeric Date", {
  expect_warning(expect_identical(
    to_numeric(as.Date("2022-01-01")),
    as.numeric(as.Date("2022-01-01"))
  ))
  expect_warning(expect_identical(
    to_numeric(as.POSIXct("2022-01-01")),
    as.numeric(as.POSIXct("2022-01-01"))
  ))
  expect_warning(expect_identical(
    to_numeric(as.POSIXlt("2022-01-01")),
    as.numeric(as.POSIXlt("2022-01-01"))
  ))
})

test_that("convert character to numeric preserve levels", {
  x <- head(as.factor(mtcars$gear))
  expect_identical(
    to_numeric(x, dummy_factors = FALSE),
    c(2, 2, 2, 1, 1, 1)
  )
  expect_identical(
    to_numeric(x, dummy_factors = FALSE, preserve_levels = TRUE),
    c(4, 4, 4, 3, 3, 3)
  )
})

test_that("convert character to numeric lowest", {
  d <- head(mtcars)
  d$vs <- as.factor(d$vs)
  model <- glm(vs ~ wt + mpg, data = d, family = "binomial")
  expect_identical(
    to_numeric(insight::get_response(model), dummy_factors = FALSE),
    c(1, 1, 2, 2, 1, 2)
  )
  expect_identical(
    to_numeric(insight::get_response(model), dummy_factors = FALSE, lowest = 0),
    c(0, 0, 1, 1, 0, 1)
  )
})

test_that("convert factor to numeric", {
  f <- factor(substring("statistics", 1:10, 1:10))
  expect_snapshot(to_numeric(f, dummy_factors = TRUE))
})

test_that("convert factor to numeric", {
  expect_identical(to_numeric(c("abc", "xyz")), c(1, 2))
  expect_identical(to_numeric(c("123", "789")), c(123, 789))
  expect_identical(to_numeric(c("1L", "2e-3")), c(1, 0.002))
  expect_identical(to_numeric(c("1L", "2e-3", "ABC")), c(1, 2, 3))
})

test_that("convert factor to numeric, dummy factors", {
  expect_identical(
    to_numeric(c("abc", "xyz"), dummy_factors = TRUE),
    data.frame(abc = c(1, 0), xyz = c(0, 1)),
    ignore_attr = TRUE
  )
  expect_identical(
    to_numeric(c("1L", "2e-3", "ABC"), dummy_factors = TRUE),
    data.frame(`1L` = c(1, 0, 0), `2e-3` = c(0, 1, 0), ABC = c(0, 0, 1)),
    ignore_attr = TRUE
  )
})

test_that("convert factor to numeric, append", {
  data(efc)
  expect_identical(
    colnames(to_numeric(efc, dummy_factors = TRUE)),
    c(
      "c12hour",
      "e16sex",
      "e42dep.1",
      "e42dep.2",
      "e42dep.3",
      "e42dep.4",
      "c172code",
      "neg_c_7"
    ),
    ignore_attr = TRUE
  )
  expect_identical(
    colnames(to_numeric(efc, dummy_factors = TRUE, append = TRUE)),
    c(
      "c12hour",
      "e16sex",
      "e42dep",
      "c172code",
      "neg_c_7",
      "e42dep_n",
      "e42dep_n.1",
      "e42dep_n.2",
      "e42dep_n.3",
      "e42dep_n.4"
    ),
    ignore_attr = TRUE
  )
  expect_identical(
    colnames(to_numeric(efc, append = TRUE, dummy_factors = FALSE)),
    c("c12hour", "e16sex", "e42dep", "c172code", "neg_c_7", "e42dep_n"),
    ignore_attr = TRUE
  )
  expect_identical(
    colnames(to_numeric(efc, append = FALSE, dummy_factors = FALSE)),
    c("c12hour", "e16sex", "e42dep", "c172code", "neg_c_7"),
    ignore_attr = TRUE
  )
})

test_that("convert factor to numeric, all numeric", {
  data(mtcars)
  expect_identical(to_numeric(mtcars), mtcars)
})

test_that("convert factor to numeric, dummy factors, with NA", {
  x1 <- factor(rep(c("a", "b"), 3))
  x2 <- factor(c("a", NA_character_, "a", "b", "a", "b"))
  x3 <- factor(c(NA_character_, "b", "a", "b", "a", "b"))
  x4 <- factor(c("a", "b", "a", "b", "a", NA_character_))
  x5 <- factor(c(NA_character_, "b", "a", "b", "a", NA_character_))
  x6 <- factor(c(NA_character_, "b", NA_character_, "b", "a", NA_character_))
  x7 <- factor(c(
    NA_character_,
    "b",
    "a",
    "b",
    "a",
    "b",
    NA_character_,
    "b",
    "a",
    NA_character_,
    "a",
    "b",
    "a",
    "b",
    "a",
    NA_character_
  ))

  # same observations are missing
  expect_identical(
    which(!complete.cases(to_numeric(x1, dummy_factors = TRUE))),
    which(is.na(x1))
  )
  expect_identical(
    which(!complete.cases(to_numeric(x2, dummy_factors = TRUE))),
    which(is.na(x2))
  )
  expect_identical(
    which(!complete.cases(to_numeric(x3, dummy_factors = TRUE))),
    which(is.na(x3))
  )
  expect_identical(
    which(!complete.cases(to_numeric(x4, dummy_factors = TRUE))),
    which(is.na(x4))
  )
  expect_identical(
    which(!complete.cases(to_numeric(x5, dummy_factors = TRUE))),
    which(is.na(x5))
  )
  expect_identical(
    which(!complete.cases(to_numeric(x6, dummy_factors = TRUE))),
    which(is.na(x6))
  )
  expect_identical(
    which(!complete.cases(to_numeric(x7, dummy_factors = TRUE))),
    which(is.na(x7))
  )

  # output has same number of observation as input
  expect_identical(nrow(to_numeric(x1, dummy_factors = TRUE)), length(x1))
  expect_identical(nrow(to_numeric(x2, dummy_factors = TRUE)), length(x2))
  expect_identical(nrow(to_numeric(x3, dummy_factors = TRUE)), length(x3))
  expect_identical(nrow(to_numeric(x4, dummy_factors = TRUE)), length(x4))
  expect_identical(nrow(to_numeric(x5, dummy_factors = TRUE)), length(x5))
  expect_identical(nrow(to_numeric(x6, dummy_factors = TRUE)), length(x6))
  expect_identical(nrow(to_numeric(x7, dummy_factors = TRUE)), length(x7))
})

test_that("to_numeric, inverse factor levels", {
  f <- c(0, 0, 1, 1, 1, 0)
  x1 <- factor(f, levels = c(0, 1))
  x2 <- factor(f, levels = c(1, 0))
  out <- to_numeric(x1, dummy_factors = FALSE, preserve_levels = FALSE)
  expect_identical(out, c(1, 1, 2, 2, 2, 1))
  out <- to_numeric(x2, dummy_factors = FALSE, preserve_levels = FALSE)
  expect_identical(out, c(2, 2, 1, 1, 1, 2))
  out <- to_numeric(x1, dummy_factors = FALSE, preserve_levels = TRUE)
  expect_identical(out, c(0, 0, 1, 1, 1, 0))
  out <- to_numeric(x2, dummy_factors = FALSE, preserve_levels = TRUE)
  expect_identical(out, c(1, 1, 0, 0, 0, 1))
})

# select helpers ------------------------------
test_that("to_numeric regex", {
  expect_identical(
    to_numeric(mtcars, select = "pg", regex = TRUE),
    to_numeric(mtcars, select = "mpg")
  )
})


test_that("to_numeric works with haven_labelled, convert many labels correctly", {
  skip_on_cran()
  skip_if_not_installed("httr")
  skip_if_not_installed("haven")
  skip_if_not_installed("withr")
  skip_if_not_installed("curl")
  skip_if_offline()

  withr::with_tempfile("temp_file", fileext = ".sav", code = {
    request <- httr::GET(
      "https://raw.github.com/easystats/circus/main/data/EFC.sav"
    )
    httr::stop_for_status(request)
    writeBin(httr::content(request, type = "raw"), temp_file)

    d <- haven::read_spss(temp_file)
    x <- to_numeric(d$c172code)
    expect_identical(as.vector(table(x)), c(180L, 506L, 156L))
  })
})


test_that("to_numeric preserves correct label order", {
  x <- factor(c(1, 2, 3, 4))
  x <- assign_labels(x, values = c("one", "two", "three", "four"))
  out <- to_numeric(x, dummy_factors = FALSE)
  expect_identical(
    attributes(out)$labels,
    c(one = 1, two = 2, three = 3, four = 4)
  )
  # correctly reverse scale
  out <- to_numeric(reverse_scale(x), dummy_factors = FALSE)
  expect_identical(
    attributes(out)$labels,
    c(one = 4, two = 3, three = 2, four = 1)
  )
  # factor with alphabetical values
  x <- factor(letters[1:4])
  x <- assign_labels(x, values = c("one", "two", "three", "four"))
  out <- to_numeric(x, dummy_factors = FALSE)
  expect_identical(
    attributes(out)$labels,
    c(one = 1, two = 2, three = 3, four = 4)
  )
  # correctly reverse scale
  out <- to_numeric(reverse_scale(x), dummy_factors = FALSE)
  expect_identical(
    attributes(out)$labels,
    c(one = 4, two = 3, three = 2, four = 1)
  )
})


================================================
FILE: tests/testthat/test-data_to_wide.R
================================================
test_that("data_to_wide works", {
  long_data <- data.frame(
    Row_ID = c(1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5),
    name = c(
      "X1",
      "X1",
      "X1",
      "X1",
      "X1",
      "X2",
      "X2",
      "X2",
      "X2",
      "X2",
      "X3",
      "X3",
      "X3",
      "X3",
      "X3"
    ),
    value = c(3L, 2L, 5L, 4L, 1L, 3L, 1L, 2L, 5L, 4L, 2L, 3L, 1L, 4L, 5L),
    stringsAsFactors = FALSE
  )

  expect_equal(
    data_to_wide(
      long_data,
      names_from = "name",
      values_from = "value",
      id_cols = "Row_ID"
    ),
    data.frame(
      Row_ID = c(1, 2, 3, 4, 5),
      X1 = c(3L, 2L, 5L, 4L, 1L),
      X2 = c(3L, 1L, 2L, 5L, 4L),
      X3 = c(2L, 3L, 1L, 4L, 5L),
      stringsAsFactors = FALSE
    ),
    ignore_attr = TRUE,
    tolerance = 1e-3
  )

  long_data$X1 <- 5
  expect_error(
    data_to_wide(
      long_data,
      names_from = "name",
      values_from = "value",
      id_cols = "Row_ID"
    ),
    regexp = "Some values of the columns specified in `names_from`"
  )
})


test_that("data_to_wide, names_prefix works", {
  skip_if_not_installed("tidyr")

  out <- data_to_wide(
    tidyr::fish_encounters,
    names_from = "station",
    values_from = "seen",
    names_prefix = "foo_"
  )

  expect_named(
    out,
    c(
      "fish",
      "foo_Release",
      "foo_I80_1",
      "foo_Lisbon",
      "foo_Rstr",
      "foo_Base_TD",
      "foo_BCE",
      "foo_BCW",
      "foo_BCE2",
      "foo_BCW2",
      "foo_MAE",
      "foo_MAW"
    )
  )
})


test_that("data_to_wide, values_fill deprecated", {
  skip_if_not_installed("tidyr")

  expect_warning(
    data_to_wide(
      tidyr::fish_encounters,
      names_from = "station",
      values_from = "seen",
      values_fill = c(1, 2)
    ),
    regexp = "`values_fill` is defunct",
    fixed = TRUE
  )
})


# EQUIVALENCE WITH TIDYR - PIVOT_WIDER -----------------------------------------------

# Examples coming from: https://tidyr.tidyverse.org/articles/pivot.html#wider
# and from https://github.com/tidyverse/tidyr/blob/main/tests/testthat/test-pivot-wide.R

### From tidyr tests

test_that("can pivot all cols to wide", {
  skip_if_not_installed("tidyr")

  df <- tidyr::tibble(key = c("x", "y", "z"), val = 1:3)
  pv <- data_to_wide(df, names_from = "key", values_from = "val")

  expect_named(pv, c("x", "y", "z"))
  expect_identical(nrow(pv), 1L)
})

test_that("non-pivoted cols are preserved", {
  skip_if_not_installed("tidyr")

  df <- tidyr::tibble(a = 1, key = c("x", "y"), val = 1:2)
  pv <- data_to_wide(df, names_from = "key", values_from = "val")

  expect_named(pv, c("a", "x", "y"))
  expect_identical(nrow(pv), 1L)
})

test_that("implicit missings turn into explicit missings", {
  skip_if_not_installed("tidyr")

  df <- tidyr::tibble(a = 1:2, key = c("x", "y"), val = 1:2)
  pv <- data_to_wide(df, names_from = "key", values_from = "val")

  expect_identical(pv$a, c(1L, 2L))
  expect_identical(pv$x, c(1L, NA))
  expect_identical(pv$y, c(NA, 2L))
})

test_that("error when overwriting existing column", {
  skip_if_not_installed("tidyr")

  df <- tidyr::tibble(
    a = c(1, 1),
    key = c("a", "b"),
    val = c(1, 2)
  )

  expect_error(
    data_to_wide(df, names_from = "key", values_from = "val"),
    regexp = "Some values of the columns specified"
  )
})

test_that("data_to_wide: fill values, #293", {
  skip_if_not_installed("tidyr")

  weekdays <- c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")

  daily <- tidyr::tibble(
    day = factor(c("Tue", "Thu", "Fri", "Mon"), levels = weekdays),
    value = c(2, 3, 1, 5),
    type = factor(c("A", "B", "B", "A"))
  )

  expect_identical(
    tidyr::pivot_wider(
      daily,
      names_from = type,
      values_from = value
    ),
    data_to_wide(
      daily,
      names_from = "type",
      values_from = "value"
    )
  )
})

test_that("data_to_wide, id_cols works correctly, #293", {
  skip_if_not_installed("tidyr")

  updates <- tidyr::tibble(
    county = c("Wake", "Wake", "Wake", "Guilford", "Guilford"),
    date = c(as.Date("2020-01-01") + 0:2, as.Date("2020-01-03") + 0:1),
    system = c("A", "B", "C", "A", "C"),
    value = c(3.2, 4, 5.5, 2, 1.2)
  )

  expect_identical(
    tidyr::pivot_wider(
      updates,
      id_cols = county,
      names_from = system,
      values_from = value
    ),
    data_to_wide(
      updates,
      id_cols = "county",
      names_from = "system",
      values_from = "value"
    )
  )
})


### Examples from tidyr website

test_that("data_to_wide equivalent to pivot_wider: ex 1", {
  skip_if_not_installed("tidyr")

  x <- tidyr::pivot_wider(
    tidyr::fish_encounters,
    names_from = "station",
    values_from = "seen"
  )

  y <- data_to_wide(
    tidyr::fish_encounters,
    names_from = "station",
    values_from = "seen"
  )

  expect_equal(x, y, ignore_attr = TRUE)
})

test_that("data_to_wide equivalent to pivot_wider: ex 2", {
  skip_if_not_installed("tidyr")

  production <- tidyr::expand_grid(
    product = c("A", "B"),
    country = c("AI", "EI"),
    year = 2000:2014
  ) %>%
    data_filter((product == "A" & country == "AI") | product == "B")

  production$production <- rnorm(nrow(production))

  x <- production %>%
    tidyr::pivot_wider(
      names_from = c(product, country),
      values_from = production
    )

  y <- production %>%
    data_to_wide(
      names_from = c("product", "country"),
      values_from = "production"
    )

  expect_identical(x, y)
})

test_that("data_to_wide equivalent to pivot_wider: ex 3", {
  skip_if_not_installed("tidyr")

  x <- tidyr::us_rent_income %>%
    tidyr::pivot_wider(
      names_from = variable,
      values_from = c(estimate, moe)
    )

  y <- tidyr::us_rent_income %>%
    data_to_wide(
      names_from = "variable",
      values_from = c("estimate", "moe")
    )

  expect_identical(x, y)
})

test_that("data_to_wide equivalent to pivot_wider: ex 4", {
  skip_if_not_installed("tidyr")

  x <- tidyr::us_rent_income %>%
    tidyr::pivot_wider(
      names_from = variable,
      names_sep = ".",
      values_from = c(estimate, moe)
    )

  y <- tidyr::us_rent_income %>%
    data_to_wide(
      names_from = "variable",
      names_sep = ".",
      values_from = c("estimate", "moe")
    )

  expect_identical(x, y)
})

test_that("data_to_wide equivalent to pivot_wider: ex 5", {
  skip_if_not_installed("tidyr")

  contacts <- tidyr::tribble(
    ~field, ~value,
    "name", "Jiena McLellan",
    "company", "Toyota",
    "name", "John Smith",
    "company", "google",
    "email", "john@google.com",
    "name", "Huxley Ratcliffe"
  )
  contacts$person_id <- cumsum(contacts$field == "name")

  x <- tidyr::pivot_wider(contacts, names_from = field, values_from = value)
  y <- data_to_wide(contacts, names_from = "field", values_from = "value")

  expect_identical(x, y)
})


test_that("data_to_wide equivalent to pivot_wider: ex 6", {
  skip_if_not_installed("tidyr")

  production <- tidyr::expand_grid(
    product = c("A", "B"),
    country = c("AI", "EI"),
    year = 2000:2014
  ) %>%
    data_filter((product == "A" & country == "AI") | product == "B")

  production$production <- rnorm(nrow(production))

  x <- production %>%
    tidyr::pivot_wider(
      names_from = c(product, country),
      values_from = production,
      names_glue = "prod_{product}_{country}"
    )

  y <- production %>%
    data_to_wide(
      names_from = c("product", "country"),
      values_from = "production",
      names_glue = "prod_{product}_{country}"
    )

  expect_identical(x, y)
})


test_that("data_to_wide, names_glue works", {
  skip_if_not_installed("tidyr")

  df <- data.frame(
    food = c(
      "banana",
      "banana",
      "banana",
      "banana",
      "cheese",
      "cheese",
      "cheese",
      "cheese"
    ),
    binary = rep(c("yes", "no"), 4),
    car = c(
      "toyota",
      "subaru",
      "mazda",
      "skoda",
      "toyota",
      "subaru",
      "mazda",
      "skoda"
    ),
    fun = c(2, 4, 3, 6, 2, 4, 2, 3),
    stringsAsFactors = FALSE
  )

  x <- df %>%
    tidyr::pivot_wider(
      id_cols = food,
      names_from = c(car, binary),
      names_glue = "{binary}_{car}",
      values_from = fun
    )

  y <- df %>%
    data_to_wide(
      id_cols = "food",
      names_from = c("car", "binary"),
      names_glue = "{binary}_{car}",
      values_from = "fun"
    )

  expect_identical(x, y, ignore_attr = TRUE)
})


test_that("preserve date format", {
  skip_if_not_installed("tidyr")

  family <- tidyr::tibble(
    family = c(1L, 1L, 2L, 2L, 3L, 3L),
    child = c(
      "dob_child1",
      "dob_child2",
      "dob_child1",
      "dob_child2",
      "dob_child1",
      "dob_child2"
    ),
    value = as.Date(c(
      "1998-11-26",
      "2000-01-29",
      "2004-10-10",
      NA,
      "2000-12-05",
      "2004-04-05"
    ))
  )

  tidyr <- tidyr::pivot_wider(
    family,
    names_from = "child",
    values_from = "value"
  )
  datawiz <- data_to_wide(family, names_from = "child", values_from = "value")

  expect_identical(tidyr, datawiz)
})


test_that("#293", {
  skip_if_not_installed("tidyr")

  weekdays <- c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")

  daily <- tidyr::tibble(
    day = factor(c("Tue", "Thu", "Fri", "Mon"), levels = weekdays),
    value = c(2, 3, 1, 5)
  )

  expect_identical(
    tidyr::pivot_wider(daily, names_from = day, values_from = value),
    data_to_wide(daily, names_from = "day", values_from = "value")
  )
})


test_that("new names starting with digits are not corrected automatically", {
  skip_if_not_installed("tidyr")

  percentages <- tidyr::tibble(
    year = c(2018, 2019, 2020, 2020),
    type = factor(c("A", "B", "A", "B"), levels = c("A", "B")),
    percentage = c(100, 100, 40, 60)
  )

  tidyr <- tidyr::pivot_wider(
    percentages,
    names_from = c(year, type),
    values_from = percentage
  )
  datawiz <- data_to_wide(
    percentages,
    names_from = c("year", "type"),
    values_from = "percentage"
  )
  expect_identical(tidyr, datawiz)
})


test_that("Preserve column name when names_from column only has one unique value", {
  d <- data.frame(
    Value = rnorm(10),
    Level = paste0("Participant_", 1:10),
    Parameter = "Intercept",
    stringsAsFactors = FALSE
  )
  out <- data_to_wide(
    d,
    values_from = "Value",
    names_from = "Parameter",
    names_sep = "_"
  )
  expect_named(out, c("Level", "Intercept"))
  expect_identical(nrow(out), 10L)

  d <- data.frame(
    Value = rnorm(10),
    Level = paste0("Participant_", 1:10),
    Parameter = c("Intercept", "abc"),
    stringsAsFactors = FALSE
  )
  out <- data_to_wide(
    d,
    values_from = "Value",
    names_from = "Parameter",
    names_sep = "_"
  )
  expect_named(out, c("Level", "Intercept", "abc"))
  expect_identical(nrow(out), 10L)
})


test_that("data_to_wide with multiple values_from and unbalanced panel", {
  skip_if_not_installed("tidyr")

  long_df <- tidyr::tibble(
    subject_id = c(1, 1, 2, 2, 3, 5, 4, 4),
    time = rep(c(1, 2), 4),
    score = c(10, NA, 15, 12, 18, 11, NA, 14),
    anxiety = c(5, 7, 6, NA, 8, 4, 5, NA)
  )

  tidyr <- tidyr::pivot_wider(
    long_df,
    id_cols = "subject_id",
    names_from = time,
    values_from = c(score, anxiety)
  )
  datawiz <- data_to_wide(
    long_df,
    id_cols = "subject_id",
    names_from = "time",
    values_from = c("score", "anxiety")
  )
  expect_identical(tidyr, datawiz)
})


test_that("data_to_wide preserves empty columns", {
  long_df <- data.frame(
    subject_id = c(1, 1, 2, 2, 3, 5, 4, 4),
    time = rep(c(1, 2), 4),
    score = c(10, NA, 15, 12, 18, 11, NA, 14),
    anxiety = c(5, 7, 6, NA, 8, 4, 5, NA),
    test = rep(NA_real_, 8)
  )

  out <- data_to_wide(
    long_df,
    id_cols = "subject_id",
    names_from = "time",
    values_from = c("score", "anxiety", "test")
  )

  expect_equal(
    out,
    data.frame(
      subject_id = c(1, 2, 3, 5, 4),
      score_1 = c(10, 15, 18, NA, NA),
      score_2 = c(NA, 12, NA, 11, 14),
      anxiety_1 = c(5, 6, 8, NA, 5),
      anxiety_2 = c(7, NA, NA, 4, NA),
      test_1 = as.double(c(NA, NA, NA, NA, NA)),
      test_2 = as.double(c(NA, NA, NA, NA, NA))
    ),
    ignore_attr = TRUE
  )
})


test_that("data_to_wide, check for valid columns", {
  long_df <- data.frame(
    subject_id = c(1, 1, 2, 2, 3, 5, 4, 4),
    time = rep(c(1, 2), 4),
    score = c(10, NA, 15, 12, 18, 11, NA, 14),
    anxiety = c(5, 7, 6, NA, 8, 4, 5, NA),
    test = rep(NA_real_, 8)
  )

  expect_error(
    data_to_wide(
      long_df,
      id_cols = "id",
      names_from = "time",
      values_from = c("score", "anxiety", "test")
    ),
    regexp = "`id_cols` must be the names of",
    fixed = TRUE
  )

  expect_error(
    data_to_wide(
      long_df,
      id_cols = "subject_id",
      names_from = "times",
      values_from = c("score", "anxiety", "test")
    ),
    regexp = "`names_from` must be the name of",
    fixed = TRUE
  )

  expect_warning(
    data_to_wide(
      long_df,
      id_cols = "subject_id",
      names_from = "time",
      values_from = c("scores", "anxiety", "test")
    ),
    regexp = "Following variable(s) were not found",
    fixed = TRUE
  )

  expect_error(
    expect_warning(expect_warning(expect_warning(
      data_to_wide(
        long_df,
        id_cols = "subject_id",
        names_from = "time",
        values_from = c("a", "b", "c")
      )
    ))),
    regexp = "No variable defined",
    fixed = TRUE
  )
})


test_that("data_to_wide, select helper for values_from", {
  long_df <- data.frame(
    subject_id = c(1, 1, 2, 2, 3, 5, 4, 4),
    time = rep(c(1, 2), 4),
    score_a = c(10, NA, 15, 12, 18, 11, NA, 14),
    score_b = c(5, 7, 6, NA, 8, 4, 5, NA),
    score_c = rep(NA_real_, 8)
  )

  out <- data_to_wide(
    long_df,
    id_cols = "subject_id",
    names_from = "time",
    values_from = starts_with("score_")
  )

  expect_equal(
    out,
    data.frame(
      subject_id = c(1, 2, 3, 5, 4),
      score_a_1 = c(10, 15, 18, NA, NA),
      score_a_2 = c(NA, 12, NA, 11, 14),
      score_a_1 = c(5, 6, 8, NA, 5),
      score_a_2 = c(7, NA, NA, 4, NA),
      score_a_1 = as.double(c(NA, NA, NA, NA, NA)),
      score_a_2 = as.double(c(NA, NA, NA, NA, NA))
    ),
    ignore_attr = TRUE
  )
})


================================================
FILE: tests/testthat/test-data_unique.R
================================================
# Preparations

df1 <- data.frame(
  id = c(1, 2, 3, 1, 3),
  year = c(2022, 2022, 2022, 2022, 2000),
  item1 = c(NA, 1, 1, 2, 3),
  item2 = c(NA, 1, 1, 2, 3),
  item3 = c(NA, 1, 1, 2, 3)
)

expected1 <- data.frame(
  id = c(1, 2, 3),
  year = c(2022, 2022, 2022),
  item1 = c(2, 1, 1),
  item2 = c(2, 1, 1),
  item3 = c(2, 1, 1)
)

expected2 <- data.frame(
  id = c(1, 2, 3),
  year = c(2022, 2022, 2022),
  item1 = c(NA, 1, 1),
  item2 = c(NA, 1, 1),
  item3 = c(NA, 1, 1)
)

expected3 <- data.frame(
  id = c(1, 2, 3),
  year = c(2022, 2022, 2000),
  item1 = c(2, 1, 3),
  item2 = c(2, 1, 3),
  item3 = c(2, 1, 3)
)

expected4 <- data.frame(
  id = c(1, 2, 3, 3),
  year = c(2022, 2022, 2022, 2000),
  item1 = c(2, 1, 1, 3),
  item2 = c(2, 1, 1, 3),
  item3 = c(2, 1, 1, 3)
)

# Testing

test_that("data_unique returns original data if no duplicates", {
  test <- data.frame(x = c(1, 2), y = c(3, 4))
  expect_identical(
    data_unique(test, c("x", "y"), verbose = FALSE),
    test
  )
  expect_identical(
    data_unique(test, "x", verbose = FALSE),
    test
  )
})

test_that("data_unique basic", {
  expect_identical(
    data_unique(df1, select = "id", verbose = FALSE),
    expected1
  )
})

test_that("data_unique basic method best", {
  expect_identical(
    data_unique(df1, select = "id", keep = "best", verbose = FALSE),
    expected1
  )
})

test_that("data_unique basic method first", {
  expect_identical(
    data_unique(df1, select = "id", keep = "first", verbose = FALSE),
    expected2
  )
})

test_that("data_unique basic method last", {
  expect_identical(
    data_unique(df1, select = "id", keep = "last", verbose = FALSE),
    expected3
  )
})

test_that("data_unique unquoted", {
  expect_identical(
    data_unique(df1, select = id, verbose = FALSE),
    expected1
  )
})

test_that("data_unique vector", {
  expect_identical(
    data_unique(df1, select = 1, verbose = FALSE),
    expected1
  )
})

test_that("data_unique select-helper", {
  expect_identical(
    data_unique(df1, select = contains("id"), verbose = FALSE),
    expected1
  )
})

test_that("data_unique multiple IDs", {
  x <- data_unique(df1, select = c("id", "year"), verbose = FALSE)
  rownames(x) <- NULL
  expect_identical(
    x,
    expected4
  )
})

test_that("data_unique multiple IDs formula", {
  x <- data_unique(df1, select = ~ id + year, verbose = FALSE)
  rownames(x) <- NULL
  expect_identical(
    x,
    expected4
  )
})

test_that("data_unique multiple IDs vector", {
  x <- data_unique(df1, select = 1:2, verbose = FALSE)
  rownames(x) <- NULL
  expect_identical(
    x,
    expected4
  )
})

test_that("data_unique preserve attributes", {
  attr(df1, "testing") <- "custom.attribute"
  x <- attributes(data_unique(df1, id, verbose = FALSE))
  expect_identical(
    x$testing,
    "custom.attribute"
  )
})

test_that("data_unique, arg 'verbose' works", {
  expect_message(
    data_unique(df1, select = ~ id + year),
    "removed, with method"
  )
})

test_that("data_unique works with groups", {
  df <- data.frame(
    g = c(1, 1, 2, 2),
    x = c(1, 1, 2, 1)
  )
  df <- data_group(df, "g")

  expected <- data.frame(
    g = c(1, 2, 2),
    x = c(1, 2, 1)
  )
  expected <- data_group(expected, "g")

  x <- data_unique(df, "x", verbose = FALSE)
  expect_identical(x, expected, ignore_attr = TRUE)

  y <- attributes(x)

  expect_identical(attributes(df)$class, y$class)
  expect_identical(attributes(df)$groups, y$groups)
})


================================================
FILE: tests/testthat/test-data_unite.R
================================================
d_unite <- data.frame(
  x = c(NA, 1:3),
  y = c(letters[1:3], NA_character_),
  z = 6:9,
  m = c("X", NA_character_, "Y", "Z"),
  n = c("NATION", "COUNTRY", "NATION", NA_character_),
  stringsAsFactors = FALSE
)


# for following tests, we need to check for correct column names,
# and correct values in new variable

test_that("data_unite: simple use case", {
  # basic
  out <- data_unite(d_unite, new_column = "xyz")
  expect_identical(colnames(out), "xyz")
  expect_identical(
    out$xyz,
    c("NA_a_6_X_NATION", "1_b_7_NA_COUNTRY", "2_c_8_Y_NATION", "3_NA_9_Z_NA")
  )
  # use existing column name
  out <- data_unite(d_unite, new_column = "x")
  expect_identical(colnames(out), "x")
  expect_identical(
    out$x,
    c("NA_a_6_X_NATION", "1_b_7_NA_COUNTRY", "2_c_8_Y_NATION", "3_NA_9_Z_NA")
  )
  # select
  out <- data_unite(d_unite, new_column = "xyz", select = c("x", "n"))
  expect_identical(
    colnames(out),
    c(setdiff(colnames(d_unite), c("x", "n")), "xyz")
  )
  expect_identical(
    out$xyz,
    c("NA_NATION", "1_COUNTRY", "2_NATION", "3_NA")
  )
  # select, use existing column name
  out <- data_unite(d_unite, new_column = "x", select = c("x", "n"))
  expect_identical(
    colnames(out),
    c(setdiff(colnames(d_unite), c("x", "n")), "x")
  )
  expect_identical(
    out$x,
    c("NA_NATION", "1_COUNTRY", "2_NATION", "3_NA")
  )
})


test_that("data_unite: remove_na", {
  # basic
  out <- data_unite(d_unite, new_column = "xyz", remove_na = TRUE)
  expect_identical(colnames(out), "xyz")
  expect_identical(
    out$xyz,
    c("a_6_X_NATION", "1_b_7_COUNTRY", "2_c_8_Y_NATION", "3_9_Z")
  )
  # use existing column name
  out <- data_unite(d_unite, new_column = "x", remove_na = TRUE)
  expect_identical(colnames(out), "x")
  expect_identical(
    out$x,
    c("a_6_X_NATION", "1_b_7_COUNTRY", "2_c_8_Y_NATION", "3_9_Z")
  )
  # select
  out <- data_unite(
    d_unite,
    new_column = "xyz",
    remove_na = TRUE,
    select = c("x", "n")
  )
  expect_identical(
    colnames(out),
    c(setdiff(colnames(d_unite), c("x", "n")), "xyz")
  )
  expect_identical(
    out$xyz,
    c("NATION", "1_COUNTRY", "2_NATION", "3")
  )
  # select, use existing column name
  out <- data_unite(
    d_unite,
    new_column = "x",
    remove_na = TRUE,
    select = c("x", "n")
  )
  expect_identical(
    colnames(out),
    c(setdiff(colnames(d_unite), c("x", "n")), "x")
  )
  expect_identical(
    out$x,
    c("NATION", "1_COUNTRY", "2_NATION", "3")
  )
})


test_that("data_unite: append", {
  # basic
  out <- data_unite(d_unite, new_column = "xyz", append = TRUE)
  expect_identical(colnames(out), c("x", "y", "z", "m", "n", "xyz"))
  expect_identical(
    out$xyz,
    c("NA_a_6_X_NATION", "1_b_7_NA_COUNTRY", "2_c_8_Y_NATION", "3_NA_9_Z_NA")
  )
  # remove NA
  out <- data_unite(
    d_unite,
    new_column = "xyz",
    remove_na = TRUE,
    append = TRUE
  )
  expect_identical(colnames(out), c("x", "y", "z", "m", "n", "xyz"))
  expect_identical(
    out$xyz,
    c("a_6_X_NATION", "1_b_7_COUNTRY", "2_c_8_Y_NATION", "3_9_Z")
  )
  # append, using existing column name
  expect_message({
    out <- data_unite(d_unite, new_column = "x", append = TRUE)
  })
  expect_identical(colnames(out), c("x", "y", "z", "m", "n"))
  expect_identical(
    out$x,
    c("NA_a_6_X_NATION", "1_b_7_NA_COUNTRY", "2_c_8_Y_NATION", "3_NA_9_Z_NA")
  )
  # append, using existing column name, and remove NA
  expect_message({
    out <- data_unite(
      d_unite,
      new_column = "x",
      remove_na = TRUE,
      append = TRUE
    )
  })
  expect_identical(colnames(out), c("x", "y", "z", "m", "n"))
  expect_identical(
    out$x,
    c("a_6_X_NATION", "1_b_7_COUNTRY", "2_c_8_Y_NATION", "3_9_Z")
  )
})


test_that("data_unite: combine select and append", {
  # basic
  out <- data_unite(
    d_unite,
    new_column = "xyz",
    append = TRUE,
    select = c("x", "n")
  )
  expect_identical(colnames(out), c("x", "y", "z", "m", "n", "xyz"))
  expect_identical(
    out$xyz,
    c("NA_NATION", "1_COUNTRY", "2_NATION", "3_NA")
  )
  # remove NA
  out <- data_unite(
    d_unite,
    new_column = "xyz",
    remove_na = TRUE,
    append = TRUE,
    select = c("x", "n")
  )
  expect_identical(colnames(out), c("x", "y", "z", "m", "n", "xyz"))
  expect_identical(
    out$xyz,
    c("NATION", "1_COUNTRY", "2_NATION", "3")
  )
  # append, using existing column name
  expect_message({
    out <- data_unite(
      d_unite,
      new_column = "x",
      append = TRUE,
      select = c("x", "n")
    )
  })
  expect_identical(colnames(out), c("x", "y", "z", "m", "n"))
  expect_identical(
    out$x,
    c("NA_NATION", "1_COUNTRY", "2_NATION", "3_NA")
  )
  # append, using existing column name, and remove NA
  expect_message({
    out <- data_unite(
      d_unite,
      new_column = "x",
      remove_na = TRUE,
      append = TRUE,
      select = c("x", "n")
    )
  })
  expect_identical(colnames(out), c("x", "y", "z", "m", "n"))
  expect_identical(
    out$x,
    c("NATION", "1_COUNTRY", "2_NATION", "3")
  )
})


test_that("data_unite: errors", {
  expect_error(data_unite(d_unite), regex = "No name")
  expect_error(
    data_unite(d_unite, new_column = c("a", "b")),
    regex = "a single string"
  )
  expect_error(
    expect_warning(data_unite(d_unite, new_column = "a", select = "huhu")),
    regex = "At least"
  )
})


================================================
FILE: tests/testthat/test-data_write.R
================================================
skip_if_not_installed("httr")
skip_if_not_installed("haven")
skip_if_not_installed("readr")

skip_on_cran()

skip_if_not_installed("curl")
skip_if_offline()

# prepare data set ---------------

data(efc)
d <- data_filter(efc, 1:5)
d$e42dep <- droplevels(d$e42dep)


# data encryption with rds ------------------

test_that("data_write, encrypting rds files", {
  skip_if_not_installed("withr")
  skip_if_not_installed("openssl")
  withr::with_tempfile("tmp", fileext = ".rds", code = {
    expect_warning(data_write(d, tmp, password = "test"), "Remember")

    # no password, returns encrypted data frame
    d2 <- data_read(tmp, verbose = FALSE)
    expect_named(d2, "out")
    expect_false(identical(d, d2))

    # password, returns decrypted data frame
    d2 <- data_read(tmp, password = "test")
    expect_identical(d, d2)

    # wrong password
    expect_error(data_read(tmp, password = "text"), "File does not appear")

    # invalid password arguments
    expect_error(
      data_read(tmp, password = c("test", "test2")),
      regex = "The password must be a single"
    )
    expect_error(
      data_read(tmp, password = 123),
      regex = "The password must be a single"
    )
    expect_error(
      data_read(tmp, password = ""),
      regex = "The password must be a single"
    )
    expect_error(
      data_write(d, tmp, password = c("test", "test2")),
      regex = "The password must be a single"
    )
    expect_error(
      data_write(d, tmp, password = 123),
      regex = "The password must be a single"
    )
    expect_error(
      data_write(d, tmp, password = ""),
      regex = "The password must be a single"
    )

    # not encrypted
    data_write(d, tmp)
    expect_error(data_read(tmp, password = "test"), "File does not appear")

    # check other decryption functions, should fail when encrypted with datawizard
    expect_warning(data_write(d, tmp, password = "test"))
    out <- readRDS(tmp)
    key <- openssl::sha256(charToRaw("test"))
    expect_error(openssl::aes_cbc_decrypt(out, key = key))

    # check other encryption functions, should fail imported with datawizard
    x <- serialize(d, NULL)
    key <- openssl::sha256(charToRaw("test"))
    saveRDS(openssl::aes_cbc_encrypt(x, key = key), tmp)
    expect_error(data_read(tmp, password = "test"), "File does not appear")
  })
})


# data encryption with rdata ------------------

test_that("data_write, encrypting rdata files", {
  skip_if_not_installed("withr")
  skip_if_not_installed("openssl")
  withr::with_tempfile("tmp", fileext = ".rdata", code = {
    expect_warning(data_write(d, tmp, password = "test"), "Remember")

    # no password, returns encrypted data frame
    d2 <- data_read(tmp, verbose = FALSE)
    expect_named(d2, "out")

    # password, returns decrypted data frame
    d2 <- data_read(tmp, password = "test")
    expect_identical(d, d2)
  })
})


# data encryption with parquet ------------------

test_that("data_write, encrypting parquet files", {
  skip_if_not_installed("withr")
  withr::with_tempfile("tmp", fileext = ".parquet", code = {
    expect_error(
      data_write(d, tmp, password = "test"),
      "Data encryption is not supported"
    )
  })
})


# SPSS -------------------------------------

test_that("data_write, SPSS", {
  skip_if_not_installed("withr")
  withr::with_tempfile("tmp", fileext = ".sav", code = {
    expect_message(data_write(d, tmp))
    d2 <- data_read(tmp, verbose = FALSE)
    expect_equal(
      to_factor(d, select = c("e16sex", "c172code")),
      d2,
      ignore_attr = TRUE
    )
    # data encryption not available for SPSS etc.
    expect_error(
      data_write(d, tmp, password = "test"),
      "Data encryption is not supported"
    )
  })
})


test_that("data_write, SPSS, mixed types of labelled vectors", {
  skip_if_not_installed("withr")
  withr::with_tempfile("tmp", fileext = ".sav", code = {
    d <- data.frame(
      a = 1:3,
      b = letters[1:3],
      c = factor(letters[1:3]),
      d = as.Date(c("2022-01-01", "2022-02-01", "2022-03-01")),
      e = c(TRUE, FALSE, FALSE),
      stringsAsFactors = FALSE
    )

    # Date and Logical cannot be labelled
    d$a <- assign_labels(
      d$a,
      variable = "First",
      values = c("one", "two", "three")
    )
    d$b <- assign_labels(d$b, variable = "Second", values = c("A", "B", "C"))
    d$c <- assign_labels(
      d$c,
      variable = "Third",
      values = c("ey", "bee", "see")
    )

    expect_message(data_write(d, tmp), regex = "Preparing")
  })
})


# Stata -------------------------------------

test_that("data_write, Stata", {
  skip_if_not_installed("withr")
  withr::with_tempfile("tmp", fileext = ".dta", code = {
    data_write(d, tmp, verbose = FALSE)
    d2 <- data_read(tmp, verbose = FALSE)

    expect_equal(
      to_factor(d, select = c("e16sex", "c172code")),
      d2,
      ignore_attr = TRUE
    )

    # data encryption not available for SPSS etc.
    expect_error(
      data_write(d, tmp, password = "test"),
      "Data encryption is not supported"
    )
  })
})


# csv -------------------------

test_that("data_write, CSV, keep numeric", {
  skip_if_not_installed("withr")
  withr::with_tempfile("tmp", fileext = ".csv", code = {
    data_write(d, tmp)
    d2 <- data_read(tmp)

    expect_equal(
      to_numeric(d, dummy_factors = FALSE, preserve_levels = TRUE),
      d2,
      ignore_attr = TRUE
    )

    # data encryption not available for SPSS etc.
    expect_error(
      data_write(d, tmp, password = "test"),
      "Data encryption is not supported"
    )
  })
})

test_that("data_write, CSV, convert to factor", {
  skip_if_not_installed("withr")
  withr::with_tempfile("tmp", fileext = ".csv", code = {
    data_write(d, tmp, convert_factors = TRUE)
    d2 <- data_read(tmp)
    out <- to_factor(d, select = c("e16sex", "c172code"))
    out$e16sex <- as.character(out$e16sex)
    out$c172code <- as.character(out$c172code)
    out$e42dep <- as.numeric(as.character(out$e42dep))
    expect_equal(out, d2, ignore_attr = TRUE)
  })
})

test_that("data_write, CSV, create labels file", {
  skip_if_not_installed("withr")
  withr::with_tempfile("tmp", fileext = ".csv", code = {
    # file for labels
    fpath <- dirname(tmp)
    fname <- sub("\\.csv$", "", basename(tmp))
    tmp2 <- paste0(fpath, .Platform$file.sep, fname, "_labels.csv")
    on.exit(unlink(tmp2))

    data(efc)
    expect_message(data_write(efc, tmp, save_labels = TRUE))
    d <- data_read(tmp2)

    expect_identical(d$variable[2], "e16sex")
    expect_identical(d$label[2], "elder's gender")
    expect_identical(d$labels[2], "1=male; 2=female")

    expect_message(data_write(efc, tmp, save_labels = TRUE, delimiter = ";"))
    d <- data_read(tmp2)
    expect_identical(d$variable[2], "e16sex")
    expect_identical(d$label[2], "elder's gender")
    expect_identical(d$labels[2], "1=male; 2=female")
  })
})


# invalid file type -------------------------

test_that("data_write, no file extension", {
  expect_error(data_write(d, "mytestfile"))
  expect_error(data_write(d, NULL))
})


# writing character vector works for missing value labels ------------------

test_that("data_write, existing variable label but missing value labels", {
  skip_if_not_installed("withr")
  withr::with_tempfile("tmp", fileext = ".sav", code = {
    d <- data.frame(
      a = letters[1:3],
      stringsAsFactors = FALSE
    )
    d$a <- assign_labels(d$a, variable = "First")
    # expect message, but no error
    expect_message(data_write(d, tmp), regex = "Preparing")

    # check if data is really the same
    d2 <- data_read(tmp, verbose = FALSE)
    expect_identical(d2, d)
  })
})


================================================
FILE: tests/testthat/test-demean.R
================================================
test_that("demean works", {
  df <- iris

  set.seed(123)
  df$ID <- sample.int(4, nrow(df), replace = TRUE) # fake-ID

  set.seed(123)
  df$binary <- as.factor(rbinom(150, 1, 0.35)) # binary variable

  set.seed(123)
  x <- demean(
    df,
    select = c("Sepal.Length", "Petal.Length"),
    by = "ID",
    append = FALSE
  )
  expect_snapshot(head(x))

  set.seed(123)
  expect_message(
    {
      x <- demean(
        df,
        select = c("Sepal.Length", "binary", "Species"),
        by = "ID",
        append = FALSE
      )
    },
    "have been coerced to numeric"
  )
  expect_snapshot(head(x))

  set.seed(123)
  expect_message(
    {
      y <- demean(
        df,
        select = ~ Sepal.Length + binary + Species,
        by = ~ID,
        append = FALSE
      )
    },
    "have been coerced to numeric"
  )
  expect_message(
    {
      z <- demean(
        df,
        select = c("Sepal.Length", "binary", "Species"),
        by = "ID",
        append = FALSE
      )
    },
    "have been coerced to numeric"
  )
  expect_identical(y, z)

  set.seed(123)
  x <- demean(df, select = c("Sepal.Length", "Petal.Length"), by = "ID")
  expect_named(
    x,
    c(
      "Sepal.Length",
      "Sepal.Width",
      "Petal.Length",
      "Petal.Width",
      "Species",
      "ID",
      "binary",
      "Sepal.Length_between",
      "Petal.Length_between",
      "Sepal.Length_within",
      "Petal.Length_within"
    )
  )
  expect_snapshot(head(x))

  df$Sepal.Length_within <- df$Sepal.Length
  expect_error(
    demean(df, select = c("Sepal.Length", "Petal.Length"), by = "ID"),
    regex = "One or more of"
  )
})

test_that("demean interaction term", {
  dat <- data.frame(
    a = c(1, 2, 3, 4, 1, 2, 3, 4),
    x = c(4, 3, 3, 4, 1, 2, 1, 2),
    y = c(1, 2, 1, 2, 4, 3, 2, 1),
    ID = c(1, 2, 3, 1, 2, 3, 1, 2)
  )

  set.seed(123)
  expect_snapshot(demean(
    dat,
    select = c("a", "x*y"),
    by = "ID",
    append = FALSE
  ))
})

test_that("demean shows message if some vars don't exist", {
  dat <- data.frame(
    a = c(1, 2, 3, 4, 1, 2, 3, 4),
    x = c(4, 3, 3, 4, 1, 2, 1, 2),
    y = c(1, 2, 1, 2, 4, 3, 2, 1),
    ID = c(1, 2, 3, 1, 2, 3, 1, 2)
  )

  set.seed(123)
  expect_error(
    demean(dat, select = "foo", by = "ID"),
    regexp = "not found"
  )
})


# see issue #520
test_that("demean for cross-classified designs (by > 1)", {
  skip_if_not_installed("poorman")

  data(efc, package = "datawizard")
  dat <- na.omit(efc)
  dat$e42dep <- factor(dat$e42dep)
  dat$c172code <- factor(dat$c172code)

  x2a <- dat %>%
    data_group(e42dep) %>%
    data_modify(
      c12hour_e42dep = mean(c12hour)
    ) %>%
    data_ungroup() %>%
    data_group(c172code) %>%
    data_modify(
      c12hour_c172code = mean(c12hour)
    ) %>%
    data_ungroup() %>%
    data_modify(
      c12hour_within = c12hour - c12hour_e42dep - c12hour_c172code
    )

  out <- degroup(
    dat,
    select = "c12hour",
    by = c("e42dep", "c172code"),
    suffix_demean = "_within"
  )

  expect_equal(
    out$c12hour_e42dep_between,
    x2a$c12hour_e42dep,
    tolerance = 1e-4,
    ignore_attr = TRUE
  )
  expect_equal(
    out$c12hour_within,
    x2a$c12hour_within,
    tolerance = 1e-4,
    ignore_attr = TRUE
  )

  x2a <- dat %>%
    data_group(e42dep) %>%
    data_modify(
      c12hour_e42dep = mean(c12hour, na.rm = TRUE),
      neg_c_7_e42dep = mean(neg_c_7, na.rm = TRUE)
    ) %>%
    data_ungroup() %>%
    data_group(c172code) %>%
    data_modify(
      c12hour_c172code = mean(c12hour, na.rm = TRUE),
      neg_c_7_c172code = mean(neg_c_7, na.rm = TRUE)
    ) %>%
    data_ungroup() %>%
    data_modify(
      c12hour_within = c12hour - c12hour_e42dep - c12hour_c172code,
      neg_c_7_within = neg_c_7 - neg_c_7_e42dep - neg_c_7_c172code
    )

  out <- degroup(
    dat,
    select = c("c12hour", "neg_c_7"),
    by = c("e42dep", "c172code"),
    suffix_demean = "_within"
  )

  expect_equal(
    out$c12hour_e42dep_between,
    x2a$c12hour_e42dep,
    tolerance = 1e-4,
    ignore_attr = TRUE
  )
  expect_equal(
    out$neg_c_7_c172code_between,
    x2a$neg_c_7_c172code,
    tolerance = 1e-4,
    ignore_attr = TRUE
  )
  expect_equal(
    out$neg_c_7_within,
    x2a$neg_c_7_within,
    tolerance = 1e-4,
    ignore_attr = TRUE
  )
  expect_equal(
    out$c12hour_within,
    x2a$c12hour_within,
    tolerance = 1e-4,
    ignore_attr = TRUE
  )

  # More than 2 groupings
  mu <- 100
  ul <- setNames(c(-1, -3, 0, 4), nm = letters[1:4])
  uL <- setNames(c(10, 30, 0, -40), nm = LETTERS[1:4])
  um <- setNames(c(100, 150, -250), nm = month.abb[1:3])

  dat <- expand.grid(l = letters[1:4], L = LETTERS[1:4], m = month.abb[1:3])

  set.seed(111)
  e <- rnorm(nrow(dat) - 1) |> round(2)
  e <- append(e, -sum(e))

  dat$y <- mu + ul[dat$l] + uL[dat$L] + um[dat$m] + e
  dat$z <- mu + ul[dat$l] + uL[dat$L] + um[dat$m] + 10 * e

  dat_dem <- datawizard::demean(
    dat,
    by = c("l", "L", "m"),
    select = c("y", "z")
  )

  expect_equal(dat_dem$y_l_between, ave(dat$y, dat$l), ignore_attr = TRUE)
  expect_equal(dat_dem$y_L_between, ave(dat$y, dat$L), ignore_attr = TRUE)
  expect_equal(dat_dem$y_m_between, ave(dat$y, dat$m), ignore_attr = TRUE)
  expect_equal(rowSums(dat_dem[grepl("^y_", colnames(dat_dem))]), dat$y)
  expect_equal(rowSums(dat_dem[grepl("^z_", colnames(dat_dem))]), dat$z)
})


test_that("demean, sanity checks", {
  data(efc, package = "datawizard")
  dat <- na.omit(efc)
  dat$e42dep <- factor(dat$e42dep)
  dat$c172code <- factor(dat$c172code)

  expect_error(
    degroup(
      dat,
      select = c("c12hour", "neg_c_8"),
      by = c("e42dep", "c172code"),
      suffix_demean = "_within"
    ),
    regex = "Variable \"neg_c_8\" was not found"
  )
  expect_error(
    degroup(
      dat,
      select = c("c12hour", "neg_c_8"),
      by = c("e42dep", "c173code"),
      suffix_demean = "_within"
    ),
    regex = "Variables \"neg_c_8\" and \"c173code\" were not found"
  )
})


test_that("demean for nested designs (by > 1), nested = TRUE", {
  data(efc, package = "datawizard")
  dat <- na.omit(efc)
  dat$e42dep <- factor(dat$e42dep)
  dat$c172code <- factor(dat$c172code)

  x_ijk <- dat$c12hour
  xbar_k <- ave(x_ijk, dat$e42dep, FUN = mean)
  xbar_jk <- ave(x_ijk, dat$e42dep, dat$c172code, FUN = mean)

  L3_between <- xbar_k
  L2_between <- xbar_jk - xbar_k
  L1_within <- x_ijk - xbar_jk

  out <- degroup(
    dat,
    select = "c12hour",
    by = c("e42dep", "c172code"),
    nested = TRUE,
    suffix_demean = "_within"
  )

  expect_equal(
    out$c12hour_within,
    L1_within,
    tolerance = 1e-4,
    ignore_attr = TRUE
  )
  expect_equal(
    out$c12hour_e42dep_between,
    L3_between,
    tolerance = 1e-4,
    ignore_attr = TRUE
  )
  expect_equal(
    out$c12hour_c172code_between,
    L2_between,
    tolerance = 1e-4,
    ignore_attr = TRUE
  )

  # Following #635
  testdf <- data.frame(
    roman = c("I", "I", "I", "I", "II", "II", "II", "II"),
    alphabet = c("a", "a", "b", "b", "a", "b", "a", "b"),
    val1 = c(1, 2, 3, 4, 5, 6, 7, 8),
    val2 = c(1, 2, 3, 4, 5, 6, 7, 8),
    val3 = c(1, 2, 3, 4, 5, 6, 7, 8)
  )

  out <- datawizard::demean(
    testdf,
    select = c("val1", "val2", "val3"),
    by = "roman/alphabet",
    append = FALSE
  )

  expect_named(
    out,
    c(
      "val1_roman_between",
      "val1_alphabet_between",
      "val2_roman_between",
      "val2_alphabet_between",
      "val3_roman_between",
      "val3_alphabet_between",
      "val1_within",
      "val2_within",
      "val3_within"
    )
  )

  expect_equal(
    as.vector(out$val1_within),
    c(-0.5, 0.5, -0.5, 0.5, -1, -1, 1, 1)
  )
  expect_equal(out$val1_within, out$val2_within)
  expect_equal(out$val1_within, out$val3_within)

  expect_equal(
    as.vector(out$val1_roman_between),
    c(2.5, 2.5, 2.5, 2.5, 6.5, 6.5, 6.5, 6.5)
  )
  expect_equal(out$val1_roman_between, out$val2_roman_between)
  expect_equal(out$val1_roman_between, out$val3_roman_between)

  expect_equal(
    as.vector(out$val1_alphabet_between),
    c(-1, -1, 1, 1, -0.5, 0.5, -0.5, 0.5)
  )
  expect_equal(out$val1_alphabet_between, out$val2_alphabet_between)
  expect_equal(out$val1_alphabet_between, out$val3_alphabet_between)

  expect_equal(rowSums(out[, grepl("^val1", names(out))]), testdf$val1)
})


================================================
FILE: tests/testthat/test-describe_distribution.R
================================================
skip_if_not_installed("bayestestR")

# numeric ---------------------------------------

test_that("describe_distribution - numeric: works with basic numeric vector", {
  x <- describe_distribution(mtcars$mpg)
  expect_identical(dim(x), c(1L, 9L))
  expect_identical(round(x$Mean), 20)
})

test_that("describe_distribution - numeric: correctly handles missing values", {
  no_missing <- describe_distribution(mtcars$mpg)
  test <- mtcars$mpg
  test[1] <- NA
  with_missing <- describe_distribution(test)
  expect_identical(with_missing$n, 31L)
  expect_identical(with_missing$n_Missing, 1L)
  expect_false(with_missing$Mean == no_missing$Mean)
})

test_that("describe_distribution - numeric: works with quartiles", {
  x <- describe_distribution(mtcars$mpg, quartiles = TRUE)
  expect_identical(dim(x), c(1L, 11L))
  expect_true("Q1" %in% names(x))
  expect_true("Q3" %in% names(x))
})

test_that("describe_distribution - numeric: works with range", {
  x <- describe_distribution(mtcars$mpg, range = FALSE)
  expect_identical(dim(x), c(1L, 7L))
  expect_false("min" %in% names(x))
  expect_false("max" %in% names(x))
})

test_that("describe_distribution - NULL for date", {
  v <- as.Date(c("2022-01-01", "2022-01-02"))
  expect_warning(expect_null(describe_distribution(v)))
})


# data frame ---------------------------------------

test_that("describe_distribution - data frame: works with basic data frame", {
  x <- describe_distribution(mtcars)
  expect_identical(dim(x), c(11L, 10L))
  expect_identical(round(x[1, "Mean"]), 20)
})

test_that("describe_distribution - data frame: correctly handles missing values", {
  no_missing <- describe_distribution(mtcars)
  test <- mtcars
  test[1, ] <- NA
  with_missing <- describe_distribution(test)
  expect_identical(unique(with_missing$n), 31L)
  expect_identical(unique(with_missing$n_Missing), 1L)
  expect_false(unique(with_missing$Mean == no_missing$Mean))
})

test_that("describe_distribution - data frame: works with quartiles", {
  x <- describe_distribution(mtcars, quartiles = TRUE)
  expect_identical(dim(x), c(11L, 12L))
  expect_true("Q1" %in% names(x))
  expect_true("Q3" %in% names(x))
})

test_that("describe_distribution - data frame: works with range", {
  x <- describe_distribution(mtcars, range = FALSE)
  expect_identical(dim(x), c(11L, 8L))
  expect_false("min" %in% names(x))
  expect_false("max" %in% names(x))
})


# factor ---------------------------------------

test_that("describe_distribution - factor", {
  expect_snapshot(describe_distribution(factor(substring(
    "statistics",
    1:10,
    1:10
  ))))
})


# character ---------------------------------------

test_that("describe_distribution - character", {
  expect_snapshot(describe_distribution(as.character(ToothGrowth$supp)))
})


# list ---------------------------------------

test_that("describe_distribution - list: works with basic list", {
  x <- list(mtcars$mpg, mtcars$cyl)
  stored <- describe_distribution(x)
  unnamed <- describe_distribution(list(mtcars$mpg, mtcars$cyl))
  named <- describe_distribution(list(foo = mtcars$mpg, foo2 = mtcars$cyl))
  mix <- describe_distribution(list(foo = mtcars$mpg, mtcars$cyl))

  expect_identical(dim(stored), c(2L, 10L))
  expect_identical(round(stored$Mean), c(20, 6))
  expect_identical(dim(unnamed), c(2L, 10L))
  expect_identical(round(unnamed$Mean), c(20, 6))
  expect_identical(dim(named), c(2L, 10L))
  expect_identical(round(named$Mean), c(20, 6))
  expect_identical(dim(mix), c(2L, 10L))
  expect_identical(round(mix$Mean), c(20, 6))
})

test_that("describe_distribution - list: works with include_factors", {
  x1 <- describe_distribution(list(mtcars$mpg, factor(mtcars$cyl)))
  y <- describe_distribution(list(mtcars$mpg))
  expect_identical(x1, y)

  x2 <- describe_distribution(
    list(mtcars$mpg, factor(mtcars$cyl)),
    include_factors = TRUE
  )
  expect_identical(dim(x2), c(2L, 10L))
  expect_identical(x2$Variable, c("mtcars$mpg", "factor(mtcars$cyl)"))

  x3 <- describe_distribution(
    list(mtcars$mpg, foo = factor(mtcars$cyl)),
    include_factors = TRUE
  )
  expect_identical(dim(x3), c(2L, 10L))
  expect_identical(x3$Variable, c("mtcars$mpg", "foo"))
})

test_that("describe_distribution - list: correctly removes character elements", {
  x <- describe_distribution(list(mtcars$mpg, "something"))
  y <- describe_distribution(list(mtcars$mpg))
  expect_identical(x, y)
})

test_that("describe_distribution - list: correctly handles variable names", {
  x <- list(mtcars$mpg, mtcars$cyl)
  stored <- describe_distribution(x)
  unnamed <- describe_distribution(list(mtcars$mpg, mtcars$cyl))
  named <- describe_distribution(list(foo = mtcars$mpg, foo2 = mtcars$cyl))
  mix <- describe_distribution(list(foo = mtcars$mpg, mtcars$cyl))

  expect_identical(stored$Variable, c("Var_1", "Var_2"))
  expect_identical(unnamed$Variable, c("mtcars$mpg", "mtcars$cyl"))
  expect_identical(named$Variable, c("foo", "foo2"))
  expect_identical(mix$Variable, c("foo", "mtcars$cyl"))
})

test_that("describe_distribution - list: correctly handles missing values", {
  no_missing <- describe_distribution(list(mtcars$mpg, mtcars$cyl))
  test <- mtcars$mpg
  test2 <- mtcars$cyl
  test[1] <- NA
  test2[1] <- NA
  with_missing <- describe_distribution(list(test, test2))
  expect_identical(unique(with_missing$n), 31L)
  expect_identical(unique(with_missing$n_Missing), 1L)
  expect_false(unique(with_missing$Mean == no_missing$Mean))
})

test_that("describe_distribution - list: works with quartiles", {
  x <- describe_distribution(list(mtcars$mpg, mtcars$cyl), quartiles = TRUE)
  expect_identical(dim(x), c(2L, 12L))
  expect_true("Q1" %in% names(x))
  expect_true("Q3" %in% names(x))
})

test_that("describe_distribution - list: works with range", {
  x <- describe_distribution(list(mtcars$mpg, mtcars$cyl), range = FALSE)
  expect_identical(dim(x), c(2L, 8L))
  expect_false("min" %in% names(x))
  expect_false("max" %in% names(x))
})


# select ----------------------

test_that("describe_distribution - select", {
  data(iris)
  out <- describe_distribution(iris, select = starts_with("Petal"))

  expect_identical(out$Variable, c("Petal.Length", "Petal.Width"))
  expect_equal(out$Mean, c(3.758000, 1.199333), tolerance = 1e-3)

  expect_null(describe_distribution(iris, select = "Species"))
  out <- describe_distribution(iris, select = "Species", include_factors = TRUE)
  exp <- describe_distribution(iris$Species)
  expect_identical(out$Range, exp$Range)
})


# select and grouped df ----------------------

test_that("describe_distribution - grouped df", {
  data(iris)
  x <- data_group(iris, Species)
  out <- describe_distribution(x, select = starts_with("Petal"))

  expect_snapshot(out)
  expect_equal(
    out$Mean,
    c(1.462, 0.246, 4.26, 1.326, 5.552, 2.026),
    tolerance = 1e-3
  )
})

# Mostly to test printing
test_that("describe_distribution - grouped df and multiple groups", {
  x <- data.frame(
    grp1 = rep(letters[1:3], each = 20),
    grp2 = rep(letters[1:3], 20),
    values = 1:30
  )
  x <- data_group(x, c("grp1", "grp2"))
  expect_snapshot(describe_distribution(x))
})

test_that("argument 'by' works", {
  # basic
  grouped <- data_group(mtcars, c("am", "vs"))
  expect_identical(
    describe_distribution(grouped),
    describe_distribution(mtcars, by = c("am", "vs")),
    ignore_attr = TRUE
  )

  # mixing data_group() and arg 'by'
  grouped <- data_group(mtcars, c("am", "vs"))
  half_grouped <- data_group(mtcars, "am")
  expect_identical(
    describe_distribution(grouped),
    describe_distribution(half_grouped, by = "vs"),
    ignore_attr = TRUE
  )

  expect_error(
    describe_distribution(mtcars, by = 2),
    "must be a character vector"
  )
})

test_that("empty groups are discarded from the output, #608", {
  dat <- data.frame(
    grp1 = factor("a", levels = c("a", "b")),
    grp2 = factor(c("A", "B")),
    value = 1:2
  )
  dat <- data_group(dat, c("grp1", "grp2"))
  expect_no_error(
    suppressWarnings(describe_distribution(dat, ci = 0.95))
  )
})

# distribution_mode --------------------------
test_that("distribution_mode works as expected", {
  # atomic vector
  expect_identical(distribution_mode(c(1, 2, 3, 3, 4, 5)), 3)
  expect_identical(distribution_mode(c(1, 2, 3, 3, 4, 4, 5)), 3)
  expect_identical(distribution_mode(c(1.5, 2.3, 3.7, 3.7, 4.0, 5)), 3.7)

  # list
  expect_identical(distribution_mode(list(1, 2, 3, 3, 4, 5)), list(3))

  # scalar
  expect_identical(distribution_mode("a"), "a")

  # empty
  expect_null(distribution_mode(NULL))
})

# select helpers ------------------------------
test_that("describe_distribution regex", {
  expect_equal(
    describe_distribution(mtcars, select = "pg", regex = TRUE),
    describe_distribution(mtcars, select = "mpg"),
    ignore_attr = TRUE
  )
})

# formatting ------------------------------
test_that("describe_distribution formatting", {
  data(iris)
  x <- describe_distribution(iris$Sepal.Width, quartiles = TRUE)
  expect_snapshot(format(x))
})

# other -----------------------------------
test_that("return NA in CI if sample is too sparse", {
  set.seed(123456)
  expect_silent(expect_message(
    {
      res <- describe_distribution(
        mtcars[mtcars$cyl == "6", ],
        wt,
        centrality = "map",
        ci = 0.95
      )
    },
    regex = "Bootstrapping"
  ))
  expect_equal(res$CI_low_map, 2.6462, tolerance = 1e-2)
  expect_equal(res$CI_high_map, 3.4531, tolerance = 1e-2)

  x <- c(
    2.5,
    2.5,
    2.5,
    2.5,
    2.5,
    2.5,
    2.2,
    2.2,
    2.2,
    2.5,
    2.5,
    2.5,
    2.5,
    2.5,
    2.5
  )
  expect_message(
    {
      out <- describe_distribution(x, centrality = "map")
    },
    regex = "Could not calculate"
  )
  expect_identical(out$MAP, NA_real_)
  expect_silent(describe_distribution(x, centrality = "map", verbose = FALSE))
})

# check for reserved column names
test_that("errors on invalid column names (reserved word)", {
  data(mtcars)

  out <- data_to_long(
    mtcars,
    cols = 1:3,
    names_to = "Variable",
    values_to = "Values"
  )
  out <- data_group(out, c("gear", "Variable"))
  expect_error(
    describe_distribution(out, select = "Values"),
    regex = "Following variable names are reserved"
  )

  out <- data_to_long(
    mtcars,
    cols = 1:3,
    names_to = "Variable",
    values_to = "Values"
  )
  expect_error(
    describe_distribution(out, select = "Variable"),
    regex = "Following variable names are reserved"
  )
})

# multiple centralities
test_that("multiple centralities work", {
  data(iris)
  out <- describe_distribution(
    iris,
    select = "Petal.Width",
    centrality = c("median", "mean")
  )
  expect_identical(dim(out), c(1L, 12L))
  expect_named(
    out,
    c(
      "Variable",
      "Median",
      "MAD",
      "Mean",
      "SD",
      "IQR",
      "Min",
      "Max",
      "Skewness",
      "Kurtosis",
      "n",
      "n_Missing"
    )
  )
  out <- describe_distribution(
    iris,
    select = "Petal.Width",
    centrality = list("median", "mean")
  )
  expect_identical(dim(out), c(1L, 12L))
  expect_named(
    out,
    c(
      "Variable",
      "Median",
      "MAD",
      "Mean",
      "SD",
      "IQR",
      "Min",
      "Max",
      "Skewness",
      "Kurtosis",
      "n",
      "n_Missing"
    )
  )
})


test_that("(multiple) centralities with CIs", {
  data(iris)
  x <- iris$Sepal.Width
  set.seed(123456)
  expect_message(
    {
      out <- describe_distribution(
        x,
        centrality = "all",
        ci = 0.95,
        iterations = 100
      )
    },
    regex = "For more stable intervals"
  )
  expect_named(
    out,
    c(
      "Median",
      "MAD",
      "Mean",
      "SD",
      "MAP",
      "IQR",
      "CI_low_mean",
      "CI_high_mean",
      "CI_low_median",
      "CI_high_median",
      "CI_low_MAP",
      "CI_high_MAP",
      "Min",
      "Max",
      "Skewness",
      "Kurtosis",
      "n",
      "n_Missing"
    )
  )
  expect_snapshot(print(out, table_width = Inf))
  expect_silent(describe_distribution(
    x,
    centrality = "all",
    ci = 0.95,
    iterations = 100,
    verbose = FALSE
  ))

  set.seed(123456)
  out <- describe_distribution(
    x,
    centrality = "mean",
    ci = 0.95,
    iterations = 100,
    verbose = FALSE
  )
  expect_named(
    out,
    c(
      "Mean",
      "SD",
      "IQR",
      "CI_low_mean",
      "CI_high_mean",
      "Min",
      "Max",
      "Skewness",
      "Kurtosis",
      "n",
      "n_Missing"
    )
  )
  expect_snapshot(print(out, table_width = Inf))

  set.seed(123456)
  out <- describe_distribution(
    x,
    centrality = c("MAP", "median"),
    ci = 0.95,
    iterations = 100,
    verbose = FALSE
  )
  expect_named(
    out,
    c(
      "Median",
      "MAD",
      "MAP",
      "IQR",
      "CI_low_MAP",
      "CI_high_MAP",
      "CI_low_median",
      "CI_high_median",
      "Min",
      "Max",
      "Skewness",
      "Kurtosis",
      "n",
      "n_Missing"
    )
  )
  expect_snapshot(print(out, table_width = Inf))

  # only one message for data frame
  expect_silent(expect_message(describe_distribution(iris, ci = 0.95)))
})


test_that("display() method exports to markdown", {
  skip_if_not_installed("knitr")
  data(iris)
  out <- describe_distribution(iris)
  expect_error(display(out, format = "invalid"), regex = "Invalid option")
  expect_snapshot(display(out))
})


test_that("display() method exports to tinytable", {
  skip_if_not_installed("tinytable")
  data(iris)
  out <- describe_distribution(iris)
  expect_snapshot(display(out, format = "tt"))
})


================================================
FILE: tests/testthat/test-distributions.R
================================================
test_that("distributions", {
  skip_if_not_installed("bayestestR")
  skip_if_not_installed("parameters")

  set.seed(123)
  x <- bayestestR::distribution_normal(100)

  expect_equal(kurtosis(x)$Kurtosis, -0.1119534, tolerance = 0.01)
  expect_equal(skewness(x)$Skewness, -5.881466e-17, tolerance = 0.01)
  expect_equal(as.numeric(smoothness(x, "diff")), 1.183699, tolerance = 0.01)
  expect_equal(as.numeric(smoothness(x, "cor")), 0.9979799, tolerance = 0.01)
})


================================================
FILE: tests/testthat/test-empty-dataframe.R
================================================
test_that("remove empty with character", {
  tmp <- data.frame(
    a = c(1, 2, 3, NA, 5),
    b = c(1, NA, 3, NA, 5),
    c = c(NA, NA, NA, NA, NA),
    d = c(1, NA, 3, NA, 5)
  )

  expect_identical(empty_columns(tmp), c(c = 3L))
  expect_identical(empty_rows(tmp), 4L)

  expect_identical(dim(remove_empty_columns(tmp)), c(5L, 3L))
  expect_identical(dim(remove_empty_rows(tmp)), c(4L, 4L))
  expect_identical(dim(remove_empty(tmp)), c(4L, 3L))

  expect_snapshot(remove_empty_columns(tmp))
  expect_snapshot(remove_empty_rows(tmp))
  expect_snapshot(remove_empty(tmp))
})


test_that("remove empty columns with character", {
  tmp <- data.frame(
    a = c(1, 2, 3, NA, 5),
    b = c("", NA, "", NA, ""),
    c = c(NA, NA, NA, NA, NA),
    d = c(1, NA, 3, NA, 5),
    e = c("", "", "", "", ""),
    stringsAsFactors = FALSE
  )

  expect_identical(empty_columns(tmp), c(b = 2L, c = 3L, e = 5L))
  expect_identical(dim(remove_empty_columns(tmp)), c(5L, 2L))
  expect_identical(dim(remove_empty(tmp)), c(4L, 2L))
})


test_that("remove empty rows with character", {
  tmp <- data.frame(
    a = c(1, "", 3, NA, 5),
    b = c("", NA, "", NA, ""),
    c = c(NA, NA, NA, NA, NA),
    d = c(1, NA, 3, NA, 5),
    e = c("", "", "", "", ""),
    f = factor(c("", "", "", "", "")),
    g = factor(c("", NA, "", NA, "")),
    stringsAsFactors = FALSE
  )

  expect_identical(empty_rows(tmp), c(2L, 4L))
  expect_identical(dim(remove_empty_rows(tmp)), c(3L, 7L))
  expect_identical(dim(remove_empty(tmp)), c(3L, 2L))
})

test_that("empty_columns with only NA characters", {
  tmp <- data.frame(
    var1 = c(1, 1, 1),
    var2 = c(NA_character_, NA_character_, NA_character_)
  )
  expect_identical(empty_columns(tmp), c(var2 = 2L))
})


test_that("works with non-ascii chars", {
  tmp <- data.frame(
    a = c(1, 2, 3, NA, 5),
    b = c("", NA, "", NA, ""),
    c = c(NA, NA, NA, NA, NA),
    d = c("test", "Se\x96ora", "works fine", "this too", "yeah"),
    e = c("", "", "", "", ""),
    stringsAsFactors = FALSE
  )
  expect_identical(empty_columns(tmp), c(b = 2L, c = 3L, e = 5L))
})


================================================
FILE: tests/testthat/test-extract_column_names.R
================================================
test_that("extract_column_names works as expected", {
  expect_identical(
    extract_column_names(iris, starts_with("Sepal")),
    c("Sepal.Length", "Sepal.Width")
  )

  expect_identical(
    extract_column_names(iris, starts_with("Sepal", "Petal")),
    c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")
  )

  expect_identical(
    extract_column_names(iris, ends_with("Width")),
    c("Sepal.Width", "Petal.Width")
  )

  expect_identical(
    extract_column_names(iris, ends_with("Length", "Width")),
    c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")
  )

  expect_identical(
    extract_column_names(iris, regex("\\.")),
    c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")
  )

  expect_identical(
    extract_column_names(iris, c("Petal.Width", "Sepal.Length")),
    c("Petal.Width", "Sepal.Length")
  )

  expect_identical(
    extract_column_names(iris, contains("Wid")),
    c("Sepal.Width", "Petal.Width")
  )

  expect_identical(
    extract_column_names(iris, contains("en", "idt")),
    c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")
  )

  expect_identical(
    extract_column_names(mtcars, c("am", "gear", "cyl")),
    c("am", "gear", "cyl")
  )

  expect_identical(
    extract_column_names(mtcars, c("vam", "gear", "cyl")),
    c("gear", "cyl")
  )

  expect_warning(expect_null(extract_column_names(mtcars, ends_with("abc"))))

  expect_identical(
    extract_column_names(mtcars, regex("rb$")),
    "carb"
  )

  expect_identical(
    extract_column_names(mtcars, regex("^c")),
    c("cyl", "carb")
  )

  expect_warning(expect_null(extract_column_names(mtcars, "^c")))

  expect_identical(
    extract_column_names(mtcars, regex("^C"), ignore_case = TRUE),
    c("cyl", "carb")
  )

  expect_identical(
    extract_column_names(iris, "Width$", regex = TRUE),
    c("Sepal.Width", "Petal.Width")
  )
})


test_that("extract_column_names from other functions", {
  test_fun1 <- function(data, i) {
    extract_column_names(data, select = i)
  }
  expect_identical(
    test_fun1(iris, c("Sepal.Length", "Sepal.Width")),
    c("Sepal.Length", "Sepal.Width")
  )

  expect_identical(
    test_fun1(iris, starts_with("Sep")),
    c("Sepal.Length", "Sepal.Width")
  )

  test_fun1a <- function(data, i) {
    extract_column_names(data, select = i, regex = TRUE)
  }
  expect_identical(
    test_fun1a(iris, "Sep"),
    c("Sepal.Length", "Sepal.Width")
  )

  test_fun1b <- function(data, i) {
    extract_column_names(data, select = i, regex = TRUE)
  }
  expect_identical(
    test_fun1b(iris, "Width$"),
    c("Sepal.Width", "Petal.Width")
  )

  test_fun2 <- function(data) {
    extract_column_names(data, select = starts_with("Sep"))
  }
  expect_identical(
    test_fun2(iris),
    c("Sepal.Length", "Sepal.Width")
  )

  test_fun3 <- function(data) {
    i <- "Sep"
    extract_column_names(data, select = starts_with(i))
  }
  expect_identical(
    test_fun3(iris),
    c("Sepal.Length", "Sepal.Width")
  )
})

test_that("extract_column_names regex", {
  expect_identical(
    extract_column_names(mtcars, select = "pg", regex = TRUE),
    extract_column_names(mtcars, select = "mpg")
  )
})

test_that("extract_column_names works correctly with minus sign", {
  expect_identical(
    extract_column_names(iris, -"Sepal.Length"),
    c("Sepal.Width", "Petal.Length", "Petal.Width", "Species")
  )

  expect_identical(
    extract_column_names(iris, -c("Sepal.Length", "Petal.Width")),
    c("Sepal.Width", "Petal.Length", "Species")
  )

  expect_identical(
    extract_column_names(iris, -1),
    c("Sepal.Width", "Petal.Length", "Petal.Width", "Species")
  )

  expect_error(
    extract_column_names(iris, -1:2),
    regexp = "can't mix negative"
  )

  expect_identical(
    extract_column_names(iris, -(1:2)),
    c("Petal.Length", "Petal.Width", "Species")
  )

  expect_identical(
    extract_column_names(iris, -c(1, 3)),
    c("Sepal.Width", "Petal.Width", "Species")
  )

  expect_identical(
    extract_column_names(iris, -starts_with("Sepal", "Petal")),
    "Species"
  )

  expect_identical(
    extract_column_names(iris, -ends_with("Length", "Width")),
    "Species"
  )

  expect_identical(
    extract_column_names(iris, -contains("en", "idt")),
    "Species"
  )

  expect_identical(
    extract_column_names(
      iris,
      -c("Sepal.Length", "Petal.Width"),
      exclude = "Species"
    ),
    c("Sepal.Width", "Petal.Length")
  )
})

test_that("extract_column_names with square brackets", {
  expect_identical(
    extract_column_names(mtcars, select = names(mtcars)[-1]),
    extract_column_names(mtcars, select = 2:11)
  )
})


================================================
FILE: tests/testthat/test-labelled_data.R
================================================
data(efc, package = "datawizard")

# reverse -----------------------------------

test_that("reverse, labels preserved", {
  # factor, label
  expect_identical(
    attr(reverse(efc$e42dep), "label", exact = TRUE),
    "elder's dependency"
  )
  # factor, labels
  expect_named(
    attr(reverse(efc$e42dep), "labels", exact = TRUE),
    names(attr(efc$e42dep, "labels", exact = TRUE))
  )
  expect_equal(
    attr(reverse(efc$e42dep), "labels", exact = TRUE),
    rev(attr(efc$e42dep, "labels", exact = TRUE)),
    ignore_attr = TRUE
  )
  # numeric
  expect_named(
    attr(reverse(efc$c12hour), "labels", exact = TRUE),
    names(attr(efc$c12hour, "labels", exact = TRUE))
  )
  # data frame
  labels <- sapply(reverse(efc), attr, which = "label", exact = TRUE)
  expect_identical(
    labels,
    c(
      c12hour = "average number of hours of care per week",
      e16sex = "elder's gender",
      e42dep = "elder's dependency",
      c172code = "carer's level of education",
      neg_c_7 = "Negative impact with 7 items"
    )
  )
})


# data_merge -----------------------------------

test_that("data_merge, labels preserved", {
  labels <- sapply(
    data_merge(efc[1:2], efc[3:4], verbose = FALSE),
    attr,
    which = "label",
    exact = TRUE
  )
  expect_identical(
    labels,
    c(
      c12hour = "average number of hours of care per week",
      e16sex = "elder's gender",
      e42dep = "elder's dependency",
      c172code = "carer's level of education"
    )
  )
})


# data_extract -----------------------------------

test_that("data_extract, labels preserved", {
  # factor
  expect_equal(
    attr(data_extract(efc, select = "e42dep"), "labels", exact = TRUE),
    attr(efc$e42dep, "labels", exact = TRUE),
    ignore_attr = TRUE
  )
  # numeric
  expect_equal(
    attr(data_extract(efc, select = "c172code"), "labels", exact = TRUE),
    attr(efc$c172code, "labels", exact = TRUE),
    ignore_attr = TRUE
  )
  # data frame
  labels <- sapply(
    data_extract(efc, select = c("e42dep", "c172code")),
    attr,
    which = "label",
    exact = TRUE
  )
  expect_identical(
    labels,
    c(e42dep = "elder's dependency", c172code = "carer's level of education")
  )
})


# categorize -----------------------------------

test_that("categorize, labels preserved", {
  # factor
  expect_equal(
    attr(categorize(efc$e42dep), "label", exact = TRUE),
    attr(efc$e42dep, "label", exact = TRUE),
    ignore_attr = TRUE
  )
  # numeric
  expect_equal(
    attr(categorize(efc$c12hour), "label", exact = TRUE),
    attr(efc$c12hour, "label", exact = TRUE),
    ignore_attr = TRUE
  )
})


# data_reorder -----------------------------------

test_that("data_reorder, labels preserved", {
  expect_equal(
    attr(data_reorder(efc, "e42dep")[[1]], "label", exact = TRUE),
    attr(efc$e42dep, "label", exact = TRUE),
    ignore_attr = TRUE
  )
})


# data_remove -----------------------------------

test_that("data_remove, labels preserved", {
  expect_equal(
    attr(data_remove(efc, "e42dep")[[1]], "label", exact = TRUE),
    attr(efc$c12hour, "label", exact = TRUE),
    ignore_attr = TRUE
  )
})


# data_rename -----------------------------------

test_that("data_rename, labels preserved", {
  # factor
  x <- data_rename(efc, "e42dep", "dependency")
  expect_equal(
    attr(x$dependency, "label", exact = TRUE),
    attr(efc$e42dep, "label", exact = TRUE),
    ignore_attr = TRUE
  )
  # numeric
  x <- data_rename(efc, "c12hour", "careload")
  expect_equal(
    attr(x$careload, "label", exact = TRUE),
    attr(efc$c12hour, "label", exact = TRUE),
    ignore_attr = TRUE
  )
  # data frame
  labels <- sapply(
    data_remove(efc, starts_with("c1")),
    attr,
    which = "label",
    exact = TRUE
  )
  expect_identical(
    labels,
    c(
      e16sex = "elder's gender",
      e42dep = "elder's dependency",
      neg_c_7 = "Negative impact with 7 items"
    )
  )
})


# data_addprefix -----------------------------------

test_that("data_addprefix, labels preserved", {
  x <- data_addprefix(efc, "new_")
  # factor
  expect_equal(
    attr(x$new_e42dep, "label", exact = TRUE),
    attr(efc$e42dep, "label", exact = TRUE),
    ignore_attr = TRUE
  )
  # numeric
  expect_equal(
    attr(x$new_c12hour, "label", exact = TRUE),
    attr(efc$c12hour, "label", exact = TRUE),
    ignore_attr = TRUE
  )
})


# data_suffix -----------------------------------

test_that("data_addsuffix, labels preserved", {
  x <- data_addsuffix(efc, "_new")
  # factor
  expect_equal(
    attr(x$e42dep_new, "label", exact = TRUE),
    attr(efc$e42dep, "label", exact = TRUE),
    ignore_attr = TRUE
  )
  # numeric
  expect_equal(
    attr(x$c12hour_new, "label", exact = TRUE),
    attr(efc$c12hour, "label", exact = TRUE),
    ignore_attr = TRUE
  )
})


# to_numeric -----------------------------------

test_that("to_numeric, labels preserved", {
  x <- to_numeric(efc, dummy_factors = FALSE)
  # factor
  expect_equal(
    attr(x$e42dep, "label", exact = TRUE),
    attr(efc$e42dep, "label", exact = TRUE),
    ignore_attr = TRUE
  )
  # numeric
  expect_equal(
    attr(x$c12hour, "label", exact = TRUE),
    attr(efc$c12hour, "label", exact = TRUE),
    ignore_attr = TRUE
  )

  x <- to_numeric(efc, dummy_factors = TRUE)
  # numeric
  expect_equal(
    attr(x$c12hour, "label", exact = TRUE),
    attr(efc$c12hour, "label", exact = TRUE),
    ignore_attr = TRUE
  )
})


# data_match -----------------------------------

test_that("data_match, labels preserved", {
  x <- data_match(efc, data.frame(c172code = 1, e16sex = 2), match = "or")
  # factor
  expect_equal(
    attr(x$e42dep, "label", exact = TRUE),
    attr(efc$e42dep, "label", exact = TRUE),
    ignore_attr = TRUE
  )
  # numeric
  expect_equal(
    attr(x$c12hour, "label", exact = TRUE),
    attr(efc$c12hour, "label", exact = TRUE),
    ignore_attr = TRUE
  )
  # filtered
  expect_equal(
    attr(x$c172code, "label", exact = TRUE),
    attr(efc$c172code, "label", exact = TRUE),
    ignore_attr = TRUE
  )
  expect_equal(
    attr(x$c172code, "labels", exact = TRUE),
    attr(efc$c172code, "labels", exact = TRUE),
    ignore_attr = TRUE
  )
})


# data_filter -----------------------------------

test_that("data_filter, labels preserved", {
  x <- data_filter(efc, c172code == 1 & c12hour > 40)
  # factor
  expect_identical(
    attr(x$e42dep, "label", exact = TRUE),
    attr(efc$e42dep, "label", exact = TRUE)
  )
  # numeric
  expect_identical(
    attr(x$c12hour, "label", exact = TRUE),
    attr(efc$c12hour, "label", exact = TRUE)
  )
})


# convert_to_na -----------------------------------

test_that("convert_to_na, labels preserved", {
  expect_message({
    x <- convert_to_na(efc, na = c(2, "2"), select = starts_with("e"))
  })
  # factor
  expect_equal(
    attr(x$e42dep, "label", exact = TRUE),
    attr(efc$e42dep, "label", exact = TRUE),
    ignore_attr = TRUE
  )
  # numeric
  expect_equal(
    attr(x$e16sex, "label", exact = TRUE),
    attr(efc$e16sex, "label", exact = TRUE),
    ignore_attr = TRUE
  )

  # factor
  x <- convert_to_na(efc$e42dep, na = "2")
  expect_equal(
    attr(x, "label", exact = TRUE),
    attr(efc$e42dep, "label", exact = TRUE),
    ignore_attr = TRUE
  )
  # numeric
  x <- convert_to_na(efc$e16sex, na = 2)
  expect_equal(
    attr(x, "label", exact = TRUE),
    attr(efc$e16sex, "label", exact = TRUE),
    ignore_attr = TRUE
  )
  # drop unused value labels
  x <- convert_to_na(efc$c172code, na = 2)
  expect_identical(
    attr(x, "labels", exact = TRUE),
    c(`low level of education` = 1, `high level of education` = 3)
  )
})


# data_select -----------------------------------

test_that("data_select, labels preserved", {
  x <- data_select(efc, starts_with("c"))
  # numeric
  expect_equal(
    attr(x$c12hour, "label", exact = TRUE),
    attr(efc$c12hour, "label", exact = TRUE),
    ignore_attr = TRUE
  )

  x <- data_select(efc, starts_with("e"))
  # factor
  expect_equal(
    attr(x$e42dep, "label", exact = TRUE),
    attr(efc$e42dep, "label", exact = TRUE),
    ignore_attr = TRUE
  )
})


# recode_values -----------------------------------

test_that("recode_values, labels preserved", {
  options(data_recode_pattern = NULL)
  data(efc)
  x <- recode_values(efc$c172code, recode = list(`0` = 1:2, `1` = 3))
  expect_equal(
    attr(x, "label", exact = TRUE),
    attr(efc$c172code, "label", exact = TRUE),
    ignore_attr = TRUE
  )
  expect_null(attr(x, "labels", exact = TRUE))
})


# slide -----------------------------------

test_that("slide, labels preserved", {
  data(efc)
  suppressMessages({
    x <- slide(efc)
  })
  expect_equal(
    attr(x$c172code, "label", exact = TRUE),
    attr(efc$c172code, "label", exact = TRUE),
    ignore_attr = TRUE
  )
  expect_null(attr(x$c172code, "labels", exact = TRUE))

  x <- slide(efc$c172code)
  expect_equal(
    attr(x, "label", exact = TRUE),
    attr(efc$c172code, "label", exact = TRUE),
    ignore_attr = TRUE
  )
})


# to_factor -----------------------------------

test_that("to_factor, labels preserved", {
  data(efc)
  x <- to_factor(efc)
  expect_equal(
    attr(x$c172code, "label", exact = TRUE),
    attr(efc$c172code, "label", exact = TRUE),
    ignore_attr = TRUE
  )
  expect_null(attr(x$c172code, "labels", exact = TRUE))

  x <- to_factor(efc$c172code)
  expect_equal(
    attr(x, "label", exact = TRUE),
    attr(efc$c172code, "label", exact = TRUE),
    ignore_attr = TRUE
  )
})


================================================
FILE: tests/testthat/test-labels_to_levels.R
================================================
test_that("labels_to_levels, numeric", {
  expect_message(
    labels_to_levels(1:10),
    regex = "only works"
  )
})

test_that("labels_to_levels, factor", {
  data(efc)
  x <- as.factor(efc$c172code)
  attr(x, "labels") <- c(low = 1, mid = 2, high = 3)
  x <- labels_to_levels(x)
  expect_identical(levels(x), c("low", "mid", "high"))
  expect_equal(table(x), table(efc$c172code), ignore_attr = TRUE)

  x <- as.ordered(efc$c172code)
  attr(x, "labels") <- c(low = 1, mid = 2, high = 3)
  x <- labels_to_levels(x)
  expect_identical(levels(x), c("low", "mid", "high"))
  expect_s3_class(x, "ordered")
})

test_that("labels_to_levels, factor, error on no labels", {
  data(efc)
  data(iris)
  x <- as.factor(efc$c172code)
  expect_error(labels_to_levels(x), regex = "Could not change factor")
  expect_error(labels_to_levels(iris), regex = "Could not change factor")
})

test_that("labels_to_levels, data frame, append", {
  data(efc)
  out <- labels_to_levels(efc, append = "_ll")
  expect_named(
    out,
    c("c12hour", "e16sex", "e42dep", "c172code", "neg_c_7", "e42dep_ll")
  )
})

test_that("labels_to_levels, data frame, append", {
  data(iris)
  d <- as.data.frame(lapply(iris, as.factor))
  expect_identical(labels_to_levels(d), d)
})

test_that("labels_to_levels, factor, data frame", {
  data(efc)
  out <- labels_to_levels(efc)
  expect_identical(
    levels(out$e42dep),
    c(
      "independent",
      "slightly dependent",
      "moderately dependent",
      "severely dependent"
    )
  )
  expect_identical(sum(vapply(efc, is.factor, TRUE)), 1L)
})

test_that("labels_to_levels, factor, with random value numbers (no sequential order)", {
  x <- c(5, 5, 1, 3, 1, 7)
  attr(x, "labels") <- c(no = 7, yes = 1, maybe = 3, `don't know` = 5)
  out <- to_factor(x, labels_to_levels = TRUE)
  expect_identical(
    as.character(out),
    c("don't know", "don't know", "yes", "maybe", "yes", "no")
  )
  expect_identical(levels(out), c("yes", "maybe", "don't know", "no"))

  x <- c(4, 4, 1, 2, 1, 3)
  attr(x, "labels") <- c(a = 1, b = 2, c = 3, d = 4)
  out <- to_factor(x, labels_to_levels = TRUE)
  expect_identical(as.character(out), c("d", "d", "a", "b", "a", "c"))
  expect_identical(levels(out), c("a", "b", "c", "d"))

  x <- c(4, 4, 1, 2, 1, 3)
  attr(x, "labels") <- c(d = 1, c = 2, b = 3, a = 4)
  out <- to_factor(x, labels_to_levels = TRUE)
  expect_identical(as.character(out), c("a", "a", "d", "c", "d", "b"))
  expect_identical(levels(out), c("d", "c", "b", "a"))

  x <- c(5, 5, 1, 3, 1, 7)
  attr(x, "labels") <- c(no = 7, yes = 1, maybe = 3, `don't know` = 5)
  out <- to_factor(x, labels_to_levels = TRUE)
  expect_identical(
    out,
    structure(
      c(3L, 3L, 1L, 2L, 1L, 4L),
      levels = c("yes", "maybe", "don't know", "no"),
      class = "factor"
    )
  )
  expect_identical(
    as.character(out),
    c("don't know", "don't know", "yes", "maybe", "yes", "no")
  )

  x <- c(5, 5, 1, 3, 1, 7, 4)
  attr(x, "labels") <- c(no = 7, yes = 1, maybe = 3, `don't know` = 5)
  expect_message(
    {
      out <- to_factor(x, labels_to_levels = TRUE)
    },
    regex = "Not all factor levels"
  )
  expect_identical(
    out,
    structure(
      c(4L, 4L, 1L, 2L, 1L, 5L, 3L),
      levels = c("yes", "maybe", "4", "don't know", "no"),
      class = "factor"
    )
  )
  expect_identical(
    as.character(out),
    c("don't know", "don't know", "yes", "maybe", "yes", "no", "4")
  )

  x <- c(5, 5, 1, 3, 1, 7)
  attr(x, "labels") <- c(no = 7, yes = 1, maybe = 4, `don't know` = 5)
  expect_message({
    out <- to_factor(x, labels_to_levels = TRUE)
  })
  expect_identical(
    out,
    structure(
      c(3L, 3L, 1L, 2L, 1L, 4L),
      levels = c("yes", "3", "don't know", "no"),
      class = "factor"
    )
  )
  expect_identical(
    as.character(out),
    c("don't know", "don't know", "yes", "3", "yes", "no")
  )

  x <- c(5, 5, 1, 3, 1, 7, 6)
  attr(x, "labels") <- c(no = 7, yes = 1, maybe = 4, `don't know` = 5)
  expect_message({
    out <- to_factor(x, labels_to_levels = TRUE)
  })
  expect_identical(
    out,
    structure(
      c(3L, 3L, 1L, 2L, 1L, 5L, 4L),
      levels = c("yes", "3", "don't know", "6", "no"),
      class = "factor"
    )
  )
  expect_identical(
    as.character(out),
    c("don't know", "don't know", "yes", "3", "yes", "no", "6")
  )
})


================================================
FILE: tests/testthat/test-makepredictcall.R
================================================
test_that("makepredictcall", {
  data("mtcars")
  nd <- data.frame(hp = c(100, 200))

  m1 <- lm(mpg ~ scale(hp, scale = FALSE), mtcars)
  m2 <- lm(mpg ~ center(hp), mtcars)

  m3 <- lm(mpg ~ scale(hp), mtcars)
  m4 <- lm(mpg ~ standardize(hp), mtcars)

  p1 <- predict(m1, nd)
  expect_equal(p1, predict(m2, nd))
  expect_equal(p1, predict(m3, nd))
  expect_equal(p1, predict(m4, nd))

  X <- matrix(rnorm(100), ncol = 2)
  Y <- rnorm(50)
  expect_error(lm(Y ~ standardize(X)), "matrices")
})


test_that("makepredictcall, normalize", {
  data("mtcars")
  m1 <- lm(mpg ~ normalize(hp), data = mtcars)
  m2 <- lm(mpg ~ hp, data = mtcars)
  m3 <- lm(mpg ~ normalize(hp, include_bounds = FALSE), data = mtcars)

  out1 <- predict(m1, newdata = data.frame(hp = c(100, 110, 120)))
  out2 <- predict(m2, newdata = data.frame(hp = c(100, 110, 120)))
  out3 <- predict(m3, newdata = data.frame(hp = c(100, 110, 120)))

  expect_equal(out1, out2, tolerance = 1e-3, ignore_attr = TRUE)
  expect_equal(out1, out3, tolerance = 1e-3, ignore_attr = TRUE)
  expect_equal(out2, out3, tolerance = 1e-3, ignore_attr = TRUE)

  out1 <- predict(m1, newdata = data.frame(hp = 110))
  out2 <- predict(m2, newdata = data.frame(hp = 110))
  out3 <- predict(m3, newdata = data.frame(hp = 110))

  expect_equal(out1, out2, tolerance = 1e-3, ignore_attr = TRUE)
  expect_equal(out1, out3, tolerance = 1e-3, ignore_attr = TRUE)
  expect_equal(out2, out3, tolerance = 1e-3, ignore_attr = TRUE)
})


test_that("makepredictcall, rescale", {
  data("mtcars")
  m1 <- lm(mpg ~ rescale(hp, to = c(50, 80)), data = mtcars)
  m2 <- lm(mpg ~ hp, data = mtcars)
  m3 <- lm(mpg ~ rescale(hp), data = mtcars)

  out1 <- predict(m1, newdata = data.frame(hp = c(100, 110, 120)))
  out2 <- predict(m2, newdata = data.frame(hp = c(100, 110, 120)))
  out3 <- predict(m3, newdata = data.frame(hp = c(100, 110, 120)))

  expect_equal(out1, out2, tolerance = 1e-3, ignore_attr = TRUE)
  expect_equal(out1, out3, tolerance = 1e-3, ignore_attr = TRUE)
  expect_equal(out2, out3, tolerance = 1e-3, ignore_attr = TRUE)

  out1 <- predict(m1, newdata = data.frame(hp = 110))
  out2 <- predict(m2, newdata = data.frame(hp = 110))
  out3 <- predict(m3, newdata = data.frame(hp = 110))

  expect_equal(out1, out2, tolerance = 1e-3, ignore_attr = TRUE)
  expect_equal(out1, out3, tolerance = 1e-3, ignore_attr = TRUE)
  expect_equal(out2, out3, tolerance = 1e-3, ignore_attr = TRUE)
})


================================================
FILE: tests/testthat/test-mean_sd.R
================================================
test_that("mean_sd", {
  x <- c(-1, 0, 1)
  msd <- mean_sd(x)
  expect_identical(unname(msd), x)
  expect_named(msd, c("-SD", "Mean", "+SD"))

  msd <- mean_sd(mtcars[["mpg"]])
  mmad <- median_mad(mtcars[["mpg"]])
  expect_identical(
    unname(msd),
    mean(mtcars[["mpg"]]) + c(-1, 0, 1) * sd(mtcars[["mpg"]])
  )
  expect_identical(
    unname(mmad),
    median(mtcars[["mpg"]]) + c(-1, 0, 1) * mad(mtcars[["mpg"]])
  )

  msd2 <- mean_sd(mtcars[["mpg"]], times = 3L)
  expect_length(msd2, n = 3 * 2 + 1)
  expect_identical(unname(msd2[3:5]), unname(msd))
  expect_equal(
    unname(diff(msd2)),
    rep(sd(mtcars[["mpg"]]), 6),
    tolerance = 0.00001
  )
  expect_named(
    msd2,
    c("-3 SD", "-2 SD", "-1 SD", "Mean", "+1 SD", "+2 SD", "+3 SD")
  )
})


================================================
FILE: tests/testthat/test-means_by_group.R
================================================
test_that("mean_by_group", {
  skip_if_not_installed("emmeans")
  data(efc)
  expect_snapshot(means_by_group(efc, "c12hour", "e42dep"))
  expect_snapshot(means_by_group(efc, "c12hour", "e42dep", ci = 0.99))
  expect_snapshot(means_by_group(efc, "c12hour", "e42dep", ci = NA))
  expect_snapshot(means_by_group(efc, c("neg_c_7", "c12hour"), "e42dep"))
  expect_snapshot(means_by_group(
    efc,
    c("neg_c_7", "c12hour"),
    "e42dep",
    ci = NA
  ))
  expect_snapshot(means_by_group(
    efc,
    c("neg_c_7", "c12hour"),
    "e42dep",
    ci = 0.99
  ))
  expect_snapshot(means_by_group(efc$c12hour, efc$e42dep))
  expect_snapshot(means_by_group(efc$c12hour, efc$e42dep, ci = NA))
})

test_that("mean_by_group, weighted", {
  skip_if_not_installed("emmeans")
  data(efc)
  set.seed(123)
  efc$weight <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
  expect_snapshot(
    means_by_group(efc, "c12hour", "e42dep", weights = "weight"),
    variant = "windows"
  )
  expect_snapshot(
    means_by_group(efc, "c12hour", "e42dep", weights = "weight", ci = NA),
    variant = "windows"
  )
})


================================================
FILE: tests/testthat/test-normalize.R
================================================
test_that("normalize work as expected", {
  expect_equal(
    normalize(c(0, 1, 5, -5, -2)),
    c(0.5, 0.6, 1, 0, 0.3),
    ignore_attr = TRUE
  )

  expect_equal(
    normalize(c(0, 1, 5, -5, -2), include_bounds = FALSE),
    c(0.5, 0.58, 0.9, 0.1, 0.34),
    ignore_attr = TRUE
  )

  expect_equal(
    normalize(c(0, 1, 5, -5, -2), include_bounds = 0.01),
    c(0.5, 0.598, 0.99, 0.01, 0.304),
    ignore_attr = TRUE,
    tolerance = 1e-4
  )

  expect_equal(
    normalize(c(0, 1, 5, -5, -2), include_bounds = "a", verbose = FALSE),
    c(0.5, 0.6, 1, 0, 0.3),
    ignore_attr = TRUE,
    tolerance = 1e-4
  )

  expect_warning(normalize(
    c(0, 1, 5, -5, -2),
    include_bounds = "a",
    verbose = TRUE
  ))

  expect_snapshot(head(normalize(trees)))
})


test_that("normalize: only NAs", {
  expect_equal(
    normalize(c(NA_real_, NA_real_)),
    c(NA_real_, NA_real_),
    ignore_attr = TRUE
  )
})


test_that("normalize: with Inf", {
  expect_equal(
    normalize(c(1, 2, 3, NA, Inf)),
    c(0, 0.5, 1, NA, Inf),
    ignore_attr = TRUE
  )
})


test_that("normalize: with Inf", {
  expect_equal(
    normalize(c(1, 2, 3, -Inf, Inf)),
    c(0, 0.5, 1, -Inf, Inf),
    ignore_attr = TRUE
  )
})


test_that("normalize: all Inf", {
  expect_equal(
    normalize(c(-Inf, Inf)),
    c(-Inf, Inf),
    ignore_attr = TRUE
  )
})


test_that("normalize: all Na or Inf", {
  expect_equal(
    normalize(c(NA, -Inf, NA, Inf)),
    c(NA, -Inf, NA, Inf),
    ignore_attr = TRUE
  )
})


test_that("normalize: only one value", {
  foo <- 1
  expect_warning(
    normalize(x = foo),
    regexp = "Variable `foo` contains only one unique value and will"
  )
  expect_warning(
    {
      y <- normalize(x = 12)
    },
    regexp = "Variable `12` contains only one unique value and will"
  )
  expect_equal(y, 12, ignore_attr = TRUE)

  expect_silent(normalize(x = foo, verbose = FALSE))
  expect_equal(normalize(x = foo, verbose = FALSE), 1, ignore_attr = TRUE)
})

test_that("normalize: only two values", {
  expect_warning({
    y <- normalize(x = c(1, 2))
  })
  expect_equal(y, c(0, 1), ignore_attr = TRUE)

  expect_silent(normalize(x = c(1, 2), verbose = FALSE))
  expect_equal(
    normalize(x = c(1, 2), verbose = FALSE),
    c(0, 1),
    ignore_attr = TRUE
  )
})

test_that("normalize: factor", {
  expect_identical(
    normalize(factor(1:3)),
    factor(1:3)
  )
})

test_that("normalize: matrix", {
  expect_equal(
    normalize(matrix(1:4, ncol = 2)),
    matrix(seq(0, 1, by = 0.3333), ncol = 2),
    tolerance = 1e-3
  )
})

test_that("normalize: select", {
  skip_if_not_installed("poorman")

  expect_equal(
    normalize(
      iris,
      select = starts_with("Petal\\.L")
    ) %>%
      poorman::pull(Petal.Length),
    normalize(iris$Petal.Length),
    ignore_attr = TRUE
  )
})

test_that("normalize: exclude", {
  skip_if_not_installed("poorman")

  expect_identical(
    normalize(
      iris,
      exclude = ends_with("ecies")
    ),
    iris %>%
      normalize(select = 1:4)
  )
})

test_that("normalize, with append", {
  out_n <- normalize(iris, "Sepal.Width", append = TRUE)
  manual <- (iris$Sepal.Width - min(iris$Sepal.Width)) /
    diff(range(iris$Sepal.Width))
  expect_equal(out_n$Sepal.Width_n, manual, ignore_attr = TRUE)
})


# with grouped data -------------------------------------------

test_that("normalize (grouped data)", {
  skip_if_not_installed("poorman")

  datawizard <- iris %>%
    poorman::group_by(Species) %>%
    normalize(Sepal.Width) %>%
    poorman::ungroup() %>%
    poorman::pull(Sepal.Width)

  manual <- iris %>%
    poorman::group_by(Species) %>%
    poorman::mutate(
      Sepal.Width = (Sepal.Width - min(Sepal.Width)) / diff(range(Sepal.Width))
    ) %>%
    poorman::ungroup() %>%
    poorman::pull(Sepal.Width)

  expect_identical(datawizard, manual)
})

test_that("normalize (grouped data), with append", {
  skip_if_not_installed("poorman")

  datawizard_n <- iris %>%
    poorman::group_by(Species) %>%
    normalize(Sepal.Width, append = TRUE) %>%
    poorman::ungroup() %>%
    poorman::pull(Sepal.Width_n)

  manual_n <- iris %>%
    poorman::group_by(Species) %>%
    poorman::mutate(
      Sepal.Width = (Sepal.Width - min(Sepal.Width)) / diff(range(Sepal.Width))
    ) %>%
    poorman::ungroup() %>%
    poorman::pull(Sepal.Width)

  expect_identical(datawizard_n, manual_n)
})

test_that("normalize, include bounds (grouped data)", {
  skip_if_not_installed("poorman")

  datawizard <- iris %>%
    poorman::group_by(Species) %>%
    normalize(Sepal.Width, include_bounds = TRUE) %>%
    poorman::ungroup() %>%
    poorman::pull(Sepal.Width)

  manual <- iris %>%
    poorman::group_by(Species) %>%
    poorman::mutate(
      Sepal.Width = (Sepal.Width - min(Sepal.Width)) / diff(range(Sepal.Width))
    ) %>%
    poorman::ungroup() %>%
    poorman::pull(Sepal.Width)

  expect_identical(datawizard, manual)
})


test_that("normalize, factor (grouped data)", {
  skip_if_not_installed("poorman")

  datawizard <- iris %>%
    poorman::group_by(Species) %>%
    normalize(Species) %>%
    poorman::ungroup() %>%
    poorman::pull(Species)

  manual <- iris$Species

  expect_identical(datawizard, manual)
})

# select helpers ------------------------------
test_that("normalize regex", {
  expect_identical(
    normalize(mtcars, select = "pg", regex = TRUE),
    normalize(mtcars, select = "mpg")
  )
})


================================================
FILE: tests/testthat/test-print.dw_transformer.R
================================================
test_that("print.dw_transformer", {
  data(iris)
  expect_snapshot(rescale(iris$Sepal.Length))
  expect_snapshot(normalize(iris$Sepal.Length))
  expect_snapshot(center(iris$Sepal.Length))
  expect_snapshot(standardize(iris$Sepal.Length))
})


================================================
FILE: tests/testthat/test-ranktransform.R
================================================
test_that("ranktransform works with NAs", {
  x <- c(NA_real_, NA_real_)
  expect_identical(ranktransform(x), x)
})

test_that("ranktransform works with factors", {
  x <- factor(c("apple", "bear", "banana", "dear"))
  expect_identical(ranktransform(x), x)
})

test_that("ranktransform works with unique value vectors", {
  x <- c(1L, 1L, 1L)

  expect_identical(suppressWarnings(ranktransform(x)), x)

  expect_warning(
    ranktransform(x),
    "Variable `x` contains only one unique value and will not be normalized."
  )
})

test_that("ranktransform works with two unique value vectors", {
  x <- c(1L, 1L, 1L, 2L, 2L, 2L)

  expect_identical(suppressWarnings(ranktransform(x)), c(2, 2, 2, 5, 5, 5))

  expect_warning(
    ranktransform(x),
    "Consider converting it"
  )
})

test_that("signed rank works as expected", {
  x <- c(-1, 2, -3, 4)

  sr <- ranktransform(x, sign = TRUE)
  r <- ranktransform(x, sign = FALSE)

  expect_identical(sr, x) # unchanged
  expect_identical(r, c(2, 3, 1, 4))

  x <- c(1, -2, -2, 4, 0, 3, -14, 0)
  expect_warning(ranktransform(x, sign = TRUE))
  expect_true(all(is.na(suppressWarnings(
    ranktransform(x, sign = TRUE)[c(5, 8)]
  ))))
})

test_that("argument 'zeros' works", {
  x <- c(-1, 0, 2, -3, 4)
  expect_warning(
    ranktransform(x, sign = TRUE),
    "cannot be sign-rank"
  )
  expect_identical(
    ranktransform(x, sign = TRUE, zeros = "signrank"),
    c(-2, 0, 3, -4, 5)
  )
  expect_error(
    ranktransform(x, sign = TRUE, zeros = "foo"),
    "should be one of"
  )
})

test_that("ranktransform works with data frames", {
  set.seed(123)
  expect_snapshot(ranktransform(BOD))
})


# with grouped data -------------------------------------------

test_that("ranktransform works with data frames (grouped data)", {
  skip_if_not_installed("poorman")

  set.seed(123)
  value1 <- sample.int(20, 9, replace = TRUE)
  set.seed(456)
  value2 <- sample.int(20, 9, replace = TRUE)

  test_df <- data.frame(
    id = rep(c("A", "B", "C"), each = 3),
    value1 = value1,
    value2 = value2,
    stringsAsFactors = FALSE
  )

  # nolint start: nested_pipe_linter
  expect_identical(
    test_df %>%
      poorman::group_by(id) %>%
      ranktransform(exclude = "id") %>%
      poorman::ungroup(),
    data.frame(
      id = rep(c("A", "B", "C"), each = 3),
      value1 = c(2, 3, 1, 1, 2, 3, 2, 1, 3),
      value2 = c(3, 2, 1, 1, 3, 2, 2, 3, 1),
      stringsAsFactors = FALSE
    )
  )
  # nolint end
})


test_that("ranktransform works with data frames containing NAs (grouped data)", {
  skip_if_not_installed("poorman")

  set.seed(789)
  value1 <- sample(c(1:15, NA), 9, replace = TRUE)
  set.seed(10)
  value2 <- sample(c(1:15, NA), 9, replace = TRUE)

  test_df <- data.frame(
    id = rep(c("A", "B", "C"), each = 3),
    value1 = value1,
    value2 = value2,
    stringsAsFactors = FALSE
  )

  # nolint start: nested_pipe_linter
  expect_identical(
    test_df %>%
      poorman::group_by(id) %>%
      ranktransform(exclude = "id") %>%
      poorman::ungroup(),
    data.frame(
      id = rep(c("A", "B", "C"), each = 3),
      value1 = c(2, NA, 1, 1, 3, 2, 2, NA, 1),
      value2 = c(3, 1, 2, NA, 2, 1, 3, 1, 2),
      stringsAsFactors = FALSE
    )
  )
  # nolint end
})

# select helpers ------------------------------
test_that("ranktransform regex", {
  expect_identical(
    ranktransform(mtcars, select = "pg", regex = TRUE),
    ranktransform(mtcars, select = "mpg")
  )
})


================================================
FILE: tests/testthat/test-recode_into.R
================================================
test_that("recode_into", {
  x <- 1:10
  out <- recode_into(
    x > 5 ~ "a",
    x > 2 & x <= 5 ~ "b",
    default = "c"
  )
  expect_identical(out, c("c", "c", "b", "b", "b", "a", "a", "a", "a", "a"))
})

test_that("recode_into, overwrite", {
  x <- 1:30
  expect_warning(
    recode_into(
      x > 1 ~ "a",
      x > 10 & x <= 15 ~ "b",
      default = "c",
      overwrite = TRUE
    ),
    regex = "overwritten"
  )
  # validate results
  x <- 1:10
  expect_silent({
    out <- recode_into(
      x >= 3 & x <= 7 ~ 1,
      x > 5 ~ 2,
      default = 0,
      verbose = FALSE
    )
  })
  expect_identical(out, c(0, 0, 1, 1, 1, 2, 2, 2, 2, 2))
  expect_warning(
    recode_into(
      x >= 3 & x <= 7 ~ 1,
      x > 5 ~ 2,
      default = 0
    ),
    regex = "case 6"
  )

  x <- 1:10
  expect_silent({
    out <- recode_into(
      x >= 3 & x <= 7 ~ 1,
      x > 5 ~ 2,
      default = 0,
      overwrite = FALSE,
      verbose = FALSE
    )
  })
  expect_identical(out, c(0, 0, 1, 1, 1, 1, 1, 2, 2, 2))
  expect_warning(
    recode_into(
      x >= 3 & x <= 7 ~ 1,
      x > 5 ~ 2,
      default = 0,
      overwrite = FALSE
    ),
    regex = "case 6"
  )
})

test_that("recode_into, don't overwrite", {
  x <- 1:30
  expect_warning(
    recode_into(
      x > 1 ~ "a",
      x > 10 & x <= 15 ~ "b",
      default = "c",
      overwrite = FALSE
    ),
    regex = "altered"
  )
})

test_that("recode_into, check mixed types", {
  x <- 1:10
  expect_error(
    {
      out <- recode_into(
        x > 5 ~ 1,
        x > 2 & x <= 5 ~ "b"
      )
    },
    regexp = "Recoding not carried out"
  )
})

test_that("recode_into, complain about default = NULL", {
  x <- 1:10
  expect_warning(
    {
      out <- recode_into(
        x > 5 ~ "c",
        x > 2 & x <= 5 ~ "b",
        default = NULL
      )
    },
    regexp = "Default value"
  )
  expect_identical(out, c(NA, NA, "b", "b", "b", "c", "c", "c", "c", "c"))
})

test_that("recode_into, data frame", {
  data(mtcars)
  out <- recode_into(
    mtcars$mpg > 20 & mtcars$cyl == 6 ~ 1,
    mtcars$mpg <= 20 ~ 2,
    default = 0
  )
  expect_identical(
    out,
    c(
      1,
      1,
      0,
      1,
      2,
      2,
      2,
      0,
      0,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      0,
      0,
      0,
      0,
      2,
      2,
      2,
      2,
      0,
      0,
      0,
      2,
      2,
      2,
      0
    )
  )
  d <- mtcars
  out <- recode_into(
    mpg > 20 & cyl == 6 ~ 1,
    mpg <= 20 ~ 2,
    default = 0,
    data = d
  )
  expect_identical(
    out,
    c(
      1,
      1,
      0,
      1,
      2,
      2,
      2,
      0,
      0,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      2,
      0,
      0,
      0,
      0,
      2,
      2,
      2,
      2,
      0,
      0,
      0,
      2,
      2,
      2,
      0
    )
  )
})

test_that("recode_into, works inside functions", {
  test <- function() {
    set.seed(123)
    d <- data.frame(
      x = sample.int(5, 30, TRUE),
      y = sample(letters[1:5], 30, TRUE),
      stringsAsFactors = FALSE
    )
    recode_into(
      x %in% 1:3 & y %in% c("a", "b") ~ 1,
      x > 3 ~ 2,
      data = d,
      default = 0
    )
  }
  expect_identical(
    test(),
    c(
      1,
      1,
      1,
      0,
      0,
      2,
      2,
      0,
      1,
      1,
      2,
      0,
      0,
      0,
      2,
      1,
      1,
      2,
      1,
      0,
      1,
      1,
      0,
      2,
      0,
      1,
      2,
      2,
      1,
      2
    )
  )
})

test_that("recode_into, check differen input length", {
  x <- 1:10
  y <- 10:30
  expect_error(
    {
      out <- recode_into(
        x > 5 ~ 1,
        y > 10 ~ 2
      )
    },
    regexp = "matching conditions"
  )
})

test_that("recode_into, check different input length", {
  x <- 1:5
  y <- c(5, 2, 3, 1, 4)
  expect_warning(
    {
      out <- recode_into(
        x == 2 ~ 1,
        y == 2 & x == 2 ~ 2,
        default = 0
      )
    },
    regexp = "Several recode patterns"
  )
})

test_that("recode_into, make sure recode works with missing in original variable", {
  data(mtcars)
  mtcars$mpg[c(3, 10, 12, 15, 16)] <- NA
  mtcars$cyl[c(2, 15, 16)] <- NA
  d_recode_na <<- as.data.frame(mtcars)
  out1_recoded_na <- recode_into(
    d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1,
    d_recode_na$mpg <= 20 ~ 2,
    d_recode_na$cyl == 4 ~ 3,
    default = 0,
    preserve_na = TRUE
  )
  out2_recoded_na <- recode_into(
    d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1,
    d_recode_na$mpg <= 20 ~ 2,
    default = 0,
    preserve_na = TRUE
  )
  expect_message(
    {
      out3_recoded_na <- recode_into(
        d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1,
        d_recode_na$mpg <= 20 ~ 2,
        d_recode_na$cyl == 4 ~ 3,
        default = 0,
        preserve_na = FALSE
      )
    },
    regex = "Missing values in original variable"
  )
  expect_message(
    {
      out4_recoded_na <- recode_into(
        d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1,
        d_recode_na$mpg <= 20 ~ 2,
        default = 0,
        preserve_na = FALSE
      )
    },
    regex = "Missing values in original variable"
  )
  # one NA in mpg is overwritten by valid value from cyl, total 5 NA
  expect_identical(
    out1_recoded_na,
    c(
      1,
      NA,
      3,
      1,
      2,
      2,
      2,
      3,
      3,
      NA,
      2,
      NA,
      2,
      2,
      NA,
      NA,
      2,
      3,
      3,
      3,
      3,
      2,
      2,
      2,
      2,
      3,
      3,
      3,
      2,
      2,
      2,
      3
    )
  )
  # total 6 NA
  expect_identical(
    out2_recoded_na,
    c(
      1,
      NA,
      NA,
      1,
      2,
      2,
      2,
      0,
      0,
      NA,
      2,
      NA,
      2,
      2,
      NA,
      NA,
      2,
      0,
      0,
      0,
      0,
      2,
      2,
      2,
      2,
      0,
      0,
      0,
      2,
      2,
      2,
      0
    )
  )
  # NA is preserved, set to default if not overwritten by other recodes
  expect_identical(
    out3_recoded_na,
    c(
      1,
      0,
      3,
      1,
      2,
      2,
      2,
      3,
      3,
      0,
      2,
      0,
      2,
      2,
      0,
      0,
      2,
      3,
      3,
      3,
      3,
      2,
      2,
      2,
      2,
      3,
      3,
      3,
      2,
      2,
      2,
      3
    )
  )
  expect_identical(
    out4_recoded_na,
    c(
      1,
      0,
      0,
      1,
      2,
      2,
      2,
      0,
      0,
      0,
      2,
      0,
      2,
      2,
      0,
      0,
      2,
      0,
      0,
      0,
      0,
      2,
      2,
      2,
      2,
      0,
      0,
      0,
      2,
      2,
      2,
      0
    )
  )
})

test_that("recode_into, NA doesn't need to be of exact type", {
  data(mtcars)
  x1 <- recode_into(
    mpg > 10 ~ 1,
    gear == 5 ~ NA_real_,
    data = mtcars,
    verbose = FALSE
  )
  x2 <- recode_into(
    mpg > 10 ~ 1,
    gear == 5 ~ NA,
    data = mtcars,
    verbose = FALSE
  )
  expect_identical(x1, x2)
})


================================================
FILE: tests/testthat/test-replace_nan_inf.R
================================================
test_that("extract from data frame", {
  x <- c(1, 2, NA, 3, NaN, 4, NA, 5, Inf, -Inf, 6, 7)

  expect_identical(
    replace_nan_inf(x),
    c(1, 2, NA, 3, NA, 4, NA, 5, NA, NA, 6, 7)
  )

  # a data frame
  df <- data.frame(
    x = c(1, NA, 5, Inf, 2, NA),
    y = c(3, NaN, 4, -Inf, 6, 7),
    stringsAsFactors = FALSE
  )

  expect_identical(
    replace_nan_inf(df),
    structure(
      list(
        x = c(1, NA, 5, NA, 2, NA),
        y = c(3, NA, 4, NA, 6, 7)
      ),
      row.names = c(NA, -6L),
      class = "data.frame"
    )
  )

  expect_identical(
    replace_nan_inf(df, select = starts_with("x")),
    structure(
      list(
        x = c(1, NA, 5, NA, 2, NA),
        y = c(3, NaN, 4, -Inf, 6, 7)
      ),
      row.names = c(NA, -6L),
      class = "data.frame"
    )
  )
})


================================================
FILE: tests/testthat/test-rescale_weights.R
================================================
test_that("rescale_weights works as expected", {
  data(nhanes_sample)
  # convert tibble into data frame, so check-hard GHA works
  nhanes_sample <- as.data.frame(nhanes_sample)

  expect_snapshot(head(rescale_weights(nhanes_sample, "WTINT2YR", "SDMVSTRA")))

  expect_snapshot(head(rescale_weights(
    nhanes_sample,
    "WTINT2YR",
    c("SDMVSTRA", "SDMVPSU")
  )))

  expect_snapshot(head(rescale_weights(
    nhanes_sample,
    probability_weights = "WTINT2YR",
    method = "kish"
  )))

  out <- rescale_weights(nhanes_sample, "WTINT2YR", "SDMVSTRA")
  expect_equal(sum(out$rescaled_weights_a), 2992, tolerance = 1e-3)
  expect_equal(sum(out$rescaled_weights_b), 2244.71451, tolerance = 1e-3)
  out <- rescale_weights(nhanes_sample, "WTINT2YR", method = "kish")
  expect_equal(sum(out$rescaled_weights), 2162.53961, tolerance = 1e-3)
  out <- rescale_weights(
    nhanes_sample,
    "WTINT2YR",
    by = "SDMVPSU",
    method = "kish"
  )
  expect_equal(sum(out$rescaled_weights), 2163.3657, tolerance = 1e-3)
})


test_that("rescale_weights works as expected", {
  data(nhanes_sample)
  # convert tibble into data frame, so check-hard GHA works
  nhanes_sample <- as.data.frame(nhanes_sample)[1:20, ]

  # add NAs
  set.seed(123)
  nhanes_sample$WTINT2YR[sample.int(nrow(nhanes_sample), 5)] <- NA

  expect_snapshot(rescale_weights(nhanes_sample, "WTINT2YR", "SDMVSTRA"))
  expect_snapshot(rescale_weights(nhanes_sample, "WTINT2YR", method = "kish"))
})


test_that("rescale_weights nested works as expected", {
  data(nhanes_sample)
  # convert tibble into data frame, so check-hard GHA works
  nhanes_sample <- as.data.frame(nhanes_sample)

  expect_snapshot(
    rescale_weights(
      data = head(nhanes_sample, n = 30),
      by = c("SDMVSTRA", "SDMVPSU"),
      probability_weights = "WTINT2YR",
      nest = TRUE
    )
  )

  expect_warning(
    {
      x <- rescale_weights(
        data = head(nhanes_sample),
        by = "SDMVPSU",
        probability_weights = "WTINT2YR",
        nest = TRUE
      )
    },
    "Only one group variable selected"
  )

  expect_identical(
    x,
    rescale_weights(
      data = head(nhanes_sample),
      by = "SDMVPSU",
      probability_weights = "WTINT2YR"
    )
  )
})


test_that("rescale_weights errors and warnings", {
  data(nhanes_sample)
  expect_error(
    rescale_weights(
      data = head(nhanes_sample, n = 30),
      by = c("a", "SDMVSTRA", "c"),
      probability_weights = "WTINT2YR"
    ),
    regex = "The following"
  )
  expect_error(
    rescale_weights(
      data = head(nhanes_sample, n = 30),
      by = "SDMVSTRA",
      probability_weights = NULL
    ),
    regex = "is missing, but required"
  )
  expect_error(
    rescale_weights(
      data = head(nhanes_sample, n = 30),
      by = NULL,
      probability_weights = "WTINT2YR"
    ),
    regex = "must be specified"
  )
  expect_error(
    rescale_weights(
      data = head(nhanes_sample, n = 30),
      by = "abc",
      probability_weights = "WTINT2YR",
      method = "kish"
    ),
    regex = "The following variable"
  )
  expect_warning(
    rescale_weights(
      data = head(nhanes_sample, n = 30),
      by = "SDMVSTRA",
      probability_weights = "WTINT2YR",
      nest = TRUE,
      method = "kish"
    ),
    regex = "is ignored"
  )
  expect_error(
    rescale_weights(
      data = head(nhanes_sample, n = 30),
      probability_weights = "WTINT2YR",
      method = "dish"
    ),
    regex = "Invalid option for argument"
  )

  nhanes_sample$rescaled_weights_a <- 1
  expect_warning(
    {
      out <- rescale_weights(
        data = head(nhanes_sample, n = 30),
        by = "SDMVSTRA",
        probability_weights = "WTINT2YR"
      )
    },
    regex = "The variable name"
  )
  expect_named(
    out,
    c(
      "total",
      "age",
      "RIAGENDR",
      "RIDRETH1",
      "SDMVPSU",
      "SDMVSTRA",
      "WTINT2YR",
      "rescaled_weights_a",
      "rescaled_weights_a_1",
      "rescaled_weights_b"
    )
  )
})


================================================
FILE: tests/testthat/test-reshape_ci.R
================================================
test_that("reshape_ci with single CI level", {
  df <- data.frame(
    Parameter = c("Term 1"),
    CI = c(0.8),
    CI_low = c(0.2),
    CI_high = c(0.5),
    stringsAsFactors = FALSE
  )

  df_reshape <- reshape_ci(df)

  expect_snapshot(df_reshape)
})


test_that("reshape_ci with multiple CI levels", {
  x <- data.frame(
    Parameter = c("Term 1", "Term 2", "Term 1", "Term 2"),
    CI = c(0.8, 0.8, 0.9, 0.9),
    CI_low = c(0.2, 0.3, 0.1, 0.15),
    CI_high = c(0.5, 0.6, 0.8, 0.85),
    stringsAsFactors = FALSE
  )

  expect_snapshot(reshape_ci(x))
  expect_snapshot(reshape_ci(reshape_ci(x)))
})


================================================
FILE: tests/testthat/test-row_count.R
================================================
test_that("row_count", {
  d_mn <- data.frame(
    c1 = c(1, 2, NA, 4),
    c2 = c(NA, 2, NA, 5),
    c3 = c(NA, 4, NA, NA),
    c4 = c(2, 3, 7, 8)
  )
  expect_identical(row_count(d_mn, count = 2), c(1, 2, 0, 0))
  expect_identical(row_count(d_mn, count = NA), c(2, 0, 3, 1))
  d_mn <- data.frame(
    c1 = c("a", "b", NA, "c"),
    c2 = c(NA, "b", NA, "d"),
    c3 = c(NA, 4, NA, NA),
    c4 = c(2, 3, 7, Inf),
    stringsAsFactors = FALSE
  )
  expect_identical(row_count(d_mn, count = "b"), c(0, 2, 0, 0))
  expect_identical(row_count(d_mn, count = Inf), c(0, 0, 0, 1))
})

test_that("row_count, errors or messages", {
  data(iris)
  expect_error(
    expect_warning(row_count(iris, select = "abc")),
    regex = "must be a valid"
  )
  expect_error(
    expect_warning(row_count(iris, select = "abc", count = 3)),
    regex = "No columns"
  )
  expect_error(row_count(iris[1], count = 3), regex = "with at least")
  expect_error(
    row_count(iris[-seq_len(nrow(iris)), , drop = FALSE], count = 2),
    regex = "one row"
  )
})

test_that("row_count, allow_coercion match", {
  d_mn <- data.frame(
    c1 = c("1", "2", NA, "3"),
    c2 = c(NA, "2", NA, "3"),
    c3 = c(NA, 4, NA, NA),
    c4 = c(2, 3, 7, Inf),
    stringsAsFactors = FALSE
  )
  expect_identical(
    row_count(d_mn, count = 2, allow_coercion = TRUE),
    c(1, 2, 0, 0)
  )
  expect_identical(
    row_count(d_mn, count = 2, allow_coercion = FALSE),
    c(1, 0, 0, 0)
  )
  expect_identical(
    row_count(d_mn, count = "2", allow_coercion = FALSE),
    c(0, 2, 0, 0)
  )
  expect_identical(
    row_count(d_mn, count = factor("2"), allow_coercion = TRUE),
    c(1, 2, 0, 0)
  )
  expect_error(
    row_count(d_mn, count = factor("2"), allow_coercion = FALSE),
    regex = "No column has"
  )

  # mix character / factor
  d_mn <- data.frame(
    c1 = factor(c("1", "2", NA, "3")),
    c2 = c("2", "1", NA, "3"),
    c3 = c(NA, 4, NA, NA),
    c4 = c(2, 3, 7, Inf),
    stringsAsFactors = FALSE
  )
  expect_identical(
    row_count(d_mn, count = 2, allow_coercion = TRUE),
    c(2, 1, 0, 0)
  )
  expect_identical(
    row_count(d_mn, count = 2, allow_coercion = FALSE),
    c(1, 0, 0, 0)
  )
  expect_identical(
    row_count(d_mn, count = "2", allow_coercion = FALSE),
    c(1, 0, 0, 0)
  )
  expect_identical(
    row_count(d_mn, count = "2", allow_coercion = TRUE),
    c(2, 1, 0, 0)
  )
  expect_identical(
    row_count(d_mn, count = factor("2"), allow_coercion = FALSE),
    c(0, 1, 0, 0)
  )
  expect_identical(
    row_count(d_mn, count = factor("2"), allow_coercion = TRUE),
    c(2, 1, 0, 0)
  )
})


================================================
FILE: tests/testthat/test-row_means.R
================================================
test_that("row_means/sums", {
  d_mn <- data.frame(
    c1 = c(1, 2, NA, 4),
    c2 = c(NA, 2, NA, 5),
    c3 = c(NA, 4, NA, NA),
    c4 = c(2, 3, 7, 8)
  )
  expect_equal(
    row_means(d_mn, min_valid = 4),
    c(NA, 2.75, NA, NA),
    tolerance = 1e-3
  )
  expect_equal(
    row_means(d_mn, min_valid = 3),
    c(NA, 2.75, NA, 5.66667),
    tolerance = 1e-3
  )
  expect_equal(
    row_means(d_mn, min_valid = 2),
    c(1.5, 2.75, NA, 5.66667),
    tolerance = 1e-3
  )
  expect_equal(
    row_means(d_mn, min_valid = 1),
    c(1.5, 2.75, 7, 5.66667),
    tolerance = 1e-3
  )
  expect_equal(
    row_means(d_mn, min_valid = 0.5),
    c(1.5, 2.75, NA, 5.66667),
    tolerance = 1e-3
  )
  expect_equal(
    row_means(d_mn, min_valid = 0.75),
    c(NA, 2.75, NA, 5.66667),
    tolerance = 1e-3
  )
  expect_equal(
    row_means(d_mn, min_valid = 2, digits = 1),
    c(1.5, 2.8, NA, 5.7),
    tolerance = 1e-1
  )
  expect_message(row_means(iris), regex = "Only numeric")
  expect_equal(
    row_means(iris, verbose = FALSE),
    rowMeans(iris[, 1:4]),
    tolerance = 1e-3,
    ignore_attr = TRUE
  )
  expect_equal(
    row_sums(d_mn, min_valid = 4),
    c(NA, 11, NA, NA),
    tolerance = 1e-3
  )
  expect_equal(
    row_sums(d_mn, min_valid = 3),
    c(NA, 11, NA, 17),
    tolerance = 1e-3
  )
  expect_message(row_sums(iris), regex = "Only numeric")
})

test_that("row_means/sums, errors or messages", {
  data(iris)
  expect_error(
    expect_warning(row_means(iris, select = "abc")),
    regex = "No columns"
  )
  expect_error(
    expect_warning(row_sums(iris, select = "abc")),
    regex = "No columns"
  )
  expect_error(row_means(iris[1], min_valid = 1), regex = "two numeric")
  expect_error(row_means(iris, min_valid = 1:4), regex = "numeric value")
  expect_error(row_means(iris, min_valid = "a"), regex = "numeric value")
  expect_message(row_means(iris[1:3, ], min_valid = 3), regex = "Only numeric")
  expect_silent(row_means(iris[1:3, ], min_valid = 3, verbose = FALSE))
  expect_error(row_sums(iris[1], min_valid = 1), regex = "two numeric")
  expect_message(row_sums(iris[1:3, ], min_valid = 3), regex = "Only numeric")
  expect_silent(row_sums(iris[1:3, ], min_valid = 3, verbose = FALSE))
})


================================================
FILE: tests/testthat/test-select_nse.R
================================================
foo <- function(data, select = NULL, exclude = NULL, regex = FALSE) {
  .select_nse(
    select,
    data,
    exclude = exclude,
    regex = regex,
    ignore_case = FALSE
  )
}

test_that(".select_nse needs data", {
  expect_error(
    foo(select = "Sepal.Length", data = NULL),
    regexp = "must be provided"
  )
})

test_that(".select_nse needs a data frame or something coercible to a dataframe", {
  expect_identical(
    foo(select = "Sepal.Length", data = as.matrix(head(iris))),
    "Sepal.Length"
  )
  expect_error(
    foo(select = "Sepal.Length", data = list(1:3, 1:2)),
    regexp = "must be a data frame"
  )
})

test_that(".select_nse: arg 'select' works", {
  expect_identical(
    foo(iris, select = NULL),
    names(iris)
  )
  expect_identical(
    foo(iris, Petal.Length),
    "Petal.Length"
  )
  expect_identical(
    foo(iris, c("Petal.Length", "Sepal.Width")),
    c("Petal.Length", "Sepal.Width")
  )
  expect_identical(
    foo(iris, c(3, 2)),
    c("Petal.Length", "Sepal.Width")
  )
  expect_identical(
    foo(iris, 1:5),
    names(iris)
  )
  expect_identical(
    foo(iris, is.numeric),
    c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")
  )
  expect_identical(
    foo(iris, is.numeric()),
    c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")
  )
  expect_identical(
    extract_column_names(iris, sepal.length, ignore_case = TRUE),
    "Sepal.Length"
  )
  expect_identical(
    foo(iris, starts_with("Petal")),
    c("Petal.Length", "Petal.Width")
  )
  expect_identical(
    foo(iris, ends_with("Length")),
    c("Sepal.Length", "Petal.Length")
  )
  expect_identical(
    foo(iris, contains("Length")),
    c("Sepal.Length", "Petal.Length")
  )
  expect_identical(
    foo(iris, regex("Length$")),
    c("Sepal.Length", "Petal.Length")
  )
  expect_identical(
    foo(iris, "Len", regex = TRUE),
    c("Sepal.Length", "Petal.Length")
  )
})


test_that(".select_nse: arg 'exclude' works", {
  expect_identical(
    foo(iris, exclude = c("Petal.Length", "Sepal.Width")),
    c("Sepal.Length", "Petal.Width", "Species")
  )
  expect_identical(
    foo(iris, exclude = c(3, 2)),
    c("Sepal.Length", "Petal.Width", "Species")
  )
  expect_identical(
    foo(iris, exclude = starts_with("Petal")),
    c("Sepal.Length", "Sepal.Width", "Species")
  )
  expect_identical(
    foo(iris, exclude = ends_with("Length")),
    c("Sepal.Width", "Petal.Width", "Species")
  )
  expect_identical(
    foo(iris, exclude = contains("Length")),
    c("Sepal.Width", "Petal.Width", "Species")
  )
  expect_identical(
    foo(iris, exclude = regex("Length$")),
    c("Sepal.Width", "Petal.Width", "Species")
  )
})

test_that(".select_nse: args 'select' and 'exclude' at the same time", {
  expect_identical(
    foo(iris, select = contains("Length"), exclude = starts_with("Petal")),
    "Sepal.Length"
  )
  expect_identical(
    foo(iris, select = contains("Length"), exclude = contains("Length")),
    character(0)
  )
})

test_that(".select_nse: misc", {
  iris2 <- iris[, 1:3]
  expect_identical(
    foo(iris, select = names(iris2)),
    c("Sepal.Length", "Sepal.Width", "Petal.Length")
  )
  expect_identical(
    foo(iris, select = names(iris2)[2]),
    "Sepal.Width"
  )
})

test_that(".select_nse: works with function and namespace", {
  model <- lm(Petal.Length ~ Petal.Width, data = iris)
  out <- data_select(
    iris,
    insight::find_predictors(model, effects = "fixed", flatten = TRUE)
  )
  expect_identical(out, iris["Petal.Width"])

  fun <- function(x) {
    data_select(iris, x)
  }
  out <- fun(insight::find_predictors(model, effects = "fixed", flatten = TRUE))
  expect_identical(out, iris["Petal.Width"])

  x <- "Sepal.Length"
  out <- fun(insight::find_predictors(model, effects = "fixed", flatten = TRUE))
  expect_identical(out, iris["Petal.Width"])
})

test_that(".select_nse: allow character vector with :", {
  data(mtcars)
  out <- data_select(mtcars, c("cyl:hp", "wt", "vs:gear"))
  expect_named(out, c("cyl", "disp", "hp", "wt", "vs", "am", "gear"))
  out <- data_select(mtcars, c("cyl:hp", "wta", "vs:gear"))
  expect_named(out, c("cyl", "disp", "hp", "vs", "am", "gear"))
  out <- data_select(mtcars, c("hp:cyl", "wta", "vs:gear"))
  expect_named(out, c("hp", "disp", "cyl", "vs", "am", "gear"))
  out <- data_select(mtcars, c("cyl:hq", "wt", "vs:gear"))
  expect_named(out, c("wt", "vs", "am", "gear"))

  expect_warning(
    center(mtcars, c("cyl:hp", "wta", "vs:gear"), verbose = TRUE),
    regex = "Did you mean \"wt\""
  )
  expect_warning(
    center(mtcars, c("cyl:hq", "wt", "vs:gear"), verbose = TRUE),
    regex = "Did you mean one of \"hp\""
  )
})


================================================
FILE: tests/testthat/test-skewness-kurtosis.R
================================================
test_that("skewness", {
  expect_equal(skewness(iris$Sepal.Length)[[1]], 0.314911, tolerance = 1e-3)
  expect_equal(
    skewness(iris$Sepal.Length, type = 1)[[1]],
    0.3117531,
    tolerance = 1e-3
  )
  expect_equal(
    skewness(iris$Sepal.Length, type = 3)[[1]],
    0.3086407,
    tolerance = 1e-3
  )
})

test_that("kurtosis", {
  expect_equal(kurtosis(iris$Sepal.Length)[[1]], -0.552064, tolerance = 1e-3)
  expect_equal(
    kurtosis(iris$Sepal.Length, type = 1)[[1]],
    -0.5735679,
    tolerance = 1e-3
  )
  expect_equal(
    kurtosis(iris$Sepal.Length, type = 3)[[1]],
    -0.6058125,
    tolerance = 1e-3
  )
})

test_that("kurtosis and skewness with bootstrapping", {
  skip_if_not_installed("boot")

  set.seed(123)
  expect_equal(
    skewness(iris$Sepal.Length, iterations = 100)[[2]],
    0.1262203,
    tolerance = 1e-3
  )

  set.seed(123)
  expect_equal(
    kurtosis(iris$Sepal.Length, iterations = 100)[[2]],
    0.1878741,
    tolerance = 1e-3
  )
})

test_that("skewness works with data frames", {
  skip_if_not_installed("boot")

  set.seed(123)
  expect_snapshot(skewness(iris[, 1:4]))

  set.seed(123)
  expect_snapshot(skewness(iris[, 1:4], iterations = 100))
})

test_that("kurtosis works with data frames", {
  skip_if_not_installed("boot")

  set.seed(123)
  expect_snapshot(kurtosis(iris[, 1:4]))

  set.seed(123)
  expect_snapshot(kurtosis(iris[, 1:4], iterations = 100))
})


test_that("skewness works with matrices", {
  skip_if_not_installed("boot")

  set.seed(123)
  expect_snapshot(skewness(as.matrix(iris[, 1:4])))

  set.seed(123)
  expect_snapshot(skewness(as.matrix(iris[, 1:4]), iterations = 100))
})

test_that("kurtosis works with matrices", {
  skip_if_not_installed("boot")

  set.seed(123)
  expect_snapshot(kurtosis(as.matrix(iris[, 1:4])))

  set.seed(123)
  expect_snapshot(kurtosis(as.matrix(iris[, 1:4]), iterations = 100))
})

test_that("skewness uses type 1 if not enough obs for type 2", {
  expect_warning(
    test <- skewness(c(1, 2), type = "2"),
    "Need at least 3 complete obs"
  )
  expect_equal(test, skewness(c(1, 2), type = "1"))
})

test_that("kurtosis uses type 1 if not enough obs for type 2", {
  expect_warning(
    test <- kurtosis(c(1, 2, 3), type = "2"),
    "Need at least 4 complete obs"
  )
  expect_equal(test, kurtosis(c(1, 2, 3), type = "1"))
})


================================================
FILE: tests/testthat/test-smoothness.R
================================================
test_that("smoothness works", {
  set.seed(123)
  x <- (-10:10)^3 + rnorm(21, 0, 100)
  expect_equal(smoothness(x)[[1]], 0.9030014, tolerance = 0.001)
  expect_equal(smoothness(x, method = "auto")[[1]], 1.750452, tolerance = 0.001)
})

test_that("smoothness works with iterations", {
  skip_if_not_installed("boot")

  set.seed(123)
  x <- (-10:10)^3 + rnorm(21, 0, 100)
  expect_equal(
    smoothness(x, iterations = 100)[[1]],
    0.9030014,
    tolerance = 0.001
  )
  expect_equal(
    smoothness(x, method = "auto", iterations = 100)[[1]],
    1.750452,
    tolerance = 0.001
  )
})


test_that("smoothness with lag works", {
  set.seed(123)
  x <- (-10:10)^3 + rnorm(21, 0, 100)
  expect_equal(smoothness(x, lag = 0.5)[[1]], 0.5859015, tolerance = 0.001)
  expect_error(smoothness(x, lag = 0), "'lag' cannot be that small.")
})

test_that("smoothness works with data frames", {
  skip_if(getRversion() < "4.0.0")
  set.seed(123)
  expect_snapshot(smoothness(BOD))
})


================================================
FILE: tests/testthat/test-standardize-data.R
================================================
# standardize.numeric -----------------------------------------------------
test_that("standardize.numeric", {
  x <- standardize(seq(0, 1, length.out = 100))
  expect_equal(mean(x), 0, tolerance = 0.01)

  x <- standardize(seq(0, 1, length.out = 100), two_sd = TRUE)
  expect_equal(sd(x), 0.5, tolerance = 0.01)

  x <- standardize(seq(0, 1, length.out = 100), robust = TRUE)
  expect_equal(median(x), 0, tolerance = 0.01)

  x <- standardize(seq(0, 1, length.out = 100), robust = TRUE, two_sd = TRUE)
  expect_equal(mad(x), 0.5, tolerance = 0.01)

  expect_message(standardize(c(0, 0, 0, 1, 1)))

  x <- standardize(c(-1, 0, 1), reference = seq(3, 4, length.out = 100))
  expect_equal(mean(x), -11.943, tolerance = 0.01)
})


# standardize factor / Date -----------------------------------------------
test_that("standardize.numeric", {
  f <- factor(c("c", "a", "b"))
  expect_identical(standardize(f), f)
  expect_equal(standardize(f, force = TRUE), c(1, -1, 0), ignore_attr = TRUE)

  d <- as.Date(c("1989/08/06", "1989/08/04", "1989/08/05"))
  expect_identical(standardize(d), d)
  expect_equal(standardize(d, force = TRUE), c(1, -1, 0), ignore_attr = TRUE)
})


# standardize.data.frame --------------------------------------------------

test_that("standardize.data.frame", {
  skip_if_not_installed("poorman")

  data(iris)
  x <- standardize(iris)
  expect_equal(mean(x$Sepal.Length), 0, tolerance = 0.01)
  expect_length(levels(x$Species), 3)
  expect_equal(
    mean(subset(x, Species == "virginica")$Sepal.Length),
    0.90,
    tolerance = 0.01
  )

  # check class attributes
  expect_identical(
    vapply(x, class, character(1)),
    c(
      Sepal.Length = "numeric",
      Sepal.Width = "numeric",
      Petal.Length = "numeric",
      Petal.Width = "numeric",
      Species = "factor"
    )
  )

  x2 <- standardize(x = iris[1, ], reference = iris)
  expect_true(all(x2[1, ] == x[1, ]))

  x <- standardize(poorman::group_by(iris, Species))
  expect_equal(mean(x$Sepal.Length), 0, tolerance = 0.01)
  expect_length(levels(x$Species), 3)
  expect_equal(
    mean(subset(x, Species == "virginica")$Sepal.Length),
    0,
    tolerance = 0.01
  )
})


test_that("standardize.data.frame, NAs", {
  skip_if_not_installed("poorman")

  data(iris)
  iris$Sepal.Width[c(148, 65, 33, 58, 54, 93, 114, 72, 32, 23)] <- NA
  iris$Sepal.Length[c(11, 30, 141, 146, 13, 149, 6, 8, 48, 101)] <- NA

  x <- standardize(iris)
  expect_equal(
    head(x$Sepal.Length),
    c(-0.9163, -1.1588, -1.4013, -1.5226, -1.0376, NA),
    tolerance = 0.01
  )
  expect_equal(
    head(x$Sepal.Width),
    c(1.0237, -0.151, 0.3189, 0.0839, 1.2586, 1.9635),
    tolerance = 0.01
  )
  expect_identical(mean(x$Sepal.Length), NA_real_)

  x <- standardize(iris, two_sd = TRUE)
  expect_equal(
    head(x$Sepal.Length),
    c(-0.4603, -0.5811, -0.7019, -0.7623, -0.5207, NA),
    tolerance = 0.01
  )
  expect_equal(
    head(x$Sepal.Width),
    c(0.5118, -0.0755, 0.1594, 0.042, 0.6293, 0.9817),
    tolerance = 0.01
  )
  expect_identical(mean(x$Sepal.Length), NA_real_)

  x <- standardize(poorman::group_by(iris, .data$Species))
  expect_equal(
    head(x$Sepal.Length),
    c(0.2547, -0.3057, -0.8661, -1.1463, -0.0255, NA),
    tolerance = 0.01
  )
  expect_equal(
    head(x$Sepal.Width),
    c(0.2369, -1.0887, -0.5584, -0.8235, 0.502, 1.2974),
    tolerance = 0.01
  )
  expect_identical(mean(x$Sepal.Length), NA_real_)
})


test_that("standardize.data.frame, apend", {
  skip_if_not_installed("poorman")

  data(iris)
  iris$Sepal.Width[c(26, 43, 56, 11, 66, 132, 23, 133, 131, 28)] <- NA
  iris$Sepal.Length[c(32, 12, 109, 92, 119, 49, 83, 113, 64, 30)] <- NA

  x <- standardize(iris, append = TRUE)
  expect_identical(
    colnames(x),
    c(
      "Sepal.Length",
      "Sepal.Width",
      "Petal.Length",
      "Petal.Width",
      "Species",
      "Sepal.Length_z",
      "Sepal.Width_z",
      "Petal.Length_z",
      "Petal.Width_z"
    )
  )
  expect_equal(
    head(x$Sepal.Length_z),
    c(-0.8953, -1.1385, -1.3816, -1.5032, -1.0169, -0.5306),
    tolerance = 0.01
  )
  expect_equal(
    head(x$Sepal.Width_z),
    c(1.04, -0.1029, 0.3543, 0.1257, 1.2685, 1.9542),
    tolerance = 0.01
  )
  expect_identical(mean(x$Sepal.Length_z), NA_real_)

  x <- standardize(iris, two_sd = TRUE, append = TRUE)
  expect_equal(
    head(x$Sepal.Length_z),
    c(-0.4477, -0.5692, -0.6908, -0.7516, -0.5084, -0.2653),
    tolerance = 0.01
  )
  expect_equal(
    head(x$Sepal.Width_z),
    c(0.52, -0.0514, 0.1771, 0.0629, 0.6343, 0.9771),
    tolerance = 0.01
  )
  expect_identical(mean(x$Sepal.Length_z), NA_real_)

  x <- standardize(poorman::group_by(iris, .data$Species), append = TRUE)
  expect_equal(
    head(x$Sepal.Length_z),
    c(0.2746, -0.2868, -0.8483, -1.129, -0.0061, 1.1168),
    tolerance = 0.01
  )
  expect_equal(
    head(x$Sepal.Width_z),
    c(0.1766, -1.1051, -0.5924, -0.8487, 0.4329, 1.2019),
    tolerance = 0.01
  )
  expect_identical(mean(x$Sepal.Length_z), NA_real_)
})


test_that("standardize.data.frame, weights", {
  skip_if_not_installed("poorman")

  x <- rexp(30)
  w <- rpois(30, 20) + 1

  expect_equal(
    sqrt(cov.wt(cbind(x, x), w)$cov[1, 1]),
    attr(standardize(x, weights = w), "scale"),
    tolerance = 1e-4
  )
  expect_equal(
    standardize(x, weights = w),
    standardize(data.frame(x), weights = w)$x,
    tolerance = 1e-4,
    ignore_attr = TRUE
  )

  # name and vector give same results
  expect_equal(
    standardize(mtcars, exclude = "cyl", weights = mtcars$cyl),
    standardize(mtcars, weights = "cyl"),
    tolerance = 1e-4
  )

  d <- poorman::group_by(mtcars, am)
  expect_warning(standardize(d, weights = d$cyl))
})


# Unstandardize -----------------------------------------------------------
test_that("unstandardize, numeric", {
  data(iris)
  x <- standardize(iris$Petal.Length)
  rez <- unstandardize(x)
  expect_equal(rez, iris$Petal.Length, tolerance = 1e-3, ignore_attr = TRUE)

  rez <- unstandardize(x, reference = iris$Petal.Length)
  expect_equal(rez, iris$Petal.Length, tolerance = 1e-3, ignore_attr = TRUE)

  rez <- unstandardize(
    x,
    center = mean(iris$Petal.Length),
    scale = stats::sd(iris$Petal.Length)
  )
  expect_equal(rez, iris$Petal.Length, tolerance = 1e-3, ignore_attr = TRUE)

  rez <- unstandardize(
    0,
    center = mean(iris$Petal.Length),
    scale = stats::sd(iris$Petal.Length)
  )
  expect_equal(rez, mean(iris$Petal.Length), tolerance = 1e-3)

  x <- standardize(iris$Petal.Length, robust = TRUE, two_sd = TRUE)
  rez <- unstandardize(x, robust = TRUE, two_sd = TRUE)
  expect_equal(rez, iris$Petal.Length, tolerance = 1e-3, ignore_attr = TRUE)

  x <- scale(iris$Petal.Length)
  rez <- unstandardize(x)
  expect_equal(rez, iris$Petal.Length, tolerance = 1e-3, ignore_attr = TRUE)

  x <- scale(iris$Petal.Length, center = 3, scale = 2)
  rez <- unstandardize(x)
  expect_equal(rez, iris$Petal.Length, tolerance = 1e-3, ignore_attr = TRUE)
})

test_that("unstandardize, data frame", {
  skip_if_not_installed("poorman")

  data(iris)
  x <- standardize(iris)
  rez <- unstandardize(x)
  expect_equal(rez, iris, tolerance = 0.1, ignore_attr = TRUE)

  x <- standardize(iris, select = "Petal.Length")
  rez <- unstandardize(x)
  expect_equal(rez, iris, tolerance = 0.1, ignore_attr = TRUE)

  x <- standardize(iris, select = starts_with("Pet"))
  rez <- unstandardize(x, select = starts_with("Pet"))
  expect_equal(rez, iris, tolerance = 0.1, ignore_attr = TRUE)

  x <- standardize(iris, select = "Petal.Length")
  rez <- unstandardize(
    x,
    center = c(Petal.Length = mean(iris$Petal.Length)),
    scale = c(Petal.Length = stats::sd(iris$Petal.Length))
  )
  expect_equal(rez, iris, tolerance = 0.1, ignore_attr = TRUE)

  expect_error(unstandardize(
    x,
    center = mean(iris$Petal.Length),
    scale = stats::sd(iris$Petal.Length)
  ))

  x <- standardize(iris)
  rez <- unstandardize(x, center = rep(0, 4), scale = rep(1, 4))
  expect_equal(rez, x, tolerance = 0.1, ignore_attr = TRUE)

  data(iris)
  x <- standardize(iris, robust = TRUE, two_sd = TRUE)
  rez <- unstandardize(x, robust = TRUE, two_sd = TRUE)
  expect_equal(rez, iris, tolerance = 0.1, ignore_attr = TRUE)
})

test_that("un/standardize, matrix", {
  set.seed(4)
  x <- matrix(sample(8), nrow = 4)
  colnames(x) <- letters[1:2]
  rownames(x) <- LETTERS[1:4]

  z1 <- standardize(x)
  z2 <- scale(x)

  expect_equal(z1, z2, ignore_attr = TRUE)
  expect_equal(unstandardize(z1), x, ignore_attr = TRUE)
  expect_identical(unstandardize(z2), unstandardize(z1))
})

test_that("unstandardize with reference (data frame)", {
  x <- standardize(x = iris, reference = iris)
  x2 <- unstandardize(x, reference = iris)
  expect_equal(x2, iris, ignore_attr = TRUE)

  x <- standardize(x = iris, reference = iris, robust = TRUE)
  x2 <- unstandardize(x, reference = iris, robust = TRUE)
  expect_equal(x2, iris, ignore_attr = TRUE)
})

test_that("unstandardize does nothing with characters and factors", {
  expect_identical(
    unstandardise(c("a", "b")),
    c("a", "b")
  )
  expect_identical(
    unstandardise(factor(c(1, 2))),
    factor(c(1, 2))
  )
})

# select helpers ------------------------------
test_that("standardize regex", {
  expect_identical(
    standardize(mtcars, select = "pg", regex = TRUE),
    standardize(mtcars, select = "mpg")
  )
})

# standardize when only providing one of center/scale ---------------
test_that("standardize when only providing one of center/scale", {
  x <- 1:10
  expect_identical(
    as.vector(datawizard::standardize(x, center = FALSE)),
    x / sd(x)
  )
  expect_identical(
    as.vector(datawizard::standardize(x, center = 2)),
    (x - 2) / sd(x)
  )
  expect_identical(
    as.vector(datawizard::standardize(x, scale = FALSE)),
    as.vector(datawizard::center(x))
  )
  expect_identical(
    as.vector(datawizard::standardize(x, scale = 1.5)),
    (x - mean(x)) / 1.5
  )
})


# grouped data

test_that("unstandardize: grouped data", {
  skip_if_not_installed("poorman")

  # 1 group, 1 standardized var
  stand <- poorman::group_by(mtcars, cyl)
  stand <- standardize(stand, "mpg")
  unstand <- unstandardize(stand, select = "mpg")
  expect_identical(
    poorman::ungroup(unstand),
    mtcars,
    ignore_attr = TRUE
  )

  expect_s3_class(unstand, "grouped_df")

  # 2 groups, 1 standardized var
  set.seed(123)
  test <- iris
  test$grp <- sample(c("A", "B"), nrow(test), replace = TRUE)
  stand <- poorman::group_by(test, Species, grp)
  stand <- standardize(stand, "Sepal.Length")
  expect_identical(
    poorman::ungroup(unstandardize(stand, select = "Sepal.Length")),
    test
  )

  # 2 groups, 2 standardized vars
  set.seed(123)
  test <- iris
  test$grp <- sample(c("A", "B"), nrow(test), replace = TRUE)
  stand <- poorman::group_by(test, Species, grp)
  stand <- standardize(stand, c("Sepal.Length", "Petal.Length"))
  expect_identical(
    poorman::ungroup(unstandardize(
      stand,
      select = c("Sepal.Length", "Petal.Length")
    )),
    test
  )

  expect_s3_class(unstand, "grouped_df")

  # can't recover attributes
  stand <- poorman::group_by(iris, Species)
  stand <- standardize(stand, "Sepal.Length")
  attr(stand, "groups") <- NULL

  expect_error(
    unstandardize(stand, "Sepal.Length"),
    regexp = "Couldn't retrieve the necessary information"
  )

  # normalize applied on grouped data but unstandardize applied on ungrouped data
  stand <- poorman::group_by(mtcars, cyl)
  stand <- standardize(stand, "mpg")
  stand <- poorman::ungroup(stand)

  expect_error(
    unstandardize(stand, "mpg"),
    regexp = "must provide the arguments"
  )

  # standardize applied on grouped data but unstandardize applied different grouped
  # data
  stand <- poorman::group_by(stand, am)
  expect_error(
    unstandardize(stand, "mpg"),
    regexp = "Couldn't retrieve the necessary"
  )
})


================================================
FILE: tests/testthat/test-standardize_datagrid.R
================================================
# standardize -----------------------------------------------------
test_that("standardize.datagrid", {
  x <- insight::get_datagrid(
    iris,
    by = "Sepal.Length",
    range = "sd",
    length = 3
  )
  out <- standardize(x)
  expect_identical(as.numeric(out$Sepal.Length), c(-1, 0, 1), tolerance = 1e-3)
  expect_identical(as.numeric(out$Sepal.Width), c(0, 0, 0), tolerance = 1e-3)

  x <- insight::get_datagrid(iris, by = "Sepal.Length = c(-1, 0)")
  out <- unstandardize(x, select = "Sepal.Length")
  expect_identical(
    out$Sepal.Length[1:2],
    c(mean(iris$Sepal.Length) - sd(iris$Sepal.Length), mean(iris$Sepal.Length)),
    tolerance = 1e-3
  )
})


================================================
FILE: tests/testthat/test-standardize_models.R
================================================
# standardize.lm ----------------------------------------------------------
test_that("standardize.lm", {
  iris2 <- na.omit(iris)
  iris_z <- standardize(iris2)

  m0 <- lm(Sepal.Length ~ Species * Petal.Width, data = iris_z)
  m1 <- lm(Sepal.Length ~ Species * Petal.Width, data = iris2)
  model <- standardize(m1)
  expect_identical(coef(m0), coef(model))
})

test_that("standardize, mlm", {
  m <- lm(cbind(mpg, hp) ~ cyl + am, data = mtcars)
  m2 <- lm(scale(cbind(mpg, hp)) ~ scale(cyl) + scale(am), data = mtcars)

  mz <- standardize(m)
  expect_equal(coef(mz), coef(m2), ignore_attr = TRUE, tolerance = 1e-4)
})

test_that("standardize | errors", {
  my_lm_external_formula <- function(.dat, predicted, predictor) {
    my_formula <- as.formula(paste0(predicted, "~", predictor))
    lm(formula = my_formula, data = .dat)
  }

  m <- my_lm_external_formula(mtcars, "mpg", "am")
  ers <- capture_error(standardize(m))
  expect_match(
    as.character(ers),
    "Try instead to standardize the data",
    fixed = TRUE
  )
})


test_that("standardize | problematic formulas", {
  data(mtcars)
  m <- lm(mpg ~ hp, data = mtcars)
  expect_equal(
    coef(standardise(m)),
    c(`(Intercept)` = -3.14935717633686e-17, hp = -0.776168371826586),
    tolerance = 1e-4
  )

  colnames(mtcars)[1] <- "1_mpg"
  m <- lm(`1_mpg` ~ hp, data = mtcars)
  expect_error(standardise(m), regex = "Looks like")

  # works interactive only
  # data(mtcars)
  # m <- lm(mtcars$mpg ~ mtcars$hp)
  # expect_error(standardise(m), regex = "model formulas")

  m <- lm(mtcars[, 1] ~ hp, data = mtcars)
  expect_error(standardise(m), regex = "indexed data")
})


# Transformations ---------------------------------------------------------
test_that("transformations", {
  skip_if_not_installed("effectsize")
  # deal with log / sqrt terms
  expect_message(standardize(lm(mpg ~ sqrt(cyl) + log(hp), mtcars)))
  expect_message(standardize(lm(mpg ~ sqrt(cyl), mtcars)))
  expect_message(standardize(lm(mpg ~ log(hp), mtcars)))

  # difference between stand-methods:
  mt <- mtcars
  mt$hp_100 <- mt$hp / 100
  fit_exp <- lm(mpg ~ exp(hp_100), mt)
  fit_scale1 <- lm(scale(mpg) ~ exp(scale(hp_100)), mt)
  fit_scale2 <- lm(scale(mpg) ~ scale(exp(hp_100)), mt)
  expect_equal(
    effectsize::standardize_parameters(fit_exp, method = "refit")[2, 2],
    unname(coef(fit_scale1)[2]),
    ignore_attr = TRUE
  )

  expect_equal(
    effectsize::standardize_parameters(fit_exp, method = "basic")[2, 2],
    unname(coef(fit_scale2)[2]),
    ignore_attr = TRUE
  )

  # fmt: skip
  d <- data.frame(
    time = factor(rep(c("1", "2", "3", "4", "5"), 6)),
    group = rep(rep(c(1, 2), 3), each = 5L),
    sum = rep(c(0, 5, 10, 15, 20, 0, 20, 25, 45, 50), 3)
  )
  m <- lm(log(sum + 1) ~ as.numeric(time) * group, data = d)

  expect_message({
    out <- standardize(m)
  })
  expect_identical(
    coef(m),
    c(
      `(Intercept)` = -0.4575,
      `as.numeric(time)` = 0.5492,
      group = 0.3379,
      `as.numeric(time):group` = 0.15779
    ),
    tolerance = 0.01
  )
})


# W/ weights --------------------------------------------------------------
test_that("weights", {
  expect_warning(standardize(mtcars, weights = "xx"))

  m <- lm(mpg ~ wt + hp, weights = cyl, mtcars)

  sm <- standardize(m, weights = TRUE)
  sm_data <- insight::get_data(sm, source = "frame")
  sm_data2 <- standardize(
    mtcars,
    select = c("mpg", "wt", "hp"),
    weights = "cyl"
  )
  expect_identical(
    sm_data[, c("mpg", "wt", "hp")],
    sm_data2[, c("mpg", "wt", "hp")]
  )

  expect_error(standardize(m, weights = TRUE, robust = TRUE), NA)

  # no weights in stding
  sm_xw <- standardize(m, weights = FALSE)
  sm_data_xw <- insight::get_data(sm_xw, source = "frame")
  expect_false(isTRUE(all.equal(coef(sm)[-1], coef(sm_xw)[-1])))

  skip_if_not_installed("effectsize")
  # refit and posthoc should give same results
  stdREFIT <- effectsize::standardize_parameters(m, method = "refit")
  expect_equal(
    stdREFIT[[2]],
    effectsize::standardize_parameters(m, method = "posthoc")[[2]],
    ignore_attr = TRUE
  )

  expect_equal(
    stdREFIT[[2]],
    effectsize::standardize_parameters(m, method = "basic")[[2]],
    ignore_attr = TRUE
  )
})


# weights + missing data --------------------------------------------------
test_that("weights + NA", {
  set.seed(1234)
  data(iris)

  # data setup
  iris$weight_me <- runif(nrow(iris))
  iris$Sepal.Length[sample(nrow(iris), size = 10)] <- NA
  iris$weight_me[sample(nrow(iris), size = 10)] <- NA

  # standardize 2nd data set
  iris2 <- standardize(
    iris,
    select = c("Sepal.Length", "Petal.Width"),
    remove_na = "all"
  )
  iris3 <- standardize(
    iris,
    select = c("Sepal.Length", "Petal.Width"),
    weights = "weight_me",
    remove_na = "selected"
  )

  m1 <- lm(
    Sepal.Length ~ Species + Petal.Width,
    data = iris,
    weights = weight_me
  )

  # weights, missing data, but data isn't weight-stdized
  m2 <- lm(
    Sepal.Length ~ Species + Petal.Width,
    data = iris2,
    weights = weight_me
  )
  sm2 <- standardize(m1, weights = FALSE)
  expect_identical(coef(m2), coef(sm2))

  # weights, missing data, and data is weight-stdized
  m3 <- lm(
    Sepal.Length ~ Species + Petal.Width,
    data = iris3,
    weights = weight_me
  )
  sm3 <- standardize(m1, weights = TRUE)
  expect_identical(coef(m3), coef(sm3))
})


# weights + missing data ´+ na.action = na.exclude --------------------------------------------------
test_that("weights + NA + na.exclude", {
  skip_if_not_installed("effectsize")
  set.seed(1234)
  data(iris)

  # data setup
  iris$weight_me <- runif(nrow(iris))
  iris$Sepal.Length[sample(nrow(iris), size = 25)] <- NA
  iris$weight_me[sample(nrow(iris), size = 15)] <- NA
  d <- iris

  m1 <- lm(
    Sepal.Length ~ Species + Petal.Width,
    data = d,
    weights = weight_me,
    na.action = na.exclude
  )
  m2 <- lm(Sepal.Length ~ Species + Petal.Width, data = d, weights = weight_me)

  expect_identical(
    coef(standardize(m2)),
    coef(standardize(m1)),
    tolerance = 1e-3
  )
  expect_identical(
    effectsize::standardize_parameters(m1, method = "basic")[[2]],
    effectsize::standardize_parameters(m2, method = "basic")[[2]],
    tolerance = 1e-3
  )
})

# subset ------------------
test_that("fail with subset", {
  data("mtcars")

  mod1 <- lm(mpg ~ hp, data = mtcars, subset = cyl > 4)

  expect_error(standardise(mod1), regexp = "subset")
})


# don't standardize non-Gaussian response ------------------------------------
test_that("standardize non-Gaussian response", {
  skip_on_cran()
  skip_if_not_installed("lme4")
  set.seed(1234)
  data(sleepstudy, package = "lme4")

  m1 <- glm(Reaction ~ Days, family = Gamma(), data = sleepstudy)
  m2 <- glm(
    Reaction ~ Days,
    family = Gamma(link = "identity"),
    data = sleepstudy
  )
  m3 <- glm(Reaction ~ Days, family = inverse.gaussian(), data = sleepstudy)

  expect_identical(
    coef(standardize(m1)),
    c(`(Intercept)` = 0.00338, Days = -0.00034),
    tolerance = 1e-2
  )
  expect_identical(
    coef(standardize(m2)),
    c(`(Intercept)` = 298.48571, Days = 29.70754),
    tolerance = 1e-3
  )
  expect_identical(
    coef(standardize(m3)),
    c(`(Intercept)` = 1e-05, Days = 0),
    tolerance = 1e-3
  )
})


# variables evaluated in the environment $$$ ------------------------------
test_that("variables evaluated in the environment", {
  m <- lm(mtcars$mpg ~ mtcars$cyl + am, data = mtcars)
  w <- capture_error(standardize(m))
  expect_true(any(grepl("Using `$`", w, fixed = TRUE)))

  ## Note:
  # No idea why this is suddenly not giving a warning on older R versions.
  m <- lm(mtcars$mpg ~ mtcars$cyl + mtcars$am, data = mtcars)
  w <- capture_error(standardize(m))
  expect_true(any(grepl("Using `$`", w, fixed = TRUE)))
})


# mediation models --------------------------------------------------------
test_that("standardize mediation", {
  skip_on_cran()
  skip_if_not_installed("mediation")
  set.seed(444)
  data(jobs, package = "mediation")
  jobs$econ_hard <- jobs$econ_hard * 20
  b.int <- lm(job_seek ~ treat * age + econ_hard + sex, data = jobs)
  d.int <- lm(depress2 ~ treat * job_seek * age + econ_hard + sex, data = jobs)

  med1 <- mediation::mediate(
    b.int,
    d.int,
    sims = 200,
    treat = "treat",
    mediator = "job_seek"
  )
  med2 <- mediation::mediate(
    b.int,
    d.int,
    sims = 200,
    treat = "treat",
    mediator = "job_seek",
    covariates = list(age = mean(jobs$age))
  )

  out1 <- summary(standardize(med1))
  expect_message({
    out2 <- summary(standardize(med2))
  })
  expect_identical(
    unlist(out1[c("d0", "d1", "z0", "z1", "n0", "n1", "tau.coef")]),
    unlist(out2[c("d0", "d1", "z0", "z1", "n0", "n1", "tau.coef")]),
    tolerance = 0.1
  )

  med0 <- mediation::mediate(
    standardize(b.int),
    standardize(d.int),
    sims = 200,
    treat = "treat",
    mediator = "job_seek"
  )
  out0 <- summary(med0)
  medz <- standardize(mediation::mediate(
    b.int,
    d.int,
    sims = 200,
    treat = "treat",
    mediator = "job_seek"
  ))
  outz <- summary(medz)
  expect_identical(
    unlist(out0[c("d0", "d1", "z0", "z1", "n0", "n1", "tau.coef")]),
    unlist(outz[c("d0", "d1", "z0", "z1", "n0", "n1", "tau.coef")]),
    tolerance = 0.1
  )
})

# Offsets -----------------------------------------------------------------

test_that("offsets", {
  skip_if_not_installed("effectsize")
  skip_if_not_installed("parameters")

  m <- lm(mpg ~ hp + offset(wt), data = mtcars)

  expect_warning({
    mz1 <- standardize(m)
  })
  expect_warning({
    mz2 <- standardize(m, two_sd = TRUE)
  })
  expect_identical(c(1, 2) * coef(mz1), coef(mz2))

  m <- glm(cyl ~ hp + offset(wt), family = poisson(), data = mtcars)
  expect_warning(
    {
      mz <- standardize(m)
    },
    regexp = NA
  )

  par1 <- parameters::model_parameters(mz)
  par2 <- effectsize::standardize_parameters(m, method = "basic")
  expect_identical(par2[2, 2], par1[2, 2], tolerance = 0.05)
})


# BRMS --------------------------------------------------------------------

test_that("brms", {
  skip_on_cran()
  skip_on_os(c("windows", "mac"))
  skip_if_not_installed("brms")
  skip_if_not_installed("RcppEigen")
  skip_if_not_installed("BH")

  invisible(
    capture.output({
      mod <- brms::brm(
        mpg ~ hp,
        data = mtcars,
        refresh = 0,
        chains = 1,
        silent = 2
      )
    })
  )

  expect_warning(
    standardize(mod),
    regexp = "without adjusting priors may lead to bogus"
  )
})

# fixest --------------------------------------------------------------------

test_that("fixest", {
  skip_if_not_installed("fixest")

  mtcars_stand <- standardize(mtcars)
  orig <- fixest::feols(
    drat ~ mpg + hp^2 | cyl + am,
    data = mtcars,
    se = "hetero"
  )
  # TODO: Remove this suppressWarnings() when a new version of `fixest` that
  # contains the fix for https://github.com/lrberge/fixest/issues/618 is on CRAN
  # (CRAN version is 0.13.2 at the time of writing).
  suppressWarnings({
    auto_stand <- standardize(orig)
  })
  manual_stand <- fixest::feols(
    drat ~ mpg + hp^2 | cyl + am,
    data = mtcars_stand,
    se = "hetero"
  )

  # Need to unname because I(hp^2) in the manual one becomes I(I(hp ^2)) in the
  # automated one.
  expect_identical(
    unname(auto_stand$coefficients),
    unname(manual_stand$coefficients)
  )
  expect_identical(unname(auto_stand$se), unname(manual_stand$se))

  ### Inform the user if some terms are log() or sqrt()
  orig <- fixest::feols(
    drat ~ log(mpg) | cyl + am,
    data = mtcars
  )
  # TODO: same as above
  expect_message(
    suppressWarnings(standardize(orig)),
    "Formula contains log- or sqrt-terms"
  )
  orig <- fixest::feols(
    drat ~ sqrt(mpg) | cyl + am,
    data = mtcars
  )
  # TODO: same as above
  expect_message(
    suppressWarnings(standardize(orig)),
    "Formula contains log- or sqrt-terms"
  )
})


================================================
FILE: tests/testthat/test-std_center.R
================================================
d <- iris[1:4, ]

# standardize -----------------------------------------------------
test_that("standardize.data.frame", {
  x <- standardise(d, select = c("Sepal.Length", "Sepal.Width"))
  expect_equal(
    as.vector(x$Sepal.Length),
    as.vector(scale(d$Sepal.Length)),
    tolerance = 0.001
  )
  expect_equal(
    as.vector(x$Petal.Length),
    as.vector(d$Petal.Length),
    tolerance = 0.001
  )
  expect_equal(
    colnames(x),
    c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species")
  )

  x <- standardise(d, select = c("Sepal.Length", "Sepal.Width"), append = TRUE)
  expect_equal(
    as.vector(x$Sepal.Length_z),
    as.vector(scale(d$Sepal.Length)),
    tolerance = 0.001
  )
  expect_equal(
    as.vector(x$Sepal.Length),
    as.vector(d$Sepal.Length),
    tolerance = 0.001
  )
  expect_equal(
    as.vector(x$Petal.Length),
    as.vector(d$Petal.Length),
    tolerance = 0.001
  )
  expect_equal(
    colnames(x),
    c(
      "Sepal.Length",
      "Sepal.Width",
      "Petal.Length",
      "Petal.Width",
      "Species",
      "Sepal.Length_z",
      "Sepal.Width_z"
    )
  )
})

test_that("standardize other classes", {
  d <- data.frame(
    a = 1:5,
    b = factor(letters[1:5]),
    c = as.Date(c(
      "2022-03-22",
      "2022-01-02",
      "2022-02-02",
      "2021-04-02",
      "2020-01-19"
    )),
    d = c(TRUE, TRUE, FALSE, FALSE, TRUE),
    e = as.complex(1:5)
  )

  x <- standardize(d$a)
  expect_equal(
    as.numeric(x),
    c(-1.26491, -0.63246, 0, 0.63246, 1.26491),
    tolerance = 1e-3,
    ignore_attr = TRUE
  )
  x <- standardize(d$b)
  expect_equal(as.numeric(x), 1:5, tolerance = 1e-3, ignore_attr = TRUE)
  x <- standardize(d$b, force = TRUE)
  expect_equal(
    as.numeric(x),
    c(-1.26491, -0.63246, 0, 0.63246, 1.26491),
    tolerance = 1e-3,
    ignore_attr = TRUE
  )
  x <- standardize(d$c)
  expect_equal(
    x,
    as.Date(
      c(
        "2022-03-22",
        "2022-01-02",
        "2022-02-02",
        "2021-04-02",
        "2020-01-19"
      )
    ),
    tolerance = 1e-3,
    ignore_attr = TRUE
  )
  x <- standardize(d$c, force = TRUE)
  expect_equal(
    as.numeric(x),
    c(0.76992, 0.53121, 0.62488, -0.29975, -1.62626),
    tolerance = 1e-3,
    ignore_attr = TRUE
  )
  x <- standardize(d$d)
  expect_equal(
    x,
    c(TRUE, TRUE, FALSE, FALSE, TRUE),
    tolerance = 1e-3,
    ignore_attr = TRUE
  )
  expect_message(x <- standardize(d$d, force = TRUE))
  expect_equal(
    x,
    c(0.7303, 0.7303, -1.09545, -1.09545, 0.7303),
    tolerance = 1e-3,
    ignore_attr = TRUE
  )

  ## TODO conflict with standardize.default() in effectsize

  expect_warning(
    x <- standardize(d$e),
    "cannot be standardized"
  )
  expect_equal(x, d$e, tolerance = 1e-3, ignore_attr = TRUE)
})


# center -----------------------------------------------------
test_that("center.data.frame", {
  x <- center(d, select = c("Sepal.Length", "Sepal.Width"))
  expect_equal(
    as.vector(x$Sepal.Length),
    as.vector(d$Sepal.Length - mean(d$Sepal.Length)),
    tolerance = 0.001
  )
  expect_equal(
    as.vector(x$Petal.Length),
    as.vector(d$Petal.Length),
    tolerance = 0.001
  )
  expect_equal(
    colnames(x),
    c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species")
  )

  x <- center(d, select = c("Sepal.Length", "Sepal.Width"), append = TRUE)
  expect_equal(
    as.vector(x$Sepal.Length_c),
    as.vector(d$Sepal.Length - mean(d$Sepal.Length)),
    tolerance = 0.001
  )
  expect_equal(
    as.vector(x$Sepal.Length),
    as.vector(d$Sepal.Length),
    tolerance = 0.001
  )
  expect_equal(
    as.vector(x$Petal.Length),
    as.vector(d$Petal.Length),
    tolerance = 0.001
  )
  expect_equal(
    colnames(x),
    c(
      "Sepal.Length",
      "Sepal.Width",
      "Petal.Length",
      "Petal.Width",
      "Species",
      "Sepal.Length_c",
      "Sepal.Width_c"
    )
  )
})

test_that("center other classes", {
  d <- data.frame(
    a = 1:5,
    b = factor(letters[1:5]),
    c = as.Date(c(
      "2022-03-22",
      "2022-01-02",
      "2022-02-02",
      "2021-04-02",
      "2020-01-19"
    )),
    d = c(TRUE, TRUE, FALSE, FALSE, TRUE),
    e = as.complex(1:5)
  )

  x <- center(d$a)
  expect_equal(
    as.numeric(x),
    c(-2, -1, 0, 1, 2),
    tolerance = 1e-3,
    ignore_attr = TRUE
  )
  x <- center(d$b)
  expect_equal(as.numeric(x), 1:5, tolerance = 1e-3, ignore_attr = TRUE)
  x <- center(d$b, force = TRUE)
  expect_equal(
    as.numeric(x),
    c(-2, -1, 0, 1, 2),
    tolerance = 1e-3,
    ignore_attr = TRUE
  )
  x <- center(d$c)
  expect_equal(
    x,
    as.Date(
      c(
        "2022-03-22",
        "2022-01-02",
        "2022-02-02",
        "2021-04-02",
        "2020-01-19"
      )
    ),
    tolerance = 1e-3,
    ignore_attr = TRUE
  )
  x <- center(d$c, force = TRUE)
  expect_equal(
    as.numeric(x),
    c(254.8, 175.8, 206.8, -99.2, -538.2),
    tolerance = 1e-3,
    ignore_attr = TRUE
  )
  x <- center(d$d)
  expect_equal(
    x,
    c(TRUE, TRUE, FALSE, FALSE, TRUE),
    tolerance = 1e-3,
    ignore_attr = TRUE
  )
  expect_message(x <- center(d$d, force = TRUE))
  expect_equal(
    x,
    c(0.4, 0.4, -0.6, -0.6, 0.4),
    tolerance = 1e-3,
    ignore_attr = TRUE
  )
  expect_message(x <- center(d$e))
  expect_equal(x, d$e, tolerance = 1e-3, ignore_attr = TRUE)
})


================================================
FILE: tests/testthat/test-std_center_scale_args.R
================================================
d <- data.frame(a = 1:5, b = 21:25, c = 41:45)

test_that("standardize", {
  x <- standardize(d)
  expect_equal(as.vector(x$a), as.vector(scale(d$a)), tolerance = 0.001)
})

test_that("standardize", {
  x <- standardize(d, center = 5, scale = 2)
  expect_equal(as.vector(x$a), c(-2, -1.5, -1, -0.5, 0), tolerance = 0.001)
})

test_that("standardize", {
  x <- standardize(d, center = c(5, 25, 45), scale = c(3, 3, 3))
  expect_equal(
    as.vector(x$a),
    c(-1.33333, -1, -0.66667, -0.33333, 0),
    tolerance = 0.001
  )
  expect_equal(
    as.vector(x$b),
    c(-1.33333, -1, -0.66667, -0.33333, 0),
    tolerance = 0.001
  )
})

test_that("standardize", {
  x <- standardize(d, center = c(c = 45, a = 5, b = 25), scale = c(3, 3, 3))
  expect_equal(
    as.vector(x$a),
    c(-1.33333, -1, -0.66667, -0.33333, 0),
    tolerance = 0.001
  )
  expect_equal(
    as.vector(x$b),
    c(-1.33333, -1, -0.66667, -0.33333, 0),
    tolerance = 0.001
  )
})

test_that("standardize", {
  x <- standardize(d, center = c(c = 45, a = 5, b = 25), scale = c(1, 2, 3))
  expect_equal(as.vector(x$a), c(-4, -3, -2, -1, 0), tolerance = 0.001)
  expect_equal(as.vector(x$b), c(-2, -1.5, -1, -0.5, 0), tolerance = 0.001)
})

test_that("standardize", {
  x <- standardize(
    d,
    center = c(c = 45, a = 5, b = 25),
    scale = c(a = 1, b = 2, c = 3)
  )
  expect_equal(as.vector(x$a), c(-4, -3, -2, -1, 0), tolerance = 0.001)
  expect_equal(as.vector(x$b), c(-2, -1.5, -1, -0.5, 0), tolerance = 0.001)
})

test_that("standardize", {
  x <- standardize(
    d,
    center = c(c = 45, a = 5, b = 25),
    scale = c(c = 3, b = 2, a = 1)
  )
  expect_equal(as.vector(x$a), c(-4, -3, -2, -1, 0), tolerance = 0.001)
  expect_equal(as.vector(x$b), c(-2, -1.5, -1, -0.5, 0), tolerance = 0.001)
})

test_that("standardize", {
  x <- standardize(
    d,
    center = c(c = 45, a = 5, b = 25),
    scale = c(c = 1, b = 2, a = 3)
  )
  expect_equal(
    as.vector(x$a),
    c(-1.33333, -1, -0.66667, -0.33333, 0),
    tolerance = 0.001
  )
  expect_equal(as.vector(x$b), c(-2, -1.5, -1, -0.5, 0), tolerance = 0.001)
})


================================================
FILE: tests/testthat/test-text_format.R
================================================
test_that("text formatting helpers work as expected", {
  expect_snapshot(text_format(
    c(
      "A very long First",
      "Some similar long Second",
      "Shorter Third",
      "More or less long Fourth",
      "And finally the Last"
    ),
    width = 20
  ))

  expect_snapshot(text_format(
    c(
      "A very long First",
      "Some similar long Second",
      "Shorter Third",
      "More or less long Fourth",
      "And finally the Last"
    ),
    last = " or ",
    enclose = "`",
    width = 20
  ))

  expect_identical(
    text_fullstop(c("something", "something else.")),
    c("something.", "something else.")
  )

  expect_identical(
    text_lastchar(c("ABC", "DEF"), n = 2),
    c("BC", "EF"),
    ignore_attr = TRUE
  )

  expect_identical(
    text_concatenate(c("First", "Second")),
    "First and Second"
  )

  expect_identical(
    text_concatenate("First"),
    "First"
  )

  expect_identical(
    text_concatenate(c("First", "Second", "Last")),
    "First, Second and Last"
  )

  expect_identical(
    text_concatenate(
      c("First", "Second", "Last"),
      last = " or ",
      enclose = "`"
    ),
    "`First`, `Second` or `Last`"
  )

  expect_identical(
    text_remove(c("one!", "two", "three!"), "!"),
    c("one", "two", "three")
  )

  expect_identical(
    text_paste(c("A", "", "B"), c("42", "42", "42")),
    c("A, 42", "42", "B, 42")
  )

  expect_identical(
    text_paste(c("A", "", "B"), c("42", "42", "42"), enclose = "`"),
    c("`A`, `42`", "`42`", "`B`, `42`")
  )
})

test_that("text formatters respect `width` argument", {
  expect_snapshot({
    long_text <- strrep("abc ", 100)
    cat(text_format(long_text, width = 50))
    cat(text_format(long_text, width = 80))

    withr::with_options(list(width = 50), code = {
      cat(text_format(long_text))
    })
  })
})


================================================
FILE: tests/testthat/test-unnormalize.R
================================================
test_that("unnormalize work as expected", {
  x <- normalize(c(0, 1, 5, -5, -2))
  expect_equal(
    unnormalize(x),
    c(0, 1, 5, -5, -2),
    ignore_attr = TRUE
  )
  expect_error(
    unnormalize(c(0, 1, 5, -5, -2)),
    "Can't unnormalize variable"
  )
})

test_that("unnormalize error if not supported", {
  expect_error(
    unnormalize(c("a", "b")),
    "can't be unnormalized"
  )
})

test_that("unnormalize and unstandardized x 4", {
  set.seed(123)
  x <- rnorm(6, 4, 10)

  z <- standardise(x)
  expect_named(attributes(z), c("center", "scale", "robust", "class"))
  expect_equal(attributes(z)$center, 8.47, tolerance = 0.01)
  expect_equal(unstandardise(z), x, ignore_attr = TRUE)

  z <- center(x)
  expect_named(attributes(z), c("center", "scale", "robust", "class"))
  expect_equal(unstandardise(z), x, ignore_attr = TRUE)

  z <- normalize(x)
  expect_named(
    attributes(z),
    c(
      "include_bounds",
      "flag_bounds",
      "min_value",
      "vector_length",
      "range_difference",
      "class"
    )
  )
  expect_equal(unnormalize(z), x, ignore_attr = TRUE)

  z <- change_scale(x, to = c(-3, 14.5))
  expect_named(
    attributes(z),
    c(
      "min_value",
      "max_value",
      "new_min",
      "new_max",
      "range_difference",
      "to_range",
      "class"
    )
  )
  expect_equal(unnormalize(z), x, ignore_attr = TRUE)

  z <- change_scale(x, range = c(-100, 100))
  expect_named(
    attributes(z),
    c(
      "min_value",
      "max_value",
      "new_min",
      "new_max",
      "range_difference",
      "to_range",
      "class"
    )
  )
  expect_equal(unnormalize(z), x, ignore_attr = TRUE)
})

# select helpers ------------------------------
test_that("unnormalize regex", {
  x <- normalize(mtcars, select = "mpg")
  expect_identical(
    unnormalize(x, select = "pg", regex = TRUE),
    unnormalize(x, select = "mpg")
  )
})


test_that("unnormalize: grouped data", {
  skip_if_not_installed("poorman")

  # 1 group, 1 normalized var
  norm <- poorman::group_by(mtcars, cyl)
  norm <- normalize(norm, "mpg")
  expect_identical(
    poorman::ungroup(unnormalize(norm, "mpg")),
    mtcars,
    ignore_attr = TRUE # unnormalize removed rownames
  )

  # 2 groups, 1 normalized var
  set.seed(123)
  test <- iris
  test$grp <- sample(c("A", "B"), nrow(test), replace = TRUE)
  norm <- poorman::group_by(test, Species, grp)
  norm <- normalize(norm, "Sepal.Length")
  expect_identical(
    poorman::ungroup(unnormalize(norm, "Sepal.Length")),
    test
  )

  # 2 groups, 2 normalized vars
  set.seed(123)
  test <- iris
  test$grp <- sample(c("A", "B"), nrow(test), replace = TRUE)
  norm <- poorman::group_by(test, Species, grp)
  norm <- normalize(norm, c("Sepal.Length", "Petal.Length"))
  unnorm <- unnormalize(norm, c("Sepal.Length", "Petal.Length"))
  expect_identical(
    poorman::ungroup(unnorm),
    test
  )

  expect_s3_class(unnorm, "grouped_df")

  # can't recover attributes
  norm <- poorman::group_by(iris, Species)
  norm <- normalize(norm, "Sepal.Length")
  attr(norm, "groups") <- NULL

  expect_error(
    unnormalize(norm, "Sepal.Length"),
    regexp = "Couldn't retrieve the necessary information"
  )

  # normalize applied on grouped data but unnormalize applied on ungrouped data
  norm <- poorman::group_by(mtcars, cyl)
  norm <- normalize(norm, "mpg")
  norm <- poorman::ungroup(norm)

  expect_error(
    unnormalize(norm, "mpg"),
    regexp = "Can't unnormalize variable"
  )

  # normalize applied on grouped data but unnormalize applied different grouped
  # data
  norm <- poorman::group_by(norm, am)
  expect_error(
    unnormalize(norm, "mpg"),
    regexp = "Couldn't retrieve the necessary"
  )
})


================================================
FILE: tests/testthat/test-utils.R
================================================
test_that(".coerce_to_dataframe works for matrices", {
  mat <- matrix(c(1, 2, 3, 11, 12, 13), nrow = 2, ncol = 3, byrow = TRUE)

  expect_equal(
    .coerce_to_dataframe(mat),
    data.frame(
      V1 = c(1, 11),
      V2 = c(2, 12),
      V3 = c(3, 13)
    )
  )
})

test_that(".coerce_to_dataframe works for vectors and list", {
  expect_equal(
    .coerce_to_dataframe(1:3),
    data.frame(data = 1:3)
  )

  expect_equal(
    .coerce_to_dataframe(c("a", "b", "c")),
    data.frame(data = c("a", "b", "c"), stringsAsFactors = FALSE)
  )

  expect_equal(
    .coerce_to_dataframe(list(var1 = 1:3, var2 = 4:6)),
    data.frame(var1 = 1:3, var2 = 4:6)
  )
})

test_that(".coerce_to_dataframe errors correctly if can't coerce", {
  expect_error(
    .coerce_to_dataframe(list(var1 = 1:3, var2 = 4:5)),
    regexp = "object that can be coerced"
  )
})

test_that(".is_sorted works", {
  expect_true(.is_sorted(1:3))
  expect_true(.is_sorted(c("a", "b", "c")))
  expect_true(.is_sorted(factor(c("a", "b", "c"))))

  expect_false(.is_sorted(c(1, 3, 2)))
  expect_false(.is_sorted(c("b", "a", "c")))
  expect_false(.is_sorted(factor(c("b", "a", "c"))))
})


================================================
FILE: tests/testthat/test-utils_cols.R
================================================
test_char <- data.frame(
  a = c("iso", 2, 5),
  b = c("year", 3, 6),
  c = c(NA, 5, 7),
  stringsAsFactors = FALSE
)

test_num <- data.frame(
  a = c(5, 2, 5),
  b = c(3, 3, 6),
  c = c(NA, 5, 7)
)

test_na <- data.frame(
  a = c(NA, 2, 5),
  b = c(NA, 3, 6),
  c = c(NA, 5, 7)
)

test_that("row_to_colnames works", {
  test <- row_to_colnames(test_char, verbose = FALSE)
  expect_identical(
    colnames(test),
    c("iso", "year", "x1")
  )

  test <- row_to_colnames(test_num, verbose = FALSE)
  expect_identical(
    colnames(test),
    c("5", "3", "x1")
  )

  test <- row_to_colnames(test_na, verbose = FALSE)
  expect_identical(
    colnames(test),
    c("x1", "x2", "x3")
  )
})

test_that("row_to_colnames: check arg 'row'", {
  expect_error(
    row_to_colnames(test_num, row = "hi", verbose = FALSE),
    regexp = "Argument `row`"
  )
  expect_error(
    row_to_colnames(test_num, row = 6),
    regexp = "You used row = 6"
  )
  expect_error(
    row_to_colnames(test_num, row = c(3, 5), verbose = FALSE),
    regexp = "Argument `row`"
  )
  expect_identical(
    row_to_colnames(test_num, verbose = FALSE),
    row_to_colnames(test_num, row = 1, verbose = FALSE)
  )
})

test_that("row_to_colnames: check arg 'na_prefix'", {
  test <- row_to_colnames(test_char, na_prefix = "foo", verbose = FALSE)
  expect_identical(
    colnames(test),
    c("iso", "year", "foo1")
  )

  test <- row_to_colnames(test_num, na_prefix = "foo", verbose = FALSE)
  expect_identical(
    colnames(test),
    c("5", "3", "foo1")
  )
})

#-----------------------------------------------------

foo <- data.frame(
  ARG = c("BRA", "FRA"),
  `1960` = c(1960, 1960),
  `2000` = c(2000, 2000),
  stringsAsFactors = FALSE
)

test_that("colnames_to_row works", {
  test <- colnames_to_row(foo)
  expect_identical(
    colnames(test),
    c("x1", "x2", "x3")
  )
  expect_true(
    all(
      test[1, 1] == "ARG",
      test[1, 2] == "X1960",
      test[1, 3] == "X2000"
    )
  )
  expect_s3_class(test, "data.frame")
})

test_that("colnames_to_row: check arg 'prefix'", {
  test <- colnames_to_row(foo, prefix = "hi")
  expect_identical(
    colnames(test),
    c("hi1", "hi2", "hi3")
  )
  expect_error(
    colnames_to_row(test_num, prefix = 6),
    regexp = "Argument `prefix`"
  )
  expect_error(
    colnames_to_row(test_num, prefix = c("A", "B")),
    regexp = "Argument `prefix`"
  )
  expect_identical(
    colnames_to_row(test),
    colnames_to_row(test, prefix = "x")
  )
})


================================================
FILE: tests/testthat/test-utils_rows.R
================================================
test_that("rownames_as_column works", {
  test <- rownames_as_column(mtcars, "new_column")
  expect_true("new_column" %in% names(test))
  expect_identical(test[1, "new_column"], "Mazda RX4")
})

test_that("rownames_as_column doesn't work if var is not a character", {
  expect_error(
    rownames_as_column(mtcars, var = 1),
    regexp = "Argument 'var' must be of type character"
  )
  expect_error(
    rownames_as_column(mtcars, var = TRUE),
    regexp = "Argument 'var' must be of type character"
  )
})

test_that("rownames_as_column uses 'rowname' as default column name", {
  test <- rownames_as_column(mtcars, var = NULL)
  expect_true("rowname" %in% names(test))
})

test_that("rownames_as_column preserves labels", {
  test_data <- mtcars
  test_data <- assign_labels(test_data, select = "hp", variable = "horsepower")

  # ungrouped
  with_id <- rownames_as_column(test_data)
  expect_identical(
    attributes(with_id$hp)$label,
    "horsepower"
  )

  # grouped
  with_id_grouped <- data_group(test_data, "cyl")
  with_id_grouped <- rownames_as_column(with_id_grouped)
  expect_identical(
    attributes(with_id_grouped$hp)$label,
    "horsepower"
  )
})

test_that("rownames_as_column preserves other attribs", {
  test_data <- standardize(mtcars)

  # ungrouped
  with_id <- rownames_as_column(test_data)
  expect_false(is.null(attributes(with_id)$center))

  # grouped
  with_id_grouped <- data_group(test_data, "cyl")
  with_id_grouped <- rownames_as_column(with_id_grouped)
  expect_false(is.null(attributes(with_id_grouped)$center))
})

test_that("rownames_as_column errors if already var of same name", {
  expect_error(
    rownames_as_column(mtcars, "mpg"),
    "already a variable named"
  )
})

#-------------------------------------------------

test_that("rowid_as_column works", {
  test <- rowid_as_column(mtcars, "new_column")
  expect_true("new_column" %in% names(test))
  expect_identical(test$new_column, 1:32)
})

test_that("rowid_as_column works with grouped data", {
  test_data <- data_group(iris, "Species")
  test <- rowid_as_column(test_data)
  expect_identical(test$rowid, rep(1:50, 3))
  expect_true("rowid" %in% names(test))
})

test_that("rowid_as_column doesn't work if var is not a character", {
  expect_error(
    rowid_as_column(mtcars, var = 1),
    regexp = "Argument 'var' must be of type character"
  )
  expect_error(
    rowid_as_column(mtcars, var = TRUE),
    regexp = "Argument 'var' must be of type character"
  )
})

test_that("rowid_as_column uses 'rowid' as default column name", {
  test <- rowid_as_column(mtcars, var = NULL)
  expect_true("rowid" %in% names(test))
})

test_that("rowid_as_column preserves labels", {
  test_data <- mtcars
  test_data <- assign_labels(test_data, select = "hp", variable = "horsepower")

  # ungrouped
  with_id <- rowid_as_column(test_data)
  expect_identical(
    attributes(with_id$hp)$label,
    "horsepower"
  )

  # grouped
  with_id_grouped <- data_group(test_data, "cyl")
  with_id_grouped <- rowid_as_column(with_id_grouped)
  expect_identical(
    attributes(with_id_grouped$hp)$label,
    "horsepower"
  )
})

test_that("rowid_as_column preserves other attribs", {
  test_data <- standardize(mtcars)

  # ungrouped
  with_id <- rowid_as_column(test_data)
  expect_false(is.null(attributes(with_id)$center))

  # grouped
  with_id_grouped <- data_group(test_data, "cyl")
  with_id_grouped <- rowid_as_column(with_id_grouped)
  expect_false(is.null(attributes(with_id_grouped)$center))
})

test_that("rowid_as_column has no issue if another variable is called 'var'", {
  foo <- data.frame(
    grp = c("A", "A", "B", "B"),
    var = 1:4,
    stringsAsFactors = FALSE
  )

  out <- data_group(foo, grp)
  out <- rowid_as_column(out)
  expect_named(out, c("rowid", "grp", "var"))
})

test_that("rowid_as_column errors if already var of same name", {
  expect_error(
    rowid_as_column(mtcars, "mpg"),
    "already a variable named"
  )
})

#-------------------------------------------------

test_that("column_as_rownames works", {
  continents <- c(
    "Africa",
    "Asia",
    "Europe",
    "North America",
    "Oceania",
    "South America"
  )
  test <- data.frame(
    continent = continents,
    some_value = seq(1, 6, by = 1)
  )
  test2 <- column_as_rownames(test, "continent")
  expect_identical(rownames(test2), continents)
  expect_identical(ncol(test2), 1L)

  test3 <- column_as_rownames(test, 1)
  expect_identical(rownames(test3), continents)
  expect_identical(ncol(test3), 1L)
})

test_that("column_as_rownames sanity checks work", {
  continents <- c(
    "Africa",
    "Asia",
    "Europe",
    "North America",
    "Oceania",
    "South America"
  )
  test <- data.frame(
    continent = continents,
    some_value = seq(1, 6, by = 1)
  )
  expect_error(column_as_rownames(test, TRUE), regexp = "Argument `var`")
  expect_error(
    column_as_rownames(test, "foo"),
    regexp = "not in the data frame"
  )
  expect_error(column_as_rownames(test, 0), regexp = "does not exist")
  expect_error(column_as_rownames(test, 3), regexp = "does not exist")
})

test_that("rownames_as_column and column_as_rownames cancel each other", {
  test <- rownames_as_column(mtcars)
  test2 <- column_as_rownames(test)
  expect_identical(test2, mtcars)
})

test_that("column_as_rownames preserves labels", {
  test_data <- rownames_as_column(mtcars)
  test_data <- assign_labels(test_data, select = "hp", variable = "horsepower")

  # ungrouped
  with_id <- column_as_rownames(test_data)
  expect_identical(
    attributes(with_id$hp)$label,
    "horsepower"
  )

  # grouped
  with_id_grouped <- data_group(test_data, "cyl")
  with_id_grouped <- column_as_rownames(with_id_grouped)
  expect_identical(
    attributes(with_id_grouped$hp)$label,
    "horsepower"
  )
})


test_that("column_as_rownames preserves other attribs", {
  test_data <- rownames_as_column(standardize(mtcars))

  # ungrouped
  with_id <- column_as_rownames(test_data, "rowname")
  expect_false(is.null(attributes(with_id)$center))

  # grouped
  with_id_grouped <- data_group(test_data, "cyl")
  with_id_grouped <- column_as_rownames(with_id_grouped)
  expect_false(is.null(attributes(with_id_grouped)$center))
})


================================================
FILE: tests/testthat/test-weighted-stats.R
================================================
test_that("weighted centrality and dispersion measures work as expected", {
  x <- c(3.7, 3.3, 3.5, 2.8)
  wt <- c(5, 5, 4, 1) / 15

  set.seed(123)
  expect_equal(weighted_mean(x, wt), 3.453333, tolerance = 0.001)
  expect_equal(weighted_median(x, wt), 3.5, tolerance = 0.001)
  expect_equal(weighted_sd(x, wt), 0.2852935, tolerance = 0.001)
  expect_equal(weighted_mad(x, wt), 0.29652, tolerance = 0.001)
})

test_that("weighted centrality and dispersion measures work with NA", {
  x <- c(3.7, 3.3, NA, 3.5, 2.8, 5.5)
  wt <- c(5, 5, 4, NA, 1, 7) / 15

  set.seed(123)
  expect_equal(weighted_mean(x, wt), 4.238889, tolerance = 0.001)
  expect_equal(weighted_median(x, wt), 3.7, tolerance = 0.001)
  expect_equal(weighted_sd(x, wt), 1.237671, tolerance = 0.001)
  expect_equal(weighted_mad(x, wt), 0.59304, tolerance = 0.001)
})

test_that("weighted centrality and dispersion measures work with NA when not removed", {
  x <- c(3.7, 3.3, NA, 3.5, 2.8, 5.5)
  wt <- c(5, 5, 4, NA, 1, 7) / 15

  set.seed(123)
  expect_identical(weighted_mean(x, wt, remove_na = FALSE), NA_real_)
  expect_identical(weighted_median(x, wt, remove_na = FALSE), NA_real_)
  expect_identical(weighted_sd(x, wt, remove_na = FALSE), NA_real_)
  expect_identical(weighted_mad(x, wt, remove_na = FALSE), NA_real_)
})

test_that("weighted centrality and dispersion measures work with Inf", {
  x <- c(3.7, 3.3, NA, 3.5, 2.8, 5.5, Inf, 4)
  wt <- c(5, 5, 4, NA, 1, 7, 3, Inf) / 15

  set.seed(123)
  expect_equal(weighted_mean(x, wt), 4.238889, tolerance = 0.001)
  expect_equal(weighted_median(x, wt), 3.7, tolerance = 0.001)
  expect_equal(weighted_sd(x, wt), 1.237671, tolerance = 0.001)
  expect_equal(weighted_mad(x, wt), 0.59304, tolerance = 0.001)
})


================================================
FILE: tests/testthat/test-winsorization.R
================================================
test_that("testing Winsorization of factors", {
  expect_identical(winsorize(as.factor(mtcars$am)), as.factor(mtcars$am))
})

test_that("with missing values", {
  skip_if_not_installed("ggplot2")

  expect_snapshot(suppressWarnings(head(winsorize(na.omit(
    ggplot2::msleep$brainwt
  )))))
  expect_length(winsorize(as.factor(ggplot2::msleep$vore)), 83L)
})

test_that("winsorize: threshold must be between 0 and 1", {
  expect_error(
    winsorize(sample(1:10, 5), threshold = -0.1),
    regexp = "must be a scalar between 0 and 0.5"
  )
  expect_error(
    winsorize(sample(1:10, 5), threshold = 1.1),
    regexp = "must be a scalar between 0 and 0.5"
  )
  expect_error(
    winsorize(sample(1:10, 5), method = "zscore", threshold = -3),
    regexp = "must be a scalar greater than 0"
  )
  expect_error(
    winsorize(
      sample(1:10, 5),
      method = "zscore",
      threshold = -3,
      robust = TRUE
    ),
    regexp = "must be a scalar greater than 0"
  )
  expect_error(
    winsorize(sample(1:10, 5), method = "raw", threshold = 1.1),
    regexp = "must be of length 2 for lower and upper bound"
  )
})

test_that("winsorize on data.frame", {
  iris2 <- winsorize(iris)
  expect_identical(
    iris2$Sepal.Length,
    winsorize(iris$Sepal.Length)
  )
  expect_identical(
    iris2$Petal.Width,
    winsorize(iris$Petal.Width)
  )
  expect_named(iris2, names(iris))
})


================================================
FILE: tests/testthat.R
================================================
library(testthat)
library(datawizard)

test_check("datawizard")


================================================
FILE: vignettes/.gitignore
================================================
*.html
*.R


================================================
FILE: vignettes/bibliography.bib
================================================
@Manual{revelle2018,
  title = {psych: Procedures for Psychological, Psychometric, and Personality Research},
  author = {William Revelle},
  organization = { Northwestern University},
  address = { Evanston, Illinois},
  year = {2018},
  note = {R package version 1.8.12},
  url = {https://CRAN.R-project.org/package=psych}
}


@article{makowski2018psycho,
  title={The psycho package: An efficient and publishing-oriented workflow for psychological science},
  author={Makowski, Dominique},
  journal={Journal of Open Source Software},
  volume={3},
  number={22},
  pages={470},
  year={2018}
}


@article{menard2011standards,
  title={Standards for standardized logistic regression coefficients},
  author={Menard, Scott},
  journal={Social Forces},
  volume={89},
  number={4},
  pages={1409--1428},
  year={2011},
  publisher={The University of North Carolina Press}
}


@article{schielzeth2010simple,
  title={Simple means to improve the interpretability of regression coefficients},
  author={Schielzeth, Holger},
  journal={Methods in Ecology and Evolution},
  volume={1},
  number={2},
  pages={103--113},
  year={2010},
  publisher={Wiley Online Library}
}


@article{gelman2008scaling,
  title={Scaling regression inputs by dividing by two standard deviations},
  author={Gelman, Andrew},
  journal={Statistics in medicine},
  volume={27},
  number={15},
  pages={2865--2873},
  year={2008},
  publisher={Wiley Online Library}
}


@article{menard2004six,
  title={Six approaches to calculating standardized logistic regression coefficients},
  author={Menard, Scott},
  journal={The American Statistician},
  volume={58},
  number={3},
  pages={218--223},
  year={2004},
  publisher={Taylor \& Francis}
}


@article{bring1994standardize,
  title={How to standardize regression coefficients},
  author={Bring, Johan},
  journal={The American Statistician},
  volume={48},
  number={3},
  pages={209--213},
  year={1994},
  publisher={Taylor \& Francis}
}


@article{neter1989applied,
  title={Applied linear regression models},
  author={Neter, John and Wasserman, William and Kutner, Michael H},
  year={1989},
  publisher={Irwin Homewood, IL}
}


@book{gelman_data_2007,
	address = {Cambridge; New York},
	series = {Analytical methods for social research},
	title = {Data analysis using regression and multilevel/hierarchical models},
	isbn = {978-0-521-86706-1 978-0-521-68689-1},
	publisher = {Cambridge University Press},
	author = {Gelman, Andrew and Hill, Jennifer},
	year = {2007}
}


@inproceedings{bafumi_fitting_2006,
	address = {Philadelphia, PA},
	title = {Fitting Multilevel Models When Predictors and Group Effects Correlate.},
	author = {Bafumi, Joseph and Gelman, Andrew},
	year = {2006}
}


@article{bell_fixed_2019,
	title = {Fixed and random effects models: making an informed choice},
	issn = {1573-7845},
	url = {https://doi.org/10.1007/s11135-018-0802-x},
	doi = {10.1007/s11135-018-0802-x},
	journal = {Quality \& Quantity},
	author = {Bell, Andrew and Fairbrother, Malcolm and Jones, Kelvyn},
	volume = {53},
	year = {2019},
	pages = {1051--1074}
}


@article{bell_explaining_2015,
	title = {Explaining Fixed Effects: Random Effects Modeling of Time-Series Cross-Sectional and Panel Data},
	volume = {3},
	issn = {2049-8470, 2049-8489},
	url = {https://www.cambridge.org/core/product/identifier/S2049847014000077/type/journal_article},
	doi = {10.1017/psrm.2014.7},
	number = {1},
	journal = {Political Science Research and Methods},
	author = {Bell, Andrew and Jones, Kelvyn},
	month = jan,
	year = {2015},
	pages = {133--153}
}


@article{heisig_costs_2017,
	title = {The Costs of Simplicity: Why Multilevel Models May Benefit from Accounting for Cross-Cluster Differences in the Effects of Controls},
	volume = {82},
	issn = {0003-1224, 1939-8271},
	url = {http://journals.sagepub.com/doi/10.1177/0003122417717901},
	doi = {10.1177/0003122417717901},
	number = {4},
	journal = {American Sociological Review},
	author = {Heisig, Jan Paul and Schaeffer, Merlin and Giesecke, Johannes},
	month = aug,
	year = {2017},
	pages = {796--827}
}


@article{shor_bayesian_2007,
	title = {A Bayesian Multilevel Modeling Approach to Time-Series Cross-Sectional Data},
	volume = {15},
	issn = {1047-1987, 1476-4989},
	doi = {10.1093/pan/mpm006},
	number = {2},
	journal = {Political Analysis},
	author = {Shor, Boris and Bafumi, Joseph and Keele, Luke and Park, David},
	year = {2007},
	pages = {165--181}
}


@article{mundlak_pooling_1978,
	title = {On the Pooling of Time Series and Cross Section Data},
	volume = {46},
	number = {1},
	journal = {Econometrica},
	author = {Mundlak, Yair},
	month = jan,
	year = {1978},
	pages = {69}
}


================================================
FILE: vignettes/overview_of_vignettes.Rmd
================================================
---
title: "Overview of Vignettes"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Overview of Vignettes}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r message=FALSE, warning=FALSE, include=FALSE}
library(knitr)
knitr::opts_chunk$set(
  echo = TRUE,
  collapse = TRUE,
  warning = FALSE,
  message = FALSE,
  comment = "#>",
  eval = TRUE
)
```

All package vignettes are available at [https://easystats.github.io/datawizard/](https://easystats.github.io/datawizard/).

## Function Overview

* [Function Reference](https://easystats.github.io/datawizard/reference/index.html)


## Data Preparation

* [Coming from 'tidyverse'](https://easystats.github.io/datawizard/articles/tidyverse_translation.html)
* [A quick summary of selection syntax in `{datawizard}`](https://easystats.github.io/datawizard/articles/selection_syntax.html)


## Statistical Transformations

* [Data Standardization](https://easystats.github.io/datawizard/articles/standardize_data.html)


================================================
FILE: vignettes/selection_syntax.Rmd
================================================
---
title: "A quick summary of selection syntax in `{datawizard}`"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{A quick summary of selection syntax in `{datawizard}`}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r setup, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

pkgs <- c(
  "datawizard",
  "dplyr"
)

if (!all(vapply(pkgs, requireNamespace, quietly = TRUE, FUN.VALUE = logical(1L)))) {
  knitr::opts_chunk$set(eval = FALSE)
}
```

```{r load, echo=FALSE, message=FALSE}
library(datawizard)
library(dplyr)

set.seed(123)
iris <- iris[sample(nrow(iris), 10), ]
row.names(iris) <- NULL
```

```{css, echo=FALSE}
.custom_note {
  border-left: solid 5px hsl(220, 100%, 30%);
  background-color: hsl(220, 100%, 95%);
  padding: 5px;
  margin-bottom: 10px
}
```

This vignette can be referred to by citing the following:

Patil et al., (2022). datawizard: An R Package for Easy Data Preparation and Statistical Transformations. *Journal of Open Source Software*, *7*(78), 4684, https://doi.org/10.21105/joss.04684

# Selecting variables

## Quoted names

This is the most simple way to select one or several variables. Just use a character
vector containing variables names, like in base R.

```{r}
data_select(iris, c("Sepal.Length", "Petal.Width"))
```

## Unquoted names

It is also possible to use unquoted names. This is useful if we use the `tidyverse`
and want to be consistent about the way variable names are passed.

```{r}
iris %>%
  group_by(Species) %>%
  standardise(Petal.Length) %>%
  ungroup()
```


## Positions

In addition to variable names, `select` can also take indices for the variables 
to select in the dataframe.

```{r}
data_select(iris, c(1, 2, 5))
```


## Functions

We can also pass a function to the `select` argument. This function will be applied
to all columns and should return `TRUE` or `FALSE`. For example, if we want to 
keep only numeric columns, we can use `is.numeric`.

```{r}
data_select(iris, is.numeric)
```

Note that we can provide any custom function to `select`, *provided it returns `TRUE` or `FALSE`* when applied to a column.

```{r}
my_function <- function(i) {
  is.numeric(i) && mean(i, na.rm = TRUE) > 3.5
}

data_select(iris, my_function)
```


## Patterns

With larger datasets, it would be tedious to write the names of variables to select,
and it would be fragile to rely on variable positions as they may change later. 
To this end, we can use four select helpers: `starts_with()`, `ends_with()`,
`contains()`, and `regex()`. The first three can take several patterns, while
`regex()` takes a single regular expression.
 
```{r}
data_select(iris, starts_with("Sep", "Peta"))

data_select(iris, ends_with("dth", "ies"))

data_select(iris, contains("pal", "ec"))

data_select(iris, regex("^Sep|ies"))
```


<!-- NOTE: use raw HTML so that vignette can compile even if `evaluate_chunk` is FALSE. -->
<!-- See e.g. #527 -->

<div class="custom_note">
  <p>
    Note: these functions are not exported by `datawizard` but are detected and
    applied internally. This means that they won't be detected by autocompletion
    when we write them.
  </p>
</div>

<div class="custom_note">
  <p>
    Note #2: because these functions are not exported, they will not create
    conflicts with the ones that come from the `tidyverse` and that have the same 
    name. Therefore, we can still use `dplyr` and its friends, it won't change 
    anything for selection in `datawizard` functions!
  </p>
</div>


# Excluding variables

What if we want to keep all variables except for a few ones? There are two ways
we can invert our selection.

The first way is to put a minus sign `"-"` in front of the `select` argument.

```{r}
data_select(iris, -c("Sepal.Length", "Petal.Width"))

data_select(iris, -starts_with("Sep", "Peta"))

data_select(iris, -is.numeric)
```

Note that if we use numeric indices, we can't mix negative and positive values. 
This means that we have to use `select = -(1:2)` if we want to exclude the first
two columns; `select = -1:2` will *not* work:

```{r}
data_select(iris, -(1:2))
```

Same thing for variable names:

```{r}
data_select(iris, -(Petal.Length:Species))
```

The second way is to use the argument `exclude`. This argument has the same 
possibilities as `select`. Although this may not be required in most contexts, 
if we wanted to, we could use both `select` and `exclude` arguments at the same 
time.

```{r}
data_select(iris, exclude = c("Sepal.Length", "Petal.Width"))

data_select(iris, exclude = starts_with("Sep", "Peta"))
```

# Programming with selections

Since `datawizard` 0.6.0, it is possible to pass function arguments and loop indices
in `select` and `exclude` arguments. This makes it easier to program with
`datawizard`.

For example, if we want to let the user decide the selection they want to use:

```{r}
my_function <- function(data, selection) {
  extract_column_names(data, select = selection)
}
my_function(iris, "Sepal.Length")
my_function(iris, starts_with("Sep"))

my_function_2 <- function(data, pattern) {
  extract_column_names(data, select = starts_with(pattern))
}
my_function_2(iris, "Sep")
```

It is also possible to pass these values in loops, for example if we have a list 
of patterns and we want to relocate columns based on these patterns, one by one:

```{r}
new_iris <- iris
for (i in c("Sep", "Pet")) {
  new_iris <- new_iris %>%
    data_relocate(select = starts_with(i), after = -1)
}
new_iris
```

In the loop above, all columns starting with `"Sep"` are moved at the end of the
data frame, and the same thing was made with all columns starting with `"Pet"`.


# Useful to know

## Ignore the case

In every selection that uses variable names, we can ignore the case in the 
selection by applying `ignore_case = TRUE`.

```{r}
data_select(iris, c("sepal.length", "petal.width"), ignore_case = TRUE)

data_select(iris, ~ Sepal.length + petal.Width, ignore_case = TRUE)

data_select(iris, starts_with("sep", "peta"), ignore_case = TRUE)
```

## Formulas

It is also possible to use formulas to select variables:

```{r}
data_select(iris, ~ Sepal.Length + Petal.Width)
```

This made it easier to use selection in custom functions before `datawizard` 
0.6.0, and is kept available for backward compatibility. 


================================================
FILE: vignettes/standardize_data.Rmd
================================================
---
title: "Data Standardization"
output: 
  rmarkdown::html_vignette:
    toc: true
    fig_width: 10.08
    fig_height: 6
vignette: >
  \usepackage[utf8]{inputenc}
  %\VignetteIndexEntry{Data Standardization}
  %\VignetteEngine{knitr::rmarkdown}
---

```{r message=FALSE, warning=FALSE, include=FALSE}
options(knitr.kable.NA = "")
knitr::opts_chunk$set(
  comment = "#>",
  message = FALSE,
  warning = FALSE,
  dpi = 300
)

pkgs <- c(
  "datawizard",
  "poorman",
  "see",
  "ggplot2",
  "parameters",
  "lme4",
  "curl"
)
pkg_available <- all(vapply(pkgs, requireNamespace, quietly = TRUE, FUN.VALUE = logical(1L)))

if (pkg_available) {
  net_available <- curl::has_internet()
} else {
  net_available <- FALSE
}

if (!pkg_available || !net_available) {
  knitr::opts_chunk$set(eval = FALSE)
}
```

This vignette can be referred to by citing the following:

> Patil et al., (2022). datawizard: An R Package for Easy Data Preparation and Statistical Transformations. *Journal of Open Source Software*, *7*(78), 4684, https://doi.org/10.21105/joss.04684

# Introduction

To make sense of their data and effects, scientists might want to standardize
(Z-score) their variables. This makes the data unitless, expressed only in terms
of deviation from an index of centrality (e.g., the mean or the median).
However, aside from some benefits, standardization also comes with challenges
and issues, that the scientist should be aware of.

## Methods of Standardization

The `datawizard` package offers two methods of standardization via the
`standardize()` function:

- **Normal standardization**: center around the *mean*, with *SD* units
  (default).

- **Robust standardization**: center around the *median*, with *MAD* (median
  absolute deviation) units (`robust = TRUE`).

Let's look at the following example:

```{r}
library(datawizard)
library(effectsize) # for data

# let's have a look at what the data look like
data("hardlyworking", package = "effectsize")
head(hardlyworking)

# let's use both methods of standardization
hardlyworking$xtra_hours_z <- standardize(hardlyworking$xtra_hours)
hardlyworking$xtra_hours_zr <- standardize(hardlyworking$xtra_hours, robust = TRUE)
```

We can see that different methods give different central and variation values:

```{r, eval=FALSE}
library(dplyr)

hardlyworking %>%
  select(starts_with("xtra_hours")) %>%
  data_to_long() %>%
  group_by(Name) %>%
  summarise(
    mean = mean(Value),
    sd = sd(Value),
    median = median(Value),
    mad = mad(Value)
  )
```

```{r, echo=FALSE}
library(poorman)

hardlyworking %>%
  select(starts_with("xtra_hours")) %>%
  reshape_longer(names_to = "name", values_to = "value") %>%
  group_by(name) %>%
  summarise(
    mean = mean(value),
    sd = sd(value),
    median = median(value),
    mad = mad(value)
  ) %>%
  knitr::kable(digits = 4)
```

`standardize()` can also be used to standardize a full data frame - where each
numeric variable is standardized separately:

```{r}
hardlyworking_z <- standardize(hardlyworking)
```

```{r, eval=FALSE}
hardlyworking_z %>%
  select(-xtra_hours_z, -xtra_hours_zr) %>%
  data_to_long() %>%
  group_by(Name) %>%
  summarise(
    mean = mean(Value),
    sd = sd(Value),
    median = median(Value),
    mad = mad(Value)
  )
```

```{r, echo=FALSE}
hardlyworking_z %>%
  select(-xtra_hours_z, -xtra_hours_zr) %>%
  reshape_longer(names_to = "name", values_to = "value") %>%
  group_by(name) %>%
  summarise(
    mean = mean(value),
    sd = sd(value),
    median = median(value),
    mad = mad(value)
  ) %>%
  knitr::kable(digits = 4)
```

Weighted standardization is also supported via the `weights` argument, and
factors can also be standardized (if you're into that kind of thing) by setting
`force = TRUE`, which converts factors to treatment-coded dummy variables before
standardizing.

## Variable-wise *vs.* Participant-wise 

Standardization is an important step and extra caution is required in
**repeated-measures designs**, in which there are three ways of standardizing
data:

- **Variable-wise**: The most common method. A simple scaling of each column.

- **Participant-wise**: Variables are standardized "within" each participant,
  *i.e.*, for each participant, by the participant's mean and SD.

- **Full**: Participant-wise first and then re-standardizing variable-wise.

Unfortunately, the method used is often not explicitly stated. This is an issue
as these methods can generate important discrepancies (that can in turn
contribute to the reproducibility crisis). Let's investigate these 3 methods.

### The Data

We will take the `emotion` dataset in which participants were exposed to
negative pictures and had to rate their emotions (**valence**) and the amount of
memories associated with the picture (**autobiographical link**). One could make
the hypothesis that for young participants with no context of war or violence,
the most negative pictures (mutilations) are less related to memories than less
negative pictures (involving for example car crashes or sick people). In other
words, **we expect a positive relationship between valence** (with high values
corresponding to less negativity) **and autobiographical link**.

Let's have a look at the data, averaged by participants:

```{r, eval=FALSE}
# Download the 'emotion' dataset
load(url("https://raw.githubusercontent.com/neuropsychology/psycho.R/master/data/emotion.rda"))

# Discard neutral pictures (keep only negative)
emotion <- emotion %>% filter(Emotion_Condition == "Negative")

# Summary
emotion %>%
  drop_na(Subjective_Valence, Autobiographical_Link) %>%
  group_by(Participant_ID) %>%
  summarise(
    n_Trials = n(),
    Valence_Mean = mean(Subjective_Valence),
    Valence_SD = sd(Subjective_Valence)
  )
```

```{r, echo=FALSE}
load(url("https://raw.githubusercontent.com/neuropsychology/psycho.R/master/data/emotion.rda"))

# Discard neutral pictures (keep only negative)
emotion <- emotion %>% filter(Emotion_Condition == "Negative")

# Summary
emotion %>%
  subset(!(is.na(Subjective_Valence) | is.na(Autobiographical_Link))) %>%
  group_by(Participant_ID) %>%
  summarise(
    n_Trials = n(),
    Valence_Mean = mean(Subjective_Valence),
    Valence_SD = sd(Subjective_Valence)
  )
```

As we can see from the means and SDs, there is a lot of variability **between**
participants both in their means and their individual *within*-participant SD.

### Effect of Standardization 

We will create three data frames standardized with each of the three
techniques.

```{r, warning=FALSE}
Z_VariableWise <- emotion %>%
  standardize()

Z_ParticipantWise <- emotion %>%
  group_by(Participant_ID) %>%
  standardize()

Z_Full <- emotion %>%
  group_by(Participant_ID) %>%
  standardize() %>%
  ungroup() %>%
  standardize()
```

Let's see how these three standardization techniques affected the **Valence**
variable.

### Across Participants

We can calculate the mean and SD of *Valence* across all participants:

```{r, eval=FALSE}
# Create a convenient function to print
summarise_Subjective_Valence <- function(data) {
  df_name <- deparse(substitute(data))
  data %>%
    ungroup() %>%
    summarise(
      DF = df_name,
      Mean = mean(Subjective_Valence),
      SD = sd(Subjective_Valence)
    )
}
# Check the results
rbind(
  summarise_Subjective_Valence(Z_VariableWise),
  summarise_Subjective_Valence(Z_ParticipantWise),
  summarise_Subjective_Valence(Z_Full)
)
```

```{r, echo=FALSE}
# Create a convenient function to print
summarise_Subjective_Valence <- function(data) {
  df_name <- deparse(substitute(data))
  data <- data %>%
    ungroup() %>%
    summarise(
      Mean = mean(Subjective_Valence),
      SD = sd(Subjective_Valence)
    )
  cbind(DF = df_name, data)
}
# Check the results
rbind(
  summarise_Subjective_Valence(Z_VariableWise),
  summarise_Subjective_Valence(Z_ParticipantWise),
  summarise_Subjective_Valence(Z_Full)
) %>%
  knitr::kable(digits = 2)
```


The **means** and the **SD** appear as fairly similar (0 and 1)...

```{r, fig.width=7, fig.height=4.5, results='markup', fig.align='center'}
library(see)
library(ggplot2)

ggplot() +
  geom_density(aes(Z_VariableWise$Subjective_Valence,
    color = "Z_VariableWise"
  ), linewidth = 1) +
  geom_density(aes(Z_ParticipantWise$Subjective_Valence,
    color = "Z_ParticipantWise"
  ), linewidth = 1) +
  geom_density(aes(Z_Full$Subjective_Valence,
    color = "Z_Full"
  ), linewidth = 1) +
  see::theme_modern() +
  labs(color = "")
```

and so do the marginal distributions...

### At the Participant Level

However, we can also look at what happens in the participant level. Let's look at
the first 5 participants:

```{r, eval=FALSE}
# Create convenient function
print_participants <- function(data) {
  df_name <- deparse(substitute(data))
  data %>%
    group_by(Participant_ID) %>%
    summarise(
      DF = df_name,
      Mean = mean(Subjective_Valence),
      SD = sd(Subjective_Valence)
    ) %>%
    head(5) %>%
    select(DF, everything())
}

# Check the results
rbind(
  print_participants(Z_VariableWise),
  print_participants(Z_ParticipantWise),
  print_participants(Z_Full)
)
```


```{r, echo=FALSE}
# Create convenient function
print_participants <- function(data) {
  df_name <- deparse(substitute(data))
  data %>%
    group_by(Participant_ID) %>%
    summarise(
      Mean = mean(Subjective_Valence),
      SD = sd(Subjective_Valence)
    ) %>%
    cbind(DF = df_name, .) %>%
    head(5) %>%
    select(DF, everything())
}

# Check the results
rbind(
  print_participants(Z_VariableWise),
  print_participants(Z_ParticipantWise),
  print_participants(Z_Full)
) %>%
  knitr::kable(digits = 2)
```

Seems like *full* and *participant-wise* standardization give similar results,
but different ones than *variable-wise* standardization.

### Compare

Let's do a **correlation** between the **variable-wise and participant-wise
methods**.

```{r, fig.width=7, fig.height=4.5, results='markup', fig.align='center'}
r <- cor.test(
  Z_VariableWise$Subjective_Valence,
  Z_ParticipantWise$Subjective_Valence
)

data.frame(
  Original = emotion$Subjective_Valence,
  VariableWise = Z_VariableWise$Subjective_Valence,
  ParticipantWise = Z_ParticipantWise$Subjective_Valence
) %>%
  ggplot(aes(x = VariableWise, y = ParticipantWise, colour = Original)) +
  geom_point(alpha = 0.75, shape = 16) +
  geom_smooth(method = "lm", color = "black") +
  scale_color_distiller(palette = 1) +
  ggtitle(paste0("r = ", round(r$estimate, 2))) +
  see::theme_modern()
```

While the three standardization methods roughly present the same characteristics
at a general level (mean 0 and SD 1) and a similar distribution, their values
are not exactly the same!

Let's now answer the original question by investigating the **linear relationship between valence and autobiographical link**. We can do this by
running a mixed-effects model with participants entered as random effects.

```{r}
library(lme4)
m_raw <- lmer(
  formula = Subjective_Valence ~ Autobiographical_Link + (1 | Participant_ID),
  data = emotion
)
m_VariableWise <- update(m_raw, data = Z_VariableWise)
m_ParticipantWise <- update(m_raw, data = Z_ParticipantWise)
m_Full <- update(m_raw, data = Z_Full)
```

We can extract the parameters of interest from each model, and find:

```{r}
# Convenient function
get_par <- function(model) {
  mod_name <- deparse(substitute(model))
  parameters::model_parameters(model) %>%
    mutate(Model = mod_name) %>%
    select(-Parameter) %>%
    select(Model, everything()) %>%
    .[-1, ]
}

# Run the model on all datasets
rbind(
  get_par(m_raw),
  get_par(m_VariableWise),
  get_par(m_ParticipantWise),
  get_par(m_Full)
)
```

As we can see, **variable-wise** standardization only affects **the coefficient** (which is expected, as it changes the unit), but not the test
statistic or statistical significance. However, using **participant-wise**
standardization *does* affect the coefficient **and** the significance. 

**No method is better or more justified, and the choice depends on the specific case, context, data and goal.**

### Conclusion

1. **Standardization can be useful in *some* cases and should be justified**.

2. **Variable and Participant-wise standardization methods *appear* to produce similar data**.

3. **Variable and Participant-wise standardization can lead to different results**.

4. **The chosen method can strongly influence the results and should therefore be explicitly stated and justified to enhance reproducibility of results**.

We showed here yet another way of **sneakily tweaking the data** that can change
the results. To prevent its use as a bad practice, we can only highlight the
importance of open data, open analysis/scripts, and preregistration.

# See also 

- `datawizard::demean()`: <https://easystats.github.io/datawizard/reference/demean.html>
- `standardize_parameters(method = "pseudo")` for mixed-effects models
<https://easystats.github.io/parameters/articles/standardize_parameters_effsize.html>

# References


================================================
FILE: vignettes/tidyverse_translation.Rmd
================================================
---
title: "Coming from 'tidyverse'"
output:
  rmarkdown::html_vignette:
    toc: true
vignette: >
  \usepackage[utf8]{inputenc}
  %\VignetteIndexEntry{Coming from 'tidyverse'}
  %\VignetteEngine{knitr::rmarkdown}
---

```{r setup, message=FALSE, warning=FALSE, include=FALSE, eval = TRUE}
library(knitr)
options(knitr.kable.NA = "")
knitr::opts_chunk$set(
  eval = FALSE,
  message = FALSE,
  warning = FALSE,
  dpi = 300
)

pkgs <- c(
  "dplyr",
  "tidyr"
)
all_deps_available <- all(vapply(pkgs, requireNamespace, quietly = TRUE, FUN.VALUE = logical(1L)))

if (all_deps_available) {
  library(datawizard)
  library(dplyr)
  library(tidyr)
}

# Since we explicitly put `eval = TRUE` for some chunks, we can't rely on
# `knitr::opts_chunk$set(eval = FALSE)` at the beginning of the script.
# Therefore, we introduce a logical that is `FALSE` only if all suggested
# dependencies are not installed (cf easystats/easystats#317)
evaluate_chunk <- all_deps_available && getRversion() >= "4.1.0"
```

This vignette can be referred to by citing the following:

Patil et al., (2022). datawizard: An R Package for Easy Data Preparation and Statistical Transformations. *Journal of Open Source Software*, *7*(78), 4684, https://doi.org/10.21105/joss.04684

```{css, echo=FALSE, eval = TRUE}
.datawizard, .datawizard > .sourceCode {
  background-color: #e6e6ff;
}
.tidyverse, .tidyverse > .sourceCode {
  background-color: #d9f2e5;
}
.custom_note {
  border-left: solid 5px hsl(220, 100%, 30%);
  background-color: hsl(220, 100%, 95%);
  padding: 5px;
  margin-bottom: 10px
}
```

# Introduction

`{datawizard}` package aims to make basic data wrangling easier than
with base R. The data wrangling workflow it supports is similar to the one
supported by the tidyverse package combination of `{dplyr}` and `{tidyr}`. However,
one of its main features is that it has a very few dependencies: `{stats}` and `{utils}`
(included in base R) and `{insight}`, which is the core package of the _easystats_
ecosystem. This package grew organically to simultaneously satisfy the
"0 non-base hard dependency" principle of _easystats_ and the data wrangling needs
of the constituent packages in this ecosystem. It is also
important to note that `{datawizard}` was designed to avoid namespace collisions
with `{tidyverse}` packages.

In this article, we will see how to go through basic data wrangling steps with
`{datawizard}`. We will also compare it to the `{tidyverse}` syntax for achieving the same.
This way, if you decide to make the switch, you can easily find the translations here.
This vignette is largely inspired from `{dplyr}`'s [Getting started vignette](https://dplyr.tidyverse.org/articles/dplyr.html).

<!-- NOTE: use raw HTML so that vignette can compile even if `evaluate_chunk` is FALSE. -->
<!-- See e.g. #527 -->

<div class="custom_note">
  <p>
    Note: In this vignette, we use the native pipe-operator, `|>`, which was
    introduced in R 4.1. Users of R version 3.6 or 4.0 should replace the native
    pipe by magrittr's one (`%>%`) so that examples work.
  </p>
</div>

```{r, eval = evaluate_chunk}
library(dplyr)
library(tidyr)
library(datawizard)

data(efc)
efc <- head(efc)
```

# Workhorses

Before we look at their *tidyverse* equivalents, we can first have a look at
`{datawizard}`'s key functions for data wrangling:

| Function          | Operation                                                        |
| :---------------- | :--------------------------------------------------------------- |
| `data_filter()`   | [to select only certain observations](#filtering)                |
| `data_select()`   | [to select only a few variables](#selecting)                     |
| `data_modify()`   | [to create variables or modify existing ones](#modifying)        |
| `data_arrange()`  | [to sort observations](#sorting)                                 |
| `data_extract()`  | [to extract a single variable](#extracting)                      |
| `data_rename()`   | [to rename variables](#renaming)                                 |
| `data_relocate()` | [to reorder a data frame](#relocating)                           |
| `data_to_long()`  | [to convert data from wide to long](#reshaping)                  |
| `data_to_wide()`  | [to convert data from long to wide](#reshaping)                  |
| `data_join()`     | [to join two data frames](#joining)                              |
| `data_unite()`    | [to concatenate several columns into a single one](#uniting)     |
| `data_separate()` | [to separate a single column into multiple columns](#separating) |

Note that there are a few functions in `{datawizard}` that have no strict equivalent
in `{dplyr}` or `{tidyr}` (e.g `data_rotate()`), and so we won't discuss them in
the next section.

# Equivalence with `{dplyr}` / `{tidyr}`

Before we look at them individually, let's first have a look at the summary table of this equivalence.

| Function          | Tidyverse equivalent(s)                                             |
| :---------------- | :------------------------------------------------------------------ |
| `data_filter()`   | `dplyr::filter()`, `dplyr::slice()`                                 |
| `data_select()`   | `dplyr::select()`                                                   |
| `data_modify()`   | `dplyr::mutate()`                                                   |
| `data_arrange()`  | `dplyr::arrange()`                                                  |
| `data_extract()`  | `dplyr::pull()`                                                     |
| `data_rename()`   | `dplyr::rename()`                                                   |
| `data_relocate()` | `dplyr::relocate()`                                                 |
| `data_to_long()`  | `tidyr::pivot_longer()`                                             |
| `data_to_wide()`  | `tidyr::pivot_wider()`                                              |
| `data_join()`     | `dplyr::inner_join()`, `dplyr::left_join()`, `dplyr::right_join()`, |
|                   | `dplyr::full_join()`, `dplyr::anti_join()`, `dplyr::semi_join()`    |
| `data_peek()`     | `dplyr::glimpse()`                                                  |
| `data_unite()`    | `tidyr::unite()`                                                    |
| `data_separate()` | `tidyr::separate()`                                                 |

## Filtering {#filtering}

`data_filter()` is a wrapper around `subset()`. However, if you want to have several filtering conditions, you can either use `&` (as in `subset()`) or `,` (as in `dplyr::filter()`).

:::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"}

::: {}

```{r filter, class.source = "datawizard"}
# ---------- datawizard -----------
starwars |>
  data_filter(
    skin_color == "light",
    eye_color == "brown"
  )

# or
starwars |>
  data_filter(
    skin_color == "light" &
      eye_color == "brown"
  )
```
:::

::: {}

```{r, class.source = "tidyverse"}
# ---------- tidyverse -----------
starwars |>
  filter(
    skin_color == "light",
    eye_color == "brown"
  )
```
:::

::::

```{r filter, eval = evaluate_chunk, echo = FALSE}
```


<!-- Shorten output to make it easier to read: -->
```{r, echo = FALSE, eval = evaluate_chunk}
starwars <- head(starwars)
```

## Selecting {#selecting}

`data_select()` is the equivalent of `dplyr::select()`.
The main difference between these two functions is that `data_select()` uses two
arguments (`select` and `exclude`) and requires quoted column names if we want to
select several variables, while `dplyr::select()` accepts any unquoted column names.

:::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"}

::: {}

```{r select1, class.source = "datawizard"}
# ---------- datawizard -----------
starwars |>
  data_select(select = c("hair_color", "skin_color", "eye_color"))
```
:::

::: {}

```{r, class.source = "tidyverse"}
# ---------- tidyverse -----------
starwars |>
  select(hair_color, skin_color, eye_color)
```
:::

::::

```{r select1, eval = evaluate_chunk, echo = FALSE}
```

:::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"}

::: {}

```{r select2, class.source = "datawizard"}
# ---------- datawizard -----------
starwars |>
  data_select(select = -ends_with("color"))
```
:::

::: {}

```{r, class.source = "tidyverse"}
# ---------- tidyverse -----------
starwars |>
  select(-ends_with("color"))
```
:::

::::

```{r select2, eval = evaluate_chunk, echo = FALSE}
```

:::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"}

::: {}

<!-- TODO: Although we say the column names need to be quoted, they are unquoted
here and quoting them won't work. Should we comment on that? -->

```{r select3, class.source = "datawizard"}
# ---------- datawizard -----------
starwars |>
  data_select(select = -(hair_color:eye_color))
```
:::

::: {}

```{r, class.source = "tidyverse"}
# ---------- tidyverse -----------
starwars |>
  select(!(hair_color:eye_color))
```
:::

::::

```{r select3, eval = evaluate_chunk, echo = FALSE}
```


:::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"}

::: {}

```{r select4, class.source = "datawizard"}
# ---------- datawizard -----------
starwars |>
  data_select(exclude = regex("color$"))
```
:::

::: {}

```{r, class.source = "tidyverse"}
# ---------- tidyverse -----------
starwars |>
  select(-contains("color$"))
```
:::

::::

```{r select4, eval = evaluate_chunk, echo = FALSE}
```


:::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"}

::: {}

```{r select5, class.source = "datawizard"}
# ---------- datawizard -----------
starwars |>
  data_select(select = is.numeric)
```
:::

::: {}

```{r, class.source = "tidyverse"}
# ---------- tidyverse -----------
starwars |>
  select(where(is.numeric))
```
:::

::::

```{r select5, eval = evaluate_chunk, echo = FALSE}
```

You can find a list of all the select helpers with `?data_select`.


## Modifying {#modifying}

`data_modify()` is a wrapper around `base::transform()` but has several additional
benefits:

* it allows us to use newly created variables in the following expressions;
* it works with grouped data;
* it preserves variable attributes such as labels;
* it accepts expressions as character vectors so that it is easy to program with it


This last point is also the main difference between `data_modify()` and
`dplyr::mutate()`.

:::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"}

::: {}

```{r modify1, class.source = "datawizard"}
# ---------- datawizard -----------
efc |>
  data_modify(
    c12hour_c = center(c12hour),
    c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE),
    c12hour_z2 = standardize(c12hour)
  )
```
:::

::: {}

```{r, class.source = "tidyverse"}
# ---------- tidyverse -----------
efc |>
  mutate(
    c12hour_c = center(c12hour),
    c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE),
    c12hour_z2 = standardize(c12hour)
  )
```
:::

::::

```{r modify1, eval = evaluate_chunk, echo = FALSE}
```

`data_modify()` supports expressions as strings via its `as_expr()` helper function.

```{r eval=evaluate_chunk}
new_exp <- c(
  "c12hour_c = center(c12hour)",
  "c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE)"
)
data_modify(efc, as_expr(new_exp))
```

This makes it easy to use it in custom functions:

```{r eval=evaluate_chunk}
miles_to_km <- function(data, var) {
  data_modify(
    data,
    as_expr(paste0("km = ", var, "* 1.609344"))
  )
}

distance <- data.frame(miles = c(1, 8, 233, 88, 9))
distance

miles_to_km(distance, "miles")
```


## Sorting {#sorting}

`data_arrange()` is the equivalent of `dplyr::arrange()`. It takes two arguments:
a data frame, and a vector of column names used to sort the rows. Note that contrary
to most other functions in `{datawizard}`, it is not possible to use select helpers
such as `starts_with()` in `data_arrange()`.

:::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"}
:::{}
```{r arrange1, class.source = "datawizard"}
# ---------- datawizard -----------
starwars |>
  data_arrange(c("hair_color", "height"))
```
:::

::: {}

```{r, class.source = "tidyverse"}
# ---------- tidyverse -----------
starwars |>
  arrange(hair_color, height)
```
:::

::::

```{r arrange1, eval = evaluate_chunk, echo = FALSE}
```

You can also sort variables in descending order by putting a `"-"` in front of
their name, like below:

:::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"}
:::{}
```{r arrange2, class.source = "datawizard"}
# ---------- datawizard -----------
starwars |>
  data_arrange(c("-hair_color", "-height"))
```
:::

::: {}

```{r, class.source = "tidyverse"}
# ---------- tidyverse -----------
starwars |>
  arrange(desc(hair_color), -height)
```
:::

::::

```{r arrange2, eval = evaluate_chunk, echo = FALSE}
```


## Extracting {#extracting}

Although we mostly work on data frames, it is sometimes useful to extract a single
column as a vector. This can be done with `data_extract()`, which reproduces the
behavior of `dplyr::pull()`:

:::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"}
:::{}
```{r extract1, class.source = "datawizard"}
# ---------- datawizard -----------
starwars |>
  data_extract(gender)
```
:::

::: {}

```{r, class.source = "tidyverse"}
# ---------- tidyverse -----------
starwars |>
  pull(gender)
```
:::

::::

```{r extract1, eval = evaluate_chunk, echo = FALSE}
```

We can also specify several variables in `select`. In this case, `data_extract()`
is equivalent to `data_select()`:

```{r eval = evaluate_chunk}
starwars |>
  data_extract(select = contains("color"))
```


## Renaming {#renaming}

`data_rename()` is the equivalent of `dplyr::rename()` but the syntax between the
two is different. While `dplyr::rename()` takes new-old pairs of column
names, `data_rename()` requires a vector of column names to rename, and then
a vector of new names for these columns that must be of the same length.

:::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"}

::: {}

```{r rename1, class.source = "datawizard"}
# ---------- datawizard -----------
starwars |>
  data_rename(
    select = c("sex", "hair_color"),
    replacement = c("Sex", "Hair Color")
  )
```
:::

::: {}

```{r, class.source = "tidyverse"}
# ---------- tidyverse -----------
starwars |>
  rename(
    Sex = sex,
    "Hair Color" = hair_color
  )
```
:::

::::

```{r rename1, eval = evaluate_chunk, echo = FALSE}
```

The way `data_rename()` is designed makes it easy to apply the same modifications
to a vector of column names. For example, we can remove underscores and use
TitleCase with the following code:

```{r rename2}
to_rename <- names(starwars)

starwars |>
  data_rename(
    select = to_rename,
    replacement = tools::toTitleCase(gsub("_", " ", to_rename, fixed = TRUE))
  )
```

```{r rename2, eval = evaluate_chunk, echo = FALSE}
```

It is also possible to add a prefix or a suffix to all or a subset of variables
with `data_addprefix()` and `data_addsuffix()`. The argument `select` accepts
all select helpers that we saw above with `data_select()`:

```{r rename3}
starwars |>
  data_addprefix(
    pattern = "OLD.",
    select = contains("color")
  ) |>
  data_addsuffix(
    pattern = ".NEW",
    select = -contains("color")
  )
```

```{r rename3, eval = evaluate_chunk, echo = FALSE}
```

## Relocating {#relocating}

Sometimes, we want to relocate one or a small subset of columns in the dataset.
Rather than typing many names in `data_select()`, we can use `data_relocate()`,
which is the equivalent of `dplyr::relocate()`. Just like `data_select()`, we can
specify a list of variables we want to relocate with `select` and `exclude`.
Then, the arguments `before` and `after`^[Note that we use `before` and `after`
whereas `dplyr::relocate()` uses `.before` and `.after`.] specify where the selected columns should
be relocated:

:::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"}

::: {}

```{r relocate1, class.source = "datawizard"}
# ---------- datawizard -----------
starwars |>
  data_relocate(sex:homeworld, before = "height")
```
:::

::: {}

```{r, class.source = "tidyverse"}
# ---------- tidyverse -----------
starwars |>
  relocate(sex:homeworld, .before = height)
```
:::

::::

```{r relocate1, eval = evaluate_chunk, echo = FALSE}
```

In addition to column names, `before` and `after` accept column indices. Finally,
one can use `before = -1` to relocate the selected columns just before the last
column, or `after = -1` to relocate them after the last column.

```{r eval = evaluate_chunk}
# ---------- datawizard -----------
starwars |>
  data_relocate(sex:homeworld, after = -1)
```


## Reshaping {#reshaping}

### Longer

Reshaping data from wide to long or from long to wide format can be done with
`data_to_long()` and `data_to_wide()`. These functions were designed to match
`tidyr::pivot_longer()` and `tidyr::pivot_wider()` arguments, so that the only
thing to do is to change the function name. However, not all of
`tidyr::pivot_longer()` and `tidyr::pivot_wider()` features are available yet.

We will use the `relig_income` dataset, as in the [`{tidyr}` vignette](https://tidyr.tidyverse.org/articles/pivot.html).

```{r eval = evaluate_chunk}
relig_income
```


We would like to reshape this dataset to have 3 columns: religion, count, and
income. The column "religion" doesn't need to change, so we exclude it with
`-religion`. Then, each remaining column corresponds to an income category.
Therefore, we want to move all these column names to a single column called
"income". Finally, the values corresponding to each of these columns will be
reshaped to be in a single new column, called "count".

:::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"}

::: {}

```{r pivot1, class.source = "datawizard"}
# ---------- datawizard -----------
relig_income |>
  data_to_long(
    -religion,
    names_to = "income",
    values_to = "count"
  )
```
:::

::: {}

```{r, class.source = "tidyverse"}
# ---------- tidyverse -----------
relig_income |>
  pivot_longer(
    !religion,
    names_to = "income",
    values_to = "count"
  )
```
:::

::::

```{r pivot1, eval = evaluate_chunk, echo = FALSE}
```


To explore a bit more the arguments of `data_to_long()`, we will use another
dataset: the `billboard` dataset.
```{r eval = evaluate_chunk}
billboard
```

:::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"}

::: {}

```{r pivot2, class.source = "datawizard"}
# ---------- datawizard -----------
billboard |>
  data_to_long(
    cols = starts_with("wk"),
    names_to = "week",
    values_to = "rank",
    values_drop_na = TRUE
  )
```
:::

::: {}

```{r, class.source = "tidyverse"}
# ---------- tidyverse -----------
billboard |>
  pivot_longer(
    cols = starts_with("wk"),
    names_to = "week",
    values_to = "rank",
    values_drop_na = TRUE
  )
```
:::

::::

```{r pivot2, eval = evaluate_chunk, echo = FALSE}
```


### Wider

Once again, we use an example in the `{tidyr}` vignette to show how close `data_to_wide()`
and `pivot_wider()` are:
```{r eval = evaluate_chunk}
fish_encounters
```


:::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"}

::: {}

```{r pivot3, class.source = "datawizard"}
# ---------- datawizard -----------
fish_encounters |>
  data_to_wide(
    names_from = "station",
    values_from = "seen"
  )
```
:::

::: {}

```{r, class.source = "tidyverse"}
# ---------- tidyverse -----------
fish_encounters |>
  pivot_wider(
    names_from = station,
    values_from = seen
  )
```
:::

::::

```{r pivot3, eval = evaluate_chunk, echo = FALSE}
```


## Joining {#joining}

<!-- explain a bit more the args of data_join -->

In `{datawizard}`, joining datasets is done with `data_join()` (or its alias
`data_merge()`). Contrary to `{dplyr}`, this unique function takes care of all
types of join, which are then specified inside the function with the argument
`join` (by default, `join = "left"`).

Below, we show how to perform the four most common joins: full, left, right and
inner. We will use the datasets `band_members`and `band_instruments` provided by `{dplyr}`:

:::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"}

::: {}

```{r eval = evaluate_chunk}
band_members
```
:::

::: {}

```{r eval = evaluate_chunk}
band_instruments
```
:::

::::


### Full join

:::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"}

::: {}

```{r join1, class.source = "datawizard"}
# ---------- datawizard -----------
band_members |>
  data_join(band_instruments, join = "full")
```
:::

::: {}

```{r, class.source = "tidyverse"}
# ---------- tidyverse -----------
band_members |>
  full_join(band_instruments)
```
:::

::::

```{r join1, eval = evaluate_chunk, echo = FALSE}
```


### Left and right joins

:::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"}

::: {}

```{r join2, class.source = "datawizard"}
# ---------- datawizard -----------
band_members |>
  data_join(band_instruments, join = "left")
```
:::

::: {}

```{r, class.source = "tidyverse"}
# ---------- tidyverse -----------
band_members |>
  left_join(band_instruments)
```
:::

::::

```{r join2, eval = evaluate_chunk, echo = FALSE}
```


:::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"}

::: {}

```{r join3, class.source = "datawizard"}
# ---------- datawizard -----------
band_members |>
  data_join(band_instruments, join = "right")
```
:::

::: {}

```{r, class.source = "tidyverse"}
# ---------- tidyverse -----------
band_members |>
  right_join(band_instruments)
```
:::

::::

```{r join3, eval = evaluate_chunk, echo = FALSE}
```


### Inner join

:::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"}

::: {}

```{r join4, class.source = "datawizard"}
# ---------- datawizard -----------
band_members |>
  data_join(band_instruments, join = "inner")
```
:::

::: {}

```{r, class.source = "tidyverse"}
# ---------- tidyverse -----------
band_members |>
  inner_join(band_instruments)
```
:::

::::

```{r join4, eval = evaluate_chunk, echo = FALSE}
```


## Uniting {#uniting}

Uniting variables is useful e.g to create unique indices by combining several
variables or to gather years, months, and days into a single date. `data_unite()`
offers an interface very close to `tidyr::unite()`:

```{r eval=evaluate_chunk}
test <- data.frame(
  year = 2002:2004,
  month = c("02", "03", "09"),
  day = c("11", "22", "28"),
  stringsAsFactors = FALSE
)
test
```


:::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"}

::: {}

```{r unite1, class.source = "datawizard"}
# ---------- datawizard -----------
test |>
  data_unite(
    new_column = "date",
    select = c("year", "month", "day"),
    separator = "-"
  )
```
:::

::: {}

```{r, class.source = "tidyverse"}
# ---------- tidyverse -----------
test |>
  unite(
    col = "date",
    year, month, day,
    sep = "-"
  )
```
:::

::::

```{r unite1, eval = evaluate_chunk, echo = FALSE}
```

:::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"}

::: {}

```{r unite2, class.source = "datawizard"}
# ---------- datawizard -----------
test |>
  data_unite(
    new_column = "date",
    select = c("year", "month", "day"),
    separator = "-",
    append = TRUE
  )
```
:::

::: {}

```{r, class.source = "tidyverse"}
# ---------- tidyverse -----------
test |>
  unite(
    col = "date",
    year, month, day,
    sep = "-",
    remove = FALSE
  )
```
:::

::::

```{r unite2, eval = evaluate_chunk, echo = FALSE}
```


## Separating {#separating}

Separating variables is the counterpart to uniting variables and is useful to split values into multiple columns, e.g. when splitting a date into values for years, months and days. `data_separate()` offers an interface very close to `tidyr::separate()`:

```{r eval=evaluate_chunk}
test <- data.frame(
  date_arrival = c("2002-02-11", "2003-03-22", "2004-09-28"),
  date_departure = c("2002-03-15", "2003-03-28", "2004-09-30"),
  stringsAsFactors = FALSE
)
test
```


:::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"}

::: {}

```{r separate1, class.source = "datawizard"}
# ---------- datawizard -----------
test |>
  data_separate(
    select = "date_arrival",
    new_columns = c("Year", "Month", "Day")
  )
```
:::

::: {}

```{r, class.source = "tidyverse"}
# ---------- tidyverse -----------
test |>
  separate(
    date_arrival,
    into = c("Year", "Month", "Day")
  )
```
:::

::::

```{r separate1, eval = evaluate_chunk, echo = FALSE}
```


Unlike `tidyr::separate()`, you can separate multiple columns in one step with `data_separate()`.

```{r eval = evaluate_chunk}
test |>
  data_separate(
    new_columns = list(
      date_arrival = c("Arr_Year", "Arr_Month", "Arr_Day"),
      date_departure = c("Dep_Year", "Dep_Month", "Dep_Day")
    )
  )
```


# Other useful functions

`{datawizard}` contains other functions that are not necessarily included in
`{dplyr}` or `{tidyr}` or do not directly modify the data. Some of them are
inspired from the package `janitor`.

## Work with rownames

We can convert a column in rownames and move rownames to a new column with
`rownames_as_column()` and `column_as_rownames()`:

```{r eval = evaluate_chunk}
mtcars <- head(mtcars)
mtcars

mtcars2 <- mtcars |>
  rownames_as_column(var = "model")

mtcars2

mtcars2 |>
  column_as_rownames(var = "model")
```

## Work with row ids

`rowid_as_column()` is close but not identical to `tibble::rowid_to_column()`.
The main difference is when we use it with grouped data. While `tibble::rowid_to_column()`
uses one distinct rowid for every row in the dataset, `rowid_as_column()` creates
one id for every row *in each group*. Therefore, two rows in different groups
can have the same row id.

This means that `rowid_as_column()` is closer to using `n()` in `mutate()`, like
the following:

```{r eval=evaluate_chunk}
test <- data.frame(
  group = c("A", "A", "B", "B"),
  value = c(3, 5, 8, 1),
  stringsAsFactors = FALSE
)
test

test |>
  data_group(group) |>
  tibble::rowid_to_column()

test |>
  data_group(group) |>
  rowid_as_column()

test |>
  data_group(group) |>
  mutate(id = seq_len(n()))
```


## Work with column names

When dealing with messy data, it is sometimes useful to use a row as column
names, and vice versa. This can be done with `row_to_colnames()` and
`colnames_to_row()`.

```{r eval = evaluate_chunk}
x <- data.frame(
  X_1 = c(NA, "Title", 1:3),
  X_2 = c(NA, "Title2", 4:6)
)
x
x2 <- x |>
  row_to_colnames(row = 2)
x2

x2 |>
  colnames_to_row()
```

## Take a quick look at the data

:::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"}

::: {}

```{r glimpse, class.source = "datawizard"}
# ---------- datawizard -----------
data_peek(iris)
```
:::

::: {}

```{r, class.source = "tidyverse"}
# ---------- tidyverse -----------
glimpse(iris)
```
:::

::::

```{r glimpse, eval = evaluate_chunk, echo = FALSE}
```