Showing preview only (1,951K chars total). Download the full file or copy to clipboard to get everything.
Repository: easystats/datawizard
Branch: main
Commit: 89daeba7d094
Files: 321
Total size: 1.8 MB
Directory structure:
gitextract_nyyro61y/
├── .Rbuildignore
├── .dev/
│ ├── _BENCHMARK_RESHAPE.R
│ ├── html5.R
│ ├── revdepcheck.R
│ ├── test-value_at.R
│ └── value_at.R
├── .git-blame-ignore-revs
├── .github/
│ ├── .gitignore
│ ├── CODE_OF_CONDUCT.md
│ ├── CONTRIBUTING.md
│ ├── FUNDING.yml
│ ├── SUPPORT.md
│ ├── dependabot.yaml
│ └── workflows/
│ ├── R-CMD-check-hard.yaml
│ ├── R-CMD-check.yaml
│ ├── check-all-examples.yaml
│ ├── check-link-rot.yaml
│ ├── check-random-test-order.yaml
│ ├── check-readme.yaml
│ ├── check-spelling.yaml
│ ├── check-styling.yaml
│ ├── check-test-warnings.yaml
│ ├── check-vignette-warnings.yaml
│ ├── html-5-check.yaml
│ ├── lint-changed-files.yaml
│ ├── lint.yaml
│ ├── pkgdown-no-suggests.yaml
│ ├── pkgdown.yaml
│ ├── test-coverage-examples.yaml
│ ├── test-coverage.yaml
│ └── update-to-latest-easystats.yaml
├── .gitignore
├── .lintr
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── NEWS.md
├── R/
│ ├── adjust.R
│ ├── assign_labels.R
│ ├── categorize.R
│ ├── center.R
│ ├── contrs.R
│ ├── convert_na_to.R
│ ├── convert_to_na.R
│ ├── data.R
│ ├── data_addprefix.R
│ ├── data_arrange.R
│ ├── data_codebook.R
│ ├── data_duplicated.R
│ ├── data_extract.R
│ ├── data_group.R
│ ├── data_match.R
│ ├── data_merge.R
│ ├── data_modify.R
│ ├── data_partition.R
│ ├── data_peek.R
│ ├── data_read.R
│ ├── data_relocate.R
│ ├── data_remove.R
│ ├── data_rename.R
│ ├── data_replicate.R
│ ├── data_rescale.R
│ ├── data_restoretype.R
│ ├── data_reverse.R
│ ├── data_rotate.R
│ ├── data_seek.R
│ ├── data_select.R
│ ├── data_separate.R
│ ├── data_summary.R
│ ├── data_tabulate.R
│ ├── data_to_long.R
│ ├── data_to_wide.R
│ ├── data_unique.R
│ ├── data_unite.R
│ ├── data_write.R
│ ├── data_xtabulate.R
│ ├── datawizard-package.R
│ ├── demean.R
│ ├── describe_distribution.R
│ ├── descriptives.R
│ ├── extract_column_names.R
│ ├── format.R
│ ├── labels_to_levels.R
│ ├── makepredictcall.R
│ ├── mean_sd.R
│ ├── means_by_group.R
│ ├── normalize.R
│ ├── ranktransform.R
│ ├── recode_into.R
│ ├── recode_values.R
│ ├── remove_empty.R
│ ├── replace_nan_inf.R
│ ├── rescale_weights.R
│ ├── reshape_ci.R
│ ├── row_count.R
│ ├── row_means.R
│ ├── select_nse.R
│ ├── skewness_kurtosis.R
│ ├── slide.R
│ ├── smoothness.R
│ ├── standardize.R
│ ├── standardize.models.R
│ ├── text_format.R
│ ├── to_factor.R
│ ├── to_numeric.R
│ ├── unnormalize.R
│ ├── unstandardize.R
│ ├── utils-cols.R
│ ├── utils-rows.R
│ ├── utils.R
│ ├── utils_labels.R
│ ├── utils_standardize_center.R
│ ├── visualisation_recipe.R
│ ├── weighted_mean_median_sd_mad.R
│ └── winsorize.R
├── README.Rmd
├── README.md
├── air.toml
├── cran-comments.md
├── data/
│ ├── efc.RData
│ └── nhanes_sample.RData
├── datawizard.Rproj
├── datawizard.code-workspace
├── inst/
│ ├── CITATION
│ └── WORDLIST
├── man/
│ ├── adjust.Rd
│ ├── as.prop.table.Rd
│ ├── assign_labels.Rd
│ ├── categorize.Rd
│ ├── center.Rd
│ ├── coef_var.Rd
│ ├── coerce_to_numeric.Rd
│ ├── colnames.Rd
│ ├── contr.deviation.Rd
│ ├── convert_na_to.Rd
│ ├── convert_to_na.Rd
│ ├── data_arrange.Rd
│ ├── data_codebook.Rd
│ ├── data_duplicated.Rd
│ ├── data_extract.Rd
│ ├── data_group.Rd
│ ├── data_match.Rd
│ ├── data_merge.Rd
│ ├── data_modify.Rd
│ ├── data_partition.Rd
│ ├── data_peek.Rd
│ ├── data_prefix_suffix.Rd
│ ├── data_read.Rd
│ ├── data_relocate.Rd
│ ├── data_rename.Rd
│ ├── data_replicate.Rd
│ ├── data_restoretype.Rd
│ ├── data_rotate.Rd
│ ├── data_seek.Rd
│ ├── data_separate.Rd
│ ├── data_summary.Rd
│ ├── data_tabulate.Rd
│ ├── data_to_long.Rd
│ ├── data_to_wide.Rd
│ ├── data_unique.Rd
│ ├── data_unite.Rd
│ ├── datawizard-package.Rd
│ ├── demean.Rd
│ ├── describe_distribution.Rd
│ ├── distribution_mode.Rd
│ ├── efc.Rd
│ ├── extract_column_names.Rd
│ ├── labels_to_levels.Rd
│ ├── makepredictcall.dw_transformer.Rd
│ ├── mean_sd.Rd
│ ├── means_by_group.Rd
│ ├── nhanes_sample.Rd
│ ├── normalize.Rd
│ ├── ranktransform.Rd
│ ├── recode_into.Rd
│ ├── recode_values.Rd
│ ├── reexports.Rd
│ ├── remove_empty.Rd
│ ├── replace_nan_inf.Rd
│ ├── rescale.Rd
│ ├── rescale_weights.Rd
│ ├── reshape_ci.Rd
│ ├── reverse.Rd
│ ├── row_count.Rd
│ ├── row_means.Rd
│ ├── rownames.Rd
│ ├── skewness.Rd
│ ├── slide.Rd
│ ├── smoothness.Rd
│ ├── standardize.Rd
│ ├── standardize.default.Rd
│ ├── text_format.Rd
│ ├── to_factor.Rd
│ ├── to_numeric.Rd
│ ├── visualisation_recipe.Rd
│ ├── weighted_mean.Rd
│ └── winsorize.Rd
├── paper/
│ └── JOSS_files/
│ ├── apa.csl
│ ├── paper.Rmd
│ ├── paper.bib
│ ├── paper.log
│ └── paper.md
├── pkgdown/
│ └── _pkgdown.yaml
├── tests/
│ ├── testthat/
│ │ ├── _snaps/
│ │ │ ├── categorize.md
│ │ │ ├── contr.deviation.md
│ │ │ ├── data_codebook.md
│ │ │ ├── data_modify.md
│ │ │ ├── data_partition.md
│ │ │ ├── data_peek.md
│ │ │ ├── data_read.md
│ │ │ ├── data_rescale.md
│ │ │ ├── data_seek.md
│ │ │ ├── data_separate.md
│ │ │ ├── data_summary.md
│ │ │ ├── data_tabulate.md
│ │ │ ├── data_to_factor.md
│ │ │ ├── data_to_long.md
│ │ │ ├── data_to_numeric.md
│ │ │ ├── demean.md
│ │ │ ├── describe_distribution.md
│ │ │ ├── empty-dataframe.md
│ │ │ ├── means_by_group.md
│ │ │ ├── normalize.md
│ │ │ ├── print.dw_transformer.md
│ │ │ ├── ranktransform.md
│ │ │ ├── rescale_weights.md
│ │ │ ├── reshape_ci.md
│ │ │ ├── skewness-kurtosis.md
│ │ │ ├── smoothness.md
│ │ │ ├── text_format.md
│ │ │ ├── windows/
│ │ │ │ └── means_by_group.md
│ │ │ └── winsorization.md
│ │ ├── helper-state.R
│ │ ├── helper.R
│ │ ├── test-adjust.R
│ │ ├── test-assign_labels.R
│ │ ├── test-attributes-grouped-df.R
│ │ ├── test-attributes.R
│ │ ├── test-categorize.R
│ │ ├── test-center.R
│ │ ├── test-coef_var.R
│ │ ├── test-contr.deviation.R
│ │ ├── test-convert_na_to.R
│ │ ├── test-convert_to_na.R
│ │ ├── test-data_addprefix.R
│ │ ├── test-data_arrange.R
│ │ ├── test-data_codebook.R
│ │ ├── test-data_duplicated.R
│ │ ├── test-data_extract.R
│ │ ├── test-data_group.R
│ │ ├── test-data_match.R
│ │ ├── test-data_merge.R
│ │ ├── test-data_modify.R
│ │ ├── test-data_partition.R
│ │ ├── test-data_peek.R
│ │ ├── test-data_read.R
│ │ ├── test-data_recode.R
│ │ ├── test-data_relocate.R
│ │ ├── test-data_remove.R
│ │ ├── test-data_rename.R
│ │ ├── test-data_reorder.R
│ │ ├── test-data_replicate.R
│ │ ├── test-data_rescale.R
│ │ ├── test-data_restoretype.R
│ │ ├── test-data_reverse.R
│ │ ├── test-data_rotate.R
│ │ ├── test-data_seek.R
│ │ ├── test-data_select.R
│ │ ├── test-data_separate.R
│ │ ├── test-data_shift.R
│ │ ├── test-data_summary.R
│ │ ├── test-data_tabulate.R
│ │ ├── test-data_to_factor.R
│ │ ├── test-data_to_long.R
│ │ ├── test-data_to_numeric.R
│ │ ├── test-data_to_wide.R
│ │ ├── test-data_unique.R
│ │ ├── test-data_unite.R
│ │ ├── test-data_write.R
│ │ ├── test-demean.R
│ │ ├── test-describe_distribution.R
│ │ ├── test-distributions.R
│ │ ├── test-empty-dataframe.R
│ │ ├── test-extract_column_names.R
│ │ ├── test-labelled_data.R
│ │ ├── test-labels_to_levels.R
│ │ ├── test-makepredictcall.R
│ │ ├── test-mean_sd.R
│ │ ├── test-means_by_group.R
│ │ ├── test-normalize.R
│ │ ├── test-print.dw_transformer.R
│ │ ├── test-ranktransform.R
│ │ ├── test-recode_into.R
│ │ ├── test-replace_nan_inf.R
│ │ ├── test-rescale_weights.R
│ │ ├── test-reshape_ci.R
│ │ ├── test-row_count.R
│ │ ├── test-row_means.R
│ │ ├── test-select_nse.R
│ │ ├── test-skewness-kurtosis.R
│ │ ├── test-smoothness.R
│ │ ├── test-standardize-data.R
│ │ ├── test-standardize_datagrid.R
│ │ ├── test-standardize_models.R
│ │ ├── test-std_center.R
│ │ ├── test-std_center_scale_args.R
│ │ ├── test-text_format.R
│ │ ├── test-unnormalize.R
│ │ ├── test-utils.R
│ │ ├── test-utils_cols.R
│ │ ├── test-utils_rows.R
│ │ ├── test-weighted-stats.R
│ │ └── test-winsorization.R
│ └── testthat.R
└── vignettes/
├── .gitignore
├── bibliography.bib
├── overview_of_vignettes.Rmd
├── selection_syntax.Rmd
├── standardize_data.Rmd
└── tidyverse_translation.Rmd
================================================
FILE CONTENTS
================================================
================================================
FILE: .Rbuildignore
================================================
^\cache$
^codemeta\.json$
^Meta$
^doc$
^.*\.Rproj$
^\.Rproj\.user$
^README\.Rmd$
^Rplots.pdf$
^README-.*\.png$
^CONDUCT\.md$
^SECURITY\.md$
^cran-comments\.md$
^CODE_OF_CONDUCT\.md$
^SUPPORT\.md$
^\.github$
^NEWS$
^docs$
^revdep$
publication/*
^codecov\.yml$
^\.coveralls\.yml$
^\.travis\.yml$
^_pkgdown\.yml$
^_pkgdown\.yaml$
^appveyor\.yml$
^.gitlab-ci\.yml$
^data-raw$
^pkgdown$
^\.httr-oauth$
^CRAN-RELEASE$
tests\^spelling
^LICENSE\.md$
^\.lintr$
\.code-workspace$
^\.circleci$
^tests/manual$
^revdep$
^\.covrignore$
^\.github/ISSUE_TEMPLATE$
^paper.*$
references.bib
^API$
^\.pre-commit-config\.yaml$
^\.github/workflows/R\.yaml$
^\.github/workflows/pr-commands\.yaml$
^hextools/.
^WIP/.
^CRAN-SUBMISSION$
docs
^.dev$
^vignettes/s.
^vignettes/t.
^[\.]?air\.toml$
^\.vscode$
^\.git-blame-ignore-revs$
================================================
FILE: .dev/_BENCHMARK_RESHAPE.R
================================================
library(tidyr)
library(dplyr)
library(datawizard)
### DATA_TO_LONG ==========================================
# SLOW (5M rows)
wide_data <- data.frame(replicate(5, rnorm(10)))
tmp <- list()
for (i in 1:500000) {
tmp[[i]] <- wide_data
}
tmp <- data.table::rbindlist(tmp) |>
as_tibble()
ex1_l <- bench::mark(
old = old_data_to_long(tmp),
new = data_to_long(tmp),
tidyr = pivot_longer(tmp, cols = everything()),
iterations = 10
)
ex2_l <- bench::mark(
old = relig_income %>%
old_data_to_long(-"religion", names_to = "income", values_to = "count"),
new = relig_income %>%
data_to_long(-"religion", names_to = "income", values_to = "count"),
tidyr = relig_income %>%
pivot_longer(!religion, names_to = "income", values_to = "count"),
iterations = 100
)
ex3_l <- bench::mark(
old = billboard %>%
old_data_to_long(
cols = starts_with("wk"),
names_to = "week",
values_to = "rank"
),
new = billboard %>%
data_to_long(
cols = starts_with("wk"),
names_to = "week",
values_to = "rank"
),
tidyr = billboard %>%
pivot_longer(
cols = starts_with("wk"),
names_to = "week",
values_to = "rank"
),
iterations = 50
)
ex4_l <- bench::mark(
old = who |>
old_data_to_long(
cols = 5:60,
names_to = c("diagnosis", "gender", "age"),
names_sep = "_",
values_to = "count"
),
new = who |>
data_to_long(
cols = 5:60,
names_to = c("diagnosis", "gender", "age"),
names_sep = "_",
values_to = "count"
),
tidyr = who |>
pivot_longer(
cols = 5:60,
names_to = c("diagnosis", "gender", "age"),
names_sep = "_",
values_to = "count"
),
iterations = 10
)
ex5_l <- bench::mark(
old = who |>
old_data_to_long(
cols = 5:60,
names_to = c("diagnosis", "gender", "age"),
names_pattern = "new_?(.*)_(.)(.*)",
values_to = "count"
),
new = who |>
data_to_long(
cols = 5:60,
names_to = c("diagnosis", "gender", "age"),
names_pattern = "new_?(.*)_(.)(.*)",
values_to = "count"
),
tidyr = who |>
pivot_longer(
cols = 5:60,
names_to = c("diagnosis", "gender", "age"),
names_pattern = "new_?(.*)_(.)(.*)",
values_to = "count"
),
iterations = 10
)
### DATA_TO_WIDE ==========================================
ex1_w <- bench::mark(
old = fish_encounters %>%
old_data_to_wide(
names_from = "station",
values_from = "seen",
values_fill = 0
),
new = fish_encounters %>%
data_to_wide(
names_from = "station",
values_from = "seen",
values_fill = 0
),
tidyr = fish_encounters %>%
pivot_wider(
names_from = "station",
values_from = "seen",
values_fill = 0
),
iterations = 100
)
production <- expand_grid(
product = letters,
country = paste0(letters, "I"),
year = 2000:2025
) %>%
mutate(production = rnorm(nrow(.)))
ex2_w <- bench::mark(
old = production %>%
old_data_to_wide(
names_from = c("product", "country"),
values_from = "production"
),
new = production %>%
data_to_wide(
names_from = c("product", "country"),
values_from = "production"
),
tidyr = production %>%
pivot_wider(
names_from = c(product, country),
values_from = production
),
iterations = 10
)
ex3_w <- bench::mark(
old = production %>%
old_data_to_wide(
names_from = c("product", "country"),
values_from = "production",
names_glue = "prod_{product}_{country}"
),
new = production %>%
data_to_wide(
names_from = c("product", "country"),
values_from = "production",
names_glue = "prod_{product}_{country}"
),
tidyr = production %>%
pivot_wider(
names_from = c(product, country),
values_from = production,
names_glue = "prod_{product}_{country}"
),
iterations = 10
)
tmp <- list()
for (i in 1:1000) {
tmp[[i]] <- us_rent_income
}
tmp <- data.table::rbindlist(tmp) |>
as_tibble()
tmp$GEOID <- rep(1:52000, each = 2)
tmp$NAME <- as.character(rep(1:52000, each = 2))
ex4_w <- bench::mark(
old = tmp %>%
old_data_to_wide(
names_from = "variable",
values_from = c("estimate", "moe")
),
new = tmp %>%
data_to_wide(
names_from = "variable",
values_from = c("estimate", "moe")
),
tidyr = tmp %>%
pivot_wider(
names_from = "variable",
values_from = c("estimate", "moe")
),
iterations = 10
)
# SLOW (1M rows) ============
set.seed(123)
contacts <- tibble(
id = rep(1:500000, each = 2),
field = rep(c("a", "b"), 500000),
value = sample(letters, 1000000, replace = TRUE)
)
ex5_w <- bench::mark(
old = contacts %>%
old_data_to_wide(names_from = "field", values_from = "value"),
new = contacts %>%
data_to_wide(names_from = "field", values_from = "value"),
tidyr = contacts %>%
tidyr::pivot_wider(names_from = field, values_from = value),
iterations = 1
)
# SLOWER (10M rows) ============
set.seed(123)
contacts <- tibble(
id = rep(1:5000000, each = 2),
field = rep(c("a", "b"), 5000000),
value = sample(letters, 10000000, replace = TRUE)
)
ex6_w <- bench::mark(
old = contacts %>%
old_data_to_wide(names_from = "field", values_from = "value"),
new = contacts %>%
data_to_wide(names_from = "field", values_from = "value"),
tidyr = contacts %>%
tidyr::pivot_wider(names_from = field, values_from = value),
iterations = 1
)
reprex:::prex({
### DATA_TO_LONG ==========================================
ex1_l
ex2_l
ex3_l
ex4_l
ex5_l
### DATA_TO_WIDE ==========================================
ex1_w
ex2_w
ex3_w
ex4_w
ex5_w
ex6_w
})
================================================
FILE: .dev/html5.R
================================================
Sys.setenv("_R_CHECK_RD_VALIDATE_RD2HTML_" = "true")
Sys.setenv("_R_CHECK_CRAN_INCOMING_REMOTE_" = "false")
Sys.setenv("_R_CHECK_CRAN_INCOMING_" = "false")
rcmdcheck::rcmdcheck(
args = c("--as-cran", "--no-codoc", "--no-examples", "--no-tests", "--no-vignettes", "--no-build-vignettes", "--ignore-vignettes", "--no-install"),
build_args = c("--no-build-vignettes"),
error_on = "note"
)
================================================
FILE: .dev/revdepcheck.R
================================================
library(revdepcheck)
revdep_check(num_workers = 4)
revdep_report()
revdep_reset()
================================================
FILE: .dev/test-value_at.R
================================================
test_that("value_at", {
data(efc, package = "datawizard")
expect_equal(value_at(efc$e42dep, 5), 4, ignore_attr = TRUE)
expect_equal(value_at(efc$c12hour, 4), NA_real_, ignore_attr = TRUE)
expect_equal(value_at(efc$c12hour, 4, remove_na = TRUE), 168, ignore_attr = TRUE)
expect_equal(value_at(efc$c12hour, 5:7), efc$c12hour[5:7], ignore_attr = TRUE)
expect_equal(value_at(efc$e42dep, 123456, default = 55), 55, ignore_attr = TRUE)
expect_null(value_at(efc$e42dep, 123456))
expect_null(value_at(efc$e42dep, NULL))
expect_error(value_at(efc$e42dep, NA), regex = "`position` can't")
expect_error(value_at(efc$e42dep, c(3, NA)), regex = "`position` can't")
})
================================================
FILE: .dev/value_at.R
================================================
#' @title Find the value(s) at a specific position in a variable
#' @name value_at
#'
#' @description This function can be used to extract one or more values at a
#' specific position in a variable.
#'
#' @param x A vector or factor.
#' @param position An integer or a vector of integers, indicating the position(s)
#' of the value(s) to be returned. Negative values are counted from the end of
#' the vector. If `NA`, an error is thrown.
#' @param remove_na Logical, if `TRUE`, missing values are removed before
#' computing the position. If `FALSE`, missing values are included in the
#' computation.
#' @param default The value to be returned if the position is out of range.
#'
#' @seealso `data_summary()` to use `value_at()` inside a `data_summary()` call.
#'
#' @return A vector with the value(s) at the specified position(s).
#'
#' @examples
#' data(mtcars)
#' # 5th value
#' value_at(mtcars$mpg, 5)
#' # last value
#' value_at(mtcars$mpg, -1)
#' # out of range, return default
#' value_at(mtcars$mpg, 150)
#' # return 2nd and fifth value
#' value_at(mtcars$mpg, c(2, 5))
#' @export
value_at <- function(x, position = 1, default = NULL, remove_na = FALSE) {
if (remove_na) {
x <- x[!is.na(x)]
}
n <- length(x)
unlist(lapply(position, .values_at, x = x, n = n, default = default), use.names = FALSE)
}
# helper ----
.values_at <- function(x, position, n, default) {
if (is.na(position)) {
insight::format_error("`position` can't be `NA`.")
}
if (position < 0L) {
position <- position + n + 1
}
if (position <= 0 || position > n) {
return(default)
}
x[position]
}
================================================
FILE: .git-blame-ignore-revs
================================================
# Air formatting
5bd245e0bc12d2eecbcfa480a231b6df3ab7d684
================================================
FILE: .github/.gitignore
================================================
*.html
================================================
FILE: .github/CODE_OF_CONDUCT.md
================================================
# Contributor Covenant Code of Conduct
## Our Pledge
We as members, contributors, and leaders pledge to make participation in our
community a harassment-free experience for everyone, regardless of age, body
size, visible or invisible disability, ethnicity, sex characteristics, gender
identity and expression, level of experience, education, socio-economic status,
nationality, personal appearance, race, caste, color, religion, or sexual
identity and orientation.
We pledge to act and interact in ways that contribute to an open, welcoming,
diverse, inclusive, and healthy community.
## Our Standards
Examples of behavior that contributes to a positive environment for our
community include:
* Demonstrating empathy and kindness toward other people
* Being respectful of differing opinions, viewpoints, and experiences
* Giving and gracefully accepting constructive feedback
* Accepting responsibility and apologizing to those affected by our mistakes,
and learning from the experience
* Focusing on what is best not just for us as individuals, but for the overall
community
Examples of unacceptable behavior include:
* The use of sexualized language or imagery, and sexual attention or advances of
any kind
* Trolling, insulting or derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or email address,
without their explicit permission
* Other conduct which could reasonably be considered inappropriate in a
professional setting
## Enforcement Responsibilities
Community leaders are responsible for clarifying and enforcing our standards of
acceptable behavior and will take appropriate and fair corrective action in
response to any behavior that they deem inappropriate, threatening, offensive,
or harmful.
Community leaders have the right and responsibility to remove, edit, or reject
comments, commits, code, wiki edits, issues, and other contributions that are
not aligned to this Code of Conduct, and will communicate reasons for moderation
decisions when appropriate.
## Scope
This Code of Conduct applies within all community spaces, and also applies when
an individual is officially representing the community in public spaces.
Examples of representing our community include using an official e-mail address,
posting via an official social media account, or acting as an appointed
representative at an online or offline event.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement at patilindrajeet.science@gmail.com.
All complaints will be reviewed and investigated promptly and fairly.
All community leaders are obligated to respect the privacy and security of the
reporter of any incident.
## Enforcement Guidelines
Community leaders will follow these Community Impact Guidelines in determining
the consequences for any action they deem in violation of this Code of Conduct:
### 1. Correction
**Community Impact**: Use of inappropriate language or other behavior deemed
unprofessional or unwelcome in the community.
**Consequence**: A private, written warning from community leaders, providing
clarity around the nature of the violation and an explanation of why the
behavior was inappropriate. A public apology may be requested.
### 2. Warning
**Community Impact**: A violation through a single incident or series of
actions.
**Consequence**: A warning with consequences for continued behavior. No
interaction with the people involved, including unsolicited interaction with
those enforcing the Code of Conduct, for a specified period of time. This
includes avoiding interactions in community spaces as well as external channels
like social media. Violating these terms may lead to a temporary or permanent
ban.
### 3. Temporary Ban
**Community Impact**: A serious violation of community standards, including
sustained inappropriate behavior.
**Consequence**: A temporary ban from any sort of interaction or public
communication with the community for a specified period of time. No public or
private interaction with the people involved, including unsolicited interaction
with those enforcing the Code of Conduct, is allowed during this period.
Violating these terms may lead to a permanent ban.
### 4. Permanent Ban
**Community Impact**: Demonstrating a pattern of violation of community
standards, including sustained inappropriate behavior, harassment of an
individual, or aggression toward or disparagement of classes of individuals.
**Consequence**: A permanent ban from any sort of public interaction within the
community.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
version 2.1, available at
<https://www.contributor-covenant.org/version/2/1/code_of_conduct.html>.
Community Impact Guidelines were inspired by
[Mozilla's code of conduct enforcement ladder][https://github.com/mozilla/inclusion].
For answers to common questions about this code of conduct, see the FAQ at
<https://www.contributor-covenant.org/faq>. Translations are available at <https://www.contributor-covenant.org/translations>.
[homepage]: https://www.contributor-covenant.org
================================================
FILE: .github/CONTRIBUTING.md
================================================
# Contributing to datawizard
This outlines how to propose a change to **datawizard**.
## Fixing typos
Small typos or grammatical errors in documentation may be edited directly using the GitHub web interface, so long as the changes are made in the _source_ file. If you want to fix typos in the documentation, please edit the related `.R` file in the `R/` folder. Do _not_ edit an `.Rd` file in `man/`.
## Filing an issue
The easiest way to propose a change or new feature is to file an issue. If you've found a
bug, you may also create an associated issue. If possible, try to illustrate your proposal or the bug with a minimal [reproducible example](https://www.tidyverse.org/help/#reprex).
## Pull requests
* Please create a Git branch for each pull request (PR).
* Your contributed code should roughly follow the [R style guide](http://style.tidyverse.org), but in particular our [**easystats convention of code-style**](https://github.com/easystats/easystats#convention-of-code-style).
* datawizard uses [roxygen2](https://cran.r-project.org/package=roxygen2), with
[Markdown syntax](https://cran.r-project.org/web/packages/roxygen2/vignettes/rd-formatting.html),
for documentation.
* datawizard uses [testthat](https://cran.r-project.org/package=testthat). Adding tests to the PR makes it easier for me to merge your PR into the code base.
* If your PR is a user-visible change, you may add a bullet to the top of `NEWS.md` describing the changes made. You may optionally add your GitHub username, and links to relevant issue(s)/PR(s).
## Code of Conduct
Please note that this project is released with a [Contributor Code of Conduct](https://easystats.github.io/datawizard/CODE_OF_CONDUCT.html). By participating in this project you agree to
abide by its terms.
================================================
FILE: .github/FUNDING.yml
================================================
# These are supported funding model platforms
github: easystats
================================================
FILE: .github/SUPPORT.md
================================================
# Getting help with `{datawizard}`
Thanks for using `{datawizard}`. Before filing an issue, there are a few places
to explore and pieces to put together to make the process as smooth as possible.
Start by making a minimal **repr**oducible **ex**ample using the
[reprex](http://reprex.tidyverse.org/) package. If you haven't heard of or used
reprex before, you're in for a treat! Seriously, reprex will make all of your
R-question-asking endeavors easier (which is a pretty insane ROI for the five to
ten minutes it'll take you to learn what it's all about). For additional reprex
pointers, check out the [Get help!](https://www.tidyverse.org/help/) resource
used by the tidyverse team.
Armed with your reprex, the next step is to figure out where to ask:
* If it's a question: start with StackOverflow. There are more people there to answer questions.
* If it's a bug: you're in the right place, file an issue.
* If you're not sure: let the community help you figure it out! If your
problem _is_ a bug or a feature request, you can easily return here and
report it.
Before opening a new issue, be sure to [search issues and pull requests](https://github.com/easystats/datawizard/issues) to make sure the
bug hasn't been reported and/or already fixed in the development version. By
default, the search will be pre-populated with `is:issue is:open`. You can
[edit the qualifiers](https://help.github.com/articles/searching-issues-and-pull-requests/)
(e.g. `is:pr`, `is:closed`) as needed. For example, you'd simply
remove `is:open` to search _all_ issues in the repo, open or closed.
Thanks for your help!
================================================
FILE: .github/dependabot.yaml
================================================
version: 2
updates:
# Keep dependencies for GitHub Actions up-to-date
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "weekly"
================================================
FILE: .github/workflows/R-CMD-check-hard.yaml
================================================
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
#
# NOTE: This workflow only directly installs "hard" dependencies, i.e. Depends,
# Imports, and LinkingTo dependencies. Notably, Suggests dependencies are never
# installed, with the exception of testthat, knitr, and rmarkdown. The cache is
# never used to avoid accidentally restoring a cache containing a suggested
# dependency.
on:
push:
branches: [main, master]
pull_request:
branches: [main, master]
name: R-CMD-check-hard
jobs:
R-CMD-check-hard:
uses: easystats/workflows/.github/workflows/R-CMD-check-hard.yaml@main
================================================
FILE: .github/workflows/R-CMD-check.yaml
================================================
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
#
# NOTE: This workflow is overkill for most R packages and
# check-standard.yaml is likely a better choice.
# usethis::use_github_action("check-standard") will install it.
on:
push:
branches: [main, master]
pull_request:
branches: [main, master]
name: R-CMD-check
jobs:
R-CMD-check:
uses: easystats/workflows/.github/workflows/R-CMD-check.yaml@main
================================================
FILE: .github/workflows/check-all-examples.yaml
================================================
# Make sure all examples run successfully, even the ones that are not supposed
# to be run or tested on CRAN machines by default.
#
# The examples that fail should use
# - `if (FALSE) { ... }` (if example is included only for illustrative purposes)
# - `try({ ... })` (if the intent is to show the error)
#
# This workflow helps find such failing examples that need to be modified.
on:
push:
branches: [main, master]
pull_request:
branches: [main, master]
name: check-all-examples
jobs:
check-all-examples:
uses: easystats/workflows/.github/workflows/check-all-examples.yaml@main
================================================
FILE: .github/workflows/check-link-rot.yaml
================================================
on:
push:
branches: [main, master]
pull_request:
branches: [main, master]
name: check-link-rot
jobs:
check-link-rot:
uses: easystats/workflows/.github/workflows/check-link-rot.yaml@main
================================================
FILE: .github/workflows/check-random-test-order.yaml
================================================
# Run tests in random order
on:
push:
branches: [main, master]
pull_request:
branches: [main, master]
name: check-random-test-order
jobs:
check-random-test-order:
uses: easystats/workflows/.github/workflows/check-random-test-order.yaml@main
================================================
FILE: .github/workflows/check-readme.yaml
================================================
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
push:
branches: [main, master]
pull_request:
branches: [main, master]
name: check-readme
jobs:
check-readme:
uses: easystats/workflows/.github/workflows/check-readme.yaml@main
================================================
FILE: .github/workflows/check-spelling.yaml
================================================
on:
push:
branches: [main, master]
pull_request:
branches: [main, master]
name: check-spelling
jobs:
check-spelling:
uses: easystats/workflows/.github/workflows/check-spelling.yaml@main
================================================
FILE: .github/workflows/check-styling.yaml
================================================
on:
push:
branches: [main, master]
pull_request:
branches: [main, master]
name: check-styling
jobs:
check-styling:
uses: easystats/workflows/.github/workflows/check-styling.yaml@main
================================================
FILE: .github/workflows/check-test-warnings.yaml
================================================
# Running tests with options(warn = 2) to fail on test warnings
on:
push:
branches: [main, master]
pull_request:
branches: [main, master]
name: check-test-warnings
jobs:
check-test-warnings:
uses: easystats/workflows/.github/workflows/check-test-warnings.yaml@main
================================================
FILE: .github/workflows/check-vignette-warnings.yaml
================================================
# Running tests with options(warn = 2) to fail on test warnings
on:
push:
branches: [main, master]
pull_request:
branches: [main, master]
name: check-vignette-warnings
jobs:
check-vignette-warnings:
uses: easystats/workflows/.github/workflows/check-vignette-warnings.yaml@main
================================================
FILE: .github/workflows/html-5-check.yaml
================================================
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
push:
branches: [main, master]
pull_request:
branches: [main, master]
name: html-5-check
jobs:
html-5-check:
uses: easystats/workflows/.github/workflows/html-5-check.yaml@main
================================================
FILE: .github/workflows/lint-changed-files.yaml
================================================
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
pull_request:
branches: [main, master]
name: lint-changed-files
jobs:
lint-changed-files:
uses: easystats/workflows/.github/workflows/lint-changed-files.yaml@main
================================================
FILE: .github/workflows/lint.yaml
================================================
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
push:
branches: [main, master]
pull_request:
branches: [main, master]
name: lint
jobs:
lint:
uses: easystats/workflows/.github/workflows/lint.yaml@main
================================================
FILE: .github/workflows/pkgdown-no-suggests.yaml
================================================
on:
push:
branches: [main, master]
pull_request:
branches: [main, master]
name: pkgdown-no-suggests
jobs:
pkgdown-no-suggests:
uses: easystats/workflows/.github/workflows/pkgdown-no-suggests.yaml@main
================================================
FILE: .github/workflows/pkgdown.yaml
================================================
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
push:
branches: [main, master]
pull_request:
branches: [main, master]
release:
types: [published]
workflow_dispatch:
name: pkgdown
jobs:
pkgdown:
uses: easystats/workflows/.github/workflows/pkgdown.yaml@main
================================================
FILE: .github/workflows/test-coverage-examples.yaml
================================================
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
push:
branches: [main, master]
pull_request:
branches: [main, master]
name: test-coverage-examples
jobs:
test-coverage-examples:
uses: easystats/workflows/.github/workflows/test-coverage-examples.yaml@main
================================================
FILE: .github/workflows/test-coverage.yaml
================================================
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
push:
branches: [main, master]
pull_request:
branches: [main, master]
name: test-coverage
jobs:
test-coverage:
uses: easystats/workflows/.github/workflows/test-coverage.yaml@main
================================================
FILE: .github/workflows/update-to-latest-easystats.yaml
================================================
on:
schedule:
# Check for dependency updates once a month
- cron: "0 0 1 * *"
name: update-to-latest-easystats
jobs:
update-to-latest-easystats:
uses: easystats/workflows/.github/workflows/update-to-latest-easystats.yaml@main
================================================
FILE: .gitignore
================================================
# History files
.Rhistory
.Rapp.history
# Session Data files
.RData
# Example code in package build process
*-Ex.R
# Output files from R CMD build
/*.tar.gz
# Output files from R CMD check
/*.Rcheck/
# RStudio files
.Rproj.user/
# produced vignettes
vignettes/*.html
vignettes/*.pdf
# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
.httr-oauth
# knitr and R markdown default cache directories
/*_cache/
/cache/
# Temporary files created by R markdown
*.utf8.md
*.knit.md
# Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html
rsconnect/
=========================
# Operating System Files
# OSX
.DS_Store
.AppleDouble
.LSOverride
# Thumbnails
._*
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
docs
inst/doc
CRAN-SUBMISSION
================================================
FILE: .lintr
================================================
linters: all_linters(
absolute_path_linter = NULL,
assignment_linter = NULL,
commented_code_linter = NULL,
cyclocomp_linter(25L),
if_not_else_linter(exceptions = character(0L)),
implicit_integer_linter = NULL,
library_call_linter = NULL,
line_length_linter(120L),
namespace_linter = NULL,
nonportable_path_linter = NULL,
object_name_linter = NULL,
object_length_linter(50L),
object_usage_linter = NULL,
todo_comment_linter = NULL,
string_boundary_linter = NULL,
strings_as_factors_linter = NULL,
undesirable_function_linter = NULL,
undesirable_operator_linter = NULL,
unnecessary_concatenation_linter(allow_single_expression = FALSE),
unused_import_linter = NULL
)
================================================
FILE: DESCRIPTION
================================================
Type: Package
Package: datawizard
Title: Easy Data Wrangling and Statistical Transformations
Version: 1.3.1
Authors@R: c(
person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut",
comment = c(ORCID = "0000-0003-1995-6531")),
person("Etienne", "Bacher", , "etienne.bacher@protonmail.com", role = c("aut", "cre"),
comment = c(ORCID = "0000-0002-9271-5075")),
person("Dominique", "Makowski", , "dom.makowski@gmail.com", role = "aut",
comment = c(ORCID = "0000-0001-5375-9967")),
person("Daniel", "Lüdecke", , "d.luedecke@uke.de", role = "aut",
comment = c(ORCID = "0000-0002-8895-3206")),
person("Mattan S.", "Ben-Shachar", , "matanshm@post.bgu.ac.il", role = "aut",
comment = c(ORCID = "0000-0002-4287-4801")),
person("Brenton M.", "Wiernik", , "brenton@wiernik.org", role = "aut",
comment = c(ORCID = "0000-0001-9560-6336")),
person("Rémi", "Thériault", , "remi.theriault@mail.mcgill.ca", role = "ctb",
comment = c(ORCID = "0000-0003-4315-6788")),
person("Thomas J.", "Faulkenberry", , "faulkenberry@tarleton.edu", role = "rev"),
person("Robert", "Garrett", , "rcg4@illinois.edu", role = "rev")
)
Maintainer: Etienne Bacher <etienne.bacher@protonmail.com>
Description: A lightweight package to assist in key steps involved in any data
analysis workflow: (1) wrangling the raw data to get it in the needed form,
(2) applying preprocessing steps and statistical transformations, and
(3) compute statistical summaries of data properties and distributions.
It is also the data wrangling backend for packages in 'easystats' ecosystem.
References: Patil et al. (2022) <doi:10.21105/joss.04684>.
License: MIT + file LICENSE
URL: https://easystats.github.io/datawizard/
BugReports: https://github.com/easystats/datawizard/issues
Depends:
R (>= 4.0)
Imports:
insight (>= 1.4.6),
stats,
utils
Suggests:
bayestestR,
boot,
BH,
brms,
curl,
data.table,
dplyr (>= 1.1),
effectsize,
emmeans,
fixest,
gamm4,
ggplot2 (>= 3.5.0),
gt,
haven,
httr,
knitr,
lme4,
mediation,
modelbased,
nanoparquet,
openssl,
parameters (>= 0.21.7),
performance (>= 0.14.0),
poorman (>= 0.2.7),
psych,
readxl,
readr,
rio,
rmarkdown,
rstanarm,
see,
testthat (>= 3.2.1),
tibble,
tidyr,
tinytable (>= 0.13.0),
withr
VignetteBuilder:
knitr
Encoding: UTF-8
Language: en-US
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.3
Config/testthat/edition: 3
Config/testthat/parallel: true
Config/Needs/website: easystats/easystatstemplate
================================================
FILE: LICENSE
================================================
YEAR: 2023
COPYRIGHT HOLDER: datawizard authors
================================================
FILE: LICENSE.md
================================================
# MIT License
Copyright (c) 2023 datawizard authors
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: NAMESPACE
================================================
# Generated by roxygen2: do not edit by hand
S3method(as.data.frame,datawizard_crosstabs)
S3method(as.data.frame,datawizard_tables)
S3method(as.double,parameters_kurtosis)
S3method(as.double,parameters_skewness)
S3method(as.double,parameters_smoothness)
S3method(as.numeric,parameters_kurtosis)
S3method(as.numeric,parameters_skewness)
S3method(as.numeric,parameters_smoothness)
S3method(as.prop.table,datawizard_crosstab)
S3method(as.prop.table,datawizard_crosstabs)
S3method(as.table,datawizard_crosstab)
S3method(as.table,datawizard_crosstabs)
S3method(as.table,datawizard_table)
S3method(as.table,datawizard_tables)
S3method(assign_labels,character)
S3method(assign_labels,data.frame)
S3method(assign_labels,default)
S3method(assign_labels,factor)
S3method(assign_labels,numeric)
S3method(categorize,data.frame)
S3method(categorize,default)
S3method(categorize,factor)
S3method(categorize,grouped_df)
S3method(categorize,numeric)
S3method(center,AsIs)
S3method(center,Date)
S3method(center,character)
S3method(center,data.frame)
S3method(center,default)
S3method(center,factor)
S3method(center,grouped_df)
S3method(center,logical)
S3method(center,numeric)
S3method(coef_var,default)
S3method(coef_var,numeric)
S3method(convert_na_to,character)
S3method(convert_na_to,data.frame)
S3method(convert_na_to,default)
S3method(convert_na_to,factor)
S3method(convert_na_to,numeric)
S3method(convert_to_na,Date)
S3method(convert_to_na,character)
S3method(convert_to_na,data.frame)
S3method(convert_to_na,default)
S3method(convert_to_na,factor)
S3method(convert_to_na,logical)
S3method(convert_to_na,numeric)
S3method(data_arrange,default)
S3method(data_arrange,grouped_df)
S3method(data_duplicated,data.frame)
S3method(data_duplicated,grouped_df)
S3method(data_extract,data.frame)
S3method(data_filter,data.frame)
S3method(data_filter,grouped_df)
S3method(data_merge,data.frame)
S3method(data_merge,list)
S3method(data_modify,data.frame)
S3method(data_modify,default)
S3method(data_modify,grouped_df)
S3method(data_peek,data.frame)
S3method(data_summary,data.frame)
S3method(data_summary,default)
S3method(data_summary,grouped_df)
S3method(data_summary,matrix)
S3method(data_tabulate,data.frame)
S3method(data_tabulate,default)
S3method(data_tabulate,grouped_df)
S3method(data_unique,data.frame)
S3method(data_unique,grouped_df)
S3method(describe_distribution,character)
S3method(describe_distribution,data.frame)
S3method(describe_distribution,default)
S3method(describe_distribution,factor)
S3method(describe_distribution,grouped_df)
S3method(describe_distribution,list)
S3method(describe_distribution,numeric)
S3method(display,data_codebook)
S3method(display,datawizard_crosstab)
S3method(display,datawizard_crosstabs)
S3method(display,datawizard_table)
S3method(display,datawizard_tables)
S3method(display,parameters_distribution)
S3method(format,data_codebook)
S3method(format,datawizard_crosstab)
S3method(format,datawizard_table)
S3method(format,dw_data_peek)
S3method(format,dw_groupmeans)
S3method(format,parameters_distribution)
S3method(kurtosis,data.frame)
S3method(kurtosis,default)
S3method(kurtosis,matrix)
S3method(kurtosis,numeric)
S3method(labels_to_levels,data.frame)
S3method(labels_to_levels,default)
S3method(labels_to_levels,factor)
S3method(makepredictcall,dw_transformer)
S3method(means_by_group,data.frame)
S3method(means_by_group,default)
S3method(means_by_group,numeric)
S3method(normalize,data.frame)
S3method(normalize,factor)
S3method(normalize,grouped_df)
S3method(normalize,matrix)
S3method(normalize,numeric)
S3method(plot,parameters_distribution)
S3method(plot,visualisation_recipe)
S3method(print,data_codebook)
S3method(print,data_seek)
S3method(print,datawizard_crosstab)
S3method(print,datawizard_crosstabs)
S3method(print,datawizard_table)
S3method(print,datawizard_tables)
S3method(print,dw_data_peek)
S3method(print,dw_data_summary)
S3method(print,dw_groupmeans)
S3method(print,dw_groupmeans_list)
S3method(print,dw_transformer)
S3method(print,parameters_distribution)
S3method(print,parameters_kurtosis)
S3method(print,parameters_skewness)
S3method(print,visualisation_recipe)
S3method(print_html,data_codebook)
S3method(print_html,datawizard_crosstab)
S3method(print_html,datawizard_crosstabs)
S3method(print_html,datawizard_table)
S3method(print_html,datawizard_tables)
S3method(print_html,dw_data_peek)
S3method(print_html,parameters_distribution)
S3method(print_md,data_codebook)
S3method(print_md,datawizard_crosstab)
S3method(print_md,datawizard_crosstabs)
S3method(print_md,datawizard_table)
S3method(print_md,datawizard_tables)
S3method(print_md,dw_data_peek)
S3method(print_md,parameters_distribution)
S3method(ranktransform,data.frame)
S3method(ranktransform,factor)
S3method(ranktransform,grouped_df)
S3method(ranktransform,numeric)
S3method(recode_values,character)
S3method(recode_values,data.frame)
S3method(recode_values,default)
S3method(recode_values,factor)
S3method(recode_values,numeric)
S3method(replace_nan_inf,data.frame)
S3method(replace_nan_inf,default)
S3method(rescale,data.frame)
S3method(rescale,default)
S3method(rescale,grouped_df)
S3method(rescale,numeric)
S3method(reverse,data.frame)
S3method(reverse,default)
S3method(reverse,factor)
S3method(reverse,grouped_df)
S3method(reverse,numeric)
S3method(rowid_as_column,default)
S3method(rowid_as_column,grouped_df)
S3method(skewness,data.frame)
S3method(skewness,default)
S3method(skewness,matrix)
S3method(skewness,numeric)
S3method(slide,data.frame)
S3method(slide,default)
S3method(slide,numeric)
S3method(smoothness,data.frame)
S3method(smoothness,default)
S3method(smoothness,numeric)
S3method(standardize,AsIs)
S3method(standardize,Date)
S3method(standardize,Surv)
S3method(standardize,bcplm)
S3method(standardize,biglm)
S3method(standardize,brmsfit)
S3method(standardize,character)
S3method(standardize,clm2)
S3method(standardize,data.frame)
S3method(standardize,datagrid)
S3method(standardize,default)
S3method(standardize,double)
S3method(standardize,factor)
S3method(standardize,fixest)
S3method(standardize,grouped_df)
S3method(standardize,integer)
S3method(standardize,logical)
S3method(standardize,matrix)
S3method(standardize,mediate)
S3method(standardize,mixor)
S3method(standardize,numeric)
S3method(standardize,visualisation_matrix)
S3method(standardize,wbgee)
S3method(standardize,wbm)
S3method(summary,parameters_kurtosis)
S3method(summary,parameters_skewness)
S3method(to_factor,Date)
S3method(to_factor,character)
S3method(to_factor,data.frame)
S3method(to_factor,default)
S3method(to_factor,double)
S3method(to_factor,factor)
S3method(to_factor,haven_labelled)
S3method(to_factor,logical)
S3method(to_factor,numeric)
S3method(to_numeric,Date)
S3method(to_numeric,POSIXct)
S3method(to_numeric,POSIXlt)
S3method(to_numeric,POSIXt)
S3method(to_numeric,character)
S3method(to_numeric,data.frame)
S3method(to_numeric,default)
S3method(to_numeric,double)
S3method(to_numeric,factor)
S3method(to_numeric,haven_labelled)
S3method(to_numeric,logical)
S3method(to_numeric,numeric)
S3method(unnormalize,data.frame)
S3method(unnormalize,default)
S3method(unnormalize,grouped_df)
S3method(unnormalize,numeric)
S3method(unstandardize,array)
S3method(unstandardize,character)
S3method(unstandardize,data.frame)
S3method(unstandardize,datagrid)
S3method(unstandardize,factor)
S3method(unstandardize,grouped_df)
S3method(unstandardize,matrix)
S3method(unstandardize,numeric)
S3method(unstandardize,visualisation_matrix)
S3method(winsorize,character)
S3method(winsorize,data.frame)
S3method(winsorize,factor)
S3method(winsorize,logical)
S3method(winsorize,numeric)
export(adjust)
export(as.prop.table)
export(assign_labels)
export(categorize)
export(center)
export(centre)
export(change_scale)
export(coef_var)
export(coerce_to_numeric)
export(colnames_to_row)
export(column_as_rownames)
export(contr.deviation)
export(convert_na_to)
export(convert_to_na)
export(data_addprefix)
export(data_addsuffix)
export(data_adjust)
export(data_arrange)
export(data_codebook)
export(data_duplicated)
export(data_extract)
export(data_filter)
export(data_group)
export(data_join)
export(data_match)
export(data_merge)
export(data_modify)
export(data_partition)
export(data_peek)
export(data_read)
export(data_relocate)
export(data_remove)
export(data_rename)
export(data_rename_rows)
export(data_reorder)
export(data_replicate)
export(data_restoretype)
export(data_rotate)
export(data_seek)
export(data_select)
export(data_separate)
export(data_summary)
export(data_tabulate)
export(data_to_long)
export(data_to_wide)
export(data_transpose)
export(data_ungroup)
export(data_unique)
export(data_unite)
export(data_write)
export(degroup)
export(demean)
export(describe_distribution)
export(detrend)
export(display)
export(distribution_coef_var)
export(distribution_mode)
export(empty_columns)
export(empty_rows)
export(extract_column_names)
export(find_columns)
export(kurtosis)
export(labels_to_levels)
export(mean_sd)
export(means_by_group)
export(median_mad)
export(normalize)
export(print_html)
export(print_md)
export(ranktransform)
export(recode_into)
export(recode_values)
export(remove_empty)
export(remove_empty_columns)
export(remove_empty_rows)
export(replace_nan_inf)
export(rescale)
export(rescale_weights)
export(reshape_ci)
export(reshape_longer)
export(reshape_wider)
export(reverse)
export(reverse_scale)
export(row_count)
export(row_means)
export(row_sums)
export(row_to_colnames)
export(rowid_as_column)
export(rownames_as_column)
export(skewness)
export(slide)
export(smoothness)
export(standardise)
export(standardize)
export(text_concatenate)
export(text_format)
export(text_fullstop)
export(text_lastchar)
export(text_paste)
export(text_remove)
export(text_wrap)
export(to_factor)
export(to_numeric)
export(unnormalize)
export(unstandardise)
export(unstandardize)
export(visualisation_recipe)
export(weighted_mad)
export(weighted_mean)
export(weighted_median)
export(weighted_sd)
export(winsorize)
importFrom(insight,display)
importFrom(insight,print_html)
importFrom(insight,print_md)
importFrom(stats,makepredictcall)
================================================
FILE: NEWS.md
================================================
# datawizard 1.3.1
CHANGES
* `data_summary()` now allows expressions to return more than one summary
value. For each value, a new column is created. Additionally, the optional
`suffix` argument controls the naming of these columns; if `suffix = NULL`,
column names are auto-generated (e.g., with numeric suffixes).
* `standardize()` now works on `fixest` estimations (#665).
* `data_read()` and `data_write()` gain a `password` argument, to encrypt and
decrypt data files. This currently only works for R file formats (`.rda`,
`.rds`, and `.rdata`). Data encryption is based on the AES-GCM algorithm using
the `openssl::aes_gcm_encrypt()` function (#675).
FIXES
* Fix a test due to R-devel change (#677).
# datawizard 1.3.0
BREAKING CHANGES
* Argument `values_fill` in `data_to_wide()` is now defunct, because it did not
work as intended (#645).
* `data_to_wide()` no longer removes empty columns that were created after
widening data frames, to behave similarly to `tidyr::pivot_wider()` (#645).
CHANGES
* `data_tabulate()` now saves the table of proportions for crosstables as
attribute, accessible via the new `as.prop.table()` method (#656).
* Due to changes in the package `insight`, `data_tabulate()` no longer prints
decimals when all values in a column are integers (#641).
* Argument `values_from` in `data_to_wide()` now supports select-helpers like
the `select` argument in other `{datawizard}` functions (#645).
* Added a `display()` method for `data_codebook()` (#646).
* `display()` methods now support the `{tinytable}` package. Use `format = "tt"`
to export tables as `tinytable` objects (#646).
* Improved performance for several functions that process grouped data frames
when the input is a grouped `tibble` (#651).
BUG FIXES
* Fixed an issue when `demean()`ing nested structures with more than 2 grouping
variables (#635).
* Fixed an issue when `demean()`ing crossed structures with more than 2 grouping
variables (#638).
* Fixed issue in `data_to_wide()` with multiple variables assigned in
`values_from` when IDs were not balanced (equally spread across observations)
(#644).
* Fixed issue in `data_replicate()` when data frame had only one column to
replicate (#654).
# datawizard 1.2.0
BREAKING CHANGES
* The following deprecated arguments have been removed (#603):
- `drop_na` in `data_match()`
- `safe`, `pattern`, and `verbose` in `data_rename()`
CHANGES
* `data_read()` and `data_write()` now support the `.parquet` file format, via
the *nanoparquet* package (#625).
* `data_tabulate()` gets a `display()` method (#627).
* `data_tabulate()` gets an `as.table()` method to coerce the frequency or
contingency table into a (list of) `table()` object(s). This can be useful for
further statistical analysis, e.g. in combination with `chisq.test()` (#629).
* The `print()` method for `data_tabulate()` now appears in the documentation,
making the `big_mark` argument visible (#627).
BUG FIXES
* Fixed an issue when printing cross tables using `data_tabulate(by = ...)`,
which was caused by the recent changes in `insight::export_table()`.
* Fixed another issue when printing cross tables using `data_tabulate(by = ...)`,
when more than one variable was selected for `select` (#630).
* Fixed typo in the documentation of `data_match()`.
# datawizard 1.1.0
BREAKING CHANGES
* `data_read()` now also returns Bayesian models from packages *brms* and
*rstanarm* as original model objects, and no longer coerces them into data
frames (#606).
* The output format of `describe_distribution()` on grouped data has changed.
Before, it printed one table per group combination. Now, it prints a single
table with group columns at the start (#610).
* The output format of `describe_distribution()` when confidence intervals are
requested has changed. Now, for each centrality measure a confidence interval
is calculated (#617).
* `data_modify()` now always uses values of a vector for a modified or newly
created variable, and no longer tries to detect whether a character value
possibly contains an expression. To allow expression provided as string (or
character vectors), use the helper-function `as_expr()`. Only literal
expressions or strings wrapped in `as_expr()` will be evaluated as
expressions, everything else will be treated as vector with values for new
variables (#605).
CHANGES
* `display()` is now re-exported from package *insight*.
* `data_read()` and `data_write()` now rely on base-R functions for files of
type `.rds`, `.rda` or `.rdata`. Thus, package *rio* is no longer required
to be installed for these file types (#607).
* `data_codebook()` gives an informative warning when no column names matched
the selection pattern (#601).
* `data_to_long()` now errors when columns selected to reshape do not exist in
the data, to avoid nonsensical results that could be missed (#602).
* New argument `by` in `describe_distribution()` (#604).
* `describe_distribution()` now gives informative errors when column names
in the input data frame conflict with column from the output table (#612).
* The methods for `parameters_distribution` objects are now defined in
`datawizard` (they were previously in `parameters`) (#613).
BUG FIXES
* Fixed bug in `data_to_wide()`, where new column names in `names_from` were
ignored when that column only contained one unique value.
* Fixed bug in `describe_distribution()` when some group combinations
didn't appear in the data (#609).
* Fixed bug in `describe_distribution()` when more than one value for the
`centrality` argument were specified (#617).
* Fixed bug in `describe_distribution()` where setting `verbose = FALSE`
didn't hide some warnings (#617).
* Fixed warning in `data_summary()` when a variable had the same name as
another object in the global environment (#585).
# datawizard 1.0.2
BUG FIXES
* Fixed failing R CMD check on ATLAS, noLD, and OpenBLAS due to small numerical
differences (#592).
# datawizard 1.0.1
BUG FIXES
* Fixed issue in `data_arrange()` for data frames that only had one column.
Formerly, the data frame was coerced into a vector, now the data frame class
is preserved.
* Fixed issue in R-devel (4.5.0) due to a change in how `grep()` handles logical
arguments with missing values (#588).
# datawizard 1.0.0
BREAKING CHANGES AND DEPRECATIONS
* *datawizard* now requires R >= 4.0 (#515).
* Argument `drop_na` in `data_match()` is deprecated now. Please use
`remove_na` instead (#556).
* In `data_rename()` (#567):
- argument `pattern` is deprecated. Use `select` instead.
- argument `safe` is deprecated. The function now errors when `select`
contains unknown column names.
- when `replacement` is `NULL`, an error is now thrown (previously, column
indices were used as new names).
- if `select` (previously `pattern`) is a named vector, then all elements
must be named, e.g. `c(length = "Sepal.Length", "Sepal.Width")` errors.
* Order of arguments `by` and `probability_weights` in `rescale_weights()` has
changed, because for `method = "kish"`, the `by` argument is optional (#575).
* The name of the rescaled weights variables in `rescale_weights()` have been
renamed. `pweights_a` and `pweights_b` are now named `rescaled_weights_a`
and `rescaled_weights_b` (#575).
* `print()` methods for `data_tabulate()` with multiple sub-tables (i.e. when
length of `by` was > 1) were revised. Now, an integrated table instead of
multiple tables is returned. Furthermore, `print_html()` did not work, which
was also fixed now (#577).
* `demean()` (and `degroup()`) gets an `append` argument that defaults to `TRUE`,
to append the centered variables to the original data frame, instead of
returning the de- and group-meaned variables only. Use `append = FALSE` to
for the previous default behaviour (i.e. only returning the newly created
variables) (#579).
CHANGES
* `rescale_weights()` gets a `method` argument, to choose method to rescale
weights. Options are `"carle"` (the default) and `"kish"` (#575).
* The `select` argument, which is available in different functions to select
variables, can now also be a character vector with quoted variable names,
including a colon to indicate a range of several variables (e.g. `"cyl:gear"`)
(#551).
* New function `row_sums()`, to calculate row sums (optionally with minimum
amount of valid values), as complement to `row_means()` (#552).
* New function `row_count()`, to count specific values row-wise (#553).
* `data_read()` no longer shows warning about forthcoming breaking changes
in upstream packages when reading `.RData` files (#557).
* `data_modify()` now recognizes `n()`, for example to create an index for data
groups with `1:n()` (#535).
* The `replacement` argument in `data_rename()` now supports glue-styled
tokens (#563).
* `data_summary()` also accepts the results of `bayestestR::ci()` as summary
function (#483).
* `ranktransform()` has a new argument `zeros` to determine how zeros should be
handled when `sign = TRUE` (#573).
BUG FIXES
* `describe_distribution()` no longer errors if the sample was too sparse to compute
CIs. Instead, it warns the user and returns `NA` (#550).
* `data_read()` preserves variable types when importing files from `rds` or
`rdata` format (#558).
# datawizard 0.13.0
BREAKING CHANGES
* `data_rename()` now errors when the `replacement` argument contains `NA` values
or empty strings (#539).
* Removed deprecated functions `get_columns()`, `data_find()`, `format_text()` (#546).
* Removed deprecated arguments `group` and `na.rm` in multiple functions. Use `by` and `remove_na` instead (#546).
* The default value for the argument `dummy_factors` in `to_numeric()` has
changed from `TRUE` to `FALSE` (#544).
CHANGES
* The `pattern` argument in `data_rename()` can also be a named vector. In this
case, names are used as values for the `replacement` argument (i.e. `pattern`
can be a character vector using `<new name> = "<old name>"`).
* `categorize()` gains a new `breaks` argument, to decide whether breaks are
inclusive or exclusive (#548).
* The `labels` argument in `categorize()` gets two new options, `"range"` and
`"observed"`, to use the range of categorized values as labels (i.e. factor
levels) (#548).
* Minor additions to `reshape_ci()` to work with forthcoming changes in the
`{bayestestR}` package.
# datawizard 0.12.3
CHANGES
* `demean()` (and `degroup()`) now also work for nested designs, if argument
`nested = TRUE` and `by` specifies more than one variable (#533).
* Vignettes are no longer provided in the package, they are now only available
on the website. There is only one "Overview" vignette available in the package,
it contains links to the other vignettes on the website. This is because there
are CRAN errors occurring when building vignettes on macOS and we couldn't
determine the cause after multiple patch releases (#534).
# datawizard 0.12.2
* Remove `htmltools` from `Suggests` in an attempt of fixing an error in CRAN
checks due to failures to build a vignette (#528).
# datawizard 0.12.1
This is a patch release to fix one error on CRAN checks occurring because of a
missing package namespace in one of the vignettes.
# datawizard 0.12.0
BREAKING CHANGES
* The argument `include_na` in `data_tabulate()` and `data_summary()` has been
renamed into `remove_na`. Consequently, to mimic former behaviour, `FALSE` and
`TRUE` need to be switched (i.e. `remove_na = TRUE` is equivalent to the former
`include_na = FALSE`).
* Class names for objects returned by `data_tabulate()` have been changed to
`datawizard_table` and `datawizard_crosstable` (resp. the plural forms,
`*_tables`), to provide a clearer and more consistent naming scheme.
CHANGES
* `data_select()` can directly rename selected variables when a named vector
is provided in `select`, e.g. `data_select(mtcars, c(new1 = "mpg", new2 = "cyl"))`.
* `data_tabulate()` gains an `as.data.frame()` method, to return the frequency
table as a data frame. The structure of the returned object is a nested data
frame, where the first column contains name of the variable for which
frequencies were calculated, and the second column contains the frequency table.
* `demean()` (and `degroup()`) now also work for cross-classified designs, or
more generally, for data with multiple grouping or cluster variables (i.e.
`by` can now specify more than one variable).
# datawizard 0.11.0
BREAKING CHANGES
* Arguments named `group` or `group_by` are deprecated and will be removed
in a future release. Please use `by` instead. This affects the following
functions in *datawizard* (#502).
* `data_partition()`
* `demean()` and `degroup()`
* `means_by_group()`
* `rescale_weights()`
* Following aliases are deprecated and will be removed in a future release (#504):
* `get_columns()`, use `data_select()` instead.
* `data_find()` and `find_columns()`, use `extract_column_names()` instead.
* `format_text()`, use `text_format()` instead.
CHANGES
* `recode_into()` is more relaxed regarding checking the type of `NA` values.
If you recode into a numeric variable, and one of the recode values is `NA`,
you no longer need to use `NA_real_` for numeric `NA` values.
* Improved documentation for some functions.
BUG FIXES
* `data_to_long()` did not work for data frame where columns had attributes
(like labelled data).
# datawizard 0.10.0
BREAKING CHANGES
* The following arguments were deprecated in 0.5.0 and are now removed:
* in `data_to_wide()`: `colnames_from`, `rows_from`, `sep`
* in `data_to_long()`: `colnames_to`
* in `data_partition()`: `training_proportion`
NEW FUNCTIONS
* `data_summary()`, to compute summary statistics of (grouped) data frames.
* `data_replicate()`, to expand a data frame by replicating rows based on another
variable that contains the counts of replications per row.
CHANGES
* `data_modify()` gets three new arguments, `.at`, `.if` and `.modify`, to modify
variables at specific positions or based on logical conditions.
* `data_tabulate()` was revised and gets several new arguments: a `weights`
argument, to compute weighted frequency tables. `include_na` allows to include
or omit missing values from the table. Furthermore, a `by` argument was added,
to compute crosstables (#479, #481).
# datawizard 0.9.1
CHANGES
* `rescale()` gains `multiply` and `add` arguments, to expand ranges by a given
factor or value.
* `to_factor()` and `to_numeric()` now support class `haven_labelled`.
BUG FIXES
* `to_numeric()` now correctly deals with inversed factor levels when
`preserve_levels = TRUE`.
* `to_numeric()` inversed order of value labels when `dummy_factors = FALSE`.
* `convert_to_na()` now preserves attributes for factors when `drop_levels = TRUE`.
# datawizard 0.9.0
NEW FUNCTIONS
* `row_means()`, to compute row means, optionally only for the rows with at
least `min_valid` non-missing values.
* `contr.deviation()` for sum-deviation contrast coding of factors.
* `means_by_group()`, to compute mean values of variables, grouped by levels
of specified factors.
* `data_seek()`, to seek for variables in a data frame, based on their
column names, variables labels, value labels or factor levels. Searching for
labels only works for "labelled" data, i.e. when variables have a `label` or
`labels` attribute.
CHANGES
* `recode_into()` gains an `overwrite` argument to skip overwriting already
recoded cases when multiple recode patterns apply to the same case.
* `recode_into()` gains an `preserve_na` argument to preserve `NA` values
when recoding.
* `data_read()` now passes the `encoding` argument to `data.table::fread()`.
This allows to read files with non-ASCII characters.
* `datawizard` moves from the GPL-3 license to the MIT license.
* `unnormalize()` and `unstandardize()` now work with grouped data (#415).
* `unnormalize()` now errors instead of emitting a warning if it doesn't have the
necessary info (#415).
BUG FIXES
* Fixed issue in `labels_to_levels()` when values of labels were not in sorted
order and values were not sequentially numbered.
* Fixed issues in `data_write()` when writing labelled data into SPSS format
and vectors were of different type as value labels.
* Fixed issues in `data_write()` when writing labelled data into SPSS format
for character vectors with missing value labels, but existing variable
labels.
* Fixed issue in `recode_into()` with probably wrong case number printed in the
warning when several recode patterns match to one case.
* Fixed issue in `recode_into()` when original data contained `NA` values and
`NA` was not included in the recode pattern.
* Fixed issue in `data_filter()` where functions containing a `=` (e.g. when
naming arguments, like `grepl(pattern, x = a)`) were mistakenly seen as
faulty syntax.
* Fixed issue in `empty_column()` for strings with invalid multibyte strings.
For such data frames or files, `empty_column()` or `data_read()` no longer
fails.
# datawizard 0.8.0
BREAKING CHANGES
* The following re-exported functions from `{insight}` have now been removed:
`object_has_names()`, `object_has_rownames()`, `is_empty_object()`,
`compact_list()`, `compact_character()`.
* Argument `na.rm` was renamed to `remove_na` throughout `{datawizard}` functions.
`na.rm` is kept for backward compatibility, but will be deprecated and later
removed in future updates.
* The way expressions are defined in `data_filter()` was revised. The `filter`
argument was replaced by `...`, allowing to separate multiple expression with
a comma (which are then combined with `&`). Furthermore, expressions can now also be
defined as strings, or be provided as character vectors, to allow string-friendly
programming.
CHANGES
* Weighted-functions (`weighted_sd()`, `weighted_mean()`, ...) gain a `remove_na`
argument, to remove or keep missing and infinite values. By default,
`remove_na = TRUE`, i.e. missing and infinite values are removed by default.
* `reverse_scale()`, `normalize()` and `rescale()` gain an `append` argument
(similar to other data frame methods of transformation functions), to append
recoded variables to the input data frame instead of overwriting existing
variables.
NEW FUNCTIONS
* `rowid_as_column()` to complement `rownames_as_column()` (and to mimic
`tibble::rowid_to_column()`). Note that its behavior is different from
`tibble::rowid_to_column()` for grouped data. See the Details section in the
docs.
* `data_unite()`, to merge values of multiple variables into one new variable.
* `data_separate()`, as counterpart to `data_unite()`, to separate a single
variable into multiple new variables.
* `data_modify()`, to create new variables, or modify or remove existing
variables in a data frame.
MINOR CHANGES
* `to_numeric()` for variables of type `Date`, `POSIXct` and `POSIXlt` now
includes the class name in the warning message.
* Added a `print()` method for `center()`, `standardize()`, `normalize()` and
`rescale()`.
BUG FIXES
* `standardize_parameters()` now works when the package namespace is in the model
formula (#401).
* `data_merge()` no longer yields a warning for `tibbles` when `join = "bind"`.
* `center()` and `standardize()` did not work for grouped data frames (of class
`grouped_df`) when `force = TRUE`.
* The `data.frame` method of `describe_distribution()` returns `NULL` instead of
an error if no valid variable were passed (for example a factor variable with
`include_factors = FALSE`) (#421).
# datawizard 0.7.1
BREAKING CHANGES
* `add_labs()` was renamed into `assign_labels()`. Since `add_labs()` existed
only for a few days, there will be no alias for backwards compatibility.
NEW FUNCTIONS
* `labels_to_levels()`, to use value labels of factors as their levels.
MINOR CHANGES
* `data_read()` now checks if the imported object actually is a data frame (or
coercible to a data frame), and if not, no longer errors, but gives an
informative warning of the type of object that was imported.
BUG FIXES
* Fix test for CRAN check on Mac OS arm64
# datawizard 0.7.0
BREAKING CHANGES
* In selection patterns, expressions like `-var1:var3` to exclude all variables
between `var1` and `var3` are no longer accepted. The correct expression is
`-(var1:var3)`. This is for 2 reasons:
* to be consistent with the behavior for numerics (`-1:2` is not accepted but
`-(1:2)` is);
* to be consistent with `dplyr::select()`, which throws a warning and only
uses the first variable in the first expression.
NEW FUNCTIONS
* `recode_into()`, similar to `dplyr::case_when()`, to recode values from one
or more variables into a new variable.
* `mean_sd()` and `median_mad()` for summarizing vectors to their mean (or
median) and a range of one SD (or MAD) above and below.
* `data_write()` as counterpart to `data_read()`, to write data frames into
CSV, SPSS, SAS, Stata files and many other file types. One advantage over
existing functions to write data in other packages is that labelled (numeric)
data can be converted into factors (with values labels used as factor levels)
even for text formats like CSV and similar. This allows exporting "labelled"
data into those file formats, too.
* `add_labs()`, to manually add value and variable labels as attributes to
variables. These attributes are stored as `"label"` and `"labels"` attributes,
similar to the `labelled` class from the _haven_ package.
MINOR CHANGES
* `data_rename()` gets a `verbose` argument.
* `winsorize()` now errors if the threshold is incorrect (previously, it provided
a warning and returned the unchanged data). The argument `verbose` is now
useless but is kept for backward compatibility. The documentation now contains
details about the valid values for `threshold` (#357).
* In all functions that have arguments `select` and/or `exclude`, there is now
one warning per misspelled variable. The previous behavior was to have only one
warning.
* Fixed inconsistent behaviour in `standardize()` when only one of the arguments
`center` or `scale` were provided (#365).
* `unstandardize()` and `replace_nan_inf()` now work with select helpers (#376).
* Added informative warning and error messages to `reverse()`. Furthermore, the
docs now describe the `range` argument more clearly (#380).
* `unnormalize()` errors with unexpected inputs (#383).
BUG FIXES
* `empty_columns()` (and therefore `remove_empty_columns()`) now correctly detects
columns containing only `NA_character_` (#349).
* Select helpers now work in custom functions when argument is called `select`
(#356).
* Fix unexpected warning in `convert_na_to()` when `select` is a list (#352).
* Fixed issue with correct labelling of numeric variables with more than nine
unique values and associated value labels.
# datawizard 0.6.5
MAJOR CHANGES
* Etienne Bacher is the new maintainer.
MINOR CHANGES
* `standardize()`, `center()`, `normalize()` and `rescale()` can be used in
model formulas, similar to `base::scale()`.
* `data_codebook()` now includes the proportion for each category/value, in
addition to the counts. Furthermore, if data contains tagged `NA` values,
these are included in the frequency table.
BUG FIXES
* `center(x)` now works correctly when `x` is a single value and either
`reference` or `center` is specified (#324).
* Fixed issue in `data_codebook()`, which failed for labelled vectors when
values of labels were not in sorted order.
# datawizard 0.6.4
NEW FUNCTIONS
* `data_codebook()`: to generate codebooks of data frames.
* New functions to deal with duplicates: `data_duplicated()` (keep all duplicates,
including the first occurrence) and `data_unique()` (returns the data, excluding
all duplicates except one instance of each, based on the selected method).
MINOR CHANGES
* `.data.frame` methods should now preserve custom attributes.
* The `include_bounds` argument in `normalize()` can now also be a numeric
value, defining the limit to the upper and lower bound (i.e. the distance
to 1 and 0).
* `data_filter()` now works with grouped data.
BUG FIXES
* `data_read()` no longer prints message for empty columns when the data
actually had no empty columns.
* `data_to_wide()` now drops columns that are not in `id_cols` (if specified),
`names_from`, or `values_from`. This is the behaviour observed in `tidyr::pivot_wider()`.
# datawizard 0.6.3
MAJOR CHANGES
* There is a new publication about the `{datawizard}` package:
<https://joss.theoj.org/papers/10.21105/joss.04684>
* Fixes failing tests due to changes in `R-devel`.
* `data_to_long()` and `data_to_wide()` have had significant performance
improvements, sometimes as high as a ten-fold speedup.
MINOR CHANGES
* When column names are misspelled, most functions now suggest which existing
columns possibly could be meant.
* Miscellaneous performance gains.
* `convert_to_na()` now requires argument `na` to be of class 'Date' to convert
specific dates to `NA`. For example, `convert_to_na(x, na = "2022-10-17")`
must be changed to `convert_to_na(x, na = as.Date("2022-10-17"))`.
BUG FIXES
* `data_to_long()` and `data_to_wide()` now correctly keep the `date` format.
# datawizard 0.6.2
BREAKING CHANGES
* Methods for grouped data frames (`.grouped_df`) no longer support
`dplyr::group_by()` for `{dplyr}` before version `0.8.0`.
* `empty_columns()` and `remove_empty_columns()` now also remove columns that
contain only empty characters. Likewise, `empty_rows()` and
`remove_empty_rows()` remove observations that completely have missing or
empty character values.
MINOR CHANGES
* `data_read()` gains a `convert_factors` argument, to turn off automatic
conversion from numeric variables into factors.
BUG FIXES
* `data_arrange()` now works with data frames that were grouped using
`data_group()` (#274).
# datawizard 0.6.1
* Updates tests for upcoming changes in the `{tidyselect}` package (#267).
# datawizard 0.6.0
BREAKING CHANGES
* The minimum needed R version has been bumped to `3.6`.
* Following deprecated functions have been removed:
`data_cut()`, `data_recode()`, `data_shift()`, `data_reverse()`,
`data_rescale()`, `data_to_factor()`, `data_to_numeric()`
* New `text_format()` alias is introduced for `format_text()`, latter of which
will be removed in the next release.
* New `recode_values()` alias is introduced for `change_code()`, latter of which
will be removed in the next release.
* `data_merge()` now errors if columns specified in `by` are not in both
datasets.
* Using negative values in arguments `select` and `exclude` now removes the
columns from the selection/exclusion. The previous behavior was to start the
selection/exclusion from the end of the dataset, which was inconsistent with
the use of "-" with other selecting possibilities.
NEW FUNCTIONS
* `data_peek()`: to peek at values and type of variables in a data frame.
* `coef_var()`: to compute the coefficient of variation.
CHANGES
* `data_filter()` will give more informative messages on malformed syntax of the
`filter` argument.
* It is now possible to use curly brackets to pass variable names to
`data_filter()`, like the following example. See examples section in the
documentation of `data_filter()`.
* The `regex` argument was added to functions that use select-helpers and did
not already have this argument.
* Select helpers `starts_with()`, `ends_with()`, and `contains()` now accept
several patterns, e.g `starts_with("Sep", "Petal")`.
* Arguments `select` and `exclude` that are present in most functions have been
improved to work in loops and in custom functions. For example, the following
code now works:
```r
foo <- function(data) {
i <- "Sep"
find_columns(data, select = starts_with(i))
}
foo(iris)
for (i in c("Sepal", "Sp")) {
head(iris) |>
find_columns(select = starts_with(i)) |>
print()
}
```
* There is now a vignette summarizing the various ways to select or exclude
variables in most `{datawizard}` functions.
# datawizard 0.5.1
* Fixes failing tests due to `{poorman}` update.
# datawizard 0.5.0
MAJOR CHANGES
* Following statistical transformation functions have been renamed to not have
`data_*()` prefix, since they do not work exclusively with data frames, but
are typically first of all used with vectors, and therefore had misleading
names:
- `data_cut()` -> `categorize()`
- `data_recode()` -> `change_code()`
- `data_shift()` -> `slide()`
- `data_reverse()` -> `reverse()`
- `data_rescale()` -> `rescale()`
- `data_to_factor()` -> `to_factor()`
- `data_to_numeric()` -> `to_numeric()`
Note that these functions also have `.data.frame()` methods and still work for
data frames as well. Former function names are still available as aliases, but
will be deprecated and removed in a future release.
* Bumps the needed minimum R version to `3.5`.
* Removed deprecated function `data_findcols()`. Please use its replacement,
`data_find()`.
* Removed alias `extract()` for `data_extract()` function since it collided with
`tidyr::extract()`.
* Argument `training_proportion` in `data_partition()` is deprecated. Please use
`proportion` now.
* Given his continued and significant contributions to the package, Etienne
Bacher (@etiennebacher) is now included as an author.
* `unstandardise()` now works for `center(x)`
* `unnormalize()` now works for `change_scale(x)`
* `reshape_wider()` now follows more consistently `tidyr::pivot_wider()` syntax.
Arguments `colnames_from`, `sep`, and `rows_from` are deprecated and should be
replaced by `names_from`, `names_sep`, and `id_cols` respectively.
`reshape_wider()` also gains an argument `names_glue` (#182, #198).
* Similarly, `reshape_longer()` now follows more consistently
`tidyr::pivot_longer()` syntax. Argument `colnames_to` is deprecated and
should be replaced by `names_to`. `reshape_longer()` also gains new arguments:
`names_prefix`, `names_sep`, `names_pattern`, and `values_drop_na` (#189).
CHANGES
* Some of the text formatting helpers (like `text_concatenate()`) gain an
`enclose` argument, to wrap text elements with surrounding characters.
* `winsorize` now accepts "raw" and "zscore" methods (in addition to
"percentile"). Additionally, when `robust` is set to `TRUE` together with
`method = "zscore"`, winsorizes via the median and median absolute deviation
(MAD); else via the mean and standard deviation. (@rempsyc, #177, #49, #47).
* `convert_na_to` now accepts numeric replacements on character vectors and
single replacement for multiple vector classes. (@rempsyc, #214).
* `data_partition()` now allows to create multiple partitions from the data,
returning multiple training and a remaining test set.
* Functions like `center()`, `normalize()` or `standardize()` no longer fail
when data contains infinite values (`Inf`).
NEW FUNCTIONS
* `row_to_colnames()` and `colnames_to_row()` to move a row to column names, and
column names to row (@etiennebacher, #169).
* `data_arrange()` to sort the rows of a dataframe according to the values of
the selected columns.
BUG FIXES
* Fixed wrong column names in `data_to_wide()` (#173).
# datawizard 0.4.1
BREAKING
* Added the `standardize.default()` method (moved from package **effectsize**),
to be consistent in that the default-method now is in the same package as the
generic. `standardize.default()` behaves exactly like in **effectsize** and
particularly works for regression model objects. **effectsize** now re-exports
`standardize()` from **datawizard**.
NEW FUNCTIONS
* `data_shift()` to shift the value range of numeric variables.
* `data_recode()` to recode old into new values.
* `data_to_factor()` as counterpart to `data_to_numeric()`.
* `data_tabulate()` to create frequency tables of variables.
* `data_read()` to read (import) data files (from text, or foreign statistical
packages).
* `unnormalize()` as counterpart to `normalize()`. This function only works for
variables that have been normalized with `normalize()`.
* `data_group()` and `data_ungroup()` to create grouped data frames, or to
remove the grouping information from grouped data frames.
CHANGES
* `data_find()` was added as alias to `find_colums()`, to have consistent name
patterns for the **datawizard** functions. `data_findcols()` will be removed
in a future update and usage is discouraged.
* The `select` argument (and thus, also the `exclude` argument) now also accepts
functions testing for logical conditions, e.g. `is.numeric()` (or
`is.numeric`), or any user-defined function that selects the variables for
which the function returns `TRUE` (like: `foo <- function(x) mean(x) > 3`).
* Arguments `select` and `exclude` now allow the negation of select-helpers,
like `-ends_with("")`, `-is.numeric` or `-Sepal.Width:Petal.Length`.
* Many functions now get a `.default` method, to capture unsupported classes.
This now yields a message and returns the original input, and hence, the
`.data.frame` methods won't stop due to an error.
* The `filter` argument in `data_filter()` can also be a numeric vector, to
indicate row indices of those rows that should be returned.
* `convert_to_na()` gets methods for variables of class `logical` and `Date`.
* `convert_to_na()` for factors (and data frames) gains a `drop_levels`
argument, to drop unused levels that have been replaced by `NA`.
* `data_to_numeric()` gains two more arguments, `preserve_levels` and `lowest`,
to give better control of conversion of factors.
BUG FIXES
* When logicals were passed to `center()` or `standardize()` and `force = TRUE`,
these were not properly converted to numeric variables.
# datawizard 0.4.0
MAJOR CHANGES
* `data_match()` now returns filtered data by default. Old behavior (returning
rows indices) can be set by setting `return_indices = TRUE`.
* The following functions are now re-exported from `{insight}` package:
`object_has_names()`, `object_has_rownames()`, `is_empty_object()`,
`compact_list()`, `compact_character()`
* `data_findcols()` will become deprecated in future updates. Please use the new
replacements `find_columns()` and `get_columns()`.
* The vignette *Analysing Longitudinal or Panel Data* has now moved to
[parameters
package](https://easystats.github.io/parameters/articles/demean.html).
NEW FUNCTIONS
* To convert rownames to a column, and *vice versa*: `rownames_as_column()` and
`column_as_rownames()` (@etiennebacher, #80).
* `find_columns()` and `get_columns()` to find column names or retrieve subsets
of data frames, based on various select-methods (including select-helpers).
These function will supersede `data_findcols()` in the future.
* `data_filter()` as complement for `data_match()`, which works with logical
expressions for filtering rows of data frames.
* For computing weighted centrality measures and dispersion: `weighted_mean()`,
`weighted_median()`, `weighted_sd()` and `weighted_mad()`.
* To replace `NA` in vectors and dataframes: `convert_na_to()` (@etiennebacher,
#111).
MINOR CHANGES
* The `select` argument in several functions (like `data_remove()`,
`reshape_longer()`, or `data_extract()`) now allows the use of select-helpers
for selecting variables based on specific patterns.
* `data_extract()` gains new arguments to allow type-safe return values,
i.e. *always* return a vector *or* a data frame. Thus, `data_extract()` can now
be used to select multiple variables or pull a single variable from data
frames.
* `data_match()` gains a `match` argument, to indicate with which logical
operation matching results should be combined.
* Improved support for *labelled data* for many functions, i.e. returned data
frame will preserve value and variable label attributes, where possible and
applicable.
* `describe_distribution()` now works with lists (@etiennebacher, #105).
* `data_rename()` doesn't use `pattern` anymore to rename the columns if
`replacement` is not provided (@etiennebacher, #103).
* `data_rename()` now adds a suffix to duplicated names in `replacement`
(@etiennebacher, #103).
BUG FIXES
* `data_to_numeric()` produced wrong results for factors when `dummy_factors =
TRUE` and factor contained missing values.
* `data_match()` produced wrong results when data contained missing values.
* Fixed CRAN check issues in `data_extract()` when more than one variable was
extracted from a data frame.
# datawizard 0.3.0
NEW FUNCTIONS
* To find or remove empty rows and columns in a data frame: `empty_rows()`,
`empty_columns()`, `remove_empty_rows()`, `remove_empty_columns()`, and
`remove_empty`.
* To check for names: `object_has_names()` and `object_has_rownames()`.
* To rotate data frames: `data_rotate()`.
* To reverse score variables: `data_reverse()`.
* To merge/join multiple data frames: `data_merge()` (or its alias
`data_join()`).
* To cut (recode) data into groups: `data_cut()`.
* To replace specific values with `NA`s: `convert_to_na()`.
* To replace `Inf` and `NaN` values with `NA`s: `replace_nan_inf()`.
- Arguments `cols`, `before` and `after` in `data_relocate()` can now also be
numeric values, indicating the position of the destination column.
# datawizard 0.2.3
- New functions:
* to work with lists: `is_empty_object()` and `compact_list()`
* to work with strings: `compact_character()`
# datawizard 0.2.2
- New function `data_extract()` (or its alias `extract()`) to pull single
variables from a data frame, possibly naming each value by the row names of
that data frame.
- `reshape_ci()` gains a `ci_type` argument, to reshape data frames where
CI-columns have prefixes other than `"CI"`.
- `standardize()` and `center()` gain arguments `center` and `scale`, to define
references for centrality and deviation that are used when centering or
standardizing variables.
- `center()` gains the arguments `force` and `reference`, similar to
`standardize()`.
- The functionality of the `append` argument in `center()` and `standardize()`
was revised. This made the `suffix` argument redundant, and thus it was
removed.
- Fixed issue in `standardize()`.
- Fixed issue in `data_findcols()`.
# datawizard 0.2.1
- Exports `plot` method for `visualisation_recipe()` objects from `{see}`
package.
- `centre()`, `standardise()`, `unstandardise()` are exported as aliases for
`center()`, `standardize()`, `unstandardize()`, respectively.
# datawizard 0.2.0.1
- This is mainly a maintenance release that addresses some issues with
conflicting namespaces.
# datawizard 0.2.0
- New function: `visualisation_recipe()`.
- The following function has now moved to *performance* package:
`check_multimodal()`.
- Minor updates to documentation, including a new vignette about `demean()`.
# datawizard 0.1.0
* First release.
================================================
FILE: R/adjust.R
================================================
#' Adjust data for the effect of other variable(s)
#'
#' This function can be used to adjust the data for the effect of other
#' variables present in the dataset. It is based on an underlying fitting of
#' regressions models, allowing for quite some flexibility, such as including
#' factors as random effects in mixed models (multilevel partialization),
#' continuous variables as smooth terms in general additive models (non-linear
#' partialization) and/or fitting these models under a Bayesian framework. The
#' values returned by this function are the residuals of the regression models.
#' Note that a regular correlation between two "adjusted" variables is
#' equivalent to the partial correlation between them.
#'
#' @param data A data frame.
#' @param effect Character vector of column names to be adjusted for (regressed
#' out). If `NULL` (the default), all variables will be selected.
#' @param multilevel If `TRUE`, the factors are included as random factors.
#' Else, if `FALSE` (default), they are included as fixed effects in the
#' simple regression model.
#' @param additive If `TRUE`, continuous variables as included as smooth terms
#' in additive models. The goal is to regress-out potential non-linear
#' effects.
#' @param bayesian If `TRUE`, the models are fitted under the Bayesian framework
#' using `rstanarm`.
#' @param keep_intercept If `FALSE` (default), the intercept of the model is
#' re-added. This avoids the centering around 0 that happens by default
#' when regressing out another variable (see the examples below for a
#' visual representation of this).
#' @inheritParams extract_column_names
#' @inheritParams standardize
#'
#' @return A data frame comparable to `data`, with adjusted variables.
#'
#' @examplesIf all(insight::check_if_installed(c("bayestestR", "rstanarm", "gamm4"), quietly = TRUE))
#' adjusted_all <- adjust(attitude)
#' head(adjusted_all)
#' adjusted_one <- adjust(attitude, effect = "complaints", select = "rating")
#' head(adjusted_one)
#' \donttest{
#' adjust(attitude, effect = "complaints", select = "rating", bayesian = TRUE)
#' adjust(attitude, effect = "complaints", select = "rating", additive = TRUE)
#' attitude$complaints_LMH <- cut(attitude$complaints, 3)
#' adjust(attitude, effect = "complaints_LMH", select = "rating", multilevel = TRUE)
#' }
#'
#' # Generate data
#' data <- bayestestR::simulate_correlation(n = 100, r = 0.7)
#' data$V2 <- (5 * data$V2) + 20 # Add intercept
#'
#' # Adjust
#' adjusted <- adjust(data, effect = "V1", select = "V2")
#' adjusted_icpt <- adjust(data, effect = "V1", select = "V2", keep_intercept = TRUE)
#'
#' # Visualize
#' plot(
#' data$V1, data$V2,
#' pch = 19, col = "blue",
#' ylim = c(min(adjusted$V2), max(data$V2)),
#' main = "Original (blue), adjusted (green), and adjusted - intercept kept (red) data"
#' )
#' abline(lm(V2 ~ V1, data = data), col = "blue")
#' points(adjusted$V1, adjusted$V2, pch = 19, col = "green")
#' abline(lm(V2 ~ V1, data = adjusted), col = "green")
#' points(adjusted_icpt$V1, adjusted_icpt$V2, pch = 19, col = "red")
#' abline(lm(V2 ~ V1, data = adjusted_icpt), col = "red")
#'
#' @export
adjust <- function(
data,
effect = NULL,
select = is.numeric,
exclude = NULL,
multilevel = FALSE,
additive = FALSE,
bayesian = FALSE,
keep_intercept = FALSE,
ignore_case = FALSE,
regex = FALSE,
verbose = FALSE
) {
# make sure column names are syntactically valid
.check_dataframe_names(data, action = "error")
# check for formula notation, convert to character vector
if (inherits(effect, "formula")) {
effect <- all.vars(effect)
}
# Find predictors
if (is.null(effect)) {
effect <- names(data)
}
if (is.null(select)) {
select <- is.numeric
}
select <- .select_nse(
select,
data,
exclude,
ignore_case,
regex = regex,
verbose = verbose
)
# Factors
formula_random <- NULL
facs <- names(data[effect][!vapply(data[effect], is.numeric, logical(1L))])
if (length(facs) >= 1 && multilevel) {
if (additive) {
formula_random <- stats::as.formula(paste(
"~",
paste(paste0("(1|", facs, ")"), collapse = " + ")
))
} else {
formula_random <- paste(
"+",
paste(paste0("(1|", facs, ")"), collapse = " + ")
)
}
effect <- effect[!effect %in% facs]
}
# Fit models
out <- data.frame(.ID = seq_len(nrow(data)))
for (var in select) {
predictors <- effect[effect != var]
if (additive) {
predictors_num <- names(data[predictors][vapply(
data[predictors],
is.numeric,
logical(1L)
)])
predictors[predictors == predictors_num] <- paste0(
"s(",
predictors_num,
")"
)
}
formula_predictors <- paste(c("1", predictors), collapse = " + ")
model_formula <- paste(var, "~", formula_predictors)
x <- .model_adjust_for(
data = data[unique(c(var, effect, facs))],
model_formula = model_formula,
multilevel = multilevel,
additive = additive,
bayesian = bayesian,
formula_random = formula_random,
keep_intercept = keep_intercept
)
out[var] <- x
}
out[names(data)[!names(data) %in% names(out)]] <- data[names(data)[
!names(data) %in% names(out)
]]
out[names(data)]
}
#' @rdname adjust
#' @export
data_adjust <- adjust
#' @keywords internal
.model_adjust_for <- function(
data,
model_formula,
multilevel = FALSE,
additive = FALSE,
bayesian = FALSE,
formula_random = NULL,
keep_intercept = FALSE
) {
# Additive -----------------------
if (additive) {
# Bayesian
if (bayesian) {
insight::check_if_installed("rstanarm")
model <- rstanarm::stan_gamm4(
stats::as.formula(model_formula),
random = formula_random,
data = data,
refresh = 0
)
# Frequentist
} else {
insight::check_if_installed("gamm4")
model <- gamm4::gamm4(
stats::as.formula(model_formula),
random = formula_random,
data = data
)
}
# Linear -------------------------
} else if (bayesian) {
# Bayesian
insight::check_if_installed("rstanarm")
if (multilevel) {
model <- rstanarm::stan_lmer(
paste(model_formula, formula_random),
data = data,
refresh = 0
)
} else {
model <- rstanarm::stan_glm(model_formula, data = data, refresh = 0)
}
} else if (multilevel) {
# Frequentist
insight::check_if_installed("lme4")
model <- lme4::lmer(paste(model_formula, formula_random), data = data)
} else {
model <- stats::lm(model_formula, data = data)
}
adjusted <- insight::get_residuals(model)
# Re-add intercept if need be
if (keep_intercept) {
intercept <- insight::get_intercept(model)
if (length(intercept) > 1) {
intercept <- stats::median(intercept)
} # For bayesian model
if (is.na(intercept)) {
intercept <- 0
}
adjusted <- adjusted + intercept
}
# Deal with missing data
out <- rep(NA, nrow(data))
out[stats::complete.cases(data)] <- as.vector(adjusted)
out
}
================================================
FILE: R/assign_labels.R
================================================
#' @title Assign variable and value labels
#' @name assign_labels
#'
#' @description
#' Assign variable and values labels to a variable or variables in a data frame.
#' Labels are stored as attributes (`"label"` for variable labels and `"labels"`)
#' for value labels.
#'
#' @param x A data frame, factor or vector.
#' @param variable The variable label as string.
#' @param values The value labels as (named) character vector. If `values` is
#' *not* a named vector, the length of labels must be equal to the length of
#' unique values. For a named vector, the left-hand side (LHS) is the value in
#' `x`, the right-hand side (RHS) the associated value label. Non-matching
#' labels are omitted.
#' @param ... Currently not used.
#' @inheritParams extract_column_names
#'
#' @inheritSection center Selection of variables - the `select` argument
#'
#' @return A labelled variable, or a data frame of labelled variables.
#'
#' @examples
#' x <- 1:3
#' # labelling by providing required number of labels
#' assign_labels(
#' x,
#' variable = "My x",
#' values = c("one", "two", "three")
#' )
#'
#' # labelling using named vectors
#' data(iris)
#' out <- assign_labels(
#' iris$Species,
#' variable = "Labelled Species",
#' values = c(`setosa` = "Spec1", `versicolor` = "Spec2", `virginica` = "Spec3")
#' )
#' str(out)
#'
#' # data frame example
#' out <- assign_labels(
#' iris,
#' select = "Species",
#' variable = "Labelled Species",
#' values = c(`setosa` = "Spec1", `versicolor` = "Spec2", `virginica` = "Spec3")
#' )
#' str(out$Species)
#'
#' # Partial labelling
#' x <- 1:5
#' assign_labels(
#' x,
#' variable = "My x",
#' values = c(`1` = "lowest", `5` = "highest")
#' )
#' @export
assign_labels <- function(x, ...) {
UseMethod("assign_labels")
}
#' @export
assign_labels.default <- function(x, verbose = TRUE, ...) {
if (isTRUE(verbose)) {
insight::format_alert(
sprintf(
"Adding labels currently not possible for variables of class `%s`.",
class(x)[1]
)
)
}
x
}
#' @rdname assign_labels
#' @export
assign_labels.numeric <- function(x, variable = NULL, values = NULL, ...) {
# add variable label
if (!is.null(variable)) {
if (is.character(variable) && length(variable) == 1L) {
attr(x, "label") <- variable
} else {
insight::format_error(
"Variable labels (argument `variable`) must be provided as a single character string, e.g. `variable = \"mylabel\"`." # nolint
)
}
}
# if user just wants to add a variable label, skip next steps
if (!is.null(values)) {
# extract unique values
unique_values <- as.vector(sort(stats::na.omit(unique(x))))
value_labels <- NULL
# do we have a names vector for "values"?
# else check if number of labels and values match
if (is.null(names(values))) {
if (length(values) == length(unique_values)) {
value_labels <- stats::setNames(unique_values, values)
} else {
insight::format_error(
"Cannot add labels. Number of unique values and number of value labels are not equal.",
sprintf(
"There are %i unique values and %i provided labels.",
length(unique_values),
length(values)
)
)
}
} else {
# check whether we have matches of labels and values
matching_labels <- names(values) %in% unique_values
if (!all(matching_labels)) {
insight::format_error(
"Following labels were associated with values that don't exist:",
text_concatenate(
paste0(
values[!matching_labels],
" (",
names(values)[!matching_labels],
")"
),
enclose = "`"
)
)
}
values <- values[names(values) %in% unique_values]
if (length(values)) {
# we need to switch names and values
value_labels <- stats::setNames(
coerce_to_numeric(names(values)),
values
)
}
}
attr(x, "labels") <- value_labels
}
x
}
#' @export
assign_labels.factor <- assign_labels.numeric
#' @export
assign_labels.character <- assign_labels.numeric
#' @rdname assign_labels
#' @export
assign_labels.data.frame <- function(
x,
select = NULL,
exclude = NULL,
values = NULL,
ignore_case = FALSE,
regex = FALSE,
verbose = TRUE,
...
) {
# evaluate arguments
select <- .select_nse(
select,
x,
exclude,
ignore_case,
regex = regex,
verbose = verbose
)
x[select] <- lapply(
x[select],
assign_labels,
values = values,
verbose = verbose,
...
)
x
}
================================================
FILE: R/categorize.R
================================================
#' @title Recode (or "cut" / "bin") data into groups of values.
#' @name categorize
#'
#' @description
#' This functions divides the range of variables into intervals and recodes
#' the values inside these intervals according to their related interval.
#' It is basically a wrapper around base R's `cut()`, providing a simplified
#' and more accessible way to define the interval breaks (cut-off values).
#'
#' @param x A (grouped) data frame, numeric vector or factor.
#' @param split Character vector, indicating at which breaks to split variables,
#' or numeric values with values indicating breaks. If character, may be one
#' of `"median"`, `"mean"`, `"quantile"`, `"equal_length"`, or `"equal_range"`.
#' `"median"` or `"mean"` will return dichotomous variables, split at their
#' mean or median, respectively. `"quantile"` and `"equal_length"` will split
#' the variable into `n_groups` groups, where each group refers to an interval
#' of a specific range of values. Thus, the length of each interval will be
#' based on the number of groups. `"equal_range"` also splits the variable
#' into multiple groups, however, the length of the interval is given, and
#' the number of resulting groups (and hence, the number of breaks) will be
#' determined by how many intervals can be generated, based on the full range
#' of the variable.
#' @param n_groups If `split` is `"quantile"` or `"equal_length"`, this defines
#' the number of requested groups (i.e. resulting number of levels or values)
#' for the recoded variable(s). `"quantile"` will define intervals based
#' on the distribution of the variable, while `"equal_length"` tries to
#' divide the range of the variable into pieces of equal length.
#' @param range If `split = "equal_range"`, this defines the range of values
#' that are recoded into a new value.
#' @param lowest Minimum value of the recoded variable(s). If `NULL` (the default),
#' for numeric variables, the minimum of the original input is preserved. For
#' factors, the default minimum is `1`. For `split = "equal_range"`, the
#' default minimum is always `1`, unless specified otherwise in `lowest`.
#' @param breaks Character, indicating whether breaks for categorizing data are
#' `"inclusive"` (values indicate the _upper_ bound of the _previous_ group or
#' interval) or `"exclusive"` (values indicate the _lower_ bound of the _next_
#' group or interval to begin). Use `labels = "range"` to make this behaviour
#' easier to see.
#' @param labels Character vector of value labels. If not `NULL`, `categorize()`
#' will returns factors instead of numeric variables, with `labels` used
#' for labelling the factor levels. Can also be `"mean"`, `"median"`,
#' `"range"` or `"observed"` for a factor with labels as the mean/median,
#' the requested range (even if not all values of that range are present in
#' the data) or observed range (range of the actual recoded values) of each
#' group. See 'Examples'.
#' @param append Logical or string. If `TRUE`, recoded or converted variables
#' get new column names and are appended (column bind) to `x`, thus returning
#' both the original and the recoded variables. The new columns get a suffix,
#' based on the calling function: `"_r"` for recode functions, `"_n"` for
#' `to_numeric()`, `"_f"` for `to_factor()`, or `"_s"` for
#' `slide()`. If `append=FALSE`, original variables in `x` will be
#' overwritten by their recoded versions. If a character value, recoded
#' variables are appended with new column names (using the defined suffix) to
#' the original data frame.
#' @param ... not used.
#' @inheritParams extract_column_names
#'
#' @inherit data_rename seealso
#'
#' @details
#'
#' # Splits and breaks (cut-off values)
#'
#' Breaks are by default _exclusive_, this means that these values indicate
#' the lower bound of the next group or interval to begin. Take a simple
#' example, a numeric variable with values from 1 to 9. The median would be 5,
#' thus the first interval ranges from 1-4 and is recoded into 1, while 5-9
#' would turn into 2 (compare `cbind(1:9, categorize(1:9))`). The same variable,
#' using `split = "quantile"` and `n_groups = 3` would define breaks at 3.67
#' and 6.33 (see `quantile(1:9, probs = c(1/3, 2/3))`), which means that values
#' from 1 to 3 belong to the first interval and are recoded into 1 (because
#' the next interval starts at 3.67), 4 to 6 into 2 and 7 to 9 into 3.
#'
#' The opposite behaviour can be achieved using `breaks = "inclusive"`, in which
#' case
#'
#' # Recoding into groups with equal size or range
#'
#' `split = "equal_length"` and `split = "equal_range"` try to divide the
#' range of `x` into intervals of similar (or same) length. The difference is
#' that `split = "equal_length"` will divide the range of `x` into `n_groups`
#' pieces and thereby defining the intervals used as breaks (hence, it is
#' equivalent to `cut(x, breaks = n_groups)`), while `split = "equal_range"`
#' will cut `x` into intervals that all have the length of `range`, where the
#' first interval by defaults starts at `1`. The lowest (or starting) value
#' of that interval can be defined using the `lowest` argument.
#'
#' @inheritSection center Selection of variables - the `select` argument
#'
#' @return `x`, recoded into groups. By default `x` is numeric, unless `labels`
#' is specified. In this case, a factor is returned, where the factor levels
#' (i.e. recoded groups are labelled accordingly.
#'
#' @examples
#' set.seed(123)
#' x <- sample(1:10, size = 50, replace = TRUE)
#'
#' table(x)
#'
#' # by default, at median
#' table(categorize(x))
#'
#' # into 3 groups, based on distribution (quantiles)
#' table(categorize(x, split = "quantile", n_groups = 3))
#'
#' # into 3 groups, user-defined break
#' table(categorize(x, split = c(3, 5)))
#'
#' set.seed(123)
#' x <- sample(1:100, size = 500, replace = TRUE)
#'
#' # into 5 groups, try to recode into intervals of similar length,
#' # i.e. the range within groups is the same for all groups
#' table(categorize(x, split = "equal_length", n_groups = 5))
#'
#' # into 5 groups, try to return same range within groups
#' # i.e. 1-20, 21-40, 41-60, etc. Since the range of "x" is
#' # 1-100, and we have a range of 20, this results into 5
#' # groups, and thus is for this particular case identical
#' # to the previous result.
#' table(categorize(x, split = "equal_range", range = 20))
#'
#' # return factor with value labels instead of numeric value
#' set.seed(123)
#' x <- sample(1:10, size = 30, replace = TRUE)
#' categorize(x, "equal_length", n_groups = 3)
#' categorize(x, "equal_length", n_groups = 3, labels = c("low", "mid", "high"))
#'
#' # cut numeric into groups with the mean or median as a label name
#' x <- sample(1:10, size = 30, replace = TRUE)
#' categorize(x, "equal_length", n_groups = 3, labels = "mean")
#' categorize(x, "equal_length", n_groups = 3, labels = "median")
#'
#' # cut numeric into groups with the requested range as a label name
#' # each category has the same range, and labels indicate this range
#' categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range")
#' # in this example, each category has the same range, but labels only refer
#' # to the ranges of the actual values (present in the data) inside each group
#' categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed")
#' @export
categorize <- function(x, ...) {
UseMethod("categorize")
}
#' @export
categorize.default <- function(x, verbose = TRUE, ...) {
if (isTRUE(verbose)) {
insight::format_alert(
paste0(
"Variables of class `",
class(x)[1],
"` can't be recoded and remain unchanged."
)
)
}
x
}
#' @rdname categorize
#' @export
categorize.numeric <- function(
x,
split = "median",
n_groups = NULL,
range = NULL,
lowest = 1,
breaks = "exclusive",
labels = NULL,
verbose = TRUE,
...
) {
# sanity check
split <- .sanitize_split_arg(split, n_groups, range)
# handle aliases
if (identical(split, "equal_length")) {
split <- "length"
}
if (identical(split, "equal_range")) {
split <- "range"
}
# check for valid values
breaks <- match.arg(breaks, c("exclusive", "inclusive"))
# save
original_x <- x
# no missings
x <- stats::na.omit(x)
# stop if all NA
if (!length(x)) {
if (isTRUE(verbose)) {
insight::format_alert(
"Variable contains only missing values. No recoding carried out."
)
}
return(original_x)
}
if (is.numeric(split)) {
category_splits <- split
} else {
category_splits <- switch(
split,
median = stats::median(x),
mean = mean(x),
length = n_groups,
quantile = stats::quantile(x, probs = seq_len(n_groups) / n_groups),
range = .equal_range(x, range, n_groups, lowest),
NULL
)
}
# complete ranges, including minimum and maximum
if (!identical(split, "length")) {
category_splits <- unique(c(min(x), category_splits, max(x)))
}
# recode into groups
out <- droplevels(cut(
x,
breaks = category_splits,
include.lowest = TRUE,
right = identical(breaks, "inclusive")
))
cut_result <- out
levels(out) <- 1:nlevels(out)
# fix lowest value, add back into original vector
out <- as.numeric(out)
if (!is.null(lowest)) {
out <- out - (min(out) - lowest)
}
original_x[!is.na(original_x)] <- out
# turn into factor?
.original_x_to_factor(original_x, x, cut_result, labels, out, verbose, ...)
}
#' @export
categorize.factor <- function(x, ...) {
original_x <- x
levels(x) <- 1:nlevels(x)
out <- as.factor(categorize(as.numeric(x), ...))
.set_back_labels(out, original_x, include_values = FALSE)
}
#' @rdname categorize
#' @export
categorize.data.frame <- function(
x,
select = NULL,
exclude = NULL,
split = "median",
n_groups = NULL,
range = NULL,
lowest = 1,
breaks = "exclusive",
labels = NULL,
append = FALSE,
ignore_case = FALSE,
regex = FALSE,
verbose = TRUE,
...
) {
# evaluate arguments
select <- .select_nse(
select,
x,
exclude,
ignore_case,
regex = regex,
verbose = verbose
)
# when we append variables, we call ".process_append()", which will
# create the new variables and updates "select", so new variables are processed
if (!isFALSE(append)) {
# process arguments
my_args <- .process_append(
x,
select,
append,
append_suffix = "_r"
)
# update processed arguments
x <- my_args$x
select <- my_args$select
}
x[select] <- lapply(
x[select],
categorize,
split = split,
n_groups = n_groups,
range = range,
lowest = lowest,
breaks = breaks,
labels = labels,
verbose = verbose,
...
)
x
}
#' @export
categorize.grouped_df <- function(
x,
select = NULL,
exclude = NULL,
split = "median",
n_groups = NULL,
range = NULL,
lowest = 1,
breaks = "exclusive",
labels = NULL,
append = FALSE,
ignore_case = FALSE,
regex = FALSE,
verbose = TRUE,
...
) {
grps <- attr(x, "groups", exact = TRUE)[[".rows"]]
attr_data <- attributes(x)
# evaluate arguments
select <- .select_nse(
select,
x,
exclude,
ignore_case,
regex = regex,
remove_group_var = TRUE,
verbose = verbose
)
# when we append variables, we call ".process_append()", which will
# create the new variables and updates "select", so new variables are processed
if (!isFALSE(append)) {
# process arguments
my_args <- .process_append(
x,
select,
append,
append_suffix = "_r"
)
# update processed arguments
x <- my_args$x
select <- my_args$select
}
x <- as.data.frame(x)
for (rows in grps) {
x[rows, ] <- categorize(
x[rows, , drop = FALSE],
split = split,
n_groups = n_groups,
range = range,
lowest = lowest,
breaks = breaks,
labels = labels,
select = select,
exclude = exclude,
append = FALSE, # need to set to FALSE here, else variable will be doubled
ignore_case = ignore_case,
verbose = verbose,
...
)
}
# set back class, so data frame still works with dplyr
x <- .replace_attrs(x, attr_data)
x
}
# tools --------------------
.equal_range <- function(x, range, n_groups, lowest = NULL) {
if (is.null(lowest)) {
lowest <- 1
}
if (is.null(range)) {
size <- ceiling((max(x) - min(x)) / n_groups)
range <- as.numeric(size)
}
seq(lowest, max(x), by = range)
}
.sanitize_split_arg <- function(split, n_groups, range) {
# check arguments
if (is.character(split)) {
split <- match.arg(
split,
choices = c(
"median",
"mean",
"quantile",
"equal_length",
"equal_range",
"equal",
"equal_distance",
"range",
"distance"
)
)
}
if (
is.character(split) &&
split %in% c("quantile", "equal_length") &&
is.null(n_groups)
) {
insight::format_error(
"Recoding based on quantiles or equal-sized groups requires the `n_groups` argument to be specified."
)
}
if (
is.character(split) &&
split == "equal_range" &&
is.null(n_groups) &&
is.null(range)
) {
insight::format_error(
"Recoding into groups with equal range requires either the `range` or `n_groups` argument to be specified."
)
}
split
}
.original_x_to_factor <- function(
original_x,
x,
cut_result,
labels,
out,
verbose,
...
) {
if (!is.null(labels)) {
if (length(labels) == length(unique(out))) {
original_x <- as.factor(original_x)
levels(original_x) <- labels
} else if (
length(labels) == 1 &&
labels %in% c("mean", "median", "range", "observed")
) {
original_x <- as.factor(original_x)
no_na_x <- original_x[!is.na(original_x)]
out <- switch(
labels,
mean = stats::aggregate(x, list(no_na_x), FUN = mean, na.rm = TRUE)$x,
median = stats::aggregate(
x,
list(no_na_x),
FUN = stats::median,
na.rm = TRUE
)$x,
# labels basically like what "cut()" returns
range = levels(cut_result),
# range based on the values that are actually present in the data
{
temp <- stats::aggregate(
x,
list(no_na_x),
FUN = range,
na.rm = TRUE
)$x
apply(temp, 1, function(i) {
paste0("(", paste(as.vector(i), collapse = "-"), ")")
})
}
)
levels(original_x) <- insight::format_value(out, ...)
} else if (isTRUE(verbose)) {
insight::format_warning(
"Argument `labels` and levels of the recoded variable are not of the same length.",
"Variable will not be converted to factor."
)
}
}
original_x
}
================================================
FILE: R/center.R
================================================
#' Centering (Grand-Mean Centering)
#'
#' Performs a grand-mean centering of data.
#'
#' @param x A (grouped) data frame, a (numeric or character) vector or a factor.
#' @param force Logical, if `TRUE`, forces centering of factors as
#' well. Factors are converted to numerical values, with the lowest level
#' being the value `1` (unless the factor has numeric levels, which are
#' converted to the corresponding numeric value).
#' @param robust Logical, if `TRUE`, centering is done by subtracting the
#' median from the variables. If `FALSE`, variables are centered by
#' subtracting the mean.
#' @param append Logical or string. If `TRUE`, centered variables get new
#' column names (with the suffix `"_c"`) and are appended (column bind) to `x`,
#' thus returning both the original and the centered variables. If `FALSE`,
#' original variables in `x` will be overwritten by their centered versions.
#' If a character value, centered variables are appended with new column
#' names (using the defined suffix) to the original data frame.
#' @param verbose Toggle warnings and messages.
#' @param weights Can be `NULL` (for no weighting), or:
#' - For data frames: a numeric vector of weights, or a character of the
#' name of a column in the `data.frame` that contains the weights.
#' - For numeric vectors: a numeric vector of weights.
#' @param center Numeric value, which can be used as alternative to
#' `reference` to define a reference centrality. If `center` is of length 1,
#' it will be recycled to match the length of selected variables for centering.
#' Else, `center` must be of same length as the number of selected variables.
#' Values in `center` will be matched to selected variables in the provided
#' order, unless a named vector is given. In this case, names are matched
#' against the names of the selected variables.
#' @param ... Currently not used.
#' @inheritParams extract_column_names
#' @inheritParams standardize
#'
#' @section Selection of variables - the `select` argument:
#' For most functions that have a `select` argument (including this function),
#' the complete input data frame is returned, even when `select` only selects
#' a range of variables. That is, the function is only applied to those variables
#' that have a match in `select`, while all other variables remain unchanged.
#' In other words: for this function, `select` will not omit any non-included
#' variables, so that the returned data frame will include all variables
#' from the input data frame.
#'
#' @note
#' **Difference between centering and standardizing**: Standardized variables
#' are computed by subtracting the mean of the variable and then dividing it by
#' the standard deviation, while centering variables involves only the
#' subtraction.
#'
#' @seealso If centering within-clusters (instead of grand-mean centering)
#' is required, see [demean()]. For standardizing, see [standardize()], and
#' [makepredictcall.dw_transformer()] for use in model formulas.
#'
#' @return The centered variables.
#'
#' @examples
#' data(iris)
#'
#' # entire data frame or a vector
#' head(iris$Sepal.Width)
#' head(center(iris$Sepal.Width))
#' head(center(iris))
#' head(center(iris, force = TRUE))
#'
#' # only the selected columns from a data frame
#' center(anscombe, select = c("x1", "x3"))
#' center(anscombe, exclude = c("x1", "x3"))
#'
#' # centering with reference center and scale
#' d <- data.frame(
#' a = c(-2, -1, 0, 1, 2),
#' b = c(3, 4, 5, 6, 7)
#' )
#'
#' # default centering at mean
#' center(d)
#'
#' # centering, using 0 as mean
#' center(d, center = 0)
#'
#' # centering, using -5 as mean
#' center(d, center = -5)
#' @export
center <- function(x, ...) {
UseMethod("center")
}
#' @rdname center
#' @export
centre <- center
#' @export
center.default <- function(x, verbose = TRUE, ...) {
if (isTRUE(verbose)) {
insight::format_alert(
sprintf(
"Centering currently not possible for variables of class `%s`.",
class(x)[1]
),
"You may open an issue at https://github.com/easystats/datawizard/issues."
)
}
x
}
#' @rdname center
#' @export
center.numeric <- function(
x,
robust = FALSE,
weights = NULL,
reference = NULL,
center = NULL,
verbose = TRUE,
...
) {
# set default. Furthermore, data.frame methods cannot return a vector
# of NULLs for each variable - instead they return NA. Thus, we have to
# treat NA like NULL
if (is.null(center) || is.na(center)) {
center <- TRUE
}
my_args <- .process_std_center(
x,
weights,
robust,
verbose,
reference,
center,
scale = NULL
)
dot_args <- list(...)
if (is.null(my_args)) {
# all NA?
return(x)
} else if (is.null(my_args$check)) {
vals <- rep(0, length(my_args$vals)) # If only unique value
} else {
vals <- as.vector(my_args$vals - my_args$center)
}
centered_x <- rep(NA, length(my_args$valid_x))
centered_x[my_args$valid_x] <- vals
attr(centered_x, "center") <- my_args$center
attr(centered_x, "scale") <- 1
attr(centered_x, "robust") <- robust
# labels
z <- .set_back_labels(centered_x, x, include_values = FALSE)
# don't add attribute when we call data frame methods
if (!isFALSE(dot_args$add_transform_class)) {
class(z) <- c("dw_transformer", class(z))
}
z
}
#' @export
center.factor <- function(
x,
robust = FALSE,
weights = NULL,
force = FALSE,
verbose = TRUE,
...
) {
if (!force) {
return(x)
}
center(
.factor_to_numeric(x),
weights = weights,
robust = robust,
verbose = verbose,
...
)
}
#' @export
center.logical <- center.factor
#' @export
center.character <- center.factor
#' @export
center.Date <- center.factor
#' @export
center.AsIs <- center.numeric
#' @rdname center
#' @inheritParams standardize.data.frame
#' @export
center.data.frame <- function(
x,
select = NULL,
exclude = NULL,
robust = FALSE,
weights = NULL,
reference = NULL,
center = NULL,
force = FALSE,
remove_na = c("none", "selected", "all"),
append = FALSE,
ignore_case = FALSE,
verbose = TRUE,
regex = FALSE,
...
) {
# evaluate select/exclude, may be select-helpers
select <- .select_nse(
select,
x,
exclude,
ignore_case,
regex = regex,
verbose = verbose
)
# process arguments
my_args <- .process_std_args(
x,
select,
exclude,
weights,
append,
append_suffix = "_c",
keep_factors = force,
remove_na,
reference,
.center = center,
.scale = NULL
)
# set new values
x <- my_args$x
for (var in my_args$select) {
x[[var]] <- center(
x[[var]],
robust = robust,
weights = my_args$weights,
verbose = FALSE,
reference = reference[[var]],
center = my_args$center[var],
force = force,
add_transform_class = FALSE
)
}
attr(x, "center") <- vapply(
x[my_args$select],
function(z) attributes(z)$center,
numeric(1)
)
attr(x, "scale") <- vapply(
x[my_args$select],
function(z) attributes(z)$scale,
numeric(1)
)
attr(x, "robust") <- robust
x
}
#' @export
center.grouped_df <- function(
x,
select = NULL,
exclude = NULL,
robust = FALSE,
weights = NULL,
reference = NULL,
center = NULL,
force = FALSE,
remove_na = c("none", "selected", "all"),
append = FALSE,
ignore_case = FALSE,
verbose = TRUE,
regex = FALSE,
...
) {
# evaluate select/exclude, may be select-helpers
select <- .select_nse(
select,
x,
exclude,
ignore_case,
regex = regex,
verbose = verbose
)
my_args <- .process_grouped_df(
x,
select,
exclude,
append,
append_suffix = "_c",
reference,
weights,
keep_factors = force
)
for (rows in my_args$grps) {
my_args$x[rows, ] <- center(
my_args$x[rows, , drop = FALSE],
select = my_args$select,
exclude = NULL,
robust = robust,
weights = my_args$weights,
remove_na = remove_na,
verbose = verbose,
force = force,
append = FALSE,
center = center,
add_transform_class = FALSE,
...
)
}
# set back class, so data frame still works with dplyr
attributes(my_args$x) <- my_args$info
my_args$x
}
# methods -------------------------
#' @export
print.dw_transformer <- function(x, ...) {
print(as.vector(x), ...)
vector_info <- NULL
if (!is.null(attributes(x)$scale)) {
# attributes for center() / standardize()
vector_info <- sprintf(
"(center: %.2g, scale = %.2g)\n",
attributes(x)$center,
attributes(x)$scale
)
} else if (!is.null(attributes(x)$range_difference)) {
# attributes for normalize() / rescale()
vector_info <- sprintf(
"(original range = %.2g to %.2g)\n",
attributes(x)$min_value,
attributes(x)$min_value + attributes(x)$range_difference
)
}
if (!is.null(vector_info)) {
insight::print_color(vector_info, color = "grey")
}
invisible(x)
}
================================================
FILE: R/contrs.R
================================================
#' Deviation Contrast Matrix
#'
#' Build a deviation contrast matrix, a type of _effects contrast_ matrix.
#'
#' @inheritParams stats::contr.sum
#'
#' @details
#' In effects coding, unlike treatment/dummy coding
#' ([stats::contr.treatment()]), each contrast sums to 0. In regressions models,
#' this results in an intercept that represents the (unweighted) average of the
#' group means. In ANOVA settings, this also guarantees that lower order effects
#' represent _main_ effects (and not _simple_ or _conditional_ effects, as is
#' the case when using R's default [stats::contr.treatment()]).
#' \cr\cr
#' Deviation coding (`contr.deviation`) is a type of effects coding. With
#' deviation coding, the coefficients for factor variables are interpreted as
#' the difference of each factor level from the base level (this is the same
#' interpretation as with treatment/dummy coding). For example, for a factor
#' `group` with levels "A", "B", and "C", with `contr.devation`, the intercept
#' represents the overall mean (average of the group means for the 3 groups),
#' and the coefficients `groupB` and `groupC` represent the differences between
#' the A group mean and the B and C group means, respectively.
#' \cr\cr
#' Sum coding ([stats::contr.sum()]) is another type of effects coding. With sum
#' coding, the coefficients for factor variables are interpreted as the
#' difference of each factor level from **the grand (across-groups) mean**. For
#' example, for a factor `group` with levels "A", "B", and "C", with
#' `contr.sum`, the intercept represents the overall mean (average of the group
#' means for the 3 groups), and the coefficients `group1` and `group2` represent
#' the differences the
#' **A** and **B** group means from the overall mean, respectively.
#'
#' @seealso [stats::contr.sum()]
#'
#' @examplesIf !identical(Sys.getenv("IN_PKGDOWN"), "true")
#' \donttest{
#' data("mtcars")
#'
#' mtcars <- data_modify(mtcars, cyl = factor(cyl))
#'
#' c.treatment <- cbind(Intercept = 1, contrasts(mtcars$cyl))
#' solve(c.treatment)
#' #> 4 6 8
#' #> Intercept 1 0 0 # mean of the 1st level
#' #> 6 -1 1 0 # 2nd level - 1st level
#' #> 8 -1 0 1 # 3rd level - 1st level
#'
#' contrasts(mtcars$cyl) <- contr.sum
#' c.sum <- cbind(Intercept = 1, contrasts(mtcars$cyl))
#' solve(c.sum)
#' #> 4 6 8
#' #> Intercept 0.333 0.333 0.333 # overall mean
#' #> 0.667 -0.333 -0.333 # deviation of 1st from overall mean
#' #> -0.333 0.667 -0.333 # deviation of 2nd from overall mean
#'
#'
#' contrasts(mtcars$cyl) <- contr.deviation
#' c.deviation <- cbind(Intercept = 1, contrasts(mtcars$cyl))
#' solve(c.deviation)
#' #> 4 6 8
#' #> Intercept 0.333 0.333 0.333 # overall mean
#' #> 6 -1.000 1.000 0.000 # 2nd level - 1st level
#' #> 8 -1.000 0.000 1.000 # 3rd level - 1st level
#'
#' ## With Interactions -----------------------------------------
#' mtcars <- data_modify(mtcars, am = C(am, contr = contr.deviation))
#' mtcars <- data_arrange(mtcars, select = c("cyl", "am"))
#'
#' mm <- unique(model.matrix(~ cyl * am, data = mtcars))
#' rownames(mm) <- c(
#' "cyl4.am0", "cyl4.am1", "cyl6.am0",
#' "cyl6.am1", "cyl8.am0", "cyl8.am1"
#' )
#'
#' solve(mm)
#' #> cyl4.am0 cyl4.am1 cyl6.am0 cyl6.am1 cyl8.am0 cyl8.am1
#' #> (Intercept) 0.167 0.167 0.167 0.167 0.167 0.167 # overall mean
#' #> cyl6 -0.500 -0.500 0.500 0.500 0.000 0.000 # cyl MAIN eff: 2nd - 1st
#' #> cyl8 -0.500 -0.500 0.000 0.000 0.500 0.500 # cyl MAIN eff: 2nd - 1st
#' #> am1 -0.333 0.333 -0.333 0.333 -0.333 0.333 # am MAIN eff
#' #> cyl6:am1 1.000 -1.000 -1.000 1.000 0.000 0.000
#' #> cyl8:am1 1.000 -1.000 0.000 0.000 -1.000 1.000
#' }
#'
#' @export
contr.deviation <- function(n, base = 1, contrasts = TRUE, sparse = FALSE) {
cont <- stats::contr.treatment(
n,
base = base,
contrasts = contrasts,
sparse = sparse
)
if (contrasts) {
n <- nrow(cont)
cont <- cont - 1 / n
}
cont
}
================================================
FILE: R/convert_na_to.R
================================================
#' @title Replace missing values in a variable or a data frame.
#' @name convert_na_to
#'
#' @description
#' Replace missing values in a variable or a data frame.
#'
#' @param x A numeric, factor, or character vector, or a data frame.
#' @param replacement Numeric or character value that will be used to
#' replace `NA`.
#' @param verbose Toggle warnings.
#' @param ... Not used.
#'
#' @inheritSection center Selection of variables - the `select` argument
#'
#' @return
#' `x`, where `NA` values are replaced by `replacement`.
#'
#' @examples
#' # Convert NA to 0 in a numeric vector
#' convert_na_to(
#' c(9, 3, NA, 2, 3, 1, NA, 8),
#' replacement = 0
#' )
#'
#' # Convert NA to "missing" in a character vector
#' convert_na_to(
#' c("a", NA, "d", "z", NA, "t"),
#' replacement = "missing"
#' )
#'
#' ### For data frames
#'
#' test_df <- data.frame(
#' x = c(1, 2, NA),
#' x2 = c(4, 5, NA),
#' y = c("a", "b", NA)
#' )
#'
#' # Convert all NA to 0 in numeric variables, and all NA to "missing" in
#' # character variables
#' convert_na_to(
#' test_df,
#' replace_num = 0,
#' replace_char = "missing"
#' )
#'
#' # Convert a specific variable in the data frame
#' convert_na_to(
#' test_df,
#' replace_num = 0,
#' replace_char = "missing",
#' select = "x"
#' )
#'
#' # Convert all variables starting with "x"
#' convert_na_to(
#' test_df,
#' replace_num = 0,
#' replace_char = "missing",
#' select = starts_with("x")
#' )
#'
#' # Convert NA to 1 in variable 'x2' and to 0 in all other numeric
#' # variables
#' convert_na_to(
#' test_df,
#' replace_num = 0,
#' select = list(x2 = 1)
#' )
#'
#' @export
convert_na_to <- function(x, ...) {
UseMethod("convert_na_to")
}
#' @export
convert_na_to.default <- function(x, verbose = TRUE, ...) {
if (isTRUE(verbose)) {
insight::format_alert(
sprintf(
"Converting missing values (`NA`) into regular values currently not possible for variables of class `%s`.",
class(x)[1]
)
)
}
x
}
#' @rdname convert_na_to
#' @export
convert_na_to.numeric <- function(x, replacement = NULL, verbose = TRUE, ...) {
if (insight::is_empty_object(replacement) || !is.numeric(replacement)) {
if (isTRUE(verbose)) {
insight::format_warning("`replacement` needs to be a numeric vector.")
}
} else if (length(replacement) > 1) {
if (isTRUE(verbose)) {
insight::format_warning("`replacement` needs to be of length one.")
}
} else {
x[is.na(x)] <- replacement
}
x
}
#' @export
convert_na_to.factor <- function(x, replacement = NULL, verbose = TRUE, ...) {
if (insight::is_empty_object(replacement) || length(replacement) > 1) {
if (isTRUE(verbose)) {
insight::format_warning("`replacement` needs to be of length one.")
}
} else {
x <- addNA(x)
levels(x) <- c(levels(x), replacement)
x[is.na(x)] <- replacement
}
x
}
#' @rdname convert_na_to
#' @export
convert_na_to.character <- function(
x,
replacement = NULL,
verbose = TRUE,
...
) {
if (
insight::is_empty_object(replacement) ||
!is.character(replacement) && !is.numeric(replacement)
) {
if (isTRUE(verbose)) {
insight::format_warning(
"`replacement` needs to be a character or numeric vector."
)
}
} else if (length(replacement) > 1) {
if (isTRUE(verbose)) {
insight::format_warning("`replacement` needs to be of length one.")
}
} else {
x[is.na(x)] <- as.character(replacement)
}
x
}
#' @param replace_num Value to replace `NA` when variable is of type numeric.
#' @param replace_char Value to replace `NA` when variable is of type character.
#' @param replace_fac Value to replace `NA` when variable is of type factor.
#' @inheritParams extract_column_names
#'
#' @rdname convert_na_to
#' @export
convert_na_to.data.frame <- function(
x,
select = NULL,
exclude = NULL,
replacement = NULL,
replace_num = replacement,
replace_char = replacement,
replace_fac = replacement,
ignore_case = FALSE,
regex = FALSE,
verbose = TRUE,
...
) {
my_data <- x
select_nse <- .select_nse(
select,
data = my_data,
exclude = exclude,
ignore_case,
regex = regex,
verbose = verbose
)
# default
lookup <- lapply(x, function(y) {
if (is.numeric(y)) {
replace_num
} else if (is.character(y)) {
replace_char
} else if (is.factor(y)) {
replace_fac
}
})
# override for specific vars
try_eval <- try(eval(select), silent = TRUE)
select_is_list <- !inherits(try_eval, "try-error") && is.list(select)
if (select_is_list) {
for (i in select_nse) {
lookup[[i]] <- select[[i]]
}
} else {
lookup <- lookup[names(lookup) %in% select_nse]
}
lookup <- Filter(Negate(is.null), lookup)
for (i in names(lookup)) {
x[[i]] <- convert_na_to(
x[[i]],
replacement = lookup[[i]],
verbose = verbose
)
}
x
}
================================================
FILE: R/convert_to_na.R
================================================
#' @title Convert non-missing values in a variable into missing values.
#' @name convert_to_na
#'
#' @description
#' Convert non-missing values in a variable into missing values.
#'
#' @param x A vector, factor or a data frame.
#' @param na Numeric, character vector or logical (or a list of numeric, character
#' vectors or logicals) with values that should be converted to `NA`. Numeric
#' values applied to numeric vectors, character values are used for factors,
#' character vectors or date variables, and logical values for logical vectors.
#' @param drop_levels Logical, for factors, when specific levels are replaced
#' by `NA`, should unused levels be dropped?
#' @param ... Not used.
#' @inheritParams extract_column_names
#'
#' @return
#' `x`, where all values in `na` are converted to `NA`.
#'
#' @examples
#' x <- sample(1:6, size = 30, replace = TRUE)
#' x
#' # values 4 and 5 to NA
#' convert_to_na(x, na = 4:5)
#'
#' # data frames
#' set.seed(123)
#' x <- data.frame(
#' a = sample(1:6, size = 20, replace = TRUE),
#' b = sample(letters[1:6], size = 20, replace = TRUE),
#' c = sample(c(30:33, 99), size = 20, replace = TRUE)
#' )
#' # for all numerics, convert 5 to NA. Character/factor will be ignored.
#' convert_to_na(x, na = 5)
#'
#' # for numerics, 5 to NA, for character/factor, "f" to NA
#' convert_to_na(x, na = list(6, "f"))
#'
#' # select specific variables
#' convert_to_na(x, select = c("a", "b"), na = list(6, "f"))
#' @export
convert_to_na <- function(x, ...) {
UseMethod("convert_to_na")
}
#' @export
convert_to_na.default <- function(x, verbose = TRUE, ...) {
if (isTRUE(verbose)) {
insight::format_alert(
sprintf(
"Converting values into missing values (`NA`) currently not possible for variables of class `%s`.",
class(x)[1]
)
)
}
x
}
#' @rdname convert_to_na
#' @export
convert_to_na.numeric <- function(x, na = NULL, verbose = TRUE, ...) {
# if we have a list, use first valid element
if (is.list(na)) {
na <- unlist(
na[vapply(na, is.numeric, FUN.VALUE = TRUE)],
use.names = FALSE
)
}
if (insight::is_empty_object(na) || !is.numeric(na)) {
if (isTRUE(verbose)) {
insight::format_alert(
"Could not convert values into `NA` for a numeric variable.",
"To do this, `na` needs to be a numeric vector, or a list that contains numeric vector elements."
)
}
} else {
matches <- which(x %in% na)
x[matches] <- NA
# drop unused labels
value_labels <- attr(x, "labels", exact = TRUE)
attr(x, "labels") <- value_labels[!value_labels %in% na]
}
x
}
#' @rdname convert_to_na
#' @export
convert_to_na.factor <- function(
x,
na = NULL,
drop_levels = FALSE,
verbose = TRUE,
...
) {
# if we have a list, use first valid element
if (is.list(na)) {
na <- unlist(
na[vapply(na, is.character, FUN.VALUE = TRUE)],
use.names = FALSE
)
}
if (insight::is_empty_object(na) || (!is.factor(na) && !is.character(na))) {
if (isTRUE(verbose)) {
insight::format_alert(
"Could not convert values into `NA` for a factor or character variable.",
"To do this, `na` needs to be a character vector, or a list that contains character vector elements."
)
}
} else {
matches <- which(x %in% na)
x[matches] <- NA
# drop unused labels
value_labels <- attr(x, "labels", exact = TRUE)
if (is.factor(x) && isTRUE(drop_levels)) {
# save label attribute
variable_label <- attr(x, "label", exact = TRUE)
x <- droplevels(x)
# droplevels() discards attributes, so we need to re-assign them
attr(x, "label") <- variable_label
}
attr(x, "labels") <- value_labels[!value_labels %in% na]
}
x
}
#' @export
convert_to_na.character <- convert_to_na.factor
#' @export
convert_to_na.Date <- function(x, na = NULL, verbose = TRUE, ...) {
# if we have a list, use first valid element
if (is.list(na)) {
na <- na[vapply(na, .is_date, FUN.VALUE = logical(1L))]
if (length(na) > 1) {
na <- na[[1]]
}
}
if (insight::is_empty_object(na) || !.is_date(na)) {
if (isTRUE(verbose)) {
insight::format_alert(
"Could not convert values into `NA` for a date/time variable.",
"To do this, `na` must be of class 'Date'."
)
}
} else {
matches <- which(x == na)
x[matches] <- NA
}
x
}
#' @export
convert_to_na.logical <- function(x, na = NULL, verbose = TRUE, ...) {
# if we have a list, use first valid element
if (is.list(na)) {
na <- unlist(
na[vapply(na, is.logical, FUN.VALUE = TRUE)],
use.names = FALSE
)
}
if (insight::is_empty_object(na) || !is.logical(na)) {
if (isTRUE(verbose)) {
insight::format_alert(
"Could not convert values into `NA` for a logical variable.",
"To do this, `na` needs to be a logical vector, or a list that contains logical vector elements."
)
}
} else {
matches <- which(x == na)
x[matches] <- NA
}
x
}
#' @rdname convert_to_na
#' @export
convert_to_na.data.frame <- function(
x,
select = NULL,
exclude = NULL,
na = NULL,
drop_levels = FALSE,
ignore_case = FALSE,
regex = FALSE,
verbose = TRUE,
...
) {
# evaluate arguments
select <- .select_nse(
select,
x,
exclude,
ignore_case,
regex = regex,
verbose = verbose
)
x[select] <- lapply(
x[select],
convert_to_na,
na = na,
drop_levels = drop_levels,
verbose = verbose,
...
)
x
}
================================================
FILE: R/data.R
================================================
#' @docType data
#' @title Sample dataset from the National Health and Nutrition Examination Survey
#' @name nhanes_sample
#' @keywords data
#'
#' @description Selected variables from the National Health and Nutrition Examination
#' Survey that are used in the example from Lumley (2010), Appendix E.
#'
#' @references Lumley T (2010). Complex Surveys: a guide to analysis using R. Wiley
NULL
#' @docType data
#' @title Sample dataset from the EFC Survey
#' @name efc
#' @keywords data
#'
#' @description Selected variables from the EUROFAMCARE survey. Useful when
#' testing on "real-life" data sets, including random missing values. This
#' data set also has value and variable label attributes.
NULL
================================================
FILE: R/data_addprefix.R
================================================
#' Add a prefix or suffix to column names
#'
#' @rdname data_prefix_suffix
#' @inheritParams extract_column_names
#' @param pattern A character string, which will be added as prefix or suffix
#' to the column names.
#' @param ... Other arguments passed to or from other functions.
#'
#' @seealso
#' [data_rename()] for more fine-grained column renaming.
#' @examples
#' # Add prefix / suffix to all columns
#' head(data_addprefix(iris, "NEW_"))
#' head(data_addsuffix(iris, "_OLD"))
#'
#' @export
data_addprefix <- function(
data,
pattern,
select = NULL,
exclude = NULL,
ignore_case = FALSE,
regex = FALSE,
verbose = TRUE,
...
) {
# evaluate arguments
select <- .select_nse(
select,
data,
exclude,
ignore_case,
regex = regex,
verbose = verbose
)
selected_columns <- colnames(data) %in% select
colnames(data)[selected_columns] <- paste0(
pattern,
colnames(data)[selected_columns]
)
data
}
#' @rdname data_prefix_suffix
#' @export
data_addsuffix <- function(
data,
pattern,
select = NULL,
exclude = NULL,
ignore_case = FALSE,
regex = FALSE,
verbose = TRUE,
...
) {
# evaluate arguments
select <- .select_nse(
select,
data,
exclude,
ignore_case,
regex = regex,
verbose = verbose
)
selected_columns <- colnames(data) %in% select
colnames(data)[selected_columns] <- paste0(
colnames(data)[selected_columns],
pattern
)
data
}
================================================
FILE: R/data_arrange.R
================================================
#' Arrange rows by column values
#'
#' `data_arrange()` orders the rows of a data frame by the values of selected
#' columns.
#'
#' @param data A data frame, or an object that can be coerced to a data frame.
#' @param select Character vector of column names. Use a dash just before column
#' name to arrange in decreasing order, for example `"-x1"`.
#' @param safe Do not throw an error if one of the variables specified doesn't
#' exist.
#'
#' @return A data frame.
#'
#' @examples
#'
#' # Arrange using several variables
#' data_arrange(head(mtcars), c("gear", "carb"))
#'
#' # Arrange in decreasing order
#' data_arrange(head(mtcars), "-carb")
#'
#' # Throw an error if one of the variables specified doesn't exist
#' try(data_arrange(head(mtcars), c("gear", "foo"), safe = FALSE))
#' @export
data_arrange <- function(data, select = NULL, safe = TRUE) {
UseMethod("data_arrange")
}
#' @export
data_arrange.default <- function(data, select = NULL, safe = TRUE) {
if (is.null(select) || length(select) == 0) {
return(data)
}
original_x <- data
# Input validation check
data <- .coerce_to_dataframe(data)
# Remove tidyverse attributes, will add them back at the end
if (inherits(original_x, "tbl_df")) {
tbl_input <- TRUE
data <- as.data.frame(data, stringsAsFactors = FALSE)
} else {
tbl_input <- FALSE
}
# find which vars should be decreasing
desc <- select[startsWith(select, "-")]
desc <- gsub("^-", "", desc)
select <- gsub("^-", "", select)
# check for variables that are not in data
dont_exist <- setdiff(select, colnames(data))
if (length(dont_exist) > 0) {
if (safe) {
insight::format_warning(
paste0(
"The following column(s) don't exist in the dataset: ",
text_concatenate(dont_exist),
"."
),
.misspelled_string(names(data), dont_exist, "Possibly misspelled?")
)
} else {
insight::format_error(
paste0(
"The following column(s) don't exist in the dataset: ",
text_concatenate(dont_exist),
"."
),
.misspelled_string(names(data), dont_exist, "Possibly misspelled?")
)
}
select <- select[-which(select %in% dont_exist)]
}
if (length(select) == 0) {
return(data)
}
already_sorted <- all(vapply(
data[, select, drop = FALSE],
.is_sorted,
logical(1L)
))
if (isTRUE(already_sorted)) {
return(data)
}
out <- data
# reverse order for variables that should be decreasing
if (length(desc) > 0) {
for (i in desc) {
out[[i]] <- -xtfrm(out[[i]])
}
}
# apply ordering
if (length(select) == 1) {
out <- data[order(out[[select]]), , drop = FALSE]
} else {
out <- data[do.call(order, out[, select]), , drop = FALSE]
}
if (!insight::object_has_rownames(data)) {
rownames(out) <- NULL
}
# add back custom attributes
out <- .replace_attrs(out, attributes(original_x))
out
}
#' @export
data_arrange.grouped_df <- function(data, select = NULL, safe = TRUE) {
original_x <- data
grps <- attr(data, "groups", exact = TRUE)
grps <- grps[[".rows"]]
# Remove tidyverse attributes, will add them back at the end
if (inherits(data, "tbl_df")) {
tbl_input <- TRUE
data <- as.data.frame(data, stringsAsFactors = FALSE)
} else {
tbl_input <- FALSE
}
out <- lapply(grps, function(x) {
data_arrange.default(data[x, ], select = select, safe = safe)
})
out <- do.call(rbind, out)
if (!insight::object_has_rownames(data)) {
rownames(out) <- NULL
}
# add back tidyverse attributes
if (isTRUE(tbl_input)) {
class(out) <- c("tbl_df", "tbl", "data.frame")
}
# add back custom attributes
out <- .replace_attrs(out, attributes(original_x))
out
}
================================================
FILE: R/data_codebook.R
================================================
#' Generate a codebook of a data frame.
#'
#' `data_codebook()` generates codebooks from data frames, i.e. overviews
#' of all variables and some more information about each variable (like
#' labels, values or value range, frequencies, amount of missing values).
#'
#' @param data A data frame, or an object that can be coerced to a data frame.
#' @param variable_label_width Length of variable labels. Longer labels will be
#' wrapped at `variable_label_width` chars. If `NULL`, longer labels will not
#' be split into multiple lines. Only applies to _labelled data_.
#' @param value_label_width Length of value labels. Longer labels will be
#' shortened, where the remaining part is truncated. Only applies to
#' _labelled data_ or factor levels.
#' @param range_at Indicates how many unique values in a numeric vector are
#' needed in order to print a range for that variable instead of a frequency
#' table for all numeric values. Can be useful if the data contains numeric
#' variables with only a few unique values and where full frequency tables
#' instead of value ranges should be displayed.
#' @param max_values Number of maximum values that should be displayed. Can be
#' used to avoid too many rows when variables have lots of unique values.
#' @param font_size For HTML tables, the font size.
#' @param line_padding For HTML tables, the distance (in pixel) between lines.
#' @param row_color For HTML tables, the fill color for odd rows.
#' @inheritParams standardize.data.frame
#' @inheritParams extract_column_names
#' @inheritParams data_tabulate
#'
#' @return A formatted data frame, summarizing the content of the data frame.
#' Returned columns include the column index of the variables in the original
#' data frame (`ID`), column name, variable label (if data is labelled), type
#' of variable, number of missing values, unique values (or value range),
#' value labels (for labelled data), and a frequency table (N for each value).
#' Most columns are formatted as character vectors.
#'
#' @note There are methods to `print()` the data frame in a nicer output, as
#' well methods for printing in markdown or HTML format (`print_md()` and
#' `print_html()`). The `print()` method for text outputs passes arguments in
#' `...` to [`insight::export_table()`].
#'
#' @examples
#' data(iris)
#' data_codebook(iris, select = starts_with("Sepal"))
#'
#' data(efc)
#' data_codebook(efc)
#'
#' # shorten labels
#' data_codebook(efc, variable_label_width = 20, value_label_width = 15)
#'
#' # automatic range for numerics at more than 5 unique values
#' data(mtcars)
#' data_codebook(mtcars, select = starts_with("c"))
#'
#' # force all values to be displayed
#' data_codebook(mtcars, select = starts_with("c"), range_at = 100)
#' @export
data_codebook <- function(
data,
select = NULL,
exclude = NULL,
variable_label_width = NULL,
value_label_width = NULL,
max_values = 10,
range_at = 6,
ignore_case = FALSE,
regex = FALSE,
verbose = TRUE,
...
) {
data_name <- insight::safe_deparse(substitute(data))
# evaluate select/exclude, may be select-helpers
select <- .select_nse(
select,
data,
exclude,
ignore_case,
regex = regex,
verbose = verbose
)
# check for emtpy columns, and remove
empty <- empty_columns(data[select])
if (length(empty)) {
if (verbose) {
insight::format_warning(
sprintf(
"Following %i columns were empty and have been removed:",
length(empty)
),
text_concatenate(names(empty))
)
}
select <- select[-empty]
}
# check if any columns left, or found
if (!length(select) || is.null(select)) {
if (isTRUE(verbose)) {
insight::format_warning(
"No column names that matched the required search pattern were found."
)
}
return(NULL)
}
# needed for % NA
rows <- nrow(data)
max_values <- max_values + 1
out <- lapply(seq_along(select), function(id) {
# variable
x <- data[[select[id]]]
x_na <- is.na(x)
x_inf <- is.infinite(x)
# inital data frame for codebook
d <- data.frame(
ID = which(colnames(data) == select[id]),
Name = select[id],
Type = .variable_type(x),
Missings = sprintf("%g (%.1f%%)", sum(x_na), 100 * (sum(x_na) / rows)),
stringsAsFactors = FALSE,
row.names = NULL,
check.names = FALSE
)
# check if there are variable labels
variable_label <- .extract_variable_labels(x, variable_label_width)
# we may need to remove duplicated value range elements
flag_range <- FALSE
# save value labels
vallab <- attr(x, "labels", exact = TRUE)
# do we have labelled NA values? If so, include labelled NAs in count table
# we do this by converting NA values into character strings
if (anyNA(vallab) && insight::check_if_installed("haven", quietly = TRUE)) {
# get na-tags, i.e. the value labels for the different NA values
na_labels <- haven::na_tag(vallab)
# replace NA in labels with NA tags
vallab[!is.na(na_labels)] <- stats::setNames(
paste0("NA(", na_labels[!is.na(na_labels)], ")"),
names(vallab[!is.na(na_labels)])
)
# replace tagged NAs in variable with their values, tagged as NA(value)
na_values <- haven::na_tag(x)
# need to convert, we still have haven-class, which cannot coerce
x <- as.character(x)
x[!is.na(na_values)] <- paste0("NA(", na_values[!is.na(na_values)], ")")
# update information on NA - we still might have non-labelled (regular) NA
x_na <- is.na(x)
}
# remove NA and Inf, for tabulate(). as.factor() will convert NaN
# to a factor level "NaN", which we don't want here (same for Inf),
# because tabulate() will then return frequencies for that level, too
x <- x[!(x_na | x_inf)]
# get unique values, to remove non labelled data
unique_values <- unique(x)
# coerce to factor, for tabulate(). We will coerce numerics to factor later
# which is required because tabulate() doesn't return frequencies for values
# lower than 1
if (!is.numeric(x) && !is.factor(x)) {
x <- as.factor(x)
}
# for ranges, we don't want the N% value, so use this to flag range-values
is_range <- FALSE
# handle labelled data - check if there are value labels or factor levels,
# and extract values and N
if (!is.null(vallab) && length(vallab)) {
# if not all values are labelled, fill in value labels
if (!all(unique_values %in% vallab)) {
new_vals <- setdiff(unique_values, vallab)
vallab <- c(vallab, stats::setNames(new_vals, new_vals))
}
# if not all value labels are present in the data, remove unused value labels
if (!all(vallab %in% unique_values)) {
not_needed <- setdiff(vallab, unique_values)
# match not needed values in vallab vector - values from labels
# may not be in sorted order (e.g. 1, 2, 3, -9), or may be character
# vectors in case of tagged NAs, so we have to make sure we know which
# values can be removed from vallab
not_needed <- stats::na.omit(match(not_needed, vallab))
vallab <- vallab[-not_needed]
}
# we now should have the same length of value labels and labelled values
# which should also match the numberof unique values in the vector.
# "tabulate" creates frequency tables by sorting by values/levels, so
# we need to make sure that labels are also in sorted order.
value_labels <- names(vallab)[order(unname(vallab))]
values <- sort(unname(vallab))
frq <- tabulate(as.factor(x))
# handle factors
} else if (is.factor(x)) {
values <- levels(x)
value_labels <- NA
frq <- tabulate(x)
# handle numerics
} else {
value_labels <- NA
# only range for too many unique values
if (length(unique_values) >= range_at) {
r <- range(x, na.rm = TRUE)
values <- sprintf("[%g, %g]", round(r[1], 2), round(r[2], 2))
frq <- sum(!x_na)
flag_range <- length(variable_label) > 1
is_range <- TRUE
# if we have few values, we can print whole freq. table
} else {
values <- sort(unique_values)
frq <- tabulate(as.factor(x))
}
}
# tabulate fills 0 for non-existing values, remove those
frq <- frq[frq != 0]
# add Inf values?
if (any(x_inf) && length(frq) <= max_values) {
values <- c(values, Inf)
if (!is.na(value_labels)) {
value_labels <- c(value_labels, "infinite")
}
frq <- c(frq, sum(x_inf))
# Inf are added as value, so don't flag range any more,
# since we now have proportions for the range and the inf values.
is_range <- FALSE
}
# add proportions, but not for ranges, since these are always 100%
if (is_range) {
frq_proportions <- ""
} else {
frq_proportions <- sprintf("%.1f%%", round(100 * (frq / sum(frq)), 1))
}
# make sure we have not too long rows, e.g. for variables that
# have dozens of unique values
if (length(value_labels) > max_values) {
value_labels <- value_labels[1:max_values]
value_labels[max_values] <- "(...)"
}
if (length(frq) > max_values) {
frq <- frq[1:max_values]
frq_proportions <- frq_proportions[1:max_values]
frq[max_values] <- NA
frq_proportions[max_values] <- NA
}
if (length(values) > max_values) {
values <- values[1:max_values]
values[max_values] <- "(...)"
}
# make sure length recycling doesn't fail, e.g. if we have split
# variable_label into two lines (i.e. vector of length 2), but we have
# 7 values in "frq", creating the data frame will fail. In this case,
# we have to make sure that recycling shorter vectors works.
if (length(variable_label) > 1 && !flag_range) {
variable_label <- variable_label[seq_along(frq)]
}
# shorten value labels
if (!is.null(value_label_width)) {
value_labels <- insight::format_string(
value_labels,
length = value_label_width
)
}
# add values, value labels and frequencies to data frame
d <- cbind(
d,
data.frame(
variable_label,
values,
value_labels,
frq,
proportions = frq_proportions,
stringsAsFactors = FALSE
)
)
# which columns need to be checked for duplicates?
duplicates <- c("ID", "Name", "Type", "Missings", "variable_label")
if (isTRUE(flag_range)) {
# when we have numeric variables with value range as values, and when
# these variables had long variable labels that have been wrapped,
# the range value is duplicated (due to recycling), so we need to fix
# this here.
duplicates <- c(duplicates, c("values", "frq", "proportions"))
}
# clear duplicates due to recycling
for (i in duplicates) {
d[[i]][duplicated(d[[i]])] <- ""
}
# remove empty rows
d <- remove_empty_rows(d)
# add empty row at the end, as separator
d[nrow(d) + 1, ] <- rep("", ncol(d))
# add row ID
d$.row_id <- id
d
})
# clean-up (column order, rename, ...)
out <- .finalize_result(do.call(rbind, out))
# add attributes
.add_codebook_attributes(out, data_name, data, select)
}
# helper -----------------------
#' @keywords internal
.extract_variable_labels <- function(x, variable_label_width = NULL) {
varlab <- attr(x, "label", exact = TRUE)
if (!is.null(varlab) && length(varlab)) {
variable_label <- varlab
# if variable labels are too long, split into multiple elements
if (
!is.null(variable_label_width) &&
nchar(variable_label) > variable_label_width
) {
variable_label <- insight::trim_ws(unlist(
strsplit(
text_wrap(variable_label, width = variable_label_width),
"\n",
fixed = TRUE
),
use.names = FALSE
))
}
} else {
variable_label <- NA
}
variable_label
}
#' @keywords internal
.finalize_result <- function(out) {
# rename
pattern <- c("variable_label", "values", "value_labels", "frq", "proportions")
replacement <- c("Label", "Values", "Value Labels", "N", "Prop")
for (i in seq_along(pattern)) {
names(out) <- replace(names(out), names(out) == pattern[i], replacement[i])
}
# remove all empty columns
out <- remove_empty_columns(out)
# reorder
column_order <- c(
"ID",
"Name",
"Label",
"Type",
"Missings",
"Values",
"Value Labels",
"N",
"Prop",
".row_id"
)
out[union(intersect(column_order, names(out)), names(out))]
}
#' @keywords internal
.add_codebook_attributes <- function(out, data_name, data, select) {
attr(out, "data_name") <- data_name
attr(out, "n_rows") <- nrow(data)
attr(out, "n_cols") <- ncol(data)
attr(out, "n_shown") <- length(select)
class(out) <- c("data_codebook", "data.frame")
out
}
# methods ----------------------
#' @export
format.data_codebook <- function(x, format = "text", ...) {
# use [["N"]] to avoid partial matching
if (any(stats::na.omit(nchar(x[["N"]]) > 5))) {
x[["N"]] <- insight::trim_ws(prettyNum(x[["N"]], big.mark = ","))
x[["N"]][x[["N"]] == "NA" | is.na(x[["N"]])] <- ""
}
# merge N and %
if (!is.null(x$Prop)) {
x$Prop[x$Prop == "NA" | is.na(x$Prop)] <- ""
# align only for text format
if (identical(format, "text")) {
x$Prop[x$Prop != ""] <- format(x$Prop[x$Prop != ""], justify = "right") # nolint
}
x[["N"]][x$Prop != ""] <- sprintf(
# nolint
"%s (%s)",
as.character(x[["N"]][x$Prop != ""]), # nolint
x$Prop[x$Prop != ""] # nolint
)
x$Prop <- NULL
}
x
}
#' @export
print.data_codebook <- function(x, ...) {
caption <- c(.get_codebook_caption(x), "blue")
x$.row_id <- NULL
cat(
insight::export_table(
format(x),
title = caption,
empty_line = "-",
cross = "+",
align = .get_codebook_align(x),
...
)
)
}
#' @rdname data_codebook
#' @export
print_html.data_codebook <- function(
x,
font_size = "100%",
line_padding = 3,
row_color = "#eeeeee",
...
) {
caption <- .get_codebook_caption(x)
attr(x, "table_caption") <- caption
# since we have each value at its own row, the HTML table contains
# horizontal borders for each cell/row. We want to remove those borders
# from rows that actually belong to one variable
separator_lines <- which(duplicated(x$.row_id) & x$N == "") # nolint
# remove separator lines, as we don't need these for HTML tables
x <- x[-separator_lines, ]
# check row IDs, and find odd rows
odd_rows <- (x$.row_id %% 2 == 1)
x$.row_id <- NULL
# create basic table
backend <- .check_format_backend(...)
out <- insight::export_table(
format(x, format = "html"),
title = caption,
format = backend,
align = .get_codebook_align(x)
)
# for tiny table output, we don't need to do any further formatting
if (identical(backend, "tt")) {
return(out)
}
insight::check_if_installed("gt")
# no border for rows which are not separator lines
out <- gt::tab_style(
out,
style = list(gt::cell_borders(sides = "top", style = "hidden")),
locations = gt::cells_body(rows = which(x$ID == "")) # nolint
)
# highlight odd rows
if (!is.null(row_color)) {
out <- gt::tab_style(
out,
style = list(gt::cell_fill(color = row_color)),
locations = gt::cells_body(rows = odd_rows)
)
}
# set up additonal HTML options
gt::tab_options(
out,
table.font.size = font_size,
data_row.padding = gt::px(line_padding)
)
}
#' @rdname data_codebook
#' @export
display.data_codebook <- function(
object,
format = "markdown",
font_size = "100%",
line_padding = 3,
row_color = "#eeeeee",
...
) {
format <- .display_default_format(format)
fun_args <- list(
x = object,
font_size = font_size,
line_padding = line_padding,
row_color = row_color,
...
)
# print table in HTML or markdown format
if (format %in% c("html", "tt")) {
fun_args$backend <- format
do.call(print_html, fun_args)
} else {
do.call(print_md, fun_args)
}
}
#' @export
print_md.data_codebook <- function(x, ...) {
caption <- .get_codebook_caption(x)
x$.row_id <- NULL
attr(x, "table_caption") <- caption
insight::export_table(
format(x, format = "markdown"),
title = caption,
align = .get_codebook_align(x),
format = "markdown"
)
}
# helper ---------
.get_codebook_caption <- function(x) {
n_rows <- as.character(attributes(x)$n_rows)
if (nchar(n_rows) > 5) {
n_rows <- prettyNum(n_rows, big.mark = ",")
}
sprintf(
"%s (%s rows and %i variables, %i shown)",
attributes(x)$data_name,
n_rows,
attributes(x)$n_cols,
attributes(x)$n_shown
)
}
.get_codebook_align <- function(x) {
# need to remove this one
x$Prop <- NULL
align <- c(
ID = "l",
Name = "l",
Label = "l",
Type = "l",
Missings = "r",
Values = "r",
`Value Labels` = "l",
N = "r"
)
align <- align[colnames(x)]
paste(unname(align), collapse = "")
}
================================================
FILE: R/data_duplicated.R
================================================
#' @title Extract all duplicates
#'
#' @description Extract all duplicates, for visual inspection.
#' Note that it also contains the first occurrence of future
#' duplicates, unlike [duplicated()] or [dplyr::distinct()]). Also
#' contains an additional column reporting the number of missing
#' values for that row, to help in the decision-making when
#' selecting which duplicates to keep.
#'
#' @inheritParams extract_column_names
#'
#' @keywords duplicates
#' @export
#' @seealso
#' [data_unique()]
#' @return A dataframe, containing all duplicates.
#' @examples
#' df1 <- data.frame(
#' id = c(1, 2, 3, 1, 3),
#' year = c(2022, 2022, 2022, 2022, 2000),
#' item1 = c(NA, 1, 1, 2, 3),
#' item2 = c(NA, 1, 1, 2, 3),
#' item3 = c(NA, 1, 1, 2, 3)
#' )
#'
#' data_duplicated(df1, select = "id")
#'
#' data_duplicated(df1, select = c("id", "year"))
#'
#' # Filter to exclude duplicates
#' df2 <- df1[-c(1, 5), ]
#' df2
#'
data_duplicated <- function(
data,
select = NULL,
exclude = NULL,
ignore_case = FALSE,
regex = FALSE,
verbose = TRUE
) {
UseMethod("data_duplicated")
}
#' @export
data_duplicated.data.frame <- function(
data,
select = NULL,
exclude = NULL,
ignore_case = FALSE,
regex = FALSE,
verbose = TRUE
) {
select <- .select_nse(
select,
data,
exclude = exclude,
ignore_case = ignore_case,
regex = regex,
verbose = verbose
)
data$temporary_id <- do.call(paste, c(data_select(data, select), sep = "_"))
data <- cbind(Row = seq_len(nrow(data)), data)
dups.index <- data$temporary_id %in%
data$temporary_id[duplicated(data$temporary_id)]
dups <- data[dups.index, ]
dups$count_na <- rowSums(is.na(dups))
dups <- data_arrange(dups, select)
dups <- data_remove(dups, "temporary_id")
dups
}
#' @export
data_duplicated.grouped_df <- function(
data,
select = NULL,
exclude = NULL,
ignore_case = FALSE,
regex = FALSE,
verbose = TRUE
) {
select <- .select_nse(
select,
data,
exclude = exclude,
ignore_case = ignore_case,
regex = regex,
verbose = verbose
)
grps <- attr(data, "groups", exact = TRUE)
grps <- grps[[".rows"]]
data <- as.data.frame(data)
out <- lapply(grps, function(x) {
data_duplicated.data.frame(data[x, ], select = select)
})
out <- do.call(rbind, out)
out
}
================================================
FILE: R/data_extract.R
================================================
#' Extract one or more columns or elements from an object
#'
#' `data_extract()` (or its alias `extract()`) is similar to `$`. It extracts
#' either a single column or element from an object (e.g., a data frame, list),
#' or multiple columns resp. elements.
#'
#' @param data The object to subset. Methods are currently available for data frames
#' and data frame extensions (e.g., tibbles).
#' @param name An optional argument that specifies the column to be used as
#' names for the vector elements after extraction. Must be specified either
#' as literal variable name (e.g., `column_name`) or as string
#' (`"column_name"`). `name` will be ignored when a data frame is returned.
#' @param extract String, indicating which element will be extracted when `select`
#' matches multiple variables. Can be `"all"` (the default) to return all
#' matched variables, `"first"` or `"last"` to return the first or last match,
#' or `"odd"` and `"even"` to return all odd-numbered or even-numbered
#' matches. Note that `"first"` or `"last"` return a vector (unless
#' `as_data_frame = TRUE`), while `"all"` can return a vector (if only one
#' match was found) *or* a data frame (for more than one match). Type safe
#' return values are only possible when `extract` is `"first"` or `"last"` (will
#' always return a vector) or when `as_data_frame = TRUE` (always returns a
#' data frame).
#' @param as_data_frame Logical, if `TRUE`, will always return a data frame,
#' even if only one variable was matched. If `FALSE`, either returns a vector
#' or a data frame. See `extract` for details.
#' @param verbose Toggle warnings.
#' @param ... For use by future methods.
#'
#' @inheritParams extract_column_names
#'
#' @details `data_extract()` can be used to select multiple variables or pull a
#' single variable from a data frame. Thus, the return value is by default not
#' type safe - `data_extract()` either returns a vector or a data frame.
#' \subsection{Extracting single variables (vectors)}{
#' When `select` is the name of a single column, or when select only matches
#' one column, a vector is returned. A single variable is also returned when
#' `extract` is either `"first` or `"last"`. Setting `as_data_frame` to `TRUE`
#' overrides this behaviour and *always* returns a data frame.
#' }
#' \subsection{Extracting a data frame of variables}{
#' When `select` is a character vector containing more than one column name (or
#' a numeric vector with more than one valid column indices), or when `select`
#' uses one of the supported select-helpers that match multiple columns, a
#' data frame is returned. Setting `as_data_frame` to `TRUE` *always* returns
#' a data frame.
#' }
#'
#' @return A vector (or a data frame) containing the extracted element, or
#' `NULL` if no matching variable was found.
#' @export
#'
#' @examples
#' # single variable
#' data_extract(mtcars, cyl, name = gear)
#' data_extract(mtcars, "cyl", name = gear)
#' data_extract(mtcars, -1, name = gear)
#' data_extract(mtcars, cyl, name = 0)
#' data_extract(mtcars, cyl, name = "row.names")
#'
#' # selecting multiple variables
#' head(data_extract(iris, starts_with("Sepal")))
#' head(data_extract(iris, ends_with("Width")))
#' head(data_extract(iris, 2:4))
#'
#' # select first of multiple variables
#' data_extract(iris, starts_with("Sepal"), extract = "first")
#'
#' # select first of multiple variables, return as data frame
#' head(data_extract(iris, starts_with("Sepal"), extract = "first", as_data_frame = TRUE))
data_extract <- function(data, select, ...) {
UseMethod("data_extract")
}
#' @rdname data_extract
#' @export
data_extract.data.frame <- function(
data,
select,
name = NULL,
extract = "all",
as_data_frame = FALSE,
ignore_case = FALSE,
regex = FALSE,
verbose = TRUE,
...
) {
extract <- match.arg(
tolower(extract),
choices = c("all", "first", "last", "odd", "even")
)
# evaluate arguments
select <- .select_nse(
select,
data,
exclude = NULL,
ignore_case,
regex = regex,
verbose = verbose
)
# nothing to select?
if (!length(select)) {
return(NULL)
}
nl <- as.list(seq_along(data))
names(nl) <- names(data)
name <- eval(substitute(name), nl, parent.frame())
if (is.numeric(name) && length(name) == 1L) {
if (name < 0L) {
name <- ncol(data) + name + 1L
} else if (name == 0L) {
name <- rownames(data)
}
} else if (is.character(name) && identical(name, "row.names")) {
name <- rownames(data)
}
# chose which matched variables to extract
select <- switch(
extract,
first = select[1L],
last = select[length(select)],
odd = select[seq(1L, length(select), 2L)],
even = select[seq(2L, length(select), 2L)],
select
)
# "name" only used for naming elements in a vector, not data frame
needs_no_names <- isTRUE(as_data_frame) ||
# more than one variable means data frame, so no name
length(select) > 1L ||
# if we have only one variable, but number of observations not equal to
# length of names, we have no proper match, so no naming, too.
(length(select) == 1L &&
length(name) > 1L &&
length(data[[select]]) != length(name))
if (needs_no_names) {
name <- NULL
}
# we definitely should have a vector here when name not NULL
if (is.null(name)) {
data[, select, drop = !as_data_frame]
} else {
# if name indicates a variable, extract values for naming now
if (length(name) == 1L) {
name <- data[[name]]
}
stats::setNames(data[[select]], name)
}
}
================================================
FILE: R/data_group.R
================================================
#' @title Create a grouped data frame
#' @name data_group
#'
#' @description This function is comparable to `dplyr::group_by()`, but just
#' following the **datawizard** function design. `data_ungroup()` removes the
#' grouping information from a grouped data frame.
#'
#' @param data A data frame
#' @inheritParams extract_column_names
#'
#' @return A grouped data frame, i.e. a data frame with additional information
#' about the grouping structure saved as attributes.
#'
#' @examplesIf requireNamespace("poorman")
#' data(efc)
#' suppressPackageStartupMessages(library(poorman, quietly = TRUE))
#'
#' # total mean
#' efc %>%
#' summarize(mean_hours = mean(c12hour, na.rm = TRUE))
#'
#' # mean by educational level
#' efc %>%
#' data_group(c172code) %>%
#' summarize(mean_hours = mean(c12hour, na.rm = TRUE))
#' @export
data_group <- function(
data,
select = NULL,
exclude = NULL,
ignore_case = FALSE,
regex = FALSE,
verbose = TRUE,
...
) {
# variables for grouping
select <- .select_nse(
select,
data,
exclude,
ignore_case = ignore_case,
regex = regex,
verbose = verbose
)
# create grid with combinations of all levels
my_grid <- as.data.frame(expand.grid(lapply(data[select], unique)))
# sort grid
my_grid <- my_grid[do.call(order, my_grid), , drop = FALSE]
.rows <- lapply(seq_len(nrow(my_grid)), function(i) {
as.integer(data_match(
data,
to = my_grid[i, , drop = FALSE],
match = "and",
return_indices = TRUE,
remove_na = FALSE
))
})
my_grid[[".rows"]] <- .rows
# remove data_match attributes
attr(my_grid, "out.attrs") <- NULL
attr(my_grid, ".drop") <- TRUE
attr(data, "groups") <- my_grid
class(data) <- unique(c("grouped_df", "data.frame"), class(data))
data
}
#' @rdname data_group
#' @export
data_ungroup <- function(data, verbose = TRUE, ...) {
attr(data, "groups") <- NULL
class(data) <- unique(setdiff(class(data), "grouped_df"))
data
}
================================================
FILE: R/data_match.R
================================================
#' Return filtered or sliced data frame, or row indices
#'
#' Return a filtered (or sliced) data frame or row indices of a data frame that
#' match a specific condition. `data_filter()` works like `data_match()`, but works
#' with logical expressions or row indices of a data frame to specify matching
#' conditions.
#'
#' @param x A data frame.
#' @param to A data frame matching the specified conditions. Note that if
#' `match` is a value other than `"and"`, the original row order might be
#' changed. See 'Details'.
#' @param match String, indicating with which logical operation matching
#' conditions should be combined. Can be `"and"` (or `"&"`), `"or"` (or `"|"`)
#' or `"not"` (or `"!"`).
#' @param return_indices Logical, if `TRUE`, return the vector of rows that
#' can be used to filter the original data frame. If `FALSE` (default),
#' returns directly the filtered data frame instead of the row indices.
#' @param remove_na Logical, if `TRUE`, missing values (`NA`s) are removed before
#' filtering the data. This is the default behaviour, however, sometimes when
#' row indices are requested (i.e. `return_indices=TRUE`), it might be useful
#' to preserve `NA` values, so returned row indices match the row indices of
#' the original data frame.
#' @param ... A sequence of logical expressions indicating which rows to keep,
#' or a numeric vector indicating the row indices of rows to keep. Can also be
#' a string representation of a logical expression (e.g. `"x > 4"`), a
#' character vector (e.g. `c("x > 4", "y == 2")`) or a variable that contains
#' the string representation of a logical expression. These might be useful
#' when used in packages to avoid defining undefined global variables.
#'
#' @return A filtered data frame, or the row indices that match the specified
#' configuration.
#'
#' @details For `data_match()`, if `match` is either `"or"` or `"not"`, the
#' original row order from `x` might be changed. If preserving row order is
#' required, use `data_filter()` instead.
#'
#' ```
#' # mimics subset() behaviour, preserving original row order
#' head(data_filter(mtcars[c("mpg", "vs", "am")], vs == 0 | am == 1))
#' #> mpg vs am
#' #> Mazda RX4 21.0 0 1
#' #> Mazda RX4 Wag 21.0 0 1
#' #> Datsun 710 22.8 1 1
#' #> Hornet Sportabout 18.7 0 0
#' #> Duster 360 14.3 0 0
#' #> Merc 450SE 16.4 0 0
#'
#' # re-sorting rows
#' head(data_match(mtcars[c("mpg", "vs", "am")],
#' data.frame(vs = 0, am = 1),
#' match = "or"))
#' #> mpg vs am
#' #> Mazda RX4 21.0 0 1
#' #> Mazda RX4 Wag 21.0 0 1
#' #> Hornet Sportabout 18.7 0 0
#' #> Duster 360 14.3 0 0
#' #> Merc 450SE 16.4 0 0
#' #> Merc 450SL 17.3 0 0
#' ```
#'
#' While `data_match()` works with data frames to match conditions against,
#' `data_filter()` is basically a wrapper around `subset(subset = <filter>)`.
#' However, unlike `subset()`, it preserves label attributes and is useful when
#' working with labelled data.
#'
#' @examples
#' data_match(mtcars, data.frame(vs = 0, am = 1))
#' data_match(mtcars, data.frame(vs = 0, am = c(0, 1)))
#'
#' # observations where "vs" is NOT 0 AND "am" is NOT 1
#' data_match(mtcars, data.frame(vs = 0, am = 1), match = "not")
#' # equivalent to
#' data_filter(mtcars, vs != 0 & am != 1)
#'
#' # observations where EITHER "vs" is 0 OR "am" is 1
#' data_match(mtcars, data.frame(vs = 0, am = 1), match = "or")
#' # equivalent to
#' data_filter(mtcars, vs == 0 | am == 1)
#'
#' # slice data frame by row indices
#' data_filter(mtcars, 5:10)
#'
#' # Define a custom function containing data_filter()
#' my_filter <- function(data, variable) {
#' data_filter(data, variable)
#' }
#' my_filter(mtcars, "cyl == 6")
#'
#' # Pass complete filter-condition as string.
#' my_filter <- function(data, condition) {
#' data_filter(data, condition)
#' }
#' my_filter(mtcars, "am != 0")
#'
#' # string can also be used directly as argument
#' data_filter(mtcars, "am != 0")
#'
#' # or as variable
#' fl <- "am != 0"
#' data_filter(mtcars, fl)
#' @inherit data_rename seealso
#' @export
data_match <- function(
x,
to,
match = "and",
return_indices = FALSE,
remove_na = TRUE,
...
) {
if (!is.data.frame(to)) {
to <- as.data.frame(to)
}
original_x <- x
# evaluate
match <- match.arg(
tolower(match),
c("and", "&", "&&", "or", "|", "||", "!", "not")
)
match <- switch(
match,
`&` = ,
`&&` = ,
and = "and",
`!` = ,
not = "not",
"or"
)
# validation check
shared_columns <- intersect(colnames(x), colnames(to))
if (is.null(shared_columns) || length(shared_columns) == 0L) {
insight::format_error(
"None of the columns from the data frame with matching conditions were found in `x`."
)
}
# only select common columns
x <- x[shared_columns]
# prepare
if (identical(match, "or")) {
idx <- vector("numeric", length = 0L)
} else {
# remove missings before matching
if (isTRUE(remove_na)) {
x <- x[stats::complete.cases(x), , drop = FALSE]
}
idx <- seq_len(nrow(x))
}
# Find matching rows
for (col in names(to)) {
values <- x[[col]]
if (match == "or") {
idx <- union(idx, which(values %in% to[[col]]))
} else if (match == "not") {
idx <- idx[!values[idx] %in% to[[col]]]
} else {
idx <- idx[values[idx] %in% to[[col]]]
}
}
# prepare output
if (isFALSE(return_indices)) {
out <- original_x[idx, , drop = FALSE]
# restore value and variable labels
for (i in colnames(out)) {
attr(out[[i]], "label") <- attr(original_x[[i]], "label", exact = TRUE)
attr(out[[i]], "labels") <- attr(original_x[[i]], "labels", exact = TRUE)
}
} else {
out <- idx
}
# add back custom attributes
out <- .replace_attrs(out, attributes(original_x))
out
}
#' @rdname data_match
#' @export
data_filter <- function(x, ...) {
UseMethod("data_filter")
}
#' @export
data_filter.data.frame <- function(x, ...) {
out <- x
# convert tibble to data.frame
if (inherits(x, "tbl_df")) {
out <- as.data.frame(out, stringsAsFactors = FALSE)
tbl_input <- TRUE
} else {
tbl_input <- FALSE
}
dots <- match.call(expand.dots = FALSE)[["..."]]
if (any(nzchar(names(dots), keepNA = TRUE))) {
insight::format_error(
"Filtering did not work. Please check if you need `==` (instead of `=`) for comparison."
)
}
# turn character vector (like `c("mpg <= 20", "cyl == 6")`) into symbols
if (length(dots) == 1) {
character_vector <- .dynEval(dots[[1]], ifnotfound = NULL)
if (is.character(character_vector) && length(character_vector) > 1) {
dots <- lapply(character_vector, str2lang)
}
}
# Check syntax of the filter. Must be done *before* calling subset()
# (cf easystats/datawizard#237)
for (.fcondition in dots) {
.check_filter_syntax(insight::safe_deparse(.fcondition))
}
for (i in seq_along(dots)) {
# only proceed when result is still valid
if (!is.null(out)) {
symbol <- dots[[i]]
# evaluate, we may have a variable with filter expression
eval_symbol <- .dynEval(symbol, ifnotfound = NULL)
# validation check: is variable named like a function?
if (is.function(eval_symbol)) {
eval_symbol <- .dynGet(symbol, ifnotfound = NULL)
}
eval_symbol_numeric <- NULL
if (!is.null(eval_symbol)) {
# when possible to evaluate, do we have a numeric vector provided
# as string? (e.g. `"5:10"`) - then try to coerce to numeric
eval_symbol_numeric <- tryCatch(
eval(parse(text = eval_symbol)),
error = function(e) NULL
)
}
# here we go when we have a filter expression, and no numeric vector to slice
if (
is.null(eval_symbol) ||
(!is.numeric(eval_symbol) && !is.numeric(eval_symbol_numeric))
) {
# could be evaluated? Then filter expression is a string and we need
# to convert into symbol
if (is.character(eval_symbol)) {
symbol <- str2lang(eval_symbol)
}
# filter data
out <- tryCatch(
subset(out, subset = eval(symbol, envir = new.env())),
warning = function(e) e,
error = function(e) e
)
} else if (is.numeric(eval_symbol)) {
# if symbol could be evaluated and is numeric, slice
out <- tryCatch(out[eval_symbol, , drop = FALSE], error = function(e) {
NULL
})
} else if (is.numeric(eval_symbol_numeric)) {
# if symbol could be evaluated, was string and could be converted to numeric, slice
out <- tryCatch(
out[eval_symbol_numeric, , drop = FALSE],
error = function(e) NULL
)
}
if (inherits(out, c("simpleError", "objectNotFoundError"))) {
error_msg <- out$message[1]
# try to find out which variable was the cause for the error
if (grepl("object '(.*)' not found", error_msg)) {
error_var <- gsub("object '(.*)' not found", "\\1", error_msg)
# some syntax errors do not relate to misspelled variables...
if (!error_var %in% colnames(x)) {
insight::format_error(
paste0(
"Variable \"",
error_var,
"\" was not found in the dataset."
),
.misspelled_string(colnames(x), error_var, "Possibly misspelled?")
)
}
}
out <- NULL
}
}
}
if (is.null(out)) {
insight::format_error(
"Filtering did not work. Please check the syntax of your conditions."
)
}
# restore value and variable labels
for (i in colnames(out)) {
attr(out[[i]], "label") <- attr(x[[i]], "label", exact = TRUE)
attr(out[[i]], "labels") <- attr(x[[i]], "labels", exact = TRUE)
}
# add back custom attributes
out <- .replace_attrs(out, attributes(x))
# add back tidyverse attributes
if (isTRUE(tbl_input)) {
class(out) <- c("tbl_df", "tbl", "data.frame")
}
out
}
#' @export
data_filter.grouped_df <- function(x, ...) {
original_x <- x
grps <- attr(x, "groups", exact = TRUE)
grps <- grps[[".rows"]]
# Remove tidyverse attributes, will add them back at the end
if (inherits(x, "tbl_df")) {
tbl_input <- TRUE
x <- as.data.frame(x, stringsAsFactors = FALSE)
} else {
tbl_input <- FALSE
}
dots <- match.call(expand.dots = FALSE)[["..."]]
out <- lapply(grps, function(grp) {
arguments <- list(x[grp, ])
arguments <- c(arguments, dots)
do.call("data_filter.data.frame", arguments)
})
out <- do.call(rbind, out)
if (!insight::object_has_rownames(x)) {
rownames(out) <- NULL
}
# add back tidyverse attributes
if (isTRUE(tbl_input)) {
class(out) <- c("tbl_df", "tbl", "data.frame")
}
# add back custom attributes
out <- .replace_attrs(out, attributes(original_x))
out
}
# helper -------------------
.check_filter_syntax <- function(.fcondition) {
# NOTE: We cannot check for `=` when "filter" is not a character vector
# because the function will then fail in general. I.e.,
# "data_filter(mtcars, filter = mpg > 10 & cyl = 4)" will not start
# running this function and never reaches the first code line,
# but immediately stops...
tmp <- gsub("==", "", .fcondition, fixed = TRUE)
tmp <- gsub("<=", "", tmp, fixed = TRUE)
tmp <- gsub(">=", "", tmp, fixed = TRUE)
tmp <- gsub("!=", "", tmp, fixed = TRUE)
# We want to check whether user used a "=" in the filter syntax. This
# typically indicates that the comparison "==" is probably wrong by using
# a "=" instead of `"=="`. However, if a function was provided, we indeed
# may have "=", e.g. if the pattern was
# `data_filter(out, grep("pattern", x = value))`. We thus first check if we
# can identify a function call, and only continue checking for wrong syntax
# when we have not identified a function.
if (
!is.function(tryCatch(
get(gsub("^(.*?)\\((.*)", "\\1", tmp)),
error = function(e) NULL
))
) {
# Give more informative message to users
# about possible misspelled comparisons / logical conditions
# check if "=" instead of "==" was used?
if (any(grepl("=", tmp, fixed = TRUE))) {
insight::format_error(
"Filtering did not work. Please check if you need `==` (instead of `=`) for comparison."
)
}
# check if "&&" etc instead of "&" was used?
logical_operator <- NULL
if (any(grepl("&&", .fcondition, fixed = TRUE))) {
logical_operator <- "&&"
}
if (any(grepl("||", .fcondition, fixed = TRUE))) {
logical_operator <- "||"
}
if (!is.null(logical_operator)) {
insight::format_error(
paste0(
"Filtering did not work. Please check if you need `",
substr(logical_operator, 0, 1),
"` (instead of `",
logical_operator,
"`) as logical operator."
)
)
}
}
}
================================================
FILE: R/data_merge.R
================================================
#' @title Merge (join) two data frames, or a list of data frames
#' @name data_merge
#'
#' @description
#' Merge (join) two data frames, or a list of data frames. However, unlike
#' base R's `merge()`, `data_merge()` offers a few more methods to join data
#' frames, and it does not drop data frame nor column attributes.
#'
#' @param x,y A data frame to merge. `x` may also be a list of data frames
#' that will be merged. Note that the list-method has no `y` argument.
#' @param join Character vector, indicating the method of joining the data frames.
#' Can be `"full"`, `"left"` (default), `"right"`, `"inner"`, `"anti"`, `"semi"`
#' or `"bind"`. See details below.
#' @param by Specifications of the columns used for merging.
#' @param id Optional name for ID column that will be created to indicate the
#' source data frames for appended rows. Only applies if `join = "bind"`.
#' @param verbose Toggle warnings.
#' @param ... Not used.
#'
#' @return
#' A merged data frame.
#'
#' @section Merging data frames:
#'
#' Merging data frames is performed by adding rows (cases), columns
#' (variables) or both from the source data frame (`y`) to the target
#' data frame (`x`). This usually requires one or more variables which
#' are included in both data frames and that are used for merging, typically
#' indicated with the `by` argument. When `by` contains a variable present
#' in both data frames, cases are matched and filtered by identical values
#' of `by` in `x` and `y`.
#'
#' @section Left- and right-joins:
#'
#' Left- and right joins usually don't add new rows (cases), but only new
#' columns (variables) for existing cases in `x`. For `join = "left"` or
#' `join = "right"` to work, `by` *must* indicate one or more columns that
#' are included in both data frames. For `join = "left"`, if `by` is an
#' identifier variable, which is included in both `x` and `y`, all variables
#' from `y` are copied to `x`, but only those cases from `y` that have
#' matching values in their identifier variable in `x` (i.e. all cases
#' in `x` that are also found in `y` get the related values from the new
#' columns in `y`). If there is no match between identifiers in `x` and `y`,
#' the copied variable from `y` will get a `NA` value for this particular
#' case. Other variables that occur both in `x` and `y`, but are not used
#' as identifiers (with `by`), will be renamed to avoid multiple identical
#' variable names. Cases in `y` where values from the identifier have no
#' match in `x`'s identifier are removed. `join = "right"` works in
#' a similar way as `join = "left"`, just that only cases from `x` that
#' have matching values in their identifier variable in `y` are chosen.
#'
#' In base R, these are equivalent to `merge(x, y, all.x = TRUE)` and
#' `merge(x, y, all.y = TRUE)`.
#'
#' @section Full joins:
#'
#' Full joins copy all cases from `y` to `x`. For matching cases in both
#' data frames, values for new variables are copied from `y` to `x`. For
#' cases in `y` not present in `x`, these will be added as new rows to `x`.
#' Thus, full joins not only add new columns (variables), but also might
#' add new rows (cases).
#'
#' In base R, this is equivalent to `merge(x, y, all = TRUE)`.
#'
#' @section Inner joins:
#'
#' Inner joins merge two data frames, however, only those rows (cases) are
#' kept that are present in both data frames. Thus, inner joins usually
#' add new columns (variables), but also remove rows (cases) that only
#' occur in one data frame.
#'
#' In base R, this is equivalent to `merge(x, y)`.
#'
#' @section Binds:
#'
#' `join = "bind"` row-binds the complete second data frame `y` to `x`.
#' Unlike simple `rbind()`, which requires the same columns for both data
#' frames, `join = "bind"` will bind shared columns from `y` to `x`, and
#' add new columns from `y` to `x`.
#'
#' @examples
#'
#' x <- data.frame(a = 1:3, b = c("a", "b", "c"), c = 5:7, id = 1:3)
#' y <- data.frame(c = 6:8, d = c("f", "g", "h"), e = 100:102, id = 2:4)
#'
#' x
#' y
#'
#' # "by" will default to all shared columns, i.e. "c" and "id". new columns
#' # "d" and "e" will be copied from "y" to "x", but there are only two cases
#' # in "x" that have the same values for "c" and "id" in "y". only those cases
#' # have values in the copied columns, the other case gets "NA".
#' data_merge(x, y, join = "left")
#'
#' # we change the id-value here
#' x <- data.frame(a = 1:3, b = c("a", "b", "c"), c = 5:7, id = 1:3)
#' y <- data.frame(c = 6:8, d = c("f", "g", "h"), e = 100:102, id = 3:5)
#'
#' x
#' y
#'
#' # no cases in "y" have the same matching "c" and "id" as in "x", thus
#' # copied variables from "y" to "x" copy no values, all get NA.
#' data_merge(x, y, join = "left")
#'
#' # one case in "y" has a match in "id" with "x", thus values for this
#' # case from the remaining variables in "y" are copied to "x", all other
#' # values (cases) in those remaining variables get NA
#' data_merge(x, y, join = "left", by = "id")
#'
#' data(mtcars)
#' x <- mtcars[1:5, 1:3]
#' y <- mtcars[28:32, 4:6]
#'
#' # add ID common column
#' x$id <- 1:5
#' y$id <- 3:7
#'
#' # left-join, add new variables and copy values from y to x,
#' # where "id" values match
#' data_merge(x, y)
#'
#' # right-join, add new variables and copy values from x to y,
#' # where "id" values match
#' data_merge(x, y, join = "right")
#'
#' # full-join
#' data_merge(x, y, join = "full")
#'
#'
#' data(mtcars)
#' x <- mtcars[1:5, 1:3]
#' y <- mtcars[28:32, c(1, 4:5)]
#'
#' # add ID common column
#' x$id <- 1:5
#' y$id <- 3:7
#'
#' # left-join, no matching rows (because columns "id" and "disp" are used)
#' # new variables get all NA values
#' data_merge(x, y)
#'
#' # one common value in "mpg", so one row from y is copied to x
#' data_merge(x, y, by = "mpg")
#'
#' # only keep rows with matching values in by-column
#' data_merge(x, y, join = "semi", by = "mpg")
#'
#' # only keep rows with non-matching values in by-column
#' data_merge(x, y, join = "anti", by = "mpg")
#'
#' # merge list of data frames. can be of different rows
#' x <- mtcars[1:5, 1:3]
#' y <- mtcars[28:31, 3:5]
#' z <- mtcars[11:18, c(1, 3:4, 6:8)]
#' x$id <- 1:5
#' y$id <- 4:7
#' z$id <- 3:10
#' data_merge(list(x, y, z), join = "bind", by = "id", id = "source")
#' @inherit data_rename seealso
#' @export
data_merge <- function(x, ...) {
UseMethod("data_merge")
}
#' @rdname data_merge
#' @export
data_join <- data_merge
#' @rdname data_merge
#' @export
data_merge.data.frame <- function(
x,
y,
join = "left",
by = NULL,
id = NULL,
verbose = TRUE,
...
) {
class_x <- class(x)
# save variable attributes
attr_x_vars <- lapply(x, attributes)
attr_y_vars <- lapply(y, attributes)
attr_vars <- c(
attr_x_vars,
attr_y_vars[names(attr_y_vars)[!names(attr_y_vars) %in% names(attr_x_vars)]]
)
# check join-argument ----------------------
join <- match.arg(
join,
choices = c("full", "left", "right", "inner", "semi", "anti", "bind")
)
# check id-argument ----------------------
all_columns <- union(colnames(x), colnames(y))
if (join == "bind" && !is.null(id) && id %in% all_columns) {
# ensure unique ID
id <- make.unique(c(all_columns, id), sep = "_")[length(all_columns) + 1]
# and also tell user...
if (isTRUE(verbose)) {
insight::format_warning(
sprintf(
"Value of `id` already exists as column name. ID column was renamed to `%s`.",
id
)
)
}
}
if (!is.null(id) && join == "bind") {
x[[id]] <- 1
y[[id]] <- 2
}
# check merge columns ("by"-argument) ----------------------
if (join != "bind") {
# we need a value for "by". If not provided, use all shared column names
if (is.null(by)) {
by <- intersect(colnames(x), colnames(y))
}
# If not all column names specified in "by" are present, yield warning
# and use all shared column names
if (!all(by %in% colnames(x)) || !all(by %in% colnames(y))) {
missing_in_x <- setdiff(by, colnames(x))
missing_in_y <- setdiff(by, colnames(y))
stop_message <- c(
"Not all columns specified in `by` were found in the data frames.",
if (length(missing_in_x) > 0L) {
paste0(
"Following columns are in `by` but absent in `x`: ",
text_concatenate(missing_in_x)
)
},
if (length(missing_in_y) > 0L) {
paste0(
"Following columns are in `by` but absent in `y`: ",
text_concatenate(missing_in_y)
)
}
)
if (isTRUE(verbose)) {
insight::format_error(stop_message)
}
}
# if still both data frames have no common columns, do a full join
if (!length(by)) {
if (isTRUE(verbose)) {
insight::format_warning(
"Found no matching columns in the data frames. Fully merging both data frames now.",
"Note that this can lead to unintended results, because rows in `x` and `y` are possibly duplicated.",
"You probably want to use `data_merge(x, y, join = \"bind\")` instead."
)
}
by <- NULL
join <- "full"
}
}
# check valid combination of "join" and "by" -----------------------
if (join %in% c("anti", "semi") && (is.null(by) || length(by) != 1)) {
insight::format_error(
sprintf(
"For `join = \"%s\"`, `by` needs to be a name of only one variable that is present in both data frames.",
join
)
)
}
# merge --------------------
# for later sorting
if (join != "bind") {
if (nrow(x) > 0L) {
x$.data_merge_id_x <- seq_len(nrow(x))
}
if (nrow(y) > 0L) {
y$.data_merge_id_y <- (seq_len(nrow(y))) + nrow(x)
}
}
all_columns <- union(colnames(x), colnames(y))
out <- switch(
join,
full = merge(x, y, all = TRUE, sort = FALSE, by = by),
left = merge(x, y, all.x = TRUE, sort = FALSE, by = by),
right = merge(x, y, all.y = TRUE, sort = FALSE, by = by),
inner = merge(x, y, sort = FALSE, by = by),
semi = x[x[[by]] %in% y[[by]], , drop = FALSE],
anti = x[!x[[by]] %in% y[[by]], , drop = FALSE],
bind = .bind_data_frames(x, y)
)
# sort rows, add attributes, and return results -------------------------
if (".data_merge_id_x" %in% colnames(out)) {
# for full joins, we have no complete sorting id, but NAs for each
# data frame. we now "merge" the two sorting IDs from each data frame.
if (anyNA(out$.data_merge_id_x) && ".data_merge_id_y" %in% colnames(out)) {
out$.data_merge_id_x[is.na(
out$.data_merge_id_x
)] <- out$.data_merge_id_y[is.na(out$.data_merge_id_x)]
}
out <- out[order(out$.data_merge_id_x), ]
out$.data_merge_id_x <- NULL
out$.data_merge_id_y <- NULL
}
# try to restore original column order as good as possible. Therefore, we
# first take all column names of the original input data frames, then
# we add all new columns, like duplicated from merging (name.x and name.y,
# if "name" was in both data frames, but not used in "by"), and then do a
# final check that all column names are present in "out" (e.g., "name" would)
# no longer be there if we have "name.x" and "name.y").
all_columns <- c(all_columns, setdiff(colnames(out), all_columns))
all_columns <- all_columns[all_columns %in% colnames(out)]
out <- out[all_columns]
# add back attributes
out <- .replace_attrs(out, attributes(y))
out <- .replace_attrs(out, attributes(x))
for (i in colnames(out)) {
if (is.list(attr_vars[[i]])) {
if (is.list(attributes(out[[i]]))) {
attributes(out[[i]]) <- utils::modifyList(
attr_vars[[i]],
attributes(out[[i]])
)
} else {
attributes(out[[i]]) <- attr_vars[[i]]
}
}
}
class(out) <- unique(c(class_x, "data.frame"))
out
}
#' @rdname data_merge
#' @export
data_merge.list <- function(
x,
join = "left",
by = NULL,
id = NULL,
verbose = TRUE,
...
) {
out <- x[[1]]
df_id <- rep(1, times = nrow(out))
for (i in 2:length(x)) {
out <- data_merge(
out,
x[[i]],
join = join,
by = by,
id = NULL,
verbose = verbose,
...
)
df_id <- c(df_id, rep(i, times = nrow(x[[i]])))
}
# we need separate handling for list of data frames and id-variable here
if (!is.null(id) && join == "bind") {
if (id %in% colnames(out)) {
# ensure unique ID
id <- make.unique(c(colnames(out), id), sep = "_")[
length(colnames(out)) + 1
]
# and also tell user...
if (isTRUE(verbose)) {
insight::format_warning(
sprintf(
"Value of `id` already exists as column name. ID column was renamed to `%s`.",
id
)
)
}
}
out[[id]] <- df_id
}
out
}
.bind_data_frames <- function(x, y) {
# merge and sort. "rbind()" is faster than "merge()" if all columns present
if (all(colnames(x) %in% colnames(y)) && ncol(x) == ncol(y)) {
# we may have different column order
out <- rbind(x, y[match(colnames(x), colnames(y))])
} else {
# add ID for merging
if (nrow(x) > 0L) {
x$.data_merge_row <- seq_len(nrow(x))
}
if (nrow(y) > 0L) {
y$.data_merge_row <- (nrow(x) + 1):(nrow(x) + nrow(y))
}
merge_by <- intersect(colnames(x), colnames(y))
out <- merge(x, y, all = TRUE, sort = FALSE, by = merge_by)
}
# for empty df's, merge() may return an empty character vector
# make sure it's a data frame object.
if (!is.data.frame(out)) {
out <- as.data.frame(out)
}
if (".data_merge_row" %in% colnames(out)) {
out <- out[order(out$.data_merge_row), ]
}
out$.data_merge_row <- NULL
out
}
================================================
FILE: R/data_modify.R
================================================
#' Create new variables in a data frame
#'
#' Create new variables or modify existing variables in a data frame. Unlike `base::transform()`, `data_modify()`
#' can be used on grouped data frames, and newly created variables can be directly
#' used.
#'
#' @param data A data frame
#' @param ... One or more expressions that define the new variable name and the
#' values or recoding of those new variables. These expressions can be one of:
#' - A sequence of named, literal expressions, where the left-hand side refers
#' to the name of the new variable, while the right-hand side represent the
#' values of the new variable. Example: `Sepal.Width = center(Sepal.Width)`.
#' - A vector of length 1 (which will be recycled to match the number of rows
#' in the data), or of same length as the data.
#' - A variable that contains a value to be used. Example:
#' ```r
#' a <- "abc"
#' data_modify(iris, var_abc = a) # var_abc contains "abc"
#' ```
#' - An expression can also be provided as string and wrapped in
#' `as_expr()`. Example:
#' ```r
#' data_modify(iris, as_expr("Sepal.Width = center(Sepal.Width)"))
#' # or
#' a <- "center(Sepal.Width)"
#' data_modify(iris, Sepal.Width = as_expr(a))
#' # or
#' a <- "Sepal.Width = center(Sepal.Width)"
#' data_modify(iris, as_expr(a))
#' ```
#' Note that `as_expr()` is no real function, which cannot be used outside
#' of `data_modify()`, and hence it is not exported nor documented. Rather,
#' it is only used for internally processing expressions.
#' - Using `NULL` as right-hand side removes a variable from the data frame.
#' Example: `Petal.Width = NULL`.
#' - For data frames (including grouped ones), the function `n()` can be used to
#' count the number of observations and thereby, for instance, create index
#' values by using `id = 1:n()` or `id = 3:(n()+2)` and similar. Note that,
#' like `as_expr()`, `n()` is also no true function and cannot be used outside
#' of `data_modify()`.
#'
#' Note that newly created variables can be used in subsequent expressions,
#' including `.at` or `.if`. See also 'Examples'.
#'
#' @param .at A character vector of variable names that should be modified. This
#' argument is used in combination with the `.modify` argument. Note that only one
#' of `.at` or `.if` can be provided, but not both at the same time. Newly created
#' variables in `...` can also be selected, see 'Examples'.
#' @param .if A function that returns `TRUE` for columns in the data frame where
#' `.if` applies. This argument is used in combination with the `.modify` argument.
#' Note that only one of `.at` or `.if` can be provided, but not both at the same
#' time. Newly created variables in `...` can also be selected, see 'Examples'.
#' @param .modify A function that modifies the variables defined in `.at` or `.if`.
#' This argum
gitextract_nyyro61y/
├── .Rbuildignore
├── .dev/
│ ├── _BENCHMARK_RESHAPE.R
│ ├── html5.R
│ ├── revdepcheck.R
│ ├── test-value_at.R
│ └── value_at.R
├── .git-blame-ignore-revs
├── .github/
│ ├── .gitignore
│ ├── CODE_OF_CONDUCT.md
│ ├── CONTRIBUTING.md
│ ├── FUNDING.yml
│ ├── SUPPORT.md
│ ├── dependabot.yaml
│ └── workflows/
│ ├── R-CMD-check-hard.yaml
│ ├── R-CMD-check.yaml
│ ├── check-all-examples.yaml
│ ├── check-link-rot.yaml
│ ├── check-random-test-order.yaml
│ ├── check-readme.yaml
│ ├── check-spelling.yaml
│ ├── check-styling.yaml
│ ├── check-test-warnings.yaml
│ ├── check-vignette-warnings.yaml
│ ├── html-5-check.yaml
│ ├── lint-changed-files.yaml
│ ├── lint.yaml
│ ├── pkgdown-no-suggests.yaml
│ ├── pkgdown.yaml
│ ├── test-coverage-examples.yaml
│ ├── test-coverage.yaml
│ └── update-to-latest-easystats.yaml
├── .gitignore
├── .lintr
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── NEWS.md
├── R/
│ ├── adjust.R
│ ├── assign_labels.R
│ ├── categorize.R
│ ├── center.R
│ ├── contrs.R
│ ├── convert_na_to.R
│ ├── convert_to_na.R
│ ├── data.R
│ ├── data_addprefix.R
│ ├── data_arrange.R
│ ├── data_codebook.R
│ ├── data_duplicated.R
│ ├── data_extract.R
│ ├── data_group.R
│ ├── data_match.R
│ ├── data_merge.R
│ ├── data_modify.R
│ ├── data_partition.R
│ ├── data_peek.R
│ ├── data_read.R
│ ├── data_relocate.R
│ ├── data_remove.R
│ ├── data_rename.R
│ ├── data_replicate.R
│ ├── data_rescale.R
│ ├── data_restoretype.R
│ ├── data_reverse.R
│ ├── data_rotate.R
│ ├── data_seek.R
│ ├── data_select.R
│ ├── data_separate.R
│ ├── data_summary.R
│ ├── data_tabulate.R
│ ├── data_to_long.R
│ ├── data_to_wide.R
│ ├── data_unique.R
│ ├── data_unite.R
│ ├── data_write.R
│ ├── data_xtabulate.R
│ ├── datawizard-package.R
│ ├── demean.R
│ ├── describe_distribution.R
│ ├── descriptives.R
│ ├── extract_column_names.R
│ ├── format.R
│ ├── labels_to_levels.R
│ ├── makepredictcall.R
│ ├── mean_sd.R
│ ├── means_by_group.R
│ ├── normalize.R
│ ├── ranktransform.R
│ ├── recode_into.R
│ ├── recode_values.R
│ ├── remove_empty.R
│ ├── replace_nan_inf.R
│ ├── rescale_weights.R
│ ├── reshape_ci.R
│ ├── row_count.R
│ ├── row_means.R
│ ├── select_nse.R
│ ├── skewness_kurtosis.R
│ ├── slide.R
│ ├── smoothness.R
│ ├── standardize.R
│ ├── standardize.models.R
│ ├── text_format.R
│ ├── to_factor.R
│ ├── to_numeric.R
│ ├── unnormalize.R
│ ├── unstandardize.R
│ ├── utils-cols.R
│ ├── utils-rows.R
│ ├── utils.R
│ ├── utils_labels.R
│ ├── utils_standardize_center.R
│ ├── visualisation_recipe.R
│ ├── weighted_mean_median_sd_mad.R
│ └── winsorize.R
├── README.Rmd
├── README.md
├── air.toml
├── cran-comments.md
├── data/
│ ├── efc.RData
│ └── nhanes_sample.RData
├── datawizard.Rproj
├── datawizard.code-workspace
├── inst/
│ ├── CITATION
│ └── WORDLIST
├── man/
│ ├── adjust.Rd
│ ├── as.prop.table.Rd
│ ├── assign_labels.Rd
│ ├── categorize.Rd
│ ├── center.Rd
│ ├── coef_var.Rd
│ ├── coerce_to_numeric.Rd
│ ├── colnames.Rd
│ ├── contr.deviation.Rd
│ ├── convert_na_to.Rd
│ ├── convert_to_na.Rd
│ ├── data_arrange.Rd
│ ├── data_codebook.Rd
│ ├── data_duplicated.Rd
│ ├── data_extract.Rd
│ ├── data_group.Rd
│ ├── data_match.Rd
│ ├── data_merge.Rd
│ ├── data_modify.Rd
│ ├── data_partition.Rd
│ ├── data_peek.Rd
│ ├── data_prefix_suffix.Rd
│ ├── data_read.Rd
│ ├── data_relocate.Rd
│ ├── data_rename.Rd
│ ├── data_replicate.Rd
│ ├── data_restoretype.Rd
│ ├── data_rotate.Rd
│ ├── data_seek.Rd
│ ├── data_separate.Rd
│ ├── data_summary.Rd
│ ├── data_tabulate.Rd
│ ├── data_to_long.Rd
│ ├── data_to_wide.Rd
│ ├── data_unique.Rd
│ ├── data_unite.Rd
│ ├── datawizard-package.Rd
│ ├── demean.Rd
│ ├── describe_distribution.Rd
│ ├── distribution_mode.Rd
│ ├── efc.Rd
│ ├── extract_column_names.Rd
│ ├── labels_to_levels.Rd
│ ├── makepredictcall.dw_transformer.Rd
│ ├── mean_sd.Rd
│ ├── means_by_group.Rd
│ ├── nhanes_sample.Rd
│ ├── normalize.Rd
│ ├── ranktransform.Rd
│ ├── recode_into.Rd
│ ├── recode_values.Rd
│ ├── reexports.Rd
│ ├── remove_empty.Rd
│ ├── replace_nan_inf.Rd
│ ├── rescale.Rd
│ ├── rescale_weights.Rd
│ ├── reshape_ci.Rd
│ ├── reverse.Rd
│ ├── row_count.Rd
│ ├── row_means.Rd
│ ├── rownames.Rd
│ ├── skewness.Rd
│ ├── slide.Rd
│ ├── smoothness.Rd
│ ├── standardize.Rd
│ ├── standardize.default.Rd
│ ├── text_format.Rd
│ ├── to_factor.Rd
│ ├── to_numeric.Rd
│ ├── visualisation_recipe.Rd
│ ├── weighted_mean.Rd
│ └── winsorize.Rd
├── paper/
│ └── JOSS_files/
│ ├── apa.csl
│ ├── paper.Rmd
│ ├── paper.bib
│ ├── paper.log
│ └── paper.md
├── pkgdown/
│ └── _pkgdown.yaml
├── tests/
│ ├── testthat/
│ │ ├── _snaps/
│ │ │ ├── categorize.md
│ │ │ ├── contr.deviation.md
│ │ │ ├── data_codebook.md
│ │ │ ├── data_modify.md
│ │ │ ├── data_partition.md
│ │ │ ├── data_peek.md
│ │ │ ├── data_read.md
│ │ │ ├── data_rescale.md
│ │ │ ├── data_seek.md
│ │ │ ├── data_separate.md
│ │ │ ├── data_summary.md
│ │ │ ├── data_tabulate.md
│ │ │ ├── data_to_factor.md
│ │ │ ├── data_to_long.md
│ │ │ ├── data_to_numeric.md
│ │ │ ├── demean.md
│ │ │ ├── describe_distribution.md
│ │ │ ├── empty-dataframe.md
│ │ │ ├── means_by_group.md
│ │ │ ├── normalize.md
│ │ │ ├── print.dw_transformer.md
│ │ │ ├── ranktransform.md
│ │ │ ├── rescale_weights.md
│ │ │ ├── reshape_ci.md
│ │ │ ├── skewness-kurtosis.md
│ │ │ ├── smoothness.md
│ │ │ ├── text_format.md
│ │ │ ├── windows/
│ │ │ │ └── means_by_group.md
│ │ │ └── winsorization.md
│ │ ├── helper-state.R
│ │ ├── helper.R
│ │ ├── test-adjust.R
│ │ ├── test-assign_labels.R
│ │ ├── test-attributes-grouped-df.R
│ │ ├── test-attributes.R
│ │ ├── test-categorize.R
│ │ ├── test-center.R
│ │ ├── test-coef_var.R
│ │ ├── test-contr.deviation.R
│ │ ├── test-convert_na_to.R
│ │ ├── test-convert_to_na.R
│ │ ├── test-data_addprefix.R
│ │ ├── test-data_arrange.R
│ │ ├── test-data_codebook.R
│ │ ├── test-data_duplicated.R
│ │ ├── test-data_extract.R
│ │ ├── test-data_group.R
│ │ ├── test-data_match.R
│ │ ├── test-data_merge.R
│ │ ├── test-data_modify.R
│ │ ├── test-data_partition.R
│ │ ├── test-data_peek.R
│ │ ├── test-data_read.R
│ │ ├── test-data_recode.R
│ │ ├── test-data_relocate.R
│ │ ├── test-data_remove.R
│ │ ├── test-data_rename.R
│ │ ├── test-data_reorder.R
│ │ ├── test-data_replicate.R
│ │ ├── test-data_rescale.R
│ │ ├── test-data_restoretype.R
│ │ ├── test-data_reverse.R
│ │ ├── test-data_rotate.R
│ │ ├── test-data_seek.R
│ │ ├── test-data_select.R
│ │ ├── test-data_separate.R
│ │ ├── test-data_shift.R
│ │ ├── test-data_summary.R
│ │ ├── test-data_tabulate.R
│ │ ├── test-data_to_factor.R
│ │ ├── test-data_to_long.R
│ │ ├── test-data_to_numeric.R
│ │ ├── test-data_to_wide.R
│ │ ├── test-data_unique.R
│ │ ├── test-data_unite.R
│ │ ├── test-data_write.R
│ │ ├── test-demean.R
│ │ ├── test-describe_distribution.R
│ │ ├── test-distributions.R
│ │ ├── test-empty-dataframe.R
│ │ ├── test-extract_column_names.R
│ │ ├── test-labelled_data.R
│ │ ├── test-labels_to_levels.R
│ │ ├── test-makepredictcall.R
│ │ ├── test-mean_sd.R
│ │ ├── test-means_by_group.R
│ │ ├── test-normalize.R
│ │ ├── test-print.dw_transformer.R
│ │ ├── test-ranktransform.R
│ │ ├── test-recode_into.R
│ │ ├── test-replace_nan_inf.R
│ │ ├── test-rescale_weights.R
│ │ ├── test-reshape_ci.R
│ │ ├── test-row_count.R
│ │ ├── test-row_means.R
│ │ ├── test-select_nse.R
│ │ ├── test-skewness-kurtosis.R
│ │ ├── test-smoothness.R
│ │ ├── test-standardize-data.R
│ │ ├── test-standardize_datagrid.R
│ │ ├── test-standardize_models.R
│ │ ├── test-std_center.R
│ │ ├── test-std_center_scale_args.R
│ │ ├── test-text_format.R
│ │ ├── test-unnormalize.R
│ │ ├── test-utils.R
│ │ ├── test-utils_cols.R
│ │ ├── test-utils_rows.R
│ │ ├── test-weighted-stats.R
│ │ └── test-winsorization.R
│ └── testthat.R
└── vignettes/
├── .gitignore
├── bibliography.bib
├── overview_of_vignettes.Rmd
├── selection_syntax.Rmd
├── standardize_data.Rmd
└── tidyverse_translation.Rmd
Condensed preview — 321 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (2,012K chars).
[
{
"path": ".Rbuildignore",
"chars": 806,
"preview": "^\\cache$\n^codemeta\\.json$\n^Meta$\n^doc$\n^.*\\.Rproj$\n^\\.Rproj\\.user$\n^README\\.Rmd$\n^Rplots.pdf$\n^README-.*\\.png$\n^CONDUCT\\"
},
{
"path": ".dev/_BENCHMARK_RESHAPE.R",
"chars": 5787,
"preview": "library(tidyr)\nlibrary(dplyr)\nlibrary(datawizard)\n\n### DATA_TO_LONG ==========================================\n\n\n# SLOW "
},
{
"path": ".dev/html5.R",
"chars": 393,
"preview": "Sys.setenv(\"_R_CHECK_RD_VALIDATE_RD2HTML_\" = \"true\")\nSys.setenv(\"_R_CHECK_CRAN_INCOMING_REMOTE_\" = \"false\")\nSys.setenv(\""
},
{
"path": ".dev/revdepcheck.R",
"chars": 83,
"preview": "library(revdepcheck)\n\nrevdep_check(num_workers = 4)\nrevdep_report()\nrevdep_reset()\n"
},
{
"path": ".dev/test-value_at.R",
"chars": 675,
"preview": "test_that(\"value_at\", {\n data(efc, package = \"datawizard\")\n expect_equal(value_at(efc$e42dep, 5), 4, ignore_attr = TRU"
},
{
"path": ".dev/value_at.R",
"chars": 1610,
"preview": "#' @title Find the value(s) at a specific position in a variable\n#' @name value_at\n#'\n#' @description This function can "
},
{
"path": ".git-blame-ignore-revs",
"chars": 57,
"preview": "# Air formatting\n5bd245e0bc12d2eecbcfa480a231b6df3ab7d684"
},
{
"path": ".github/.gitignore",
"chars": 7,
"preview": "*.html\n"
},
{
"path": ".github/CODE_OF_CONDUCT.md",
"chars": 5254,
"preview": "# Contributor Covenant Code of Conduct\n\n## Our Pledge\n\nWe as members, contributors, and leaders pledge to make participa"
},
{
"path": ".github/CONTRIBUTING.md",
"chars": 1782,
"preview": "# Contributing to datawizard\n\nThis outlines how to propose a change to **datawizard**. \n\n## Fixing typos\n\nSmall typos or"
},
{
"path": ".github/FUNDING.yml",
"chars": 65,
"preview": "# These are supported funding model platforms\n\ngithub: easystats\n"
},
{
"path": ".github/SUPPORT.md",
"chars": 1636,
"preview": "# Getting help with `{datawizard}`\n\nThanks for using `{datawizard}`. Before filing an issue, there are a few places\nto e"
},
{
"path": ".github/dependabot.yaml",
"chars": 171,
"preview": "version: 2\n\nupdates:\n # Keep dependencies for GitHub Actions up-to-date\n - package-ecosystem: \"github-actions\"\n dir"
},
{
"path": ".github/workflows/R-CMD-check-hard.yaml",
"chars": 717,
"preview": "# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples\n# Need help debugging build failures? Start at"
},
{
"path": ".github/workflows/R-CMD-check.yaml",
"chars": 543,
"preview": "# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples\n# Need help debugging build failures? Start at"
},
{
"path": ".github/workflows/check-all-examples.yaml",
"chars": 602,
"preview": "# Make sure all examples run successfully, even the ones that are not supposed\n# to be run or tested on CRAN machines by"
},
{
"path": ".github/workflows/check-link-rot.yaml",
"chars": 206,
"preview": "on:\n push:\n branches: [main, master]\n pull_request:\n branches: [main, master]\n\nname: check-link-rot\n\njobs:\n che"
},
{
"path": ".github/workflows/check-random-test-order.yaml",
"chars": 261,
"preview": "# Run tests in random order\non:\n push:\n branches: [main, master]\n pull_request:\n branches: [main, master]\n\nname:"
},
{
"path": ".github/workflows/check-readme.yaml",
"chars": 374,
"preview": "# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples\n# Need help debugging build failures? Start at"
},
{
"path": ".github/workflows/check-spelling.yaml",
"chars": 206,
"preview": "on:\n push:\n branches: [main, master]\n pull_request:\n branches: [main, master]\n\nname: check-spelling\n\njobs:\n che"
},
{
"path": ".github/workflows/check-styling.yaml",
"chars": 203,
"preview": "on:\n push:\n branches: [main, master]\n pull_request:\n branches: [main, master]\n\nname: check-styling\n\njobs:\n chec"
},
{
"path": ".github/workflows/check-test-warnings.yaml",
"chars": 285,
"preview": "# Running tests with options(warn = 2) to fail on test warnings\non:\n push:\n branches: [main, master]\n pull_request:"
},
{
"path": ".github/workflows/check-vignette-warnings.yaml",
"chars": 297,
"preview": "# Running tests with options(warn = 2) to fail on test warnings\non:\n push:\n branches: [main, master]\n pull_request:"
},
{
"path": ".github/workflows/html-5-check.yaml",
"chars": 373,
"preview": "# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples\n# Need help debugging build failures? Start at"
},
{
"path": ".github/workflows/lint-changed-files.yaml",
"chars": 354,
"preview": "# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples\n# Need help debugging build failures? Start at"
},
{
"path": ".github/workflows/lint.yaml",
"chars": 349,
"preview": "# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples\n# Need help debugging build failures? Start at"
},
{
"path": ".github/workflows/pkgdown-no-suggests.yaml",
"chars": 221,
"preview": "on:\n push:\n branches: [main, master]\n pull_request:\n branches: [main, master]\n\nname: pkgdown-no-suggests\n\njobs:\n"
},
{
"path": ".github/workflows/pkgdown.yaml",
"chars": 413,
"preview": "# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples\n# Need help debugging build failures? Start at"
},
{
"path": ".github/workflows/test-coverage-examples.yaml",
"chars": 403,
"preview": "# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples\n# Need help debugging build failures? Start at"
},
{
"path": ".github/workflows/test-coverage.yaml",
"chars": 376,
"preview": "# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples\n# Need help debugging build failures? Start at"
},
{
"path": ".github/workflows/update-to-latest-easystats.yaml",
"chars": 244,
"preview": "on:\n schedule:\n # Check for dependency updates once a month\n - cron: \"0 0 1 * *\"\n\nname: update-to-latest-easystat"
},
{
"path": ".gitignore",
"chars": 989,
"preview": "# History files\n.Rhistory\n.Rapp.history\n\n# Session Data files\n.RData\n\n# Example code in package build process\n*-Ex.R\n\n# "
},
{
"path": ".lintr",
"chars": 747,
"preview": "linters: all_linters(\n absolute_path_linter = NULL,\n assignment_linter = NULL,\n commented_code_linter = NULL,\n "
},
{
"path": "DESCRIPTION",
"chars": 2707,
"preview": "Type: Package\nPackage: datawizard\nTitle: Easy Data Wrangling and Statistical Transformations\nVersion: 1.3.1\nAuthors@R: c"
},
{
"path": "LICENSE",
"chars": 48,
"preview": "YEAR: 2023\nCOPYRIGHT HOLDER: datawizard authors\n"
},
{
"path": "LICENSE.md",
"chars": 1077,
"preview": "# MIT License\n\nCopyright (c) 2023 datawizard authors\n\nPermission is hereby granted, free of charge, to any person obtain"
},
{
"path": "NAMESPACE",
"chars": 10051,
"preview": "# Generated by roxygen2: do not edit by hand\n\nS3method(as.data.frame,datawizard_crosstabs)\nS3method(as.data.frame,datawi"
},
{
"path": "NEWS.md",
"chars": 39197,
"preview": "# datawizard 1.3.1\n\nCHANGES\n\n* `data_summary()` now allows expressions to return more than one summary\n value. For each"
},
{
"path": "R/adjust.R",
"chars": 7148,
"preview": "#' Adjust data for the effect of other variable(s)\n#'\n#' This function can be used to adjust the data for the effect of "
},
{
"path": "R/assign_labels.R",
"chars": 4679,
"preview": "#' @title Assign variable and value labels\n#' @name assign_labels\n#'\n#' @description\n#' Assign variable and values label"
},
{
"path": "R/categorize.R",
"chars": 14976,
"preview": "#' @title Recode (or \"cut\" / \"bin\") data into groups of values.\n#' @name categorize\n#'\n#' @description\n#' This functions"
},
{
"path": "R/center.R",
"chars": 9025,
"preview": "#' Centering (Grand-Mean Centering)\n#'\n#' Performs a grand-mean centering of data.\n#'\n#' @param x A (grouped) data frame"
},
{
"path": "R/contrs.R",
"chars": 4168,
"preview": "#' Deviation Contrast Matrix\n#'\n#' Build a deviation contrast matrix, a type of _effects contrast_ matrix.\n#'\n#' @inheri"
},
{
"path": "R/convert_na_to.R",
"chars": 4940,
"preview": "#' @title Replace missing values in a variable or a data frame.\n#' @name convert_na_to\n#'\n#' @description\n#' Replace mis"
},
{
"path": "R/convert_to_na.R",
"chars": 5555,
"preview": "#' @title Convert non-missing values in a variable into missing values.\n#' @name convert_to_na\n#'\n#' @description\n#' Con"
},
{
"path": "R/data.R",
"chars": 718,
"preview": "#' @docType data\n#' @title Sample dataset from the National Health and Nutrition Examination Survey\n#' @name nhanes_samp"
},
{
"path": "R/data_addprefix.R",
"chars": 1448,
"preview": "#' Add a prefix or suffix to column names\n#'\n#' @rdname data_prefix_suffix\n#' @inheritParams extract_column_names\n#' @pa"
},
{
"path": "R/data_arrange.R",
"chars": 3787,
"preview": "#' Arrange rows by column values\n#'\n#' `data_arrange()` orders the rows of a data frame by the values of selected\n#' col"
},
{
"path": "R/data_codebook.R",
"chars": 17237,
"preview": "#' Generate a codebook of a data frame.\n#'\n#' `data_codebook()` generates codebooks from data frames, i.e. overviews\n#' "
},
{
"path": "R/data_duplicated.R",
"chars": 2323,
"preview": "#' @title Extract all duplicates\n#'\n#' @description Extract all duplicates, for visual inspection.\n#' Note that it also "
},
{
"path": "R/data_extract.R",
"chars": 5589,
"preview": "#' Extract one or more columns or elements from an object\n#'\n#' `data_extract()` (or its alias `extract()`) is similar t"
},
{
"path": "R/data_group.R",
"chars": 1979,
"preview": "#' @title Create a grouped data frame\n#' @name data_group\n#'\n#' @description This function is comparable to `dplyr::grou"
},
{
"path": "R/data_match.R",
"chars": 13132,
"preview": "#' Return filtered or sliced data frame, or row indices\n#'\n#' Return a filtered (or sliced) data frame or row indices of"
},
{
"path": "R/data_merge.R",
"chars": 13734,
"preview": "#' @title Merge (join) two data frames, or a list of data frames\n#' @name data_merge\n#'\n#' @description\n#' Merge (join) "
},
{
"path": "R/data_modify.R",
"chars": 23421,
"preview": "#' Create new variables in a data frame\n#'\n#' Create new variables or modify existing variables in a data frame. Unlike "
},
{
"path": "R/data_partition.R",
"chars": 5293,
"preview": "#' Partition data\n#'\n#' Creates data partitions (for instance, a training and a test set) based on a\n#' data frame that "
},
{
"path": "R/data_peek.R",
"chars": 3548,
"preview": "#' @title Peek at values and type of variables in a data frame\n#' @name data_peek\n#'\n#' @description This function creat"
},
{
"path": "R/data_read.R",
"chars": 19898,
"preview": "#' @title Read (import) data files from various sources\n#' @name data_read\n#'\n#' @description\n#' This functions imports "
},
{
"path": "R/data_relocate.R",
"chars": 5117,
"preview": "#' @title Relocate (reorder) columns of a data frame\n#' @name data_relocate\n#'\n#' @description\n#' `data_relocate()` will"
},
{
"path": "R/data_remove.R",
"chars": 733,
"preview": "#' @inheritParams extract_column_names\n#' @rdname data_relocate\n#' @examples\n#' # Remove columns\n#' head(data_remove(iri"
},
{
"path": "R/data_rename.R",
"chars": 10051,
"preview": "#' @title Rename columns and variable names\n#' @name data_rename\n#'\n#' @description Safe and intuitive functions to rena"
},
{
"path": "R/data_replicate.R",
"chars": 3692,
"preview": "#' @title Expand (i.e. replicate rows) a data frame\n#' @name data_replicate\n#'\n#' @description\n#' Expand a data frame by"
},
{
"path": "R/data_rescale.R",
"chars": 9853,
"preview": "#' @title Rescale Variables to a New Range\n#' @name rescale\n#'\n#' @description\n#' Rescale variables to a new range. Can "
},
{
"path": "R/data_restoretype.R",
"chars": 1598,
"preview": "#' Restore the type of columns according to a reference data frame\n#'\n#' @param data A data frame for which to restore t"
},
{
"path": "R/data_reverse.R",
"chars": 8004,
"preview": "#' Reverse-Score Variables\n#'\n#' Reverse-score variables (change the keying/scoring direction).\n#'\n#' @param range Range"
},
{
"path": "R/data_rotate.R",
"chars": 2917,
"preview": "#' @title Rotate a data frame\n#' @name data_rotate\n#'\n#' @description\n#' This function rotates a data frame, i.e. column"
},
{
"path": "R/data_seek.R",
"chars": 5889,
"preview": "#' @title Find variables by their names, variable or value labels\n#' @name data_seek\n#'\n#' @description This functions s"
},
{
"path": "R/data_select.R",
"chars": 852,
"preview": "#' @rdname extract_column_names\n#' @export\ndata_select <- function(\n data,\n select = NULL,\n exclude = NULL,\n ignore_"
},
{
"path": "R/data_separate.R",
"chars": 13968,
"preview": "#' @title Separate single variable into multiple variables\n#' @name data_separate\n#'\n#' @description\n#' Separates a sing"
},
{
"path": "R/data_summary.R",
"chars": 13765,
"preview": "#' @title Summarize data\n#' @name data_summary\n#'\n#' @description This function can be used to compute summary statistic"
},
{
"path": "R/data_tabulate.R",
"chars": 32686,
"preview": "#' @title Create frequency and crosstables of variables\n#' @name data_tabulate\n#'\n#' @description This function creates "
},
{
"path": "R/data_to_long.R",
"chars": 12162,
"preview": "#' @title Reshape (pivot) data from wide to long\n#' @name data_to_long\n#'\n#' @description\n#' This function \"lengthens\" d"
},
{
"path": "R/data_to_wide.R",
"chars": 15246,
"preview": "#' Reshape (pivot) data from long to wide\n#'\n#' This function \"widens\" data, increasing the number of columns and decrea"
},
{
"path": "R/data_unique.R",
"chars": 3568,
"preview": "#' @title Keep only one row from all with duplicated IDs\n#'\n#' @description From all rows with at least one duplicated I"
},
{
"path": "R/data_unite.R",
"chars": 3242,
"preview": "#' @title Unite (\"merge\") multiple variables\n#' @name data_unite\n#'\n#' @description\n#' Merge values of multiple variable"
},
{
"path": "R/data_write.R",
"chars": 11611,
"preview": "#' @param data The data frame that should be written to a file.\n#' @param delimiter For CSV-files, specifies the delimit"
},
{
"path": "R/data_xtabulate.R",
"chars": 18471,
"preview": "# helper to compute crosstables --------------\n\n.crosstable <- function(\n x,\n by,\n weights = NULL,\n remove_na = FALS"
},
{
"path": "R/datawizard-package.R",
"chars": 684,
"preview": "#' `datawizard`\n#'\n#' @title datawizard: Easy Data Wrangling and Statistical Transformations\n#'\n#' @description\n#'\n#' A "
},
{
"path": "R/demean.R",
"chars": 23859,
"preview": "#' Compute group-meaned and de-meaned variables\n#'\n#' @description\n#'\n#' `demean()` computes group- and de-meaned versio"
},
{
"path": "R/describe_distribution.R",
"chars": 19167,
"preview": "#' Describe a distribution\n#'\n#' This function describes a distribution by a set of indices (e.g., measures of\n#' centra"
},
{
"path": "R/descriptives.R",
"chars": 6415,
"preview": "# distribution_mode ----------------------------------\n\n#' Compute mode for a statistical distribution\n#'\n#' @param x An"
},
{
"path": "R/extract_column_names.R",
"chars": 7042,
"preview": "#' @title Find or get columns in a data frame based on search patterns\n#' @name extract_column_names\n#'\n#' @description "
},
{
"path": "R/format.R",
"chars": 2783,
"preview": "# distribution ---------------------------------\n\n#' @export\nformat.parameters_distribution <- function(\n x,\n digits ="
},
{
"path": "R/labels_to_levels.R",
"chars": 2496,
"preview": "#' @title Convert value labels into factor levels\n#' @name labels_to_levels\n#'\n#' @details\n#' `labels_to_levels()` allow"
},
{
"path": "R/makepredictcall.R",
"chars": 2450,
"preview": "#' Utility Function for Safe Prediction with `datawizard` transformers\n#'\n#' This function allows for the use of (some o"
},
{
"path": "R/mean_sd.R",
"chars": 2076,
"preview": "#' Summary Helpers\n#'\n#' @param x A numeric vector (or one that can be coerced to one via\n#' `as.numeric()`) to be sum"
},
{
"path": "R/means_by_group.R",
"chars": 8562,
"preview": "#' @title Summary of mean values by group\n#' @name means_by_group\n#'\n#' @description Computes summary table of means by "
},
{
"path": "R/normalize.R",
"chars": 8381,
"preview": "#' Normalize numeric variable to 0-1 range\n#'\n#' Performs a normalization of data, i.e., it scales variables in the rang"
},
{
"path": "R/ranktransform.R",
"chars": 4501,
"preview": "#' (Signed) rank transformation\n#'\n#' Transform numeric values with the integers of their rank (i.e., 1st smallest,\n#' 2"
},
{
"path": "R/recode_into.R",
"chars": 9877,
"preview": "#' @title Recode values from one or more variables into a new variable\n#' @name recode_into\n#'\n#' @description\n#' This f"
},
{
"path": "R/recode_values.R",
"chars": 15621,
"preview": "#' @title Recode old values of variables into new values\n#' @name recode_values\n#'\n#' @description\n#' This functions rec"
},
{
"path": "R/remove_empty.R",
"chars": 3081,
"preview": "#' @title Return or remove variables or observations that are completely missing\n#' @name remove_empty\n#' @rdname remove"
},
{
"path": "R/replace_nan_inf.R",
"chars": 1193,
"preview": "#' @title Convert infinite or `NaN` values into `NA`\n#' @name replace_nan_inf\n#'\n#' @description\n#' Replaces all infinit"
},
{
"path": "R/rescale_weights.R",
"chars": 15104,
"preview": "#' @title Rescale design weights for multilevel analysis\n#' @name rescale_weights\n#'\n#' @description Most functions to f"
},
{
"path": "R/reshape_ci.R",
"chars": 4132,
"preview": "#' Reshape CI between wide/long formats\n#'\n#' Reshape CI between wide/long formats.\n#'\n#' @param x A data frame containi"
},
{
"path": "R/row_count.R",
"chars": 4425,
"preview": "#' @title Count specific values row-wise\n#' @name row_count\n#' @description `row_count()` mimics base R's `rowSums()`, w"
},
{
"path": "R/row_means.R",
"chars": 6477,
"preview": "#' @title Row means or sums (optionally with minimum amount of valid values)\n#' @name row_means\n#' @description This fun"
},
{
"path": "R/select_nse.R",
"chars": 20459,
"preview": "# Code adapted from {poorman} by Nathan Eastwood [License: MIT]\n# https://github.com/nathaneastwood/poorman/blob/master/"
},
{
"path": "R/skewness_kurtosis.R",
"chars": 11449,
"preview": "#' Compute Skewness and (Excess) Kurtosis\n#'\n#' @param x A numeric vector or data.frame.\n#' @param type Type of algorith"
},
{
"path": "R/slide.R",
"chars": 2160,
"preview": "#' @title Shift numeric value range\n#' @name slide\n#'\n#' @description\n#' This functions shifts the value range of a nume"
},
{
"path": "R/smoothness.R",
"chars": 3018,
"preview": "#' Quantify the smoothness of a vector\n#'\n#' @param x Numeric vector (similar to a time series).\n#' @param method Can be"
},
{
"path": "R/standardize.R",
"chars": 12848,
"preview": "#' Standardization (Z-scoring)\n#'\n#' Performs a standardization of data (z-scoring), i.e., centering and scaling,\n#' so "
},
{
"path": "R/standardize.models.R",
"chars": 16389,
"preview": "#' Re-fit a model with standardized data\n#'\n#' Performs a standardization of data (z-scoring) using\n#' [`standardize()`]"
},
{
"path": "R/text_format.R",
"chars": 4000,
"preview": "#' Convenient text formatting functionalities\n#'\n#' Convenience functions to manipulate and format text.\n#'\n#' @param te"
},
{
"path": "R/to_factor.R",
"chars": 3371,
"preview": "#' @title Convert data to factors\n#' @name to_factor\n#'\n#' @details\n#' Convert variables or data into factors. If the da"
},
{
"path": "R/to_numeric.R",
"chars": 9007,
"preview": "#' Convert data to numeric\n#'\n#' Convert data to numeric by converting characters to factors and factors to\n#' either nu"
},
{
"path": "R/unnormalize.R",
"chars": 3946,
"preview": "#' @rdname normalize\n#' @export\nunnormalize <- function(x, ...) {\n UseMethod(\"unnormalize\")\n}\n\n\n#' @export\nunnormalize."
},
{
"path": "R/unstandardize.R",
"chars": 7676,
"preview": "#' @rdname standardize\n#' @export\nunstandardize <- function(x, ...) {\n UseMethod(\"unstandardize\")\n}\n\n#' @rdname standar"
},
{
"path": "R/utils-cols.R",
"chars": 2454,
"preview": "#' Tools for working with column names\n#'\n#' @param x A data frame.\n#' @param row Row to use as column names.\n#' @param "
},
{
"path": "R/utils-rows.R",
"chars": 4022,
"preview": "#' Tools for working with row names or row ids\n#'\n#' @param x A data frame.\n#' @param var Name of column to use for row "
},
{
"path": "R/utils.R",
"chars": 6616,
"preview": "#' @keywords internal\n.get_model_info <- function(model, model_info = NULL, ...) {\n if (is.null(model_info)) {\n mode"
},
{
"path": "R/utils_labels.R",
"chars": 3103,
"preview": "# after data transformation, label attributes get lost. This function\n# extracts label attributes from the original vect"
},
{
"path": "R/utils_standardize_center.R",
"chars": 12438,
"preview": "# preparation for standardize and center ----\n#\n# Performs some preparation when standardizing or centering variables,\n#"
},
{
"path": "R/visualisation_recipe.R",
"chars": 2914,
"preview": "#' Prepare objects for visualisation\n#'\n#' @description This function prepares objects for visualisation by returning a "
},
{
"path": "R/weighted_mean_median_sd_mad.R",
"chars": 3694,
"preview": "#' Weighted Mean, Median, SD, and MAD\n#'\n#' @inheritParams stats::weighted.mean\n#' @inheritParams stats::mad\n#' @param w"
},
{
"path": "R/winsorize.R",
"chars": 4284,
"preview": "#' Winsorize data\n#'\n#' @details\n#'\n#' Winsorizing or winsorization is the transformation of statistics by limiting\n#' e"
},
{
"path": "README.Rmd",
"chars": 9639,
"preview": "---\noutput: github_document\n---\n\n# `datawizard`: Easy Data Wrangling and Statistical Transformations <img src='man/figur"
},
{
"path": "README.md",
"chars": 19979,
"preview": "\n# `datawizard`: Easy Data Wrangling and Statistical Transformations <img src='man/figures/logo.png' align=\"right\" heigh"
},
{
"path": "air.toml",
"chars": 133,
"preview": "[format]\nline-width = 80\nindent-width = 2\nindent-style = \"space\"\nline-ending = \"lf\"\npersistent-line-breaks = true\nskip ="
},
{
"path": "cran-comments.md",
"chars": 50,
"preview": "This fixes R-devel errors reported on 2026-04-23.\n"
},
{
"path": "datawizard.Rproj",
"chars": 438,
"preview": "Version: 1.0\n\nRestoreWorkspace: No\nSaveWorkspace: No\nAlwaysSaveHistory: No\n\nEnableCodeIndexing: Yes\nUseSpacesForTab: Yes"
},
{
"path": "datawizard.code-workspace",
"chars": 296,
"preview": "{\n\t\"folders\": [\n\t\t{\n\t\t\t\"path\": \".\"\n\t\t}\n ],\n \"launch\": {\n \"version\": \"0.2.0\",\n \"configurations\": [\n {\n "
},
{
"path": "inst/CITATION",
"chars": 675,
"preview": "bibentry(\n bibtype=\"Article\",\n title=\"{datawizard}: An {R} Package for Easy Data Preparation and Statistical Transform"
},
{
"path": "inst/WORDLIST",
"chars": 973,
"preview": "AES\nAnalysing\nAsparouhov\nBMC\nBafumi\nBrincks\nBulotsky\nCMD\nCarle\nCatran\nCrosstables\nDEPRECATIONS\nDe\nDhaliwal\nDisaggregatin"
},
{
"path": "man/adjust.Rd",
"chars": 7136,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/adjust.R\n\\name{adjust}\n\\alias{adjust}\n\\ali"
},
{
"path": "man/as.prop.table.Rd",
"chars": 3723,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data_tabulate.R\n\\name{as.prop.table}\n\\alia"
},
{
"path": "man/assign_labels.Rd",
"chars": 5997,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/assign_labels.R\n\\name{assign_labels}\n\\alia"
},
{
"path": "man/categorize.Rd",
"chars": 13302,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/categorize.R\n\\name{categorize}\n\\alias{cate"
},
{
"path": "man/center.Rd",
"chars": 8354,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/center.R\n\\name{center}\n\\alias{center}\n\\ali"
},
{
"path": "man/coef_var.Rd",
"chars": 3584,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/descriptives.R\n\\name{coef_var}\n\\alias{coef"
},
{
"path": "man/coerce_to_numeric.Rd",
"chars": 502,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/to_numeric.R\n\\name{coerce_to_numeric}\n\\ali"
},
{
"path": "man/colnames.Rd",
"chars": 1263,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/utils-cols.R\n\\name{row_to_colnames}\n\\alias"
},
{
"path": "man/contr.deviation.Rd",
"chars": 4629,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/contrs.R\n\\name{contr.deviation}\n\\alias{con"
},
{
"path": "man/convert_na_to.Rd",
"chars": 6343,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/convert_na_to.R\n\\name{convert_na_to}\n\\alia"
},
{
"path": "man/convert_to_na.Rd",
"chars": 5431,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/convert_to_na.R\n\\name{convert_to_na}\n\\alia"
},
{
"path": "man/data_arrange.Rd",
"chars": 968,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data_arrange.R\n\\name{data_arrange}\n\\alias{"
},
{
"path": "man/data_codebook.Rd",
"chars": 7467,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data_codebook.R\n\\name{data_codebook}\n\\alia"
},
{
"path": "man/data_duplicated.Rd",
"chars": 4696,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data_duplicated.R\n\\name{data_duplicated}\n\\"
},
{
"path": "man/data_extract.Rd",
"chars": 7165,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data_extract.R\n\\name{data_extract}\n\\alias{"
},
{
"path": "man/data_group.Rd",
"chars": 4708,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data_group.R\n\\name{data_group}\n\\alias{data"
},
{
"path": "man/data_match.Rd",
"chars": 5762,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data_match.R\n\\name{data_match}\n\\alias{data"
},
{
"path": "man/data_merge.Rd",
"chars": 8009,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data_merge.R\n\\name{data_merge}\n\\alias{data"
},
{
"path": "man/data_modify.Rd",
"chars": 6332,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data_modify.R\n\\name{data_modify}\n\\alias{da"
},
{
"path": "man/data_partition.Rd",
"chars": 3519,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data_partition.R\n\\name{data_partition}\n\\al"
},
{
"path": "man/data_peek.Rd",
"chars": 4630,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data_peek.R\n\\name{data_peek}\n\\alias{data_p"
},
{
"path": "man/data_prefix_suffix.Rd",
"chars": 4467,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data_addprefix.R\n\\name{data_addprefix}\n\\al"
},
{
"path": "man/data_read.Rd",
"chars": 7397,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data_read.R, R/data_write.R\n\\name{data_rea"
},
{
"path": "man/data_relocate.Rd",
"chars": 7189,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data_relocate.R, R/data_remove.R\n\\name{dat"
},
{
"path": "man/data_rename.Rd",
"chars": 7066,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data_rename.R\n\\name{data_rename}\n\\alias{da"
},
{
"path": "man/data_replicate.Rd",
"chars": 4689,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data_replicate.R\n\\name{data_replicate}\n\\al"
},
{
"path": "man/data_restoretype.Rd",
"chars": 1046,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data_restoretype.R\n\\name{data_restoretype}"
},
{
"path": "man/data_rotate.Rd",
"chars": 2937,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data_rotate.R\n\\name{data_rotate}\n\\alias{da"
},
{
"path": "man/data_seek.Rd",
"chars": 2694,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data_seek.R\n\\name{data_seek}\n\\alias{data_s"
},
{
"path": "man/data_separate.Rd",
"chars": 8742,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data_separate.R\n\\name{data_separate}\n\\alia"
},
{
"path": "man/data_summary.Rd",
"chars": 4054,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data_summary.R\n\\name{data_summary}\n\\alias{"
},
{
"path": "man/data_tabulate.Rd",
"chars": 10342,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data_tabulate.R\n\\name{data_tabulate}\n\\alia"
},
{
"path": "man/data_to_long.Rd",
"chars": 10494,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data_to_long.R\n\\name{data_to_long}\n\\alias{"
},
{
"path": "man/data_to_wide.Rd",
"chars": 9988,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data_to_wide.R\n\\name{data_to_wide}\n\\alias{"
},
{
"path": "man/data_unique.Rd",
"chars": 4760,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data_unique.R\n\\name{data_unique}\n\\alias{da"
},
{
"path": "man/data_unite.Rd",
"chars": 4942,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data_unite.R\n\\name{data_unite}\n\\alias{data"
},
{
"path": "man/datawizard-package.Rd",
"chars": 1952,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/datawizard-package.R\n\\docType{package}\n\\na"
},
{
"path": "man/demean.Rd",
"chars": 15561,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/demean.R\n\\name{demean}\n\\alias{demean}\n\\ali"
},
{
"path": "man/describe_distribution.Rd",
"chars": 8104,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/describe_distribution.R\n\\name{describe_dis"
},
{
"path": "man/distribution_mode.Rd",
"chars": 856,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/descriptives.R\n\\name{distribution_mode}\n\\a"
},
{
"path": "man/efc.Rd",
"chars": 384,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data.R\n\\docType{data}\n\\name{efc}\n\\alias{ef"
},
{
"path": "man/extract_column_names.Rd",
"chars": 8537,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data_select.R, R/extract_column_names.R\n\\n"
},
{
"path": "man/labels_to_levels.Rd",
"chars": 5315,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/labels_to_levels.R\n\\name{labels_to_levels}"
},
{
"path": "man/makepredictcall.dw_transformer.Rd",
"chars": 1745,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/makepredictcall.R\n\\name{makepredictcall.dw"
},
{
"path": "man/mean_sd.Rd",
"chars": 1077,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/mean_sd.R\n\\name{mean_sd}\n\\alias{mean_sd}\n\\"
},
{
"path": "man/means_by_group.Rd",
"chars": 5952,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/means_by_group.R\n\\name{means_by_group}\n\\al"
},
{
"path": "man/nhanes_sample.Rd",
"chars": 477,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data.R\n\\docType{data}\n\\name{nhanes_sample}"
},
{
"path": "man/normalize.Rd",
"chars": 7660,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/normalize.R, R/unnormalize.R\n\\name{normali"
},
{
"path": "man/ranktransform.Rd",
"chars": 5985,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/ranktransform.R\n\\name{ranktransform}\n\\alia"
},
{
"path": "man/recode_into.Rd",
"chars": 3590,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/recode_into.R\n\\name{recode_into}\n\\alias{re"
},
{
"path": "man/recode_values.Rd",
"chars": 12293,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/recode_values.R\n\\name{recode_values}\n\\alia"
},
{
"path": "man/reexports.Rd",
"chars": 524,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data_tabulate.R\n\\docType{import}\n\\name{ree"
},
{
"path": "man/remove_empty.Rd",
"chars": 1971,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/remove_empty.R\n\\name{remove_empty}\n\\alias{"
},
{
"path": "man/replace_nan_inf.Rd",
"chars": 748,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/replace_nan_inf.R\n\\name{replace_nan_inf}\n\\"
},
{
"path": "man/rescale.Rd",
"chars": 7996,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data_rescale.R\n\\name{rescale}\n\\alias{resca"
},
{
"path": "man/rescale_weights.Rd",
"chars": 6962,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/rescale_weights.R\n\\name{rescale_weights}\n\\"
},
{
"path": "man/reshape_ci.Rd",
"chars": 1255,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/reshape_ci.R\n\\name{reshape_ci}\n\\alias{resh"
},
{
"path": "man/reverse.Rd",
"chars": 6611,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/data_reverse.R\n\\name{reverse}\n\\alias{rever"
},
{
"path": "man/row_count.Rd",
"chars": 5901,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/row_count.R\n\\name{row_count}\n\\alias{row_co"
},
{
"path": "man/row_means.Rd",
"chars": 7167,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/row_means.R\n\\name{row_means}\n\\alias{row_me"
},
{
"path": "man/rownames.Rd",
"chars": 1681,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/utils-rows.R\n\\name{rownames_as_column}\n\\al"
},
{
"path": "man/skewness.Rd",
"chars": 4880,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/skewness_kurtosis.R\n\\name{skewness}\n\\alias"
},
{
"path": "man/slide.Rd",
"chars": 7125,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/slide.R\n\\name{slide}\n\\alias{slide}\n\\alias{"
},
{
"path": "man/smoothness.Rd",
"chars": 1131,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/smoothness.R\n\\name{smoothness}\n\\alias{smoo"
},
{
"path": "man/standardize.Rd",
"chars": 11472,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/standardize.R, R/unstandardize.R\n\\name{sta"
},
{
"path": "man/standardize.default.Rd",
"chars": 4303,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/standardize.models.R\n\\name{standardize.def"
},
{
"path": "man/text_format.Rd",
"chars": 1971,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/text_format.R\n\\name{text_format}\n\\alias{te"
},
{
"path": "man/to_factor.Rd",
"chars": 6080,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/to_factor.R\n\\name{to_factor}\n\\alias{to_fac"
},
{
"path": "man/to_numeric.Rd",
"chars": 6642,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/to_numeric.R\n\\name{to_numeric}\n\\alias{to_n"
},
{
"path": "man/visualisation_recipe.Rd",
"chars": 1226,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/visualisation_recipe.R\n\\name{visualisation"
},
{
"path": "man/weighted_mean.Rd",
"chars": 1407,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/weighted_mean_median_sd_mad.R\n\\name{weight"
},
{
"path": "man/winsorize.Rd",
"chars": 4072,
"preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/winsorize.R\n\\name{winsorize}\n\\alias{winsor"
},
{
"path": "paper/JOSS_files/apa.csl",
"chars": 69612,
"preview": "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<style xmlns=\"http://purl.org/net/xbiblio/csl\" class=\"in-text\" version=\"1.0\" demo"
},
{
"path": "paper/JOSS_files/paper.Rmd",
"chars": 8107,
"preview": "---\ntitle: \"datawizard: An R Package for Easy Data Preparation and Statistical Transformations\"\ntags:\n - R\n - easystat"
},
{
"path": "paper/JOSS_files/paper.bib",
"chars": 5055,
"preview": "@Article{Ben-Shachar2020,\n title = {{e}ffectsize: Estimation of Effect Size Indices and Standardized Parameters},\n "
},
{
"path": "paper/JOSS_files/paper.log",
"chars": 52211,
"preview": "This is XeTeX, Version 3.141592653-2.6-0.999994 (TeX Live 2022) (preloaded format=xelatex 2022.9.27) 4 OCT 2022 17:54\ne"
}
]
// ... and 121 more files (download for full content)
About this extraction
This page contains the full source code of the easystats/datawizard GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 321 files (1.8 MB), approximately 572.3k tokens. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.