Full Code of tidymodels/infer for AI

main 1e5cabee6f83 cached
155 files
768.6 KB
222.0k tokens
1 requests
Download .txt
Showing preview only (812K chars total). Download the full file or copy to clipboard to get everything.
Repository: tidymodels/infer
Branch: main
Commit: 1e5cabee6f83
Files: 155
Total size: 768.6 KB

Directory structure:
gitextract__66l93tt/

├── .Rbuildignore
├── .gitattributes
├── .github/
│   ├── .gitignore
│   ├── CODE_OF_CONDUCT.md
│   └── workflows/
│       ├── R-CMD-check.yaml
│       ├── check-hard.yaml
│       ├── lock.yaml
│       ├── pkgdown.yaml
│       ├── pr-commands.yaml
│       └── test-coverage.yaml
├── .gitignore
├── .vscode/
│   ├── extensions.json
│   └── settings.json
├── CONTRIBUTING.md
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── NEWS.md
├── R/
│   ├── assume.R
│   ├── calculate.R
│   ├── deprecated.R
│   ├── fit.R
│   ├── generate.R
│   ├── get_confidence_interval.R
│   ├── get_p_value.R
│   ├── gss.R
│   ├── hypothesize.R
│   ├── infer.R
│   ├── observe.R
│   ├── pipe.R
│   ├── print_methods.R
│   ├── rep_sample_n.R
│   ├── set_params.R
│   ├── shade_confidence_interval.R
│   ├── shade_p_value.R
│   ├── specify.R
│   ├── utils.R
│   ├── visualize.R
│   └── wrappers.R
├── README.Rmd
├── README.md
├── _pkgdown.yml
├── air.toml
├── codecov.yml
├── data/
│   └── gss.rda
├── data-raw/
│   └── save_gss.R
├── figs/
│   ├── paper/
│   │   ├── apa.csl
│   │   ├── columns.tex
│   │   ├── paper.Rmd
│   │   ├── paper.bib
│   │   ├── paper.log
│   │   └── paper.md
│   └── rethinking-inference.key
├── infer.Rproj
├── inst/
│   └── CITATION
├── man/
│   ├── assume.Rd
│   ├── calculate.Rd
│   ├── chisq_stat.Rd
│   ├── chisq_test.Rd
│   ├── deprecated.Rd
│   ├── fit.infer.Rd
│   ├── generate.Rd
│   ├── get_confidence_interval.Rd
│   ├── get_p_value.Rd
│   ├── gss.Rd
│   ├── hypothesize.Rd
│   ├── infer.Rd
│   ├── observe.Rd
│   ├── pipe.Rd
│   ├── print.infer.Rd
│   ├── prop_test.Rd
│   ├── reexports.Rd
│   ├── rep_sample_n.Rd
│   ├── shade_confidence_interval.Rd
│   ├── shade_p_value.Rd
│   ├── specify.Rd
│   ├── t_stat.Rd
│   ├── t_test.Rd
│   └── visualize.Rd
├── man-roxygen/
│   └── seeds.Rmd
├── tests/
│   ├── testthat/
│   │   ├── _snaps/
│   │   │   ├── aliases.md
│   │   │   ├── assume.md
│   │   │   ├── calculate.md
│   │   │   ├── fit.md
│   │   │   ├── generate.md
│   │   │   ├── get_confidence_interval.md
│   │   │   ├── get_p_value.md
│   │   │   ├── hypothesize.md
│   │   │   ├── observe.md
│   │   │   ├── print.md
│   │   │   ├── rep_sample_n.md
│   │   │   ├── shade_confidence_interval.md
│   │   │   ├── shade_p_value.md
│   │   │   ├── specify.md
│   │   │   ├── utils.md
│   │   │   ├── visualize.md
│   │   │   └── wrappers.md
│   │   ├── helper-data.R
│   │   ├── setup.R
│   │   ├── test-aliases.R
│   │   ├── test-assume.R
│   │   ├── test-calculate.R
│   │   ├── test-fit.R
│   │   ├── test-generate.R
│   │   ├── test-get_confidence_interval.R
│   │   ├── test-get_p_value.R
│   │   ├── test-hypothesize.R
│   │   ├── test-observe.R
│   │   ├── test-print.R
│   │   ├── test-rep_sample_n.R
│   │   ├── test-shade_confidence_interval.R
│   │   ├── test-shade_p_value.R
│   │   ├── test-specify.R
│   │   ├── test-utils.R
│   │   ├── test-visualize.R
│   │   └── test-wrappers.R
│   └── testthat.R
└── vignettes/
    ├── anova.Rmd
    ├── chi_squared.Rmd
    ├── infer.Rmd
    ├── infer_cache/
    │   └── html/
    │       ├── __packages
    │       ├── calculate-point_94c073b633c3cf7bef3252dcad544ee2.RData
    │       ├── calculate-point_94c073b633c3cf7bef3252dcad544ee2.rdb
    │       ├── calculate-point_94c073b633c3cf7bef3252dcad544ee2.rdx
    │       ├── generate-permute_21b25928d642a97a30057306d51f1b23.RData
    │       ├── generate-permute_21b25928d642a97a30057306d51f1b23.rdb
    │       ├── generate-permute_21b25928d642a97a30057306d51f1b23.rdx
    │       ├── generate-point_d562524427be20dbb4736ca1ea29b04b.RData
    │       ├── generate-point_d562524427be20dbb4736ca1ea29b04b.rdb
    │       ├── generate-point_d562524427be20dbb4736ca1ea29b04b.rdx
    │       ├── hypothesize-40-hr-week_c8e33c404efa90c2ca0b2eacad95b06c.RData
    │       ├── hypothesize-40-hr-week_c8e33c404efa90c2ca0b2eacad95b06c.rdb
    │       ├── hypothesize-40-hr-week_c8e33c404efa90c2ca0b2eacad95b06c.rdx
    │       ├── hypothesize-independence_fe1c79b9f1dc0df488828fdd34c8145f.RData
    │       ├── hypothesize-independence_fe1c79b9f1dc0df488828fdd34c8145f.rdb
    │       ├── hypothesize-independence_fe1c79b9f1dc0df488828fdd34c8145f.rdx
    │       ├── specify-diff-in-means_e4103c4c3e3daedd5c1429b7a1bc8727.RData
    │       ├── specify-diff-in-means_e4103c4c3e3daedd5c1429b7a1bc8727.rdb
    │       ├── specify-diff-in-means_e4103c4c3e3daedd5c1429b7a1bc8727.rdx
    │       ├── specify-example_3ea3cfa390233b127dc25b05b0354bcf.RData
    │       ├── specify-example_3ea3cfa390233b127dc25b05b0354bcf.rdb
    │       ├── specify-example_3ea3cfa390233b127dc25b05b0354bcf.rdx
    │       ├── specify-one_149be66261b0606b7ddb80efd10fa81d.RData
    │       ├── specify-one_149be66261b0606b7ddb80efd10fa81d.rdb
    │       ├── specify-one_149be66261b0606b7ddb80efd10fa81d.rdx
    │       ├── specify-success_e8eb15e9f621ccf60cb6527a6bccdb4b.RData
    │       ├── specify-success_e8eb15e9f621ccf60cb6527a6bccdb4b.rdb
    │       ├── specify-success_e8eb15e9f621ccf60cb6527a6bccdb4b.rdx
    │       ├── specify-two_20085531c110a936ee691162f225333b.RData
    │       ├── specify-two_20085531c110a936ee691162f225333b.rdb
    │       └── specify-two_20085531c110a936ee691162f225333b.rdx
    ├── observed_stat_examples.Rmd
    ├── paired.Rmd
    └── t_test.Rmd

================================================
FILE CONTENTS
================================================

================================================
FILE: .Rbuildignore
================================================
^CRAN-RELEASE$
^.*\.Rproj$
^\.Rproj\.user$
^README\.Rmd$
^figs$
^profiles*
^examples*
^codecov\.yml$
^docs*
^CONDUCT\.md$
^cran-comments\.md$
^_build\.sh$
^appveyor\.yml$
^\.implement_new_methods\.md
^CONTRIBUTING\.md$
^TO-DO\.md$
^\.httr-oauth$
^_pkgdown.yml
^_pkgdown\.yml$
^docs$
^data-raw*
^doc$
^Meta$
README_files/
^pkgdown$
^\.github$
^LICENSE\.md$
^man-roxygen$
^[\.]?air\.toml$
^\.vscode$
inst/hex/


================================================
FILE: .gitattributes
================================================
* text=auto
data/* binary
src/* text=lf
R/* text=lf

================================================
FILE: .github/.gitignore
================================================
*.html


================================================
FILE: .github/CODE_OF_CONDUCT.md
================================================
# Contributor Covenant Code of Conduct

## Our Pledge

We as members, contributors, and leaders pledge to make participation in our
community a harassment-free experience for everyone, regardless of age, body
size, visible or invisible disability, ethnicity, sex characteristics, gender
identity and expression, level of experience, education, socio-economic status,
nationality, personal appearance, race, caste, color, religion, or sexual
identity and orientation.

We pledge to act and interact in ways that contribute to an open, welcoming,
diverse, inclusive, and healthy community.

## Our Standards

Examples of behavior that contributes to a positive environment for our
community include:

* Demonstrating empathy and kindness toward other people
* Being respectful of differing opinions, viewpoints, and experiences
* Giving and gracefully accepting constructive feedback
* Accepting responsibility and apologizing to those affected by our mistakes,
  and learning from the experience
* Focusing on what is best not just for us as individuals, but for the overall
  community

Examples of unacceptable behavior include:

* The use of sexualized language or imagery, and sexual attention or advances of
  any kind
* Trolling, insulting or derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or email address,
  without their explicit permission
* Other conduct which could reasonably be considered inappropriate in a
  professional setting

## Enforcement Responsibilities

Community leaders are responsible for clarifying and enforcing our standards of
acceptable behavior and will take appropriate and fair corrective action in
response to any behavior that they deem inappropriate, threatening, offensive,
or harmful.

Community leaders have the right and responsibility to remove, edit, or reject
comments, commits, code, wiki edits, issues, and other contributions that are
not aligned to this Code of Conduct, and will communicate reasons for moderation
decisions when appropriate.

## Scope

This Code of Conduct applies within all community spaces, and also applies when
an individual is officially representing the community in public spaces.
Examples of representing our community include using an official e-mail address,
posting via an official social media account, or acting as an appointed
representative at an online or offline event.

## Enforcement

Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement at codeofconduct@posit.co. 
All complaints will be reviewed and investigated promptly and fairly.

All community leaders are obligated to respect the privacy and security of the
reporter of any incident.

## Enforcement Guidelines

Community leaders will follow these Community Impact Guidelines in determining
the consequences for any action they deem in violation of this Code of Conduct:

### 1. Correction

**Community Impact**: Use of inappropriate language or other behavior deemed
unprofessional or unwelcome in the community.

**Consequence**: A private, written warning from community leaders, providing
clarity around the nature of the violation and an explanation of why the
behavior was inappropriate. A public apology may be requested.

### 2. Warning

**Community Impact**: A violation through a single incident or series of
actions.

**Consequence**: A warning with consequences for continued behavior. No
interaction with the people involved, including unsolicited interaction with
those enforcing the Code of Conduct, for a specified period of time. This
includes avoiding interactions in community spaces as well as external channels
like social media. Violating these terms may lead to a temporary or permanent
ban.

### 3. Temporary Ban

**Community Impact**: A serious violation of community standards, including
sustained inappropriate behavior.

**Consequence**: A temporary ban from any sort of interaction or public
communication with the community for a specified period of time. No public or
private interaction with the people involved, including unsolicited interaction
with those enforcing the Code of Conduct, is allowed during this period.
Violating these terms may lead to a permanent ban.

### 4. Permanent Ban

**Community Impact**: Demonstrating a pattern of violation of community
standards, including sustained inappropriate behavior, harassment of an
individual, or aggression toward or disparagement of classes of individuals.

**Consequence**: A permanent ban from any sort of public interaction within the
community.

## Attribution

This Code of Conduct is adapted from the [Contributor Covenant][homepage],
version 2.1, available at
<https://www.contributor-covenant.org/version/2/1/code_of_conduct.html>.

Community Impact Guidelines were inspired by
[Mozilla's code of conduct enforcement ladder][https://github.com/mozilla/inclusion].

For answers to common questions about this code of conduct, see the FAQ at
<https://www.contributor-covenant.org/faq>. Translations are available at <https://www.contributor-covenant.org/translations>.

[homepage]: https://www.contributor-covenant.org


================================================
FILE: .github/workflows/R-CMD-check.yaml
================================================
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
#
# NOTE: This workflow is overkill for most R packages and
# check-standard.yaml is likely a better choice.
# usethis::use_github_action("check-standard") will install it.
on:
  push:
    branches: [main, master]
  pull_request:

name: R-CMD-check.yaml

permissions: read-all

jobs:
  R-CMD-check:
    runs-on: ${{ matrix.config.os }}

    name: ${{ matrix.config.os }} (${{ matrix.config.r }})

    strategy:
      fail-fast: false
      matrix:
        config:
          - {os: macos-latest,   r: 'release'}

          - {os: windows-latest, r: 'release'}

          - {os: ubuntu-latest,  r: 'devel', http-user-agent: 'release'}
          - {os: ubuntu-latest,  r: 'release'}
          - {os: ubuntu-latest,  r: 'oldrel-1'}
          - {os: ubuntu-latest,  r: 'oldrel-2'}
          - {os: ubuntu-latest,  r: 'oldrel-3'}
          - {os: ubuntu-latest,  r: 'oldrel-4'}

    env:
      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
      R_KEEP_PKG_SOURCE: yes

    steps:
      - uses: actions/checkout@v4

      - uses: r-lib/actions/setup-pandoc@v2

      - uses: r-lib/actions/setup-r@v2
        with:
          r-version: ${{ matrix.config.r }}
          http-user-agent: ${{ matrix.config.http-user-agent }}
          use-public-rspm: true

      - uses: r-lib/actions/setup-r-dependencies@v2
        with:
          extra-packages: any::rcmdcheck
          needs: check

      - uses: r-lib/actions/check-r-package@v2
        with:
          upload-snapshots: true
          build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")'


================================================
FILE: .github/workflows/check-hard.yaml
================================================
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
#
# NOTE: This workflow only directly installs "hard" dependencies, i.e. Depends,
# Imports, and LinkingTo dependencies. Notably, Suggests dependencies are never
# installed, with the exception of testthat, knitr, and rmarkdown. The cache is
# never used to avoid accidentally restoring a cache containing a suggested
# dependency.
on:
  push:
    branches: [main]
  pull_request:
    branches: [main]

name: R-CMD-check-hard

jobs:
  R-CMD-check:
    runs-on: ${{ matrix.config.os }}

    name: ${{ matrix.config.os }} (${{ matrix.config.r }})

    strategy:
      fail-fast: false
      matrix:
        config:
          - {os: ubuntu-latest,   r: 'release'}

    env:
      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
      R_KEEP_PKG_SOURCE: yes

    steps:
      - uses: actions/checkout@v2

      - uses: r-lib/actions/setup-pandoc@v2

      - uses: r-lib/actions/setup-r@v2
        with:
          r-version: ${{ matrix.config.r }}
          http-user-agent: ${{ matrix.config.http-user-agent }}
          use-public-rspm: true

      - uses: r-lib/actions/setup-r-dependencies@v2
        with:
          dependencies: '"hard"'
          cache: false
          extra-packages: |
            any::rcmdcheck
            any::testthat
            any::knitr
            any::rmarkdown
          needs: check

      - uses: r-lib/actions/check-r-package@v2
        with:
          upload-snapshots: true

================================================
FILE: .github/workflows/lock.yaml
================================================
name: 'Lock Threads'

on:
  schedule:
    - cron: '0 0 * * *'

jobs:
  lock:
    runs-on: ubuntu-latest
    steps:
      - uses: dessant/lock-threads@v2
        with:
          github-token: ${{ github.token }}
          issue-lock-inactive-days: '14'
#          issue-exclude-labels: ''
#          issue-lock-labels: 'outdated'
          issue-lock-comment: >
            This issue has been automatically locked. If you believe you have
            found a related problem, please file a new issue (with a reprex:
            <https://reprex.tidyverse.org>) and link to this issue.
          issue-lock-reason: ''
          pr-lock-inactive-days: '14'
#          pr-exclude-labels: 'wip'
          pr-lock-labels: ''
          pr-lock-comment: >
            This pull request has been automatically locked. If you believe you
            have found a related problem, please file a new issue (with a reprex:
            <https://reprex.tidyverse.org>) and link to this issue.
          pr-lock-reason: ''
#          process-only: 'issues'


================================================
FILE: .github/workflows/pkgdown.yaml
================================================
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
  push:
    branches: [main, master]
  pull_request:
  release:
    types: [published]
  workflow_dispatch:

name: pkgdown.yaml

permissions: read-all

jobs:
  pkgdown:
    runs-on: ubuntu-latest
    # Only restrict concurrency for non-PR jobs
    concurrency:
      group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
    env:
      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
    permissions:
      contents: write
    steps:
      - uses: actions/checkout@v4

      - uses: r-lib/actions/setup-pandoc@v2

      - uses: r-lib/actions/setup-r@v2
        with:
          use-public-rspm: true

      - uses: r-lib/actions/setup-r-dependencies@v2
        with:
          extra-packages: any::pkgdown, local::.
          needs: website

      - name: Build site
        run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
        shell: Rscript {0}

      - name: Deploy to GitHub pages 🚀
        if: github.event_name != 'pull_request'
        uses: JamesIves/github-pages-deploy-action@v4.5.0
        with:
          clean: false
          branch: gh-pages
          folder: docs


================================================
FILE: .github/workflows/pr-commands.yaml
================================================
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
  issue_comment:
    types: [created]

name: pr-commands.yaml

permissions: read-all

jobs:
  document:
    if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/document') }}
    name: document
    runs-on: ubuntu-latest
    env:
      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
    permissions:
      contents: write
    steps:
      - uses: actions/checkout@v4

      - uses: r-lib/actions/pr-fetch@v2
        with:
          repo-token: ${{ secrets.GITHUB_TOKEN }}

      - uses: r-lib/actions/setup-r@v2
        with:
          use-public-rspm: true

      - uses: r-lib/actions/setup-r-dependencies@v2
        with:
          extra-packages: any::roxygen2
          needs: pr-document

      - name: Document
        run: roxygen2::roxygenise()
        shell: Rscript {0}

      - name: commit
        run: |
          git config --local user.name "$GITHUB_ACTOR"
          git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
          git add man/\* NAMESPACE
          git commit -m 'Document'

      - uses: r-lib/actions/pr-push@v2
        with:
          repo-token: ${{ secrets.GITHUB_TOKEN }}

  style:
    if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/style') }}
    name: style
    runs-on: ubuntu-latest
    env:
      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
    permissions:
      contents: write
    steps:
      - uses: actions/checkout@v4

      - uses: r-lib/actions/pr-fetch@v2
        with:
          repo-token: ${{ secrets.GITHUB_TOKEN }}

      - uses: r-lib/actions/setup-r@v2

      - name: Install dependencies
        run: install.packages("styler")
        shell: Rscript {0}

      - name: Style
        run: styler::style_pkg()
        shell: Rscript {0}

      - name: commit
        run: |
          git config --local user.name "$GITHUB_ACTOR"
          git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
          git add \*.R
          git commit -m 'Style'

      - uses: r-lib/actions/pr-push@v2
        with:
          repo-token: ${{ secrets.GITHUB_TOKEN }}


================================================
FILE: .github/workflows/test-coverage.yaml
================================================
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
  push:
    branches: [main, master]
  pull_request:

name: test-coverage.yaml

permissions: read-all

jobs:
  test-coverage:
    runs-on: ubuntu-latest
    env:
      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}

    steps:
      - uses: actions/checkout@v4

      - uses: r-lib/actions/setup-r@v2
        with:
          use-public-rspm: true

      - uses: r-lib/actions/setup-r-dependencies@v2
        with:
          extra-packages: any::covr, any::xml2
          needs: coverage

      - name: Test coverage
        run: |
          cov <- covr::package_coverage(
            quiet = FALSE,
            clean = FALSE,
            install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package")
          )
          print(cov)
          covr::to_cobertura(cov)
        shell: Rscript {0}

      - uses: codecov/codecov-action@v5
        with:
          # Fail if error if not on PR, or if on PR and token is given
          fail_ci_if_error: ${{ github.event_name != 'pull_request' || secrets.CODECOV_TOKEN }}
          files: ./cobertura.xml
          plugins: noop
          disable_search: true
          token: ${{ secrets.CODECOV_TOKEN }}

      - name: Show testthat output
        if: always()
        run: |
          ## --------------------------------------------------------------------
          find '${{ runner.temp }}/package' -name 'testthat.Rout*' -exec cat '{}' \; || true
        shell: bash

      - name: Upload test results
        if: failure()
        uses: actions/upload-artifact@v4
        with:
          name: coverage-test-failures
          path: ${{ runner.temp }}/package


================================================
FILE: .gitignore
================================================
.Rproj.user
.Rhistory
.RData
.Ruserdata
.DS_Store
.httr-oauth
doc
Meta
docs
*.psd


================================================
FILE: .vscode/extensions.json
================================================
{
    "recommendations": [
        "Posit.air-vscode"
    ]
}


================================================
FILE: .vscode/settings.json
================================================
{
    "[r]": {
        "editor.formatOnSave": true,
        "editor.defaultFormatter": "Posit.air-vscode"
    }
}


================================================
FILE: CONTRIBUTING.md
================================================
# Contributing

Contributions to the `infer` whether in the form of bug fixes, issue reports, new
code or documentation improvements are encouraged and welcome. We welcome novices
who may have never contributed to a package before as well as friendly
veterans looking to help us improve the package for users. We are eager to include
and accepting of contributions from everyone that meets our [code of conduct](.github/CODE_OF_CONDUCT.md)
guidelines.

Please use the GitHub issues. For any pull request, please link to or open a
corresponding issue in GitHub issues. Please ensure that you have notifications
turned on and respond to questions, comments or needed changes promptly.

##  Tests

`infer` uses `testthat` for testing. Please try to provide 100% test coverage
for any submitted code and always check that existing tests continue to pass.
If you are a beginner and need help with writing a test, mention this
in the issue and we will try to help.

It's also helpful to run `goodpractice::gp()` to ensure that lines of code are
not over 80 characters and that all lines of code have tests written. Please do
so prior to submitting any pull request and fix any suggestions from there.
Reach out to us if you need any assistance there too.

## Code style

Please use snake case (such as `rep_sample_n`) for function names.
Besides that, in general follow the 
[tidyverse style](http://style.tidyverse.org/) for R. 

## Code of Conduct

When contributing to the `infer` package you must follow the code of 
conduct defined in [CONDUCT](.github/CODE_OF_CONDUCT.md).


================================================
FILE: DESCRIPTION
================================================
Type: Package
Package: infer
Title: Tidy Statistical Inference
Version: 1.1.0.9000
Authors@R: c(
    person("Andrew", "Bray", , "abray@reed.edu", role = "aut"),
    person("Chester", "Ismay", , "chester.ismay@gmail.com", role = "aut",
           comment = c(ORCID = "0000-0003-2820-2547")),
    person("Evgeni", "Chasnovski", , "evgeni.chasnovski@gmail.com", role = "aut",
           comment = c(ORCID = "0000-0002-1617-4019")),
    person("Simon", "Couch", , "simon.couch@posit.co", role = c("aut", "cre"),
           comment = c(ORCID = "0000-0001-5676-5107")),
    person("Ben", "Baumer", , "ben.baumer@gmail.com", role = "aut",
           comment = c(ORCID = "0000-0002-3279-0516")),
    person("Mine", "Cetinkaya-Rundel", , "mine@stat.duke.edu", role = "aut",
           comment = c(ORCID = "0000-0001-6452-2420")),
    person("Ted", "Laderas", , "tedladeras@gmail.com", role = "ctb",
           comment = c(ORCID = "0000-0002-6207-7068")),
    person("Nick", "Solomon", , "nick.solomon@datacamp.com", role = "ctb"),
    person("Johanna", "Hardin", , "Jo.Hardin@pomona.edu", role = "ctb"),
    person("Albert Y.", "Kim", , "albert.ys.kim@gmail.com", role = "ctb",
           comment = c(ORCID = "0000-0001-7824-306X")),
    person("Neal", "Fultz", , "nfultz@gmail.com", role = "ctb"),
    person("Doug", "Friedman", , "doug.nhp@gmail.com", role = "ctb"),
    person("Richie", "Cotton", , "richie@datacamp.com", role = "ctb",
           comment = c(ORCID = "0000-0003-2504-802X")),
    person("Brian", "Fannin", , "captain@pirategrunt.com", role = "ctb")
  )
Description: The objective of this package is to perform inference using
    an expressive statistical grammar that coheres with the tidy design
    framework.
License: MIT + file LICENSE
URL: https://github.com/tidymodels/infer, https://infer.tidymodels.org/
BugReports: https://github.com/tidymodels/infer/issues
Depends:
    R (>= 4.1)
Imports:
    broom,
    cli,
    dplyr (>= 0.7.0),
    generics,
    ggplot2 (>= 3.5.2),
    glue (>= 1.3.0),
    grDevices,
    lifecycle,
    magrittr,
    methods,
    patchwork,
    purrr,
    rlang (>= 0.2.0),
    tibble,
    tidyr,
    vctrs (>= 0.6.5),
    withr
Suggests:
    covr,
    devtools (>= 1.12.0),
    fs,
    knitr,
    nycflights13,
    parsnip,
    rmarkdown,
    stringr,
    testthat (>= 3.0.0),
    vdiffr (>= 1.0.0)
VignetteBuilder:
    knitr
Config/Needs/website: tidyverse/tidytemplate
Config/testthat/edition: 3
Config/usethis/last-upkeep: 2025-04-25
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.3


================================================
FILE: LICENSE
================================================
YEAR: 2025
COPYRIGHT HOLDER: infer authors


================================================
FILE: LICENSE.md
================================================
# MIT License

Copyright (c) 2025 infer authors

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: NAMESPACE
================================================
# Generated by roxygen2: do not edit by hand

S3method(calc_impl,"function")
S3method(calc_impl,Chisq)
S3method(calc_impl,F)
S3method(calc_impl,correlation)
S3method(calc_impl,count)
S3method(calc_impl,diff_in_means)
S3method(calc_impl,diff_in_medians)
S3method(calc_impl,diff_in_props)
S3method(calc_impl,function_of_props)
S3method(calc_impl,mean)
S3method(calc_impl,median)
S3method(calc_impl,odds_ratio)
S3method(calc_impl,prop)
S3method(calc_impl,ratio_of_means)
S3method(calc_impl,ratio_of_props)
S3method(calc_impl,sd)
S3method(calc_impl,slope)
S3method(calc_impl,sum)
S3method(calc_impl,t)
S3method(calc_impl,z)
S3method(fit,infer)
S3method(get_p_value,default)
S3method(get_p_value,infer_dist)
S3method(ggplot_add,infer_layer)
S3method(print,infer)
S3method(print,infer_dist)
S3method(print,infer_layer)
export("%>%")
export(assume)
export(calculate)
export(chisq_stat)
export(chisq_test)
export(conf_int)
export(fit)
export(fit.infer)
export(generate)
export(get_ci)
export(get_confidence_interval)
export(get_p_value)
export(get_pvalue)
export(ggplot_add)
export(hypothesise)
export(hypothesize)
export(observe)
export(p_value)
export(prop_test)
export(rep_sample_n)
export(rep_slice_sample)
export(shade_ci)
export(shade_confidence_interval)
export(shade_p_value)
export(shade_pvalue)
export(specify)
export(t_stat)
export(t_test)
export(visualise)
export(visualize)
importFrom(cli,cli_abort)
importFrom(cli,cli_inform)
importFrom(cli,cli_warn)
importFrom(cli,no)
importFrom(cli,qty)
importFrom(dplyr,across)
importFrom(dplyr,any_of)
importFrom(dplyr,bind_rows)
importFrom(dplyr,group_by)
importFrom(dplyr,n)
importFrom(dplyr,pull)
importFrom(dplyr,select)
importFrom(dplyr,summarize)
importFrom(generics,fit)
importFrom(ggplot2,aes)
importFrom(ggplot2,geom_bar)
importFrom(ggplot2,geom_histogram)
importFrom(ggplot2,geom_rect)
importFrom(ggplot2,geom_vline)
importFrom(ggplot2,ggplot)
importFrom(ggplot2,ggplot_add)
importFrom(ggplot2,labs)
importFrom(glue,glue)
importFrom(glue,glue_collapse)
importFrom(magrittr,"%>%")
importFrom(methods,hasArg)
importFrom(purrr,compact)
importFrom(rlang,"!!")
importFrom(rlang,":=")
importFrom(rlang,caller_env)
importFrom(rlang,enquo)
importFrom(rlang,eval_tidy)
importFrom(rlang,f_lhs)
importFrom(rlang,f_rhs)
importFrom(rlang,get_expr)
importFrom(rlang,new_formula)
importFrom(rlang,quo)
importFrom(rlang,sym)
importFrom(stats,as.formula)
importFrom(stats,dchisq)
importFrom(stats,df)
importFrom(stats,dnorm)
importFrom(stats,dt)
importFrom(stats,qchisq)
importFrom(stats,qf)
importFrom(stats,qnorm)
importFrom(stats,qt)
importFrom(tibble,tibble)


================================================
FILE: NEWS.md
================================================
# infer (development version)

# infer 1.1.0

* Fixed bug where adding `shade_confidence_interval(NULL)` or `shade_p_value(NULL)` 
  to plots resulted in `list()` rather than the unmodified plot (#566).

* Introduced support for arbitrary test statistics in `calculate()`. In addition
  to the pre-implemented `calculate(stat)` options, taken as strings, users can
  now supply a function defining any scalar-valued test statistic. See
  `?calculate()` to learn more (#542).

# infer 1.0.9

* Replaced usage of deprecated functions ahead of a new release of the ggplot2 package (#557).

* Addressed narrative mistakes in the `t_test` vignette (#556).

* Increased the minimum required R version to R 4.1

# infer 1.0.8

* The infer print method now truncates output when descriptions of explanatory or responses variables exceed the console width (#543).

* Added missing commas and addressed formatting issues throughout the vignettes and articles. Backticks for package names were removed and missing parentheses for functions were added (@Joscelinrocha).


# infer 1.0.7

* The aliases `p_value()` and `conf_int()`, first deprecated 6 years ago, now
  return an error (#530).
  
* Addresses ggplot2 warnings when shading p-values for test statistics
  that are outside of the range of the generated distribution (#528).

* Fixed bug in `shade_p_value()` and `shade_confidence_interval()` where `fill = NULL` was ignored when it was documented as preventing any shading (#525).

# infer v1.0.6

* Updated infrastructure for errors, warnings, and messages (#513). Most of these changes will not be visible to users, though:
     - Many longer error messages are now broken up into several lines.
     - For references to help-files, users can now click on the error message's text to navigate to the cited documentation.
     
* Various improvements to documentation (#501, #504, #508, #512).

* Fixed bug where `get_confidence_interval()` would error uninformatively when the supplied distribution of estimates contained missing values. The function will now warn and return a confidence interval calculated using the non-missing estimates (#521).

* Fixed bug where `generate()` could not be used without first `specify()`ing variables, even in cases where that specification would not affect resampling/simulation (#448). 

# infer v1.0.5

* Implemented support for permutation hypothesis tests for paired data via the 
  argument value `null = "paired independence"` in `hypothesize()` (#487).

* The `weight_by` argument to `rep_slice_sample()` can now be passed either as a vector of numeric weights or an unquoted column name in `.data` (#480).

* Newly accommodates variables with spaces in names in the wrapper functions `t_test()` and `prop_test()` (#472).

* Fixed bug in two-sample `prop_test()` where the response and explanatory 
  variable were passed in place of each other to `prop.test()`. This enables
  using `prop_test()` with explanatory variables with greater than 2 levels and,
  in the process, addresses a bug where `prop_test()` collapsed levels other than
  the `success` when the response variable had more than 2 levels.

# infer v1.0.4

* Fixed bug in p-value shading where shaded regions no longer correctly overlaid
  histogram bars.
* Addressed deprecation warning ahead of upcoming dplyr release.

# infer v1.0.3

* Fix R-devel HTML5 NOTEs.

# infer v1.0.2

* Fix p-value shading when the calculated statistic falls exactly on the boundaries of a histogram bin (#424).
* Fix `generate()` errors when columns are named `x` (#431).
* Fix error from `visualize` when passed `generate()`d `infer_dist` objects that had not been passed to `hypothesize()` (#432). 
* Update visual checks for `visualize` output to align with the R 4.1.0+ graphics engine (#438).
* `specify()` and wrapper functions now appropriately handle ordered factors (#439).
* Clarify error when incompatible statistics and hypotheses are supplied (#441).
* Updated `generate()` unexpected `type` warnings to be more permissive—the 
warning will be raised less often when `type = "bootstrap"` (#425).
* Allow passing additional arguments to `stats::chisq.test` via `...` in 
`calculate()`. Ellipses are now always passed to the applicable base R
hypothesis testing function, when applicable (#414)!
* The package will now set the levels of logical variables on conversion to factor
so that the first level (regarded as `success` by default) is `TRUE`. Core verbs
have warned without an explicit `success` value already, and this change makes
behavior consistent with the functions being wrapped by shorthand test 
wrappers (#440).
* Added new statistic `stat = "ratio of means"` (#452).

# infer v1.0.1 (GitHub Only)

This release reflects the infer version accepted to the Journal of Open Source Software.

* Re-licensed the package from CC0 to MIT. See the `LICENSE` and `LICENSE.md` files.
* Contributed a paper to the Journal of Open Source Software, a draft of which is available in `/figs/paper`.
* Various improvements to documentation (#417, #418).

# infer 1.0.0

v1.0.0 is the first major release of the {infer} package! By and large, the core verbs `specify()`, `hypothesize()`, `generate()`, and `calculate()` will interface as they did before. This release makes several improvements to behavioral consistency of the package and introduces support for theory-based inference as well as randomization-based inference with multiple explanatory variables.

## Behavioral consistency

A major change to the package in this release is a set of standards for behavioral consistency of `calculate()` (#356). Namely, the package will now

* supply a consistent error when the supplied `stat` argument isn't well-defined
for the variables `specify()`d

``` r
gss %>%
  specify(response = hours) %>%
  calculate(stat = "diff in means")
#> Error: A difference in means is not well-defined for a 
#> numeric response variable (hours) and no explanatory variable.
```

or

``` r
gss %>%
  specify(college ~ partyid, success = "degree") %>%
  calculate(stat = "diff in props")
#> Error: A difference in proportions is not well-defined for a dichotomous categorical 
#> response variable (college) and a multinomial categorical explanatory variable (partyid).
```

* supply a consistent message when the user supplies unneeded information via `hypothesize()` to `calculate()` an observed statistic

``` r
# supply mu = 40 when it's not needed
gss %>%
  specify(response = hours) %>%
  hypothesize(null = "point", mu = 40) %>%
  calculate(stat = "mean")
#> Message: The point null hypothesis `mu = 40` does not inform calculation of 
#> the observed statistic (a mean) and will be ignored.
#> # A tibble: 1 x 1
#>    stat
#>   <dbl>
#> 1  41.4
```

and

* supply a consistent warning and assume a reasonable null value when the user does not supply sufficient information to calculate an observed statistic

``` r
# don't hypothesize `p` when it's needed
gss %>%
    specify(response = sex, success = "female") %>%
    calculate(stat = "z")
#> # A tibble: 1 x 1
#>    stat
#>   <dbl>
#> 1 -1.16
#> Warning message:
#> A z statistic requires a null hypothesis to calculate the observed statistic. 
#> Output assumes the following null value: `p = .5`. 
```

or

``` r
# don't hypothesize `p` when it's needed
gss %>%
  specify(response = partyid) %>%
  calculate(stat = "Chisq")
#> # A tibble: 1 x 1
#>    stat
#>  <dbl>
#> 1  334.
#> Warning message:
#> A chi-square statistic requires a null hypothesis to calculate the observed statistic. 
#> Output assumes the following null values: `p = c(dem = 0.2, ind = 0.2, rep = 0.2, other = 0.2, DK = 0.2)`.
```

To accommodate this behavior, a number of new `calculate` methods were added or improved. Namely:

- Implemented the standardized proportion $z$ statistic for one categorical variable
- Extended `calculate()` with `stat = "t"` by passing `mu` to the `calculate()` method for `stat = "t"` to allow for calculation of `t` statistics for one numeric variable with hypothesized mean
- Extended `calculate()` to allow lowercase aliases for `stat` arguments (#373).
- Fixed bugs in `calculate()` for to allow for programmatic calculation of statistics

This behavioral consistency also allowed for the implementation of `observe()`, a wrapper function around `specify()`, `hypothesize()`, and `calculate()`, to calculate observed statistics. The function provides a shorthand alternative to calculating observed statistics from data:

``` r
# calculating the observed mean number of hours worked per week
gss %>%
  observe(hours ~ NULL, stat = "mean")
#> # A tibble: 1 x 1
#>    stat
#>   <dbl>
#> 1  41.4

# equivalently, calculating the same statistic with the core verbs
gss %>%
  specify(response = hours) %>%
  calculate(stat = "mean")
#> # A tibble: 1 x 1
#>    stat
#>   <dbl>
#> 1  41.4

# calculating a t statistic for hypothesized mu = 40 hours worked/week
gss %>%
  observe(hours ~ NULL, stat = "t", null = "point", mu = 40)
#> # A tibble: 1 x 1
#>    stat
#>   <dbl>
#> 1  2.09

# equivalently, calculating the same statistic with the core verbs
gss %>%
  specify(response = hours) %>%
  hypothesize(null = "point", mu = 40) %>%
  calculate(stat = "t")
#> # A tibble: 1 x 1
#>    stat
#>   <dbl>
#> 1  2.09
```

We don't anticipate that these changes are "breaking" in the sense that code that previously worked will continue to, though it may now message or warn in a way that it did not used to or error with a different (and hopefully more informative) message.

## A framework for theoretical inference

This release also introduces a more complete and principled interface for theoretical inference. While the package previously supplied some methods for visualization of theory-based curves, the interface did not provide any object that was explicitly a "null distribution" that could be supplied to helper functions like `get_p_value()` and `get_confidence_interval()`. The new interface is based on a new verb, `assume()`, that returns a null distribution that can be interfaced in the same way that simulation-based null distributions can be interfaced with.

As an example, we'll work through a full infer pipeline for inference on a mean using infer's `gss` dataset. Supposed that we believe the true mean number of hours worked by Americans in the past week is 40.

First, calculating the observed `t`-statistic:

``` r
obs_stat <- gss %>%
  specify(response = hours) %>%
  hypothesize(null = "point", mu = 40) %>%
  calculate(stat = "t")

obs_stat
#> Response: hours (numeric)
#> Null Hypothesis: point
#> # A tibble: 1 x 1
#>    stat
#>   <dbl>
#> 1  2.09
```

The code to define the null distribution is very similar to that required to calculate a theorized observed statistic, switching out `calculate()` for `assume()` and replacing arguments as needed.

``` r
null_dist <- gss %>%
  specify(response = hours) %>%
  assume(distribution = "t")

null_dist 
#> A T distribution with 499 degrees of freedom.
```

This null distribution can now be interfaced with in the same way as a simulation-based null distribution elsewhere in the package. For example, calculating a p-value by juxtaposing the observed statistic and null distribution:

``` r
get_p_value(null_dist, obs_stat, direction = "both")
#> # A tibble: 1 x 1
#>   p_value
#>     <dbl>
#> 1  0.0376
```

…or visualizing the null distribution alone:

``` r
visualize(null_dist)
```

![](https://i.imgur.com/g3B5coD.png)

…or juxtaposing the two visually:

``` r
visualize(null_dist) + 
  shade_p_value(obs_stat, direction = "both")
```

![](https://i.imgur.com/3C66kgK.png)

Confidence intervals lie in data space rather than the standardized scale of the theoretical distributions. Calculating a mean rather than the standardized `t`-statistic:

``` r
obs_mean <- gss %>%
  specify(response = hours) %>%
  calculate(stat = "mean")
```

The null distribution here just defines the spread for the standard error calculation.

``` r
ci <- 
  get_confidence_interval(
    null_dist,
    level = .95,
    point_estimate = obs_mean
  )

ci
#> # A tibble: 1 x 2
#>   lower_ci upper_ci
#>      <dbl>    <dbl>
#> 1     40.1     42.7
```

Visualizing the confidence interval results in the theoretical distribution being recentered and rescaled to align with the scale of the observed data:

``` r
visualize(null_dist) + 
  shade_confidence_interval(ci)
```

![](https://i.imgur.com/4akSCY3.png)

Previous methods for interfacing with theoretical distributions are superseded—they will continue to be supported, though documentation will forefront the `assume()` interface.

## Support for multiple regression

The 2016 "Guidelines for Assessment and Instruction in Statistics Education" [1] state that, in introductory statistics courses, "[s]tudents should gain experience with how statistical models, including multivariable models, are used." In line with this recommendation, we introduce support for randomization-based inference with multiple explanatory variables via a new `fit.infer` core verb.

If passed an `infer` object, the method will parse a formula out of the `formula` or `response` and `explanatory` arguments, and pass both it and `data` to a `stats::glm` call.

``` r
gss %>%
  specify(hours ~ age + college) %>%
  fit()
#> # A tibble: 3 x 2
#>   term          estimate
#>   <chr>            <dbl>
#> 1 intercept     40.6    
#> 2 age            0.00596
#> 3 collegedegree  1.53
```

Note that the function returns the model coefficients as `estimate` rather than their associated `t`-statistics as `stat`.

If passed a `generate()`d object, the model will be fitted to each replicate.

``` r
gss %>%
  specify(hours ~ age + college) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 100, type = "permute") %>%
  fit()
#> # A tibble: 300 x 3
#> # Groups:   replicate [100]
#>    replicate term          estimate
#>        <int> <chr>            <dbl>
#>  1         1 intercept     44.4    
#>  2         1 age           -0.0767 
#>  3         1 collegedegree  0.121  
#>  4         2 intercept     41.8    
#>  5         2 age            0.00344
#>  6         2 collegedegree -1.59   
#>  7         3 intercept     38.3    
#>  8         3 age            0.0761 
#>  9         3 collegedegree  0.136  
#> 10         4 intercept     43.1    
#> # … with 290 more rows
```

If `type = "permute"`, a set of unquoted column names in the data to permute (independently of each other) can be passed via the `variables` argument to `generate`. It defaults to only the response variable.

``` r
gss %>%
  specify(hours ~ age + college) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 100, type = "permute", variables = c(age, college)) %>%
  fit()
#> # A tibble: 300 x 3
#> # Groups:   replicate [100]
#>    replicate term          estimate
#>        <int> <chr>            <dbl>
#>  1         1 intercept      39.4   
#>  2         1 age             0.0748
#>  3         1 collegedegree  -2.98  
#>  4         2 intercept      42.8   
#>  5         2 age            -0.0190
#>  6         2 collegedegree  -1.83  
#>  7         3 intercept      40.4   
#>  8         3 age             0.0354
#>  9         3 collegedegree  -1.31  
#> 10         4 intercept      40.9   
#> # … with 290 more rows
```

This feature allows for more detailed exploration of the effect of disrupting the correlation structure among explanatory variables on outputted model coefficients.

Each of the auxillary functions `get_p_value()`, `get_confidence_interval()`, `visualize()`, `shade_p_value()`, and `shade_confidence_interval()` have methods to handle `fit()` output! See their help-files for example usage. Note that `shade_*` functions now delay evaluation until they are added to an existing ggplot (e.g. that outputted by `visualize()`) with `+`.

## Improvements

- Following extensive discussion, the `generate()` type `type = "simulate"` has been renamed to the more evocative `type = "draw"`. We will continue to support `type = "simulate"` indefinitely, though supplying that argument will now prompt a message notifying the user of its preferred alias. (#233, #390)
- Fixed several bugs related to factors with unused levels. `specify()` will now drop unused factor levels and message that it has done so. (#374, #375, #397, #380)
- Added `two.sided` as an acceptable alias for `two_sided` for the `direction` argument in `get_p_value()` and `shade_p_value()`. (#355)
- Various improvements to documentation, including extending example sections in help-files, re-organizing the function reference in the {pkgdown} site, and linking more extensively among help-files.

## Breaking changes

We don't anticipate that any changes made in this release are "breaking" in the sense that code that previously worked will continue to, though it may now message or warn in a way that it did not used to or error with a different (and hopefully more informative) message. If you currently teach or research with infer, we recommend re-running your materials and noting any changes in messaging and warning.

- Move forward with a number of planned deprecations. Namely, the `GENERATION_TYPES` object is now fully deprecated, and arguments that were relocated from `visualize()` to `shade_p_value()` and `shade_confidence_interval()` are now fully deprecated in `visualize()`. If supplied a deprecated argument, `visualize()` will warn the user and ignore the argument.
- Added a `prop` argument to `rep_slice_sample()` as an alternative to the `n`
argument for specifying the proportion of rows in the supplied data to sample
per replicate (#361, #362, #363). This changes order of arguments of
`rep_slice_sample()` (in order to be more aligned with `dplyr::slice_sample()`)
which might break code if it didn't use named arguments (like
`rep_slice_sample(df, 5, TRUE)`). To fix this, use named arguments (like
`rep_slice_sample(df, 5, replicate = TRUE)`).

## Other

- Added Simon P. Couch as an author. Long deserved for his reliable maintenance and improvements of the package.

[1]: GAISE College Report ASA Revision Committee, "Guidelines for Assessment and Instruction in Statistics Education College Report 2016," http://www.amstat.org/education/gaise. 

# infer 0.5.4

- `rep_sample_n()` no longer errors when supplied a `prob` argument (#279)
- Added `rep_slice_sample()`, a light wrapper around `rep_sample_n()`, that
more closely resembles `dplyr::slice_sample()` (the function that supersedes
`dplyr::sample_n()`) (#325)
- Added a `success`, `correct`, and `z` argument to `prop_test()` 
(#343, #347, #353)
- Implemented observed statistic calculation for the standardized proportion 
$z$ statistic (#351, #353)
- Various bug fixes and improvements to documentation and errors.

# infer 0.5.3

## Breaking changes

- `get_confidence_interval()` now uses column names ('lower_ci' and 'upper_ci') 
in output that are consistent with other infer functionality (#317).

## New functionality

- `get_confidence_interval()` can now produce bias-corrected confidence intervals
by setting `type = "bias-corrected"`. Thanks to @davidbaniadam for the 
initial implementation (#237, #318)!

## Other

- Fix CRAN check failures related to long double errors.

# infer 0.5.2

- Warn the user when a p-value of 0 is reported (#257, #273)
- Added new vignettes: `chi_squared` and `anova` (#268)
- Updates to documentation and existing vignettes (#268)
- Add alias for `hypothesize()` (`hypothesise()`) (#271)
- Subtraction order no longer required for difference-based tests--a warning will be raised in the case that the user doesn't supply an `order` argument (#275, #281)
- Add new messages for common errors (#277)
- Increase coverage of theoretical methods in documentation (#278, #280)
- Drop missing values and reduce size of `gss` dataset used in examples (#282)
- Add `stat = "ratio of props"` and `stat = "odds ratio"` to `calculate` (#285)
- Add `prop_test()`, a tidy interface to `prop.test()` (#284, #287)
- Updates to `visualize()` for compatibility with `ggplot2` v3.3.0 (#289)
- Fix error when bootstrapping with small samples and raise warnings/errors 
when appropriate (#239, #244, #291)
- Fix unit test failures resulting from breaking changes in `dplyr` v1.0.0
- Fix error in `generate()` when response variable is named `x` (#299)
- Add `two-sided` and `two sided` as aliases for `two_sided` for the 
`direction` argument in `get_p_value()` and `shade_p_value()` (#302)
- Fix `t_test()` and `t_stat()` ignoring the `order` argument (#310)

# infer 0.5.1

- Updates to documentation and other tweaks

# infer 0.5.0

## Breaking changes

- `shade_confidence_interval()` now plots vertical lines starting from zero (previously - from the bottom of a plot) (#234).
- `shade_p_value()` now uses "area under the curve" approach to shading (#229).

## Other

- Updated `chisq_test()` to take arguments in a response/explanatory format, perform goodness of fit tests, and default to the approximation approach (#241).
- Updated `chisq_stat()` to do goodness of fit (#241).
- Make interface to `hypothesize()` clearer by adding the options for the point null parameters to the function signature (#242).
- Manage `infer` class more systematically (#219).
- Use `vdiffr` for plot testing (#221).

# infer 0.4.1

- Added Evgeni Chasnovski as author for his incredible work on refactoring the package and providing excellent support.

# infer 0.4.0

## Breaking changes

- Changed method of computing two-sided p-value to a more conventional one. It also makes `get_pvalue()` and `visualize()` more aligned (#205).

## Deprecation changes

- Deprecated `p_value()` (use `get_p_value()` instead) (#180).
- Deprecated `conf_int()` (use `get_confidence_interval()` instead) (#180).
- Deprecated (via warnings) plotting p-value and confidence interval in `visualize()` (use new functions `shade_p_value()` and `shade_confidence_interval()` instead) (#178).

## New functions

- `shade_p_value()` - {ggplot2}-like layer function to add information about p-value region to `visualize()` output. Has alias `shade_pvalue()`.
- `shade_confidence_interval()` - {ggplot2}-like layer function to add information about confidence interval region to `visualize()` output. Has alias `shade_ci()`.

## Other

- Account for `NULL` value in left hand side of formula in `specify()` (#156) and `type` in `generate()` (#157).
- Update documentation code to follow tidyverse style guide (#159).
- Remove help page for internal `set_params()` (#165).
- Fully use {tibble} (#166).
- Fix `calculate()` to not depend on order of `p` for `type = "simulate"` (#122).
- Reduce code duplication (#173).
- Make transparency in `visualize()` to not depend on method and data volume.
- Make `visualize()` work for "One sample t" theoretical type with `method = "both"`.
- Add `stat = "sum"` and `stat = "count"` options to `calculate()` (#50).

# infer 0.3.1

- Stop using package {assertive} in favor of custom type checks (#149)
- Fixed `t_stat()` to use `...` so `var.equal` works
- With the help of @echasnovski, fixed `var.equal = TRUE` for `specify() %>% calculate(stat = "t")`
- Use custom functions for error, warning, message, and `paste()` handling (#155)

# infer 0.3.0

- Added `conf_int` logical argument and `conf_level` argument to `t_test()`
- Switched `shade_color` argument in `visualize()` to be `pvalue_fill` instead
since fill color for confidence intervals is also added now
- Shading for Confidence Intervals in `visualize()` 
    - Green is default color for CI and red for p-values
    - `direction = "between"` to get the green shading
    - Currently working only for simulation-based methods
- Implemented `conf_int()` function for computing confidence interval provided a simulation-based method with a `stat` variable
    - `get_ci()` and `get_confidence_interval()` are aliases for `conf_int()`
    - Converted longer confidence interval calculation code in vignettes to use `get_ci()` instead    
- Implemented `p_value()` function for computing p-value provided a simulation-based method with a `stat` variable
    - `get_pvalue()` is an alias for `p_value()`
    - Converted longer p-value calculation code in vignettes to use `get_pvalue()` instead
- Implemented Chi-square Goodness of Fit observed stat depending on `params` being set in `hypothesize` with `specify() %>% calculate()` shortcut
- Removed "standardized" slope $t$ since its formula is different than "standardized" correlation and there is no way currently to give one over the other
- Implemented correlation with bootstrap CI and permutation hypothesis test
- Filled the `type` argument automatically in `generate()` based
on `specify()` and `hypothesize()`
    - Added message if `type` is given differently than expected
- Implemented `specify() %>% calculate()` for getting observed
statistics.
    - `visualize()` works with either a 1x1 data frame or a vector
    for its `obs_stat` argument
    - Got `stat = "t"` working
- Refactored `calculate()` into smaller functions to reduce complexity
- Produced error if `mu` is given in `hypothesize()` but `stat = "median"`
is provided in `calculate()` and other similar mis-specifications
- Tweaked `chisq_stat()` and `t_stat()` to match with `specify() %>% calculate()` framework
    - Both work in the one sample and two sample cases by providing `formula`
    - Added `order` argument to `t_stat()`
- Added implementation of one sample `t_test()` by passing in the `mu` argument to `t.test`
from `hypothesize()`
- Tweaked `pkgdown` page to include ToDo's using [{dplyr}](https://github.com/tidyverse/dplyr/blob/master/_pkgdown.yml) example

# infer 0.2.0

- Switched to `!!` instead of `UQ()` since `UQ()` is deprecated in 
{rlang} 0.2.0
- Added many new files: `CONDUCT.md`, `CONTRIBUTING.md`, and `TO-DO.md`
- Updated README file with more development information
- Added wrapper functions `t_test()` and `chisq_test()` that use a
formula interface and provide an intuitive wrapper to `t.test()` and
`chisq.test()`
- Created `stat = "z"` and `stat = "t"` options
- Added many new arguments to `visualize()` to prescribe colors to shade and 
use for observed statistics and theoretical density curves
- Added check so that a bar graph created with `visualize()` if number of 
unique values for generated statistics is small
- Added shading for `method = "theoretical"` 
- Implemented shading for simulation methods w/o a traditional distribution
    - Use percentiles to determine two-tailed shading
- Changed `method = "randomization"` to `method = "simulation"`
- Added warning when theoretical distribution is used that 
  assumptions should be checked  
- Added theoretical distributions to `visualize()` alone and as overlay with
current implementations being
    - Two sample t
    - ANOVA F
    - One proportion z
    - Two proportion z
    - Chi-square test of independence
    - Chi-square Goodness of Fit test
    - Standardized slope (t)
    
# infer 0.1.1
- Added additional tests
- Added `order` argument in `calculate()`
- Fixed bugs post-CRAN release
- Automated travis build of pkgdown to gh-pages branch

# infer 0.1.0
- Altered the way that successes are indicated in an infer pipeline. 
They now live in `specify()`.
- Updated documentation with examples
- Created `pkgdown` site materials
    - Deployed to https://infer.tidymodels.org/


# infer 0.0.1
- Implemented the "intro stats" examples for randomization methods


================================================
FILE: R/assume.R
================================================
#' Define a theoretical distribution
#'
#' @description
#'
#' This function allows the user to define a null distribution based on
#' theoretical methods. In many infer pipelines, `assume()` can be
#' used in place of [generate()] and [calculate()] to create a null
#' distribution. Rather than outputting a data frame containing a
#' distribution of test statistics calculated from resamples of the observed
#' data, `assume()` outputs a more abstract type of object just containing
#' the distributional details supplied in the `distribution` and `df` arguments.
#' However, `assume()` output can be passed to [visualize()], [get_p_value()],
#' and [get_confidence_interval()] in the same way that simulation-based
#' distributions can.
#'
#' To define a theoretical null distribution (for use in hypothesis testing),
#' be sure to provide a null hypothesis via [hypothesize()]. To define a
#' theoretical sampling distribution (for use in confidence intervals),
#' provide the output of [specify()]. Sampling distributions (only
#' implemented for `t` and `z`) lie on the scale of the data, and will be
#' recentered and rescaled to match the corresponding `stat` given in
#' [calculate()] to calculate the observed statistic.
#'
#' @param x The output of [specify()] or [hypothesize()], giving the
#'   observed data, variable(s) of interest, and (optionally) null hypothesis.
#' @param distribution The distribution in question, as a string. One of
#'   `"F"`, `"Chisq"`, `"t"`, or `"z"`.
#' @param df Optional. The degrees of freedom parameter(s) for the `distribution`
#'   supplied, as a numeric vector. For `distribution = "F"`, this should have
#'   length two (e.g. `c(10, 3)`). For `distribution = "Chisq"` or
#'   `distribution = "t"`, this should have length one. For
#'   `distribution = "z"`, this argument is not required. The package
#'   will supply a message if the supplied `df` argument is different from
#'   recognized values. See the Details section below for more information.
#' @param ... Currently ignored.
#'
#' @return An infer theoretical distribution that can be passed to helpers
#'   like [visualize()], [get_p_value()], and [get_confidence_interval()].
#'
#' @details
#'
#' Note that the assumption being expressed here, for use in theory-based
#' inference, only extends to _distributional_ assumptions: the null
#' distribution in question and its parameters. Statistical inference with
#' infer, whether carried out via simulation (i.e. based on pipelines
#' using [generate()] and [calculate()]) or theory (i.e. with `assume()`),
#' always involves the condition that observations are independent of
#' each other.
#'
#' `infer` only supports theoretical tests on one or two means via the
#' `t` distribution and one or two proportions via the `z`.
#'
#' For tests comparing two means, if `n1` is the group size for one level of
#' the explanatory variable, and `n2` is that for the other level, `infer`
#' will recognize the following degrees of freedom (`df`) arguments:
#'
#' * `min(n1 - 1, n2 - 1)`
#' * `n1 + n2 - 2`
#' * The `"parameter"` entry of the analogous `stats::t.test()` call
#' * The `"parameter"` entry of the analogous `stats::t.test()` call with `var.equal = TRUE`
#'
#' By default, the package will use the `"parameter"` entry of the analogous
#' `stats::t.test()` call with `var.equal = FALSE` (the default).
#'
#' @examples
#' # construct theoretical distributions ---------------------------------
#'
#' # F distribution
#' # with the `partyid` explanatory variable
#' gss |>
#'   specify(age ~ partyid) |>
#'   assume(distribution = "F")
#'
#' # Chi-squared goodness of fit distribution
#' # on the `finrela` variable
#' gss |>
#'   specify(response = finrela) |>
#'   hypothesize(null = "point",
#'               p = c("far below average" = 1/6,
#'                     "below average" = 1/6,
#'                     "average" = 1/6,
#'                     "above average" = 1/6,
#'                     "far above average" = 1/6,
#'                     "DK" = 1/6)) |>
#'   assume("Chisq")
#'
#' # Chi-squared test of independence
#' # on the `finrela` and `sex` variables
#' gss |>
#'   specify(formula = finrela ~ sex) |>
#'   assume(distribution = "Chisq")
#'
#' # T distribution
#' gss |>
#'   specify(age ~ college) |>
#'   assume("t")
#'
#' # Z distribution
#' gss |>
#'   specify(response = sex, success = "female") |>
#'   assume("z")
#'
#' \dontrun{
#' # each of these distributions can be passed to infer helper
#' # functions alongside observed statistics!
#'
#' # for example, a 1-sample t-test -------------------------------------
#'
#' # calculate the observed statistic
#' obs_stat <- gss |>
#'   specify(response = hours) |>
#'   hypothesize(null = "point", mu = 40) |>
#'   calculate(stat = "t")
#'
#' # construct a null distribution
#' null_dist <- gss |>
#'   specify(response = hours) |>
#'   assume("t")
#'
#' # juxtapose them visually
#' visualize(null_dist) +
#'   shade_p_value(obs_stat, direction = "both")
#'
#' # calculate a p-value
#' get_p_value(null_dist, obs_stat, direction = "both")
#'
#' # or, an F test ------------------------------------------------------
#'
#' # calculate the observed statistic
#' obs_stat <- gss |>
#'   specify(age ~ partyid) |>
#'   hypothesize(null = "independence") |>
#'   calculate(stat = "F")
#'
#' # construct a null distribution
#' null_dist <- gss |>
#'   specify(age ~ partyid) |>
#'   assume(distribution = "F")
#'
#' # juxtapose them visually
#' visualize(null_dist) +
#'   shade_p_value(obs_stat, direction = "both")
#'
#' # calculate a p-value
#' get_p_value(null_dist, obs_stat, direction = "both")
#' }
#'
#' @export
assume <- function(x, distribution, df = NULL, ...) {
  if (!inherits(x, "infer")) {
    cli_abort(
      "The {.arg x} argument must be the output of a core infer function, \\
       likely {.fun specify} or {.fun hypothesize}."
    )
  }

  # check that `distribution` aligns with what is expected from
  # `x` and that `distribution` and `df` are consistent with each other
  df <- check_distribution(x, distribution, df, ...)

  structure(
    glue(
      "{distribution_desc(distribution)} distribution{df_desc(df)}.",
      .null = "NULL"
    ),
    # store distribution as the suffix to p* in dist function
    distribution = dist_fn(distribution),
    dist_ = distribution,
    # store df for easier passing to p* functions
    df = df,
    # store df in `specify`-esque format for use in `visualize`
    distr_param = if (length(df) > 0) {
      df[1]
    } else {
      NULL
    },
    distr_param2 = if (length(df) == 2) {
      df[2]
    } else {
      NULL
    },
    # bring along x attributes
    theory_type = attr(x, "theory_type"),
    params = attr(x, "params"),
    hypothesized = attr(x, "hypothesized"),
    # bring along dots
    dots = list(...),
    # append class
    class = "infer_dist"
  )
}

# check that the distribution is well-specified
check_distribution <- function(x, distribution, df, ..., call = caller_env()) {
  dist <- tolower(distribution)

  if (!dist %in% c("f", "chisq", "t", "z")) {
    cli_abort(
      'The distribution argument must be one of "Chisq", "F", "t", or "z".',
      call = call
    )
  }

  if (
    (dist == "f" && attr(x, "theory_type") != "ANOVA") ||
      (dist == "chisq" &&
        !attr(x, "theory_type") %in%
          c("Chi-square test of indep", "Chi-square Goodness of Fit")) ||
      (dist == "t" &&
        !attr(x, "theory_type") %in% c("One sample t", "Two sample t")) ||
      (dist == "z" &&
        !attr(x, "theory_type") %in%
          c("One sample prop z", "Two sample props z"))
  ) {
    if (has_explanatory(x)) {
      msg_tail <- glue(
        "a {get_stat_type_desc(attr(x, 'type_desc_explanatory'))} ",
        "explanatory variable ({explanatory_name(x)}).",
        .null = "NULL"
      )
    } else {
      msg_tail <- "no explanatory variable."
    }

    cli_abort(
      'The supplied distribution {.val {distribution}} is not well-defined for a \\
      {get_stat_type_desc(attr(x, "type_desc_response"))} response \\
      variable ({response_name(x)}) and {msg_tail}',
      call = call
    )
  }

  if (!is.numeric(df) && !is.null(df)) {
    cli_abort(
      "{.fun assume} expects the {.arg df} argument to be a numeric vector, \\
       but you supplied a {list(class(df))} object.",
      call = call
    )
  }

  if (length(list(...)) != 0) {
    dots <- list(...)

    cli_abort(
      c(
        "{.fun assume} ignores the dots `...` argument, though the \\
       {qty(dots)}argument{?s} {.field {names(dots)}} {?was/were} supplied. ",
        i = "Did you forget to concatenate the {.arg df} argument with {.fun c}?"
      ),
      call = call
    )
  }

  if (dist_df_length(distribution) != length(df) && !is.null(df)) {
    cli_abort(
      '{distribution_desc(distribution)} distribution requires \\
       {dist_df_length(distribution)} degrees of freedom argument{?s}, \\
       but {length(df)} {?was/were} supplied.',
      call = call
    )
  }

  df <- determine_df(x, dist, df)

  return(df)
}

# convert the distribution argument to its r distribution function suffix
dist_fn <- function(distribution) {
  switch(
    tolower(distribution),
    `f` = "f",
    `chisq` = "chisq",
    `t` = "t",
    `z` = "norm"
  )
}

# return expected degrees of freedom length
dist_df_length <- function(distribution) {
  switch(
    tolower(distribution),
    `f` = 2,
    `chisq` = ,
    `t` = 1,
    `z` = 0
  )
}

# describe the distribution
distribution_desc <- function(distribution) {
  switch(
    tolower(distribution),
    `f` = "An F",
    `chisq` = "A Chi-squared",
    `t` = "A T",
    `z` = "A Z"
  )
}

# describe the degrees of freedom
df_desc <- function(df) {
  if (is.null(df)) {
    ""
  } else {
    plural <- length(df) != 1

    paste0(
      ' with ',
      if (plural) {
        paste0(round(df), collapse = " and ")
      } else {
        round(df)
      },
      ' degree',
      if (!plural && df == 1) {
        ''
      } else {
        's'
      },
      ' of freedom'
    )
  }
}

# process df for passing to p* functions
process_df <- function(df) {
  switch(
    as.character(length(df)),
    "0" = list(),
    "1" = list(df = df),
    "2" = list(df1 = df[1], df2 = df[2])
  )
}

# generate an automatic "df" value using logic from
# hypothesize and, if it doesn't match the
# supplied one, raise a message
determine_df <- function(x, dist, df) {
  if (!is.null(df) && !all(round(df) %in% round(acceptable_dfs(x)))) {
    cli_inform(
      "Message: The supplied {.arg df} argument does not match its \\
       expected value. If this is unexpected, ensure that your calculation \\
       for {.arg df} is correct (see {.help [{.fun assume}](infer::assume)} for \\
       recognized values) or supply {.code df = NULL} to {.fun assume}."
    )

    return(df)
  }

  if (is.null(df)) {
    df <- acceptable_dfs(x)
  }

  if (attr(x, "theory_type") == "Two sample t") {
    df <- df[1]
  }

  df
}

# return a vector of dfs recognized by `assume`
acceptable_dfs <- function(x) {
  # base R pipe doesn't support operators or anonymous functions
  # in piped expressions (#553)
  minus_one <- function(x) {x - 1}
  minus_two <- function(x) {x - 2}

  if (attr(x, "theory_type") == "Two sample t") {
    c(
      # t.test param with var.equal = FALSE
      unname(
        unlist(
          attr(x, "distr_param") <-
            stats::t.test(response_variable(x) ~ explanatory_variable(x))[[
              "parameter"
            ]]
        )
      ),
      # t.test param with var.equal = TRUE
      unname(
        unlist(
          attr(x, "distr_param") <-
            stats::t.test(
              response_variable(x) ~ explanatory_variable(x),
              var.equal = TRUE
            )[["parameter"]]
        )
      ),
      # min(n1 - 1, n2 - 1)
      x |>
        dplyr::count(!!explanatory_expr(x)) |>
        dplyr::pull(n) |>
        min() |>
        minus_one(),
      # n1 + n2 - 2
      x |>
        dplyr::count(!!explanatory_expr(x)) |>
        dplyr::pull(n) |>
        sum() |>
        minus_two()
    )
  } else {
    c(
      unname(unlist(attr(x, "distr_param"))),
      unname(unlist(attr(x, "distr_param2")))
    )
  }
}


================================================
FILE: R/calculate.R
================================================
#' Calculate summary statistics
#'
#' @description
#'
#' Given the output of [specify()] and/or [hypothesize()], this function will
#' return the observed statistic specified with the `stat` argument. Some test
#' statistics, such as `Chisq`, `t`, and `z`, require a null hypothesis. If
#' provided the output of [generate()], the function will calculate the
#' supplied `stat` for each `replicate`.
#'
#' Learn more in `vignette("infer")`.
#'
#' @param x The output from [generate()] for computation-based inference or the
#'   output from [hypothesize()] piped in to here for theory-based inference.
#' @param stat A string giving the type of the statistic to calculate or a
#'   function that takes in a replicate of `x` and returns a scalar value. Current
#'   options include `"mean"`, `"median"`, `"sum"`, `"sd"`, `"prop"`, `"count"`,
#'   `"diff in means"`, `"diff in medians"`, `"diff in props"`, `"Chisq"` (or
#'   `"chisq"`), `"F"` (or `"f"`), `"t"`, `"z"`, `"ratio of props"`, `"slope"`,
#'   `"odds ratio"`, `"ratio of means"`, and `"correlation"`. `infer` only
#'   supports theoretical tests on one or two means via the `"t"` distribution
#'   and one or two proportions via the `"z"`. See the "Arbitrary test statistics"
#'   section below for more on how to define a custom statistic.
#' @param order A string vector of specifying the order in which the levels of
#'   the explanatory variable should be ordered for subtraction (or division
#'   for ratio-based statistics), where `order = c("first", "second")` means
#'   `("first" - "second")`, or the analogue for ratios. Needed for inference on
#'   difference in means, medians, proportions, ratios, t, and z statistics.
#' @param ... To pass options like `na.rm = TRUE` into functions like
#'   [mean()][base::mean()], [sd()][stats::sd()], etc. Can also be used to
#'   supply hypothesized null values for the `"t"` statistic or additional
#'   arguments to [stats::chisq.test()].
#'
#' @return A tibble containing a `stat` column of calculated statistics.
#'
#' @section Arbitrary test statistics:
#'
#' In addition to the pre-implemented statistics documented in `stat`, users can
#' supply an arbitrary test statistic by supplying a function to the `stat`
#' argument.
#'
#' The function should have arguments `stat(x, order, ...)`, where `x` is one
#' replicate's worth of `x`. The `order` argument and ellipses will be supplied
#' directly to the `stat` function. Internally, `calculate()` will split `x` up
#' into data frames by replicate and pass them one-by-one to the supplied `stat`.
#' For example, to implement `stat = "mean"` as a function, one could write:
#'
#' ```r
#' stat_mean <- function(x, order, ...) {mean(x$hours)}
#' obs_mean <-
#'   gss %>%
#'   specify(response = hours) %>%
#'   calculate(stat = stat_mean)
#'
#' set.seed(1)
#' null_dist_mean <-
#'   gss %>%
#'   specify(response = hours) %>%
#'   hypothesize(null = "point", mu = 40) %>%
#'   generate(reps = 5, type = "bootstrap") %>%
#'   calculate(stat = stat_mean)
#' ```
#'
#' Note that the same `stat_mean` function is supplied to both `generate()`d and
#' non-`generate()`d infer objects--no need to implement support for grouping
#' by `replicate` yourself.
#'
#' @section Missing levels in small samples:
#' In some cases, when bootstrapping with small samples, some generated
#' bootstrap samples will have only one level of the explanatory variable
#' present. For some test statistics, the calculated statistic in these
#' cases will be NaN. The package will omit non-finite values from
#' visualizations (with a warning) and raise an error in p-value calculations.
#'
#' @includeRmd man-roxygen/seeds.Rmd
#'
#' @examples
#'
#' # calculate a null distribution of hours worked per week under
#' # the null hypothesis that the mean is 40
#' gss |>
#'   specify(response = hours) |>
#'   hypothesize(null = "point", mu = 40) |>
#'   generate(reps = 200, type = "bootstrap") |>
#'   calculate(stat = "mean")
#'
#' # calculate the corresponding observed statistic
#' gss |>
#'   specify(response = hours) |>
#'   calculate(stat = "mean")
#'
#' # calculate a null distribution assuming independence between age
#' # of respondent and whether they have a college degree
#' gss |>
#'   specify(age ~ college) |>
#'   hypothesize(null = "independence") |>
#'   generate(reps = 200, type = "permute") |>
#'   calculate("diff in means", order = c("degree", "no degree"))
#'
#' # calculate the corresponding observed statistic
#' gss |>
#'   specify(age ~ college) |>
#'   calculate("diff in means", order = c("degree", "no degree"))
#'
#' # some statistics require a null hypothesis
#'  gss |>
#'    specify(response = hours) |>
#'    hypothesize(null = "point", mu = 40) |>
#'    calculate(stat = "t")
#'
#' # more in-depth explanation of how to use the infer package
#' \dontrun{
#' vignette("infer")
#' }
#'
#' @seealso [visualize()], [get_p_value()], and [get_confidence_interval()]
#' to extract value from this function's outputs.
#'
#' @importFrom dplyr group_by summarize n
#' @importFrom rlang !! sym quo enquo eval_tidy
#' @family core functions
#' @export
calculate <- function(
  x,
  stat = c(
    "mean",
    "median",
    "sum",
    "sd",
    "prop",
    "count",
    "diff in means",
    "diff in medians",
    "diff in props",
    "Chisq",
    "F",
    "slope",
    "correlation",
    "t",
    "z",
    "ratio of props",
    "odds ratio",
    "ratio of means"
  ),
  order = NULL,
  ...
) {
  check_type(x, tibble::is_tibble)
  check_if_mlr(x, "calculate")
  stat_chr <- stat_chr(stat)
  stat_chr <- check_calculate_stat(stat_chr)
  check_input_vs_stat(x, stat_chr)
  check_point_params(x, stat_chr)

  order <- check_order(x, order, in_calculate = TRUE, stat_chr)

  if (!is_generated(x)) {
    x$replicate <- 1L
  }

  x <- message_on_excessive_null(x, stat = stat_chr, fn = "calculate")
  x <- warn_on_insufficient_null(x, stat_chr, ...)

  # Use S3 method to match correct calculation
  result <- calc_impl(
    structure(stat, class = gsub(" ", "_", stat_chr)),
    x,
    order,
    ...
  )

  result <- copy_attrs(to = result, from = x)
  attr(result, "stat") <- stat

  # For returning a 1x1 observed statistic value
  if (nrow(result) == 1) {
    result <- select(result, stat)
  }

  append_infer_class(result)
}

check_if_mlr <- function(x, fn, call = caller_env()) {
  if (fn == "calculate") {
    suggestion <-
      "When working with multiple explanatory variables, use \\
        {.help [{.fun fit}](infer::fit.infer)} instead."
  } else {
    suggestion <- ""
  }

  if (is_mlr(x)) {
    cli_abort(
      c(
        "Multiple explanatory variables are not supported in {.fun {fn}}.",
        i = suggestion
      ),
      call = call
    )
  }
}

stat_chr <- function(stat) {
  if (rlang::is_function(stat)) {
    return("function")
  }

  stat
}

check_calculate_stat <- function(stat, call = caller_env()) {
  check_type(stat, rlang::is_string, call = call)
  if (identical(stat, "function")) {
    return(stat)
  }

  # Check for possible `stat` aliases
  alias_match_id <- match(stat, implemented_stats_aliases[["alias"]])
  if (!is.na(alias_match_id)) {
    stat <- implemented_stats_aliases[["target"]][alias_match_id]
  } else {
    rlang::arg_match(stat, implemented_stats)
  }

  stat
}

# Raise an error if the user supplies a test statistic that doesn't
# make sense given the variable and hypothesis specified
check_input_vs_stat <- function(x, stat, call = caller_env()) {
  response_type <- attr(x, "type_desc_response")
  explanatory_type <- attr(x, "type_desc_explanatory")

  possible_stats <- stat_types |>
    dplyr::filter(resp == response_type & exp == explanatory_type) |>
    dplyr::pull(stats) |>
    unlist()

  if (is.null(possible_stats)) {
    cli_abort(
      "The infer team has not implemented test statistics for the \\
       supplied variable types.",
      call = call
    )
  }

  if (identical(stat, "function")) {
     return(x)
  }

  if (!stat %in% possible_stats) {
    if (has_explanatory(x)) {
      msg_tail <- glue(
        "a {get_stat_type_desc(explanatory_type)} explanatory variable ",
        "({explanatory_name(x)}).",
        .null = "NULL"
      )
    } else {
      msg_tail <- "no explanatory variable."
    }

    cli_abort(
      "{get_stat_desc(stat)} is not well-defined for a \\
       {get_stat_type_desc(response_type)} response variable \\
       ({response_name(x)}) and {msg_tail}",
      call = call
    )
  }

  if (is_hypothesized(x)) {
    stat_nulls <- stat_hypotheses |>
      dplyr::filter(
        stat == !!stat &
          hypothesis == attr(x, "null")
      )

    if (nrow(stat_nulls) == 0) {
      cli_abort(
        'The supplied statistic `stat = "{stat}"` is incompatible with the \\
         supplied hypothesis `null = "{attr(x, "null")}"`.',
        call = call
      )
    }
  }

  x
}

# When given no hypothesis for a theorized statistic, supply a reasonable value
.subset_1 <- function(x) {x[[1]]}
assume_null <- function(x, stat_) {
  null_fn <- theorized_nulls |>
    dplyr::filter(stat == stat_) |>
    dplyr::pull(null_fn) |>
    .subset_1()

  null_fn(x)
}


# User supplied "too much" information - hypothesized a value for a point
# estimate that isn't relevant to the statistic calculation
#
# The `stat = "mean"` default ensures that `stat %in% untheorized_stats`
# when called in non-`calculate` functions
message_on_excessive_null <- function(x, stat = "mean", fn) {
  if (!is_generated(x) && is_hypothesized(x) && stat %in% untheorized_stats) {
    null_type <- attr(x, "null")
    null_param <- attr(x, "params")

    cli_inform(
      "Message: The {null_type} null hypothesis \\
       {if (null_type == 'point') {paste0('`', names(null_param), ' = ', unname(null_param), '` ')} else {''}} \\
       does not inform calculation of the observed \\
       {if (fn == 'calculate') {paste0('statistic (', tolower(get_stat_desc(stat)), ') ')} else {'fit '}} \\
       and will be ignored."
    )
  }

  x
}

# User didn't supply "enough" information - no hypothesis for a theorized
# statistic on a point estimate, so warn that a reasonable value was assumed.
warn_on_insufficient_null <- function(x, stat, ...) {
  if (
    !is_hypothesized(x) &&
      !has_explanatory(x) &&
      !stat %in% c(untheorized_stats, "function") &&
      !(stat == "t" && "mu" %in% names(list(...)))
  ) {
    attr(x, "null") <- "point"
    attr(x, "params") <- assume_null(x, stat)

    cli_warn(c(
      "{get_stat_desc(stat)} requires a null \\
       hypothesis to calculate the observed statistic.",
      "Output assumes the following null value{print_params(x)}."
    ))
  }

  x
}

calc_impl <- function(type, x, order, ...) {
  UseMethod("calc_impl", type)
}

calc_impl_one_f <- function(f) {
  function(type, x, order, ...) {
    col <- base::setdiff(names(x), "replicate")

    if (!identical(dplyr::group_vars(x), "replicate")) {
      x <- dplyr::group_by(x, replicate)
    }

    res <- x |>
      dplyr::summarize(stat = f(!!(sym(col)), ...))

    # calculate SE for confidence intervals
    if (!is_generated(x)) {
      sample_sd <- x |>
        dplyr::summarize(stats::sd(!!(sym(col)))) |>
        dplyr::pull()

      attr(res, "se") <- sample_sd / sqrt(nrow(x))
    }

    res
  }
}

#' @export
calc_impl.mean <- calc_impl_one_f(mean)

#' @export
calc_impl.median <- calc_impl_one_f(stats::median)

#' @export
calc_impl.sum <- calc_impl_one_f(sum)

#' @export
calc_impl.sd <- calc_impl_one_f(stats::sd)

calc_impl_success_f <- function(f, output_name) {
  function(type, x, order, ...) {
    col <- base::setdiff(names(x), "replicate")

    success <- attr(x, "success")

    if (!identical(dplyr::group_vars(x), "replicate")) {
      x <- dplyr::group_by(x, replicate)
    }

    res <- x |>
      dplyr::summarize(stat = f(!!sym(col), success))

    # calculate SE for confidence intervals
    if (!is_generated(x) && output_name == "proportion") {
      prop <- res[["stat"]]

      attr(res, "se") <- sqrt((prop * (1 - prop)) / nrow(x))
    }

    res
  }
}

#' @export
calc_impl.prop <- calc_impl_success_f(
  f = function(response, success, ...) {
    mean(response == success, ...)
  },
  output_name = "proportion"
)

#' @export
calc_impl.count <- calc_impl_success_f(
  f = function(response, success, ...) {
    sum(response == success, ...)
  },
  output_name = "count"
)

#' @export
calc_impl.F <- function(type, x, order, ...) {
  x |>
    dplyr::summarize(
      stat = stats::anova(
        stats::lm(!!(response_expr(x)) ~ !!(explanatory_expr(x)))
      )$`F value`[1]
    )
}

#' @export
calc_impl.slope <- function(type, x, order, ...) {
  x |>
    dplyr::summarize(
      stat = stats::coef(
        stats::lm(!!(response_expr(x)) ~ !!(explanatory_expr(x)))
      )[2]
    )
}

#' @export
calc_impl.correlation <- function(type, x, order, ...) {
  x |>
    dplyr::summarize(
      stat = stats::cor(!!explanatory_expr(x), !!response_expr(x))
    )
}

calc_impl_diff_f <- function(f, operator) {
  function(type, x, order, ...) {
    res <- x |>
      dplyr::group_by(replicate, !!explanatory_expr(x), .drop = FALSE) |>
      dplyr::summarize(value = f(!!response_expr(x), ...)) |>
      dplyr::group_by(replicate) |>
      dplyr::summarize(
        stat = operator(
          value[!!(explanatory_expr(x)) == order[1]],
          value[!!(explanatory_expr(x)) == order[2]]
        )
      )

    # calculate SE for confidence intervals
    if (!is_generated(x) && identical(operator, `-`)) {
      sample_sds <- x |>
        dplyr::group_by(replicate, !!explanatory_expr(x), .drop = FALSE) |>
        dplyr::summarize(stats::sd(!!response_expr(x))) |>
        dplyr::pull()

      sample_counts <- x |>
        dplyr::count(!!explanatory_expr(x), .drop = FALSE) |>
        dplyr::pull()

      attr(res, "se") <-
        sqrt(
          sum(
            (sample_sds[1] / sqrt(sample_counts[1]))^2,
            (sample_sds[2] / sqrt(sample_counts[2]))^2
          )
        )
    }

    res
  }
}

#' @export
calc_impl.diff_in_means <- calc_impl_diff_f(mean, operator = `-`)

#' @export
calc_impl.diff_in_medians <- calc_impl_diff_f(stats::median, operator = `-`)

#' @export
calc_impl.ratio_of_means <- calc_impl_diff_f(mean, operator = `/`)

#' @export
calc_impl.Chisq <- function(type, x, order, ...) {
  resp_var <- response_name(x)

  if (!has_attr(x, "explanatory")) {
    # Chi-Square Goodness of Fit
    p_levels <- get_par_levels(x)
    chisq_gof <- function(df) {
      chisq <- suppressWarnings(stats::chisq.test(
        # Ensure correct ordering of parameters
        table(df[[resp_var]])[p_levels],
        p = attr(x, "params"),
        ...
      ))

      unname(chisq[["statistic"]])
    }

    result <- x |>
      dplyr::nest_by(.key = "data") |>
      dplyr::summarise(stat = chisq_gof(data), .groups = "drop")
  } else {
    # Chi-Square Test of Independence
    expl_var <- explanatory_name(x)
    chisq_indep <- function(df) {
      res <- suppressWarnings(stats::chisq.test(
        x = df[[expl_var]],
        y = df[[resp_var]],
        ...
      ))

      res[["statistic"]]
    }

    # Compute result
    result <- x |>
      dplyr::nest_by(.key = "data") |>
      dplyr::summarise(stat = chisq_indep(data), .groups = "drop")
  }

  if (is_generated(x)) {
    result <- result |> dplyr::select(replicate, stat)
  } else {
    result <- result |> dplyr::select(stat)
  }

  copy_attrs(
    to = result,
    from = x,
    attrs = c(
      "response",
      "success",
      "explanatory",
      "response_type",
      "explanatory_type",
      "distr_param",
      "distr_param2",
      "theory_type",
      "type_desc_response",
      "type_desc_explanatory"
    )
  )
}

#' @export
calc_impl.function_of_props <- function(type, x, order, operator, ...) {
  col <- response_expr(x)
  success <- attr(x, "success")

  res <- x |>
    dplyr::group_by(replicate, !!explanatory_expr(x), .drop = FALSE) |>
    dplyr::summarize(prop = mean(!!sym(col) == success, ...)) |>
    dplyr::summarize(
      stat = operator(
        prop[!!explanatory_expr(x) == order[1]],
        prop[!!explanatory_expr(x) == order[2]]
      )
    )

  # calculate SE for confidence intervals
  if (!is_generated(x)) {
    props <- x |>
      dplyr::group_by(!!explanatory_expr(x), .drop = FALSE) |>
      dplyr::summarize(prop = mean(!!sym(col) == success, ...)) |>
      dplyr::pull()

    counts <- x |>
      dplyr::count(!!explanatory_expr(x), .drop = FALSE) |>
      dplyr::pull()

    attr(res, "se") <-
      sqrt(
        sum(
          abs((props[1] * (1 - props[1])) / counts[1]),
          abs((props[2] * (1 - props[2])) / counts[2])
        )
      )
  }

  res
}

#' @export
calc_impl.diff_in_props <- function(type, x, order, ...) {
  calc_impl.function_of_props(type, x, order, operator = `-`, ...)
}

#' @export
calc_impl.ratio_of_props <- function(type, x, order, ...) {
  calc_impl.function_of_props(type, x, order, operator = `/`, ...)
}

#' @export
calc_impl.odds_ratio <- function(type, x, order, ...) {
  col <- response_expr(x)
  success <- attr(x, "success")

  x |>
    dplyr::group_by(replicate, !!explanatory_expr(x), .drop = FALSE) |>
    dplyr::summarize(prop = mean(!!sym(col) == success, ...)) |>
    dplyr::summarize(
      prop_1 = prop[!!explanatory_expr(x) == order[1]],
      prop_2 = prop[!!explanatory_expr(x) == order[2]],
      stat = (prop_1 / prop_2) / ((1 - prop_1) / (1 - prop_2))
    ) |>
    dplyr::select(stat)
}

#' @export
calc_impl.t <- function(type, x, order, ...) {
  if (theory_type(x) == "Two sample t") {
    x <- reorder_explanatory(x, order)

    df_out <- x |>
      dplyr::summarize(
        stat = stats::t.test(
          !!response_expr(x) ~ !!explanatory_expr(x),
          ...
        )[["statistic"]]
      )
  } else if (theory_type(x) == "One sample t") {
    if (!is_hypothesized(x)) {
      # For bootstrap
      df_out <- x |>
        dplyr::summarize(
          stat = stats::t.test(!!response_expr(x), ...)[["statistic"]]
        )
    } else {
      # For hypothesis testing
      df_out <- x |>
        dplyr::summarize(
          stat = stats::t.test(
            !!response_expr(x),
            mu = attr(!!x, "params"),
            ...
          )[["statistic"]]
        )
    }
  }
  df_out
}

#' @export
calc_impl.z <- function(type, x, order, ...) {
  # Two sample proportions
  if (theory_type(x) == "Two sample props z") {
    col <- response_expr(x)
    success <- attr(x, "success")

    x$explan <- factor(
      explanatory_variable(x),
      levels = c(order[1], order[2])
    )

    aggregated <- x |>
      dplyr::group_by(replicate, explan) |>
      dplyr::summarize(
        group_num = dplyr::n(),
        prop = mean(rlang::eval_tidy(col) == rlang::eval_tidy(success)),
        num_suc = sum(rlang::eval_tidy(col) == rlang::eval_tidy(success))
      )

    df_out <- aggregated |>
      dplyr::summarize(
        diff_prop = prop[explan == order[1]] - prop[explan == order[2]],
        total_suc = sum(num_suc),
        n1 = group_num[1],
        n2 = group_num[2],
        p_hat = total_suc / (n1 + n2),
        denom = sqrt(p_hat * (1 - p_hat) / n1 + p_hat * (1 - p_hat) / n2),
        stat = diff_prop / denom
      ) |>
      dplyr::select(stat)

    df_out
  } else if (theory_type(x) == "One sample prop z") {
    # One sample proportion

    # When `hypothesize()` has been called
    success <- attr(x, "success")
    col <- response_expr(x)
    p0 <- unname(attr(x, "params")[1])
    num_rows <- nrow(x) / length(unique(x$replicate))

    df_out <- x |>
      dplyr::summarize(
        stat = (mean(rlang::eval_tidy(col) == rlang::eval_tidy(success), ...) -
          p0) /
          sqrt((p0 * (1 - p0)) / num_rows)
      )

    df_out
  }
}

#' @export
calc_impl.function <- function(type, x, order, ..., call = rlang::caller_env()) {
  rlang::try_fetch(
    {
      if (!identical(dplyr::group_vars(x), "replicate")) {
         x <- dplyr::group_by(x, replicate)
      }
      x_by_replicate <- dplyr::group_split(x)
      res <- purrr::map(x_by_replicate, ~type(.x, order, ...))
    },
    error = function(cnd) {rethrow_stat_cnd(cnd, call = call)},
    warning = function(cnd) {rethrow_stat_cnd(cnd, call = call)}
  )

  if (!rlang::is_scalar_atomic(res[[1]])) {
    cli::cli_abort(
      c(
        "The supplied {.arg stat} function must return a scalar value.",
        "i" = "It returned {.obj_type_friendly {res[[1]]}}."
      ),
      call = call
    )
  }

  tibble::new_tibble(list(stat = unlist(res)))
}

rethrow_stat_cnd <- function(cnd, call = call) {
  cli::cli_abort(
    "The supplied {.arg stat} function encountered an issue.",
    parent = cnd,
    call = call
  )
}


================================================
FILE: R/deprecated.R
================================================
#' Deprecated functions and objects
#'
#' These functions and objects should no longer be used. They will be removed
#' in a future release of infer.
#' @param x See the non-deprecated function.
#' @param level See the non-deprecated function.
#' @param type See the non-deprecated function.
#' @param point_estimate See the non-deprecated function.
#' @param obs_stat See the non-deprecated function.
#' @param direction See the non-deprecated function.
#' @seealso [get_p_value()], [get_confidence_interval()], [generate()]
#' @name deprecated
NULL


#' @rdname deprecated
#' @export
conf_int <- function(
  x,
  level = 0.95,
  type = "percentile",
  point_estimate = NULL
) {
  lifecycle::deprecate_stop("0.4.0", "conf_int()", "get_confidence_interval()")
}


#' @rdname deprecated
#' @export
p_value <- function(x, obs_stat, direction) {
  lifecycle::deprecate_stop("0.4.0", "conf_int()", "get_p_value()")
}


================================================
FILE: R/fit.R
================================================
#' @importFrom generics fit
#' @details
#' Read more about infer's [fit][fit.infer()] function [here][fit.infer()] or
#' by running `?fit.infer` in your console.
#'
#' @export
generics::fit


#' Fit linear models to infer objects
#'
#' @description
#' Given the output of an infer core function, this function will fit
#' a linear model using [stats::glm()] according to the formula and data supplied
#' earlier in the pipeline. If passed the output of [specify()] or
#' [hypothesize()], the function will fit one model. If passed the output
#' of [generate()], it will fit a model to each data resample, denoted in
#' the `replicate` column. The family of the fitted model depends on the type
#' of the response variable. If the response is numeric, `fit()` will use
#' `family = "gaussian"` (linear regression). If the response is a 2-level
#' factor or character, `fit()` will use `family = "binomial"` (logistic
#' regression). To fit character or factor response variables with more than
#' two levels, we recommend [parsnip::multinom_reg()].
#'
#' infer provides a fit "method" for infer objects, which is a way of carrying
#' out model fitting as applied to infer output. The "generic," imported from
#' the generics package and re-exported from this package, provides the
#' general form of `fit()` that points to infer's method when called on an
#' infer object. That generic is also documented here.
#'
#' Learn more in `vignette("infer")`.
#'
#' @param object Output from an infer function---likely [generate()] or
#' [specify()]---which specifies the formula and data to fit a model to.
#' @param ... Any optional arguments to pass along to the model fitting
#' function. See [stats::glm()] for more information.
#'
#' @return A [tibble][tibble::tibble] containing the following columns:
#'
#' \itemize{
#'   \item `replicate`: Only supplied if the input object had been previously
#'     passed to [generate()]. A number corresponding to which resample of the
#'     original data set the model was fitted to.
#'   \item `term`: The explanatory variable (or intercept) in question.
#'   \item `estimate`: The model coefficient for the given resample (`replicate`) and
#'     explanatory variable (`term`).
#' }
#'
#' @details
#'
#' Randomization-based statistical inference with multiple explanatory
#' variables requires careful consideration of the null hypothesis in question
#' and its implications for permutation procedures. Inference for partial
#' regression coefficients via the permutation method implemented in
#' [generate()] for multiple explanatory variables, consistent with its meaning
#' elsewhere in the package, is subject to additional distributional assumptions
#' beyond those required for one explanatory variable. Namely, the distribution
#' of the response variable must be similar to the distribution of the errors
#' under the null hypothesis' specification of a fixed effect of the explanatory
#' variables. (This null hypothesis is reflected in the `variables` argument to
#' [generate()]. By default, all of the explanatory variables are treated
#' as fixed.) A general rule of thumb here is, if there are large outliers
#' in the distributions of any of the explanatory variables, this distributional
#' assumption will not be satisfied; when the response variable is permuted,
#' the (presumably outlying) value of the response will no longer be paired
#' with the outlier in the explanatory variable, causing an outsize effect
#' on the resulting slope coefficient for that explanatory variable.
#'
#' More sophisticated methods that are outside of the scope of this package
#' requiring fewer---or less strict---distributional assumptions
#' exist. For an overview, see "Permutation tests for univariate or
#' multivariate analysis of variance and regression" (Marti J. Anderson,
#' 2001), \doi{10.1139/cjfas-58-3-626}.
#'
#' @includeRmd man-roxygen/seeds.Rmd
#'
#' @examples
#' # fit a linear model predicting number of hours worked per
#' # week using respondent age and degree status.
#' observed_fit <- gss |>
#'   specify(hours ~ age + college) |>
#'   fit()
#'
#' observed_fit
#'
#' # fit 100 models to resamples of the gss dataset, where the response
#' # `hours` is permuted in each. note that this code is the same as
#' # the above except for the addition of the `generate` step.
#' null_fits <- gss |>
#'   specify(hours ~ age + college) |>
#'   hypothesize(null = "independence") |>
#'   generate(reps = 100, type = "permute") |>
#'   fit()
#'
#' null_fits
#'
#' # for logistic regression, just supply a binary response variable!
#' # (this can also be made explicit via the `family` argument in ...)
#' gss |>
#'   specify(college ~ age + hours) |>
#'   fit()
#'
#' # more in-depth explanation of how to use the infer package
#' \dontrun{
#' vignette("infer")
#' }
#'
#' @rdname fit.infer
#' @method fit infer
#' @export fit.infer
#' @export
fit.infer <- function(object, ...) {
  message_on_excessive_null(object, fn = "fit")

  # Confirm that the family, possibly supplied via
  # `family` in ..., takes precedence over the default.
  # Return a processed version of the ellipses
  dots <- check_family(object, ...)

  # Relevel the response based on the success attribute
  # so that the reference level is reflected in the fit
  object <- relevel_response(object)

  # Extract the formula if it was supplied to specify, otherwise
  # construct it out of the explanatory and response arguments
  formula <- get_formula(object)

  if (is_generated(object)) {
    x <- object |>
      tidyr::nest(data = -replicate) |>
      dplyr::rowwise() |>
      dplyr::mutate(
        model = list(
          do.call(
            fit_linear_model,
            c(
              list(object = data, formula = formula),
              dots
            )
          )
        )
      ) |>
      dplyr::select(replicate, model) |>
      tidyr::unnest(model)
  } else {
    x <- do.call(
      fit_linear_model,
      c(
        list(object, formula),
        dots
      )
    )
  }

  x <- copy_attrs(x, object)
  attr(x, "fitted") <- TRUE

  x
}

check_family <- function(object, ..., call = caller_env()) {
  response_type <- attr(object, "type_desc_response")

  if (response_type == "mult") {
    cli_abort(
      c(
        "infer does not support fitting models for categorical response variables \\
       with more than two levels.",
        i = "Please see {.fun multinom_reg} from the parsnip package."
      ),
      call = call
    )
  }

  dots <- list(...)

  if ("family" %in% names(dots)) {
    return(dots)
  }

  if (response_type == "bin") {
    dots[["family"]] <- stats::binomial
  } else {
    dots[["family"]] <- stats::gaussian
  }

  dots
}

relevel_response <- function(x) {
  if (!is.null(attr(x, "success"))) {
    x[[response_name(x)]] <-
      stats::relevel(
        response_variable(x),
        ref = attr(x, "success")
      )
  }

  x
}

get_formula <- function(x) {
  if (has_attr(x, "formula")) {
    return(attr(x, "formula"))
  } else {
    exp <- paste0(explanatory_name(x), collapse = " + ")

    as.formula(
      glue(
        '{response_name(x)} ~
         {if (exp == "") NULL else exp}',
        .null = "NULL"
      )
    )
  }
}

fit_linear_model <- function(object, formula, ...) {
  stats::glm(
    formula = formula,
    data = object,
    ...
  ) |>
    broom::tidy() |>
    dplyr::select(
      term,
      estimate
    ) |>
    dplyr::mutate(
      term = dplyr::case_when(
        term == "(Intercept)" ~ "intercept",
        TRUE ~ term
      )
    )
}


================================================
FILE: R/generate.R
================================================
#' Generate resamples, permutations, or simulations
#'
#' @description
#'
#' Generation creates a simulated distribution from `specify()`.
#' In the context of confidence intervals, this is a bootstrap distribution
#' based on the result of `specify()`. In the context of hypothesis testing,
#' this is a null distribution based on the result of `specify()` and
#' `hypothesize().`
#'
#' Learn more in `vignette("infer")`.
#'
#' @param x A data frame that can be coerced into a [tibble][tibble::tibble].
#' @param reps The number of resamples to generate.
#' @param type The method used to generate resamples of the observed
#'   data reflecting the null hypothesis. Currently one of
#'   `"bootstrap"`, `"permute"`, or `"draw"` (see below).
#' @param variables If `type = "permute"`, a set of unquoted column names in the
#'   data to permute (independently of each other). Defaults to only the
#'   response variable. Note that any derived effects that depend on these
#'   columns (e.g., interaction effects) will also be affected.
#' @param ... Currently ignored.
#'
#' @return A tibble containing `reps` generated datasets, indicated by the
#'   `replicate` column.
#'
#' @section Generation Types:
#'
#' The `type` argument determines the method used to create the null
#' distribution.
#'
#' \itemize{
#'   \item `bootstrap`: A bootstrap sample will be drawn for each replicate,
#'   where a sample of size equal to the input sample size is drawn (with
#'   replacement) from the input sample data.
#'   \item `permute`: For each replicate, each input value will be randomly
#'   reassigned (without replacement) to a new output value in the sample.
#'   \item `draw`: A value will be sampled from a theoretical distribution
#'   with parameter `p` specified in [hypothesize()] for each replicate. This
#'   option is currently only applicable for testing on one proportion. This
#'   generation type was previously called `"simulate"`, which has been
#'   superseded.
#' }
#'
#' @includeRmd man-roxygen/seeds.Rmd
#'
#' @examples
#' # generate a null distribution by taking 200 bootstrap samples
#' gss |>
#'  specify(response = hours) |>
#'  hypothesize(null = "point", mu = 40) |>
#'  generate(reps = 200, type = "bootstrap")
#'
#' # generate a null distribution for the independence of
#' # two variables by permuting their values 200 times
#' gss |>
#'  specify(partyid ~ age) |>
#'  hypothesize(null = "independence") |>
#'  generate(reps = 200, type = "permute")
#'
#' # generate a null distribution via sampling from a
#' # binomial distribution 200 times
#' gss |>
#' specify(response = sex, success = "female") |>
#'   hypothesize(null = "point", p = .5) |>
#'   generate(reps = 200, type = "draw") |>
#'   calculate(stat = "z")
#'
#' # more in-depth explanation of how to use the infer package
#' \dontrun{
#' vignette("infer")
#' }
#'
#' @importFrom dplyr group_by
#' @family core functions
#' @export
generate <- function(
  x,
  reps = 1,
  type = NULL,
  variables = !!response_expr(x),
  ...
) {
  # Check type argument, warning if necessary
  type <- sanitize_generation_type(type)
  auto_type <- sanitize_generation_type(attr(x, "type"))
  type <- if (!is.null(type)) {
    compare_type_vs_auto_type(type, auto_type, x)
  } else {
    use_auto_type(auto_type)
  }
  attr(x, "type") <- type

  check_cols(x, rlang::enquo(variables), type, missing(variables))

  attr(x, "generated") <- TRUE

  switch(
    type,
    bootstrap = bootstrap(x, reps, ...),
    permute = {
      check_permutation_attributes(x)
      permute(x, reps, rlang::enquo(variables), ...)
    },
    draw = draw(x, reps, ...),
    simulate = draw(x, reps, ...)
  )
}

# Check that type argument is an implemented type
sanitize_generation_type <- function(x, call = caller_env()) {
  if (is.null(x)) return(x)

  check_type(x, is.character, call = call)

  if (!x %in% c("bootstrap", "permute", "simulate", "draw")) {
    cli_abort(
      'The `type` argument should be one of "bootstrap", "permute", \\
       or "draw". See {.help [{.fun generate}](infer::generate)} for more details.',
      call = call
    )
  }

  if (x == "simulate") {
    cli_inform(
      'The `"simulate"` generation type has been renamed to `"draw"`. \\
       Use `type = "draw"` instead to quiet this message.'
    )
  }

  x
}

# Ensure that the supplied type matches what would be assumed from input
compare_type_vs_auto_type <- function(type, auto_type, x) {
  if (is.null(auto_type)) {
    return(type)
  }

  if (
    (type == "bootstrap" && has_p_param(x)) ||
      (type != "bootstrap" &&
        auto_type != type &&
        # make sure auto_type vs type difference isn't just an alias
        (any(!c(auto_type, type) %in% c("draw", "simulate"))))
  ) {
    cli_warn(
      "You have given `type = \"{type}\"`, but `type` is expected \\
         to be `\"{auto_type}\"`. This workflow is untested and \\
         the results may not mean what you think they mean."
    )
  }

  type
}

has_p_param <- function(x) {
  if (!has_attr(x, "params")) {
    return(FALSE)
  }

  if (all(grepl("^p\\.", names(attr(x, "params"))))) {
    return(TRUE)
  }

  FALSE
}

use_auto_type <- function(auto_type) {
  if (!suppress_infer_messages()) {
    cli_inform('Setting `type = "{auto_type}"` in `generate()`.')
  }
  auto_type
}

check_permutation_attributes <- function(x, call = caller_env()) {
  if (
    any(!has_attr(x, "response"), !has_attr(x, "explanatory")) &&
      !identical(attr(x, "null"), "paired independence")
  ) {
    cli_abort(
      "Please {.fun specify} an explanatory and a response variable \\
        when permuting.",
      call = call
    )
  }
}

check_cols <- function(
  x,
  variables,
  type,
  missing,
  arg_name = "variables",
  call = caller_env()
) {
  if (!rlang::is_symbolic(rlang::get_expr(variables)) && type == "permute") {
    cli_abort(
      "The {.arg {arg_name}} argument should be one or more unquoted variable names \\
        (not strings in quotation marks).",
      call = call
    )
  }

  if (!missing && type != "permute") {
    cli_warn(
      'The {.arg {arg_name}} argument is only relevant for the "permute" \\
       generation type and will be ignored.'
    )

    should_prompt <- FALSE
  } else {
    should_prompt <- TRUE
  }

  col_names <- process_variables(variables, should_prompt)

  if (any(!col_names %in% colnames(x))) {
    bad_cols <- col_names[!col_names %in% colnames(x)]

    cli_abort(
      '{qty(bad_cols)}The column{?s} {.field {bad_cols}} provided to \\
       the {.arg {arg_name}} argument{qty(bad_cols)} {?is/are} not in the supplied data.',
      call = call
    )
  }
}

bootstrap <- function(x, reps = 1, ...) {
  # Check if hypothesis test chosen
  if (is_hypothesized(x)) {
    # If so, shift the variable chosen to have a mean corresponding
    # to that specified in `hypothesize`
    if (!is.null(attr(attr(x, "params"), "names"))) {
      if (identical(attr(attr(x, "params"), "names"), "mu")) {
        col <- response_name(x)
        x[[col]] <- x[[col]] - mean(x[[col]], na.rm = TRUE) + attr(x, "params")
      } # Similarly for median
      else if (identical(attr(attr(x, "params"), "names"), "med")) {
        col <- response_name(x)
        x[[col]] <- x[[col]] -
          stats::median(x[[col]], na.rm = TRUE) +
          attr(x, "params")
      }
    }
  }

  # Set variables for use in calculate()
  result <- rep_sample_n(x, size = nrow(x), replace = TRUE, reps = reps)
  result <- copy_attrs(to = result, from = x)

  append_infer_class(result)
}

#' @importFrom dplyr bind_rows group_by
permute <- function(x, reps = 1, variables, ..., call = caller_env()) {
  nrow_x <- nrow(x)
  df_out <- replicate(
    reps,
    permute_once(x, variables, call = call),
    simplify = FALSE
  ) |>
    dplyr::bind_rows() |>
    dplyr::mutate(replicate = rep(1:reps, each = !!nrow_x)) |>
    group_by_replicate(reps, nrow_x)

  df_out <- copy_attrs(to = df_out, from = x)

  append_infer_class(df_out)
}

permute_once <- function(x, variables, ..., call = caller_env()) {
  dots <- list(...)
  null <- attr(x, "null")

  if (
    !is_hypothesized(x) ||
      !null %in% c("independence", "paired independence")
  ) {
    cli_abort(
      "Permuting should be done only when doing an independence \\
        hypothesis test. See {.help [{.fun hypothesize}](infer::hypothesize)}.",
      call = call
    )
  }

  variables <- process_variables(variables, FALSE)
  if (null == "independence") {
    # for each column, determine whether it should be permuted
    needs_permuting <- colnames(x) %in% variables

    # pass each to permute_column with its associated logical
    out <- purrr::map2(x, needs_permuting, permute_column)
    out <- tibble::new_tibble(out)
  } else {
    out <- x
    signs <- sample(c(-1, 1), nrow(x), replace = TRUE, prob = c(.5, .5))
    out[[variables]] <- x[[variables]] * signs
  }

  copy_attrs(out, x)

  return(out)
}

process_variables <- function(variables, should_prompt) {
  # extract the expression and convert each element to string
  out <- rlang::get_expr(variables)

  if (length(out) == 1) {
    out <- as.character(out)
  } else {
    out <- as.list(out)
    out <- purrr::map(out, as.character)
  }

  # drop c()
  out[out == "c"] <- NULL

  # drop interactions and message
  interactions <- purrr::map_lgl(out, `%in%`, x = "*")

  if (any(interactions) && should_prompt) {
    cli_inform(
      "Message: Please supply only data columns to the {.arg variables} argument. \\
       Note that any derived effects that depend on these columns will also \\
       be affected."
    )
  }

  out <- out[!interactions]

  out
}

permute_column <- function(col, permute) {
  if (permute) {
    sample(col, size = length(col), replace = FALSE)
  } else {
    col
  }
}

#' @importFrom dplyr pull
#' @importFrom tibble tibble
#' @importFrom rlang :=
draw <- function(x, reps = 1, ...) {
  fct_levels <- as.character(unique(response_variable(x)))

  probs <- format_params(x)
  col_simmed <- unlist(replicate(
    reps,
    sample(fct_levels, size = nrow(x), replace = TRUE, prob = probs),
    simplify = FALSE
  ))

  x_nrow <- nrow(x)
  rep_tbl <- tibble::tibble(
    !!response_expr(x) := as.factor(col_simmed),
    replicate = as.factor(rep(1:reps, rep(x_nrow, reps)))
  )

  rep_tbl <- copy_attrs(to = rep_tbl, from = x)

  rep_tbl <- group_by_replicate(rep_tbl, reps, nrow(x))

  append_infer_class(rep_tbl)
}


================================================
FILE: R/get_confidence_interval.R
================================================
#' Compute confidence interval
#'
#' @description
#'
#' Compute a confidence interval around a summary statistic. Both
#' simulation-based and theoretical methods are supported, though only
#' `type = "se"` is supported for theoretical methods.
#'
#' Learn more in `vignette("infer")`.
#'
#' @param x A distribution. For simulation-based inference, a data frame
#'   containing a distribution of [calculate()]d statistics
#'   or [`fit()`][fit.infer()]ted coefficient estimates. This object should
#'   have been passed to [generate()] before being supplied or
#'   [calculate()] to [`fit()`][fit.infer()]. For theory-based inference,
#'   output of [assume()]. Distributions for confidence intervals do not
#'   require a null hypothesis via [hypothesize()].
#' @param level A numerical value between 0 and 1 giving the confidence level.
#'   Default value is 0.95.
#' @param type A string giving which method should be used for creating the
#'   confidence interval. The default is `"percentile"` with `"se"`
#'   corresponding to (multiplier * standard error) and `"bias-corrected"` for
#'   bias-corrected interval as other options.
#' @param point_estimate A data frame containing the observed statistic (in a
#'   [calculate()]-based workflow) or observed fit (in a
#'   [`fit()`][fit.infer()]-based workflow). This object is likely the output
#'   of [calculate()] or [`fit()`][fit.infer()] and need not
#'   to have been passed to [generate()]. Set to `NULL` by
#'   default. Must be provided if `type` is `"se"` or `"bias-corrected"`.
#'
#' @return A [tibble][tibble::tibble] containing the following columns:
#'
#' \itemize{
#'   \item `term`: The explanatory variable (or intercept) in question. Only
#'     supplied if the input had been previously passed to [`fit()`][fit.infer()].
#'   \item `lower_ci`, `upper_ci`: The lower and upper bounds of the confidence
#'     interval, respectively.
#' }
#'
#' @details
#' A null hypothesis is not required to compute a confidence interval. However,
#' including [hypothesize()] in a pipeline leading to `get_confidence_interval()`
#' will not break anything. This can be useful when computing a confidence
#' interval using the same distribution used to compute a p-value.
#'
#' Theoretical confidence intervals (i.e. calculated by supplying the output
#' of [assume()] to the `x` argument) require that the point estimate lies on
#' the scale of the data. The distribution defined in [assume()] will be
#' recentered and rescaled to align with the point estimate, as can be shown
#' in the output of [visualize()] when paired with [shade_confidence_interval()].
#' Confidence intervals are implemented for the following distributions and
#' point estimates:
#'
#' \itemize{
#'   \item `distribution = "t"`: `point_estimate` should be the output of
#'   [calculate()] with `stat = "mean"` or `stat = "diff in means"`
#'   \item `distribution = "z"`: `point_estimate` should be the output of
#'   [calculate()] with `stat = "prop"` or `stat = "diff in props"`
#' }
#'
#' @section Aliases:
#' `get_ci()` is an alias of `get_confidence_interval()`.
#' `conf_int()` is a deprecated alias of `get_confidence_interval()`.
#'
#' @examples
#'
#' boot_dist <- gss |>
#'   # We're interested in the number of hours worked per week
#'   specify(response = hours) |>
#'   # Generate bootstrap samples
#'   generate(reps = 1000, type = "bootstrap") |>
#'   # Calculate mean of each bootstrap sample
#'   calculate(stat = "mean")
#'
#' boot_dist |>
#'   # Calculate the confidence interval around the point estimate
#'   get_confidence_interval(
#'     # At the 95% confidence level; percentile method
#'     level = 0.95
#'   )
#'
#' # for type = "se" or type = "bias-corrected" we need a point estimate
#' sample_mean <- gss |>
#'   specify(response = hours) |>
#'   calculate(stat = "mean")
#'
#' boot_dist |>
#'   get_confidence_interval(
#'     point_estimate = sample_mean,
#'     # At the 95% confidence level
#'     level = 0.95,
#'     # Using the standard error method
#'     type = "se"
#'   )
#'
#' # using a theoretical distribution -----------------------------------
#'
#' # define a sampling distribution
#' sampling_dist <- gss |>
#'   specify(response = hours) |>
#'   assume("t")
#'
#' # get the confidence interval---note that the
#' # point estimate is required here
#' get_confidence_interval(
#'   sampling_dist,
#'   level = .95,
#'   point_estimate = sample_mean
#' )
#'
#' # using a model fitting workflow -----------------------
#'
#' # fit a linear model predicting number of hours worked per
#' # week using respondent age and degree status.
#' observed_fit <- gss |>
#'   specify(hours ~ age + college) |>
#'   fit()
#'
#' observed_fit
#'
#' # fit 100 models to resamples of the gss dataset, where the response
#' # `hours` is permuted in each. note that this code is the same as
#' # the above except for the addition of the `generate` step.
#' null_fits <- gss |>
#'   specify(hours ~ age + college) |>
#'   hypothesize(null = "independence") |>
#'   generate(reps = 100, type = "permute") |>
#'   fit()
#'
#' null_fits
#'
#' get_confidence_interval(
#'   null_fits,
#'   point_estimate = observed_fit,
#'   level = .95
#' )
#'
#' # more in-depth explanation of how to use the infer package
#' \dontrun{
#' vignette("infer")
#' }
#'
#' @name get_confidence_interval
#' @family auxillary functions
#' @export
get_confidence_interval <- function(
  x,
  level = 0.95,
  type = NULL,
  point_estimate = NULL
) {
  # Inform if no `level` was explicitly supplied
  if (!("level" %in% rlang::call_args_names(match.call())) && !suppress_infer_messages()) {
    cli_inform("Using `level = {level}` to compute confidence interval.")
  }

  if (is.null(type)) {
    if (inherits(x, "infer_dist")) {
      type <- "se"
    } else {
      type <- "percentile"
    }
  }

  if (is_fitted(x)) {
    # check that x and point estimate reference the same variables
    check_mlr_x_and_obs_stat(
      x,
      point_estimate,
      "get_confidence_interval",
      "point_estimate"
    )

    # split up x and point estimate by term
    term_data <- x |>
      dplyr::ungroup() |>
      dplyr::group_by(term) |>
      dplyr::group_split() |>
      purrr::map(copy_attrs, x)

    term_estimates <- point_estimate |>
      dplyr::ungroup() |>
      dplyr::group_by(term) |>
      dplyr::group_split()

    # check arguments for each term
    purrr::map2_dfr(
      term_data,
      purrr::map(term_estimates, purrr::pluck, "estimate"),
      check_ci_args,
      level = level,
      type = type
    )

    # map over switch_ci and then add the term column back in
    purrr::map2_dfr(
      term_data,
      purrr::map(term_estimates, purrr::pluck, "estimate"),
      switch_ci,
      level = level,
      type = type
    ) |>
      dplyr::mutate(
        term = purrr::map_chr(term_estimates, purrr::pluck, "term"),
        .before = dplyr::everything()
      ) |>
      copy_attrs(x)
  } else {
    check_ci_args(x, level, type, point_estimate)

    switch_ci(type, x, level, point_estimate)
  }
}

#' @rdname get_confidence_interval
#' @export
get_ci <- function(x, level = 0.95, type = NULL, point_estimate = NULL) {
  get_confidence_interval(
    x,
    level = level,
    type = type,
    point_estimate = point_estimate
  )
}

switch_ci <- function(type, x, level, point_estimate) {
  switch(
    type,
    percentile = ci_percentile(x, level),
    se = ci_se(x, level, point_estimate),
    `bias-corrected` = ci_bias_corrected(x, level, point_estimate)
  )
}

remove_missing_estimates <- function(estimates) {
  na_estimates <- is.na(estimates)
  na_estimates_n <- sum(na_estimates)

  if (na_estimates_n > 0) {
    cli_warn(
      "{na_estimates_n} estimates were missing and were removed when \\
               calculating the confidence interval."
    )
  }

  estimates[!na_estimates]
}

ci_percentile <- function(x, level) {
  # x[[ncol(x)]] pulls out the stat or estimate column
  estimates <- remove_missing_estimates(x[[ncol(x)]])

  ci_vec <- stats::quantile(estimates, probs = (1 + c(-level, level)) / 2)

  make_ci_df(ci_vec)
}

ci_se <- function(x, level, point_estimate) {
  point_estimate_ <- check_obs_stat(point_estimate)

  args <- list()

  if (inherits(x, "infer_dist")) {
    se <- attr(point_estimate, "se")
    qfn <- paste0("q", attr(x, "distribution"))
    if (attr(x, "distribution") == "t") {
      args <- list(df = attr(x, "df"))
    }
  } else {
    # x[[ncol(x)]] pulls out the stat or estimate column
    estimates <- remove_missing_estimates(x[[ncol(x)]])
    se <- stats::sd(estimates)

    qfn <- "qnorm"
  }

  args <- c(args, list(p = (1 + level) / 2))

  multiplier <- do.call(qfn, args)

  ci_vec <- point_estimate_ + c(-multiplier, multiplier) * se

  res <- make_ci_df(ci_vec)

  attr(res, "se") <- attr(point_estimate, "se")
  attr(res, "point_estimate") <- point_estimate_

  res
}

ci_bias_corrected <- function(x, level, point_estimate) {
  point_estimate <- check_obs_stat(point_estimate)

  # x[[ncol(x)]] pulls out the stat or estimate column
  estimates <- remove_missing_estimates(x[[ncol(x)]])

  p <- mean(estimates <= point_estimate)

  z0 <- stats::qnorm(p)
  # z_alpha_2 is z_(alpha/2)
  z_alpha_2 <- stats::qnorm((1 + c(-level, level)) / 2)
  new_probs <- stats::pnorm(2 * z0 + z_alpha_2)

  ci_vec <- stats::quantile(estimates, probs = new_probs)

  make_ci_df(ci_vec)
}

check_ci_args <- function(x, level, type, point_estimate, call = caller_env()) {
  if (!is.null(point_estimate)) {
    if (!is.data.frame(point_estimate)) {
      check_type(point_estimate, is.numeric, call = call)
    } else {
      check_type(point_estimate, is.data.frame, call = call)
      check_type(point_estimate[[1]][[1]], is.numeric, call = call)
    }
  }
  check_is_distribution(x, "get_confidence_interval")
  check_type(level, is.numeric, call = call)

  if ((level <= 0) || (level >= 1)) {
    cli_abort(
      "The value of {.arg level} must be between 0 and 1, non-inclusive.",
      call = call
    )
  }

  if (inherits(x, "infer_dist") && !is.null(type) && type != "se") {
    cli_abort(
      'The only {.arg type} option for theory-based confidence intervals \\
       is `type = "se"`.',
      call = call
    )
  }

  if (!(type %in% c("percentile", "se", "bias-corrected"))) {
    cli_abort(
      'The options for `type` are "percentile", "se", or "bias-corrected".',
      call = call
    )
  }

  if ((type %in% c("se", "bias-corrected")) && is.null(point_estimate)) {
    cli_abort(
      'A numeric value needs to be given for {.arg point_estimate} \\
       for `type` "se" or "bias-corrected".',
      call = call
    )
  }

  if (inherits(x, "infer_dist")) {
    # theoretical CIs require the full point estimate infer object as they
    # contain the necessary standard error
    if (!inherits(point_estimate, "infer")) {
      cli_abort(
        'For theoretical confidence intervals, the `point_estimate` argument \\
         must be an `infer` object. Have you made sure to supply the output of \\
         {.fun calculate} as the `point_estimate` argument?',
        call = call
      )
    }

    if (
      !attr(point_estimate, "stat") %in%
        c("mean", "prop", "diff in means", "diff in props")
    ) {
      cli_abort(
        'The only allowable statistics for theoretical confidence intervals \\
         are "mean", "prop", "diff in means", and "diff in props". See \\
         the "Details" section of \\
         {.help [{.fun get_confidence_interval}](infer::get_confidence_interval)} \\
         for more details.',
        call = call
      )
    }

    if (
      (attr(x, "distribution") == "t" &&
        !attr(point_estimate, "stat") %in% c("mean", "diff in means")) ||
        (attr(x, "distribution") == "norm" &&
          !attr(point_estimate, "stat") %in% c("prop", "diff in props"))
    ) {
      cli_abort(
        'Confidence intervals using a `{attr(x, "dist_")}` distribution for \\
         `stat = {attr(point_estimate, "stat")}` are not implemented.',
        call = call
      )
    }
  }
}

make_ci_df <- function(ci_vec) {
  tibble::tibble(lower_ci = ci_vec[[1]], upper_ci = ci_vec[[2]])
}


================================================
FILE: R/get_p_value.R
================================================
#' Compute p-value
#'
#' @description
#'
#' Compute a p-value from a null distribution and observed statistic.
#'
#' Learn more in `vignette("infer")`.
#'
#' @param x A null distribution. For simulation-based inference, a data frame
#'   containing a distribution of [calculate()]d statistics
#'   or [`fit()`][fit.infer()]ted coefficient estimates. This object should
#'   have been passed to [generate()] before being supplied or
#'   [calculate()] to [`fit()`][fit.infer()]. For theory-based inference,
#'   the output of [assume()].
#' @param obs_stat A data frame containing the observed statistic (in a
#'   [calculate()]-based workflow) or observed fit (in a
#'   [`fit()`][fit.infer()]-based workflow). This object is likely the output
#'   of [calculate()] or [`fit()`][fit.infer()] and need not
#'   to have been passed to [generate()].
#' @param direction A character string. Options are `"less"`, `"greater"`, or
#'   `"two-sided"`. Can also use `"left"`, `"right"`, `"both"`,
#'   `"two_sided"`, or `"two sided"`, `"two.sided"`.
#'
#' @return A [tibble][tibble::tibble] containing the following columns:
#'
#' \itemize{
#'   \item `term`: The explanatory variable (or intercept) in question. Only
#'     supplied if the input had been previously passed to [`fit()`][fit.infer()].
#'   \item `p_value`: A value in \[0, 1\] giving the probability that a
#'     statistic/coefficient as or more extreme than the observed
#'     statistic/coefficient would occur if the null hypothesis were true.
#' }
#'
#'
#' @section Aliases:
#' `get_pvalue()` is an alias of `get_p_value()`.
#' `p_value` is a deprecated alias of `get_p_value()`.
#'
#' @section Zero p-value:
#' Though a true p-value of 0 is impossible, `get_p_value()` may return 0 in
#' some cases. This is due to the simulation-based nature of the \{infer\}
#' package; the output of this function is an approximation based on
#' the number of `reps` chosen in the `generate()` step. When the observed
#' statistic is very unlikely given the null hypothesis, and only a small
#' number of `reps` have been generated to form a null distribution,
#' it is possible that the observed statistic will be more extreme than
#' every test statistic generated to form the null distribution, resulting
#' in an approximate p-value of 0. In this case, the true p-value is a small
#' value likely less than `3/reps` (based on a poisson approximation).
#'
#' In the case that a p-value of zero is reported, a warning message will be
#' raised to caution the user against reporting a p-value exactly equal to 0.
#'
#'
#' @examples
#'
#' # using a simulation-based null distribution ------------------------------
#'
#' # find the point estimate---mean number of hours worked per week
#' point_estimate <- gss |>
#'   specify(response = hours) |>
#'   calculate(stat = "mean")
#'
#' # starting with the gss dataset
#' gss |>
#'   # ...we're interested in the number of hours worked per week
#'   specify(response = hours) |>
#'   # hypothesizing that the mean is 40
#'   hypothesize(null = "point", mu = 40) |>
#'   # generating data points for a null distribution
#'   generate(reps = 1000, type = "bootstrap") |>
#'   # finding the null distribution
#'   calculate(stat = "mean") |>
#    # calculate the p-value for the point estimate
#'   get_p_value(obs_stat = point_estimate, direction = "two-sided")
#'
#' # using a theoretical null distribution -----------------------------------
#'
#' # calculate the observed statistic
#' obs_stat <- gss |>
#'   specify(response = hours) |>
#'   hypothesize(null = "point", mu = 40) |>
#'   calculate(stat = "t")
#'
#' # define a null distribution
#' null_dist <- gss |>
#'   specify(response = hours) |>
#'   assume("t")
#'
#' # calculate a p-value
#' get_p_value(null_dist, obs_stat, direction = "both")
#'
#' # using a model fitting workflow -----------------------------------------
#'
#' # fit a linear model predicting number of hours worked per
#' # week using respondent age and degree status.
#' observed_fit <- gss |>
#'   specify(hours ~ age + college) |>
#'   fit()
#'
#' observed_fit
#'
#' # fit 100 models to resamples of the gss dataset, where the response
#' # `hours` is permuted in each. note that this code is the same as
#' # the above except for the addition of the `generate` step.
#' null_fits <- gss |>
#'   specify(hours ~ age + college) |>
#'   hypothesize(null = "independence") |>
#'   generate(reps = 100, type = "permute") |>
#'   fit()
#'
#' null_fits
#'
#' get_p_value(null_fits, obs_stat = observed_fit, direction = "two-sided")
#'
#' # more in-depth explanation of how to use the infer package
#' \dontrun{
#' vignette("infer")
#' }
#'
#' @name get_p_value
#' @export
get_p_value <- function(x, obs_stat, direction) {
  UseMethod("get_p_value", x)
}

#' @rdname get_p_value
#' @family auxillary functions
#' @method get_p_value default
#' @export
get_p_value.default <- function(x, obs_stat, direction) {
  check_type(x, is.data.frame)
  if (!is_generated(x) & is_hypothesized(x)) {
    cli_abort(c(
      "Theoretical p-values are not yet supported. ",
      i = "`x` should be the result of calling {.fun generate}."
    ))
  }
  check_for_nan(x, "get_p_value")
  check_direction(direction)

  if (is_fitted(x)) {
    # check that x and obs stat reference the same variables
    check_mlr_x_and_obs_stat(
      x,
      obs_stat,
      "get_p_value",
      "obs_stat"
    )

    # split up x and obs_stat by term
    term_data <- x |>
      dplyr::ungroup() |>
      dplyr::group_by(term) |>
      dplyr::group_split() |>
      purrr::map(copy_attrs, x)

    term_obs_stats <- obs_stat |>
      dplyr::ungroup() |>
      dplyr::group_by(term) |>
      dplyr::group_split()

    # calculate the p value for each term and then add the term column back in
    purrr::map2_dfr(
      term_data,
      purrr::map(term_obs_stats, purrr::pluck, "estimate"),
      simulation_based_p_value,
      direction = direction
    ) |>
      dplyr::mutate(
        term = purrr::map_chr(term_obs_stats, purrr::pluck, "term"),
        .before = dplyr::everything()
      )
  } else {
    simulation_based_p_value(x = x, obs_stat = obs_stat, direction = direction)
  }
}

#' @rdname get_p_value
#' @export
get_pvalue <- get_p_value

#' @rdname get_p_value
#' @method get_p_value infer_dist
#' @export
get_p_value.infer_dist <- function(x, obs_stat, direction) {
  # check the null hypotheses attached to x and obs_stat
  check_hypotheses_align(x, obs_stat)

  # parse the distribution function
  dist_fn <- paste0("p", attr(x, "distribution"))

  # translate the direction argument
  dir <- norm_direction(direction)

  lower_tail <- switch(
    dir,
    `left` = TRUE,
    `right` = FALSE,
    `both` = TRUE
  )

  # supply everything to the base R distribution function
  res <- do.call(
    dist_fn,
    c(
      list(q = as.numeric(obs_stat), lower.tail = lower_tail),
      process_df(attr(x, "df"))
    )
  )

  if (dir == "both") {
    res <- min(res, 1 - res) * 2
  }

  tibble::tibble(p_value = res)
}

simulation_based_p_value <- function(
  x,
  obs_stat,
  direction,
  call = caller_env()
) {
  check_x_vs_obs_stat(x, obs_stat, call = call)
  obs_stat <- check_obs_stat(obs_stat)

  # x[[ncol(x)]] pulls out the stat or estimate column
  if (direction %in% c("less", "left")) {
    pval <- left_p_value(x[[ncol(x)]], obs_stat)
  } else if (direction %in% c("greater", "right")) {
    pval <- right_p_value(x[[ncol(x)]], obs_stat)
  } else {
    pval <- two_sided_p_value(x[[ncol(x)]], obs_stat)
  }

  if (abs(pval) < 1e-16) {
    cli_warn(c(
      "Please be cautious in reporting a p-value of 0. This result is an \\
       approximation based on the number of `reps` chosen in the {.fun generate} step.",
      i = "See {.help [{.fun get_p_value}](infer::get_p_value)} for more information."
    ))
  }

  tibble::tibble(p_value = pval)
}

left_p_value <- function(vec, obs_stat) {
  mean(vec <= obs_stat)
}

right_p_value <- function(vec, obs_stat) {
  mean(vec >= obs_stat)
}

two_sided_p_value <- function(vec, obs_stat) {
  left_pval <- left_p_value(vec, obs_stat)
  right_pval <- right_p_value(vec, obs_stat)
  raw_res <- 2 * min(left_pval, right_pval)

  min(raw_res, 1)
}

check_hypotheses_align <- function(x, obs_stat) {
  if (
    is_hypothesized(x) &&
      is_hypothesized(obs_stat) &&
      any(attr(x, "params") != attr(obs_stat, "params"))
  ) {
    cli_warn(
      "`x` and `obs_stat` were generated using different null hypotheses. \\
        This workflow is untested and results may not mean what you think \\
        they mean."
    )
  }
}

check_x_vs_obs_stat <- function(x, obs_stat, call = caller_env()) {
  # check if x and obs_stat might have been mistakenly supplied
  # in the reverse order
  if (
    is_generated(obs_stat) &&
      !is_generated(x)
  ) {
    cli_abort(
      c(
        "It seems like the `obs_stat` argument has been passed to `get_p_value()` \\
       as the first argument when `get_p_value()` expects `x`, a distribution \\
       of statistics or coefficient estimates, as the first argument. ",
        i = "Have you mistakenly switched the order of `obs_stat` and `x`?"
      ),
      call = call
    )
  }

  invisible(TRUE)
}

# which_distribution <- function(x, theory_type, obs_stat, direction){
#
#   param <- attr(x, "distr_param")
#   if(!is.null(attr(x, "distr_param2")))
#     param2 <- attr(x, "distr_param2")
#
#   if(theory_type == "Two sample t")
#     return(
#       pt(q = obs_stat,
#          df = param,
#          lower.tail = set_lower_tail(direction)
#         )
#     )
# }

#theory_t_pvalue <-

# set_lower_tail <- function(direction){
#   if(direction %in% c("greater", "right"))
#     lower_tail <- FALSE
#   else
#     lower_tail <- TRUE
#
#   lower_tail
# }


================================================
FILE: R/gss.R
================================================
#' Subset of data from the General Social Survey (GSS).
#'
#' The General Social Survey is a high-quality survey which gathers data on
#' American society and opinions, conducted since 1972. This data set is a
#' sample of 500 entries from the GSS, spanning years 1973-2018,
#' including demographic markers and some
#' economic variables. Note that this data is included for demonstration only,
#' and should not be assumed to provide accurate estimates relating to the GSS.
#' However, due to the high quality of the GSS, the unweighted data will
#' approximate the weighted data in some analyses.
#' @format A tibble with 500 rows and 11 variables:
#' \describe{
#'   \item{year}{year respondent was surveyed}
#'   \item{age}{age at time of survey, truncated at 89}
#'   \item{sex}{respondent's sex (self-identified)}
#'   \item{college}{whether on not respondent has a college degree, including
#'   junior/community college}
#'   \item{partyid}{political party affiliation}
#'   \item{hompop}{number of persons in household}
#'   \item{hours}{number of hours worked in week before survey, truncated at 89}
#'   \item{income}{total family income}
#'   \item{class}{subjective socioeconomic class identification}
#'   \item{finrela}{opinion of family income}
#'   \item{weight}{survey weight}
#' }
#' @source \url{https://gss.norc.org}
"gss"


================================================
FILE: R/hypothesize.R
================================================
#' Declare a null hypothesis
#'
#' @description
#'
#' Declare a null hypothesis about variables selected in [specify()].
#'
#' Learn more in `vignette("infer")`.
#'
#' @param x A data frame that can be coerced into a [tibble][tibble::tibble].
#' @param null The null hypothesis. Options include `"independence"`,
#'   `"point"`, and `"paired independence"`.
#' \itemize{
#'   \item `independence`: Should be used with both a `response` and `explanatory`
#'   variable. Indicates that the values of the specified `response` variable
#'   are independent of the associated values in `explanatory`.
#'   \item `point`: Should be used with only a `response` variable. Indicates
#'   that a point estimate based on the values in `response` is associated
#'   with a parameter. Sometimes requires supplying one of `p`, `mu`, `med`, or
#'   `sigma`.
#'   \item `paired independence`: Should be used with only a `response` variable
#'   giving the pre-computed difference between paired observations. Indicates
#'   that the order of subtraction between paired values does not affect the
#'   resulting distribution.
#' }
#' @param p The true proportion of successes (a number between 0 and 1). To be used with point null hypotheses when the specified response
#' variable is categorical.
#' @param mu The true mean (any numerical value). To be used with point null
#' hypotheses when the specified response variable is continuous.
#' @param med The true median (any numerical value). To be used with point null
#' hypotheses when the specified response variable is continuous.
#' @param sigma The true standard deviation (any numerical value). To be used with
#' point null hypotheses.
#'
#' @return A tibble containing the response (and explanatory, if specified)
#'   variable data with parameter information stored as well.
#'
#' @examples
#' # hypothesize independence of two variables
#' gss |>
#'  specify(college ~ partyid, success = "degree") |>
#'  hypothesize(null = "independence")
#'
#' # hypothesize a mean number of hours worked per week of 40
#' gss |>
#'   specify(response = hours) |>
#'   hypothesize(null = "point", mu = 40)
#'
#' # more in-depth explanation of how to use the infer package
#' \dontrun{
#' vignette("infer")
#' }
#'
#' @importFrom purrr compact
#' @family core functions
#' @export
hypothesize <- function(
  x,
  null,
  p = NULL,
  mu = NULL,
  med = NULL,
  sigma = NULL
) {
  # Check arguments
  if (missing(null)) {
    null <- NA
  }
  null <- match_null_hypothesis(null)
  hypothesize_checks(x, null)

  attr(x, "null") <- null
  attr(x, "hypothesized") <- TRUE

  dots <- compact(list(p = p, mu = mu, med = med, sigma = sigma))

  # Set parameters and determine appropriate generation type
  switch(
    null,
    independence = {
      params <- sanitize_hypothesis_params_independence(dots)
      attr(x, "type") <- "permute"
    },
    point = {
      params <- sanitize_hypothesis_params_point(dots, x)
      attr(x, "params") <- unlist(params)

      if (!is.null(params$p)) {
        attr(x, "type") <- "draw"
      } else {
        # Check one proportion test set up correctly
        if (is.factor(response_variable(x))) {
          cli_abort(
            'Testing one categorical variable requires `p` to be used as a \\
             parameter.'
          )
        }
        attr(x, "type") <- "bootstrap"
      }
    },
    `paired independence` = {
      params <- sanitize_hypothesis_params_paired_independence(dots)
      attr(x, "type") <- "permute"
    }
  )

  res <- append_infer_class(tibble::as_tibble(x))

  copy_attrs(to = res, from = x)
}

#' @rdname hypothesize
#' @export
hypothesise <- hypothesize

hypothesize_checks <- function(x, null, call = caller_env()) {
  if (!inherits(x, "data.frame")) {
    cli_abort("x must be a data.frame or tibble", call = call)
  }

  if ((null == "independence") && !has_explanatory(x)) {
    cli_abort(
      'Please {.fun specify} an explanatory and a response variable when \\
       testing a null hypothesis of `"independence"`.',
      call = call
    )
  }

  if (null == "paired independence" && has_explanatory(x)) {
    cli_abort(
      c(
        'Please {.fun specify} only a response variable when \\
           testing a null hypothesis of `"paired independence"`.',
        "i" = 'The supplied response variable should be the \\
                 pre-computed difference between paired observations.'
      ),
      call = call
    )
  }
}

match_null_hypothesis <- function(null, call = caller_env()) {
  null_hypothesis_types <- c("point", "independence", "paired independence")

  if (length(null) != 1) {
    cli_abort(
      'You should specify exactly one type of null hypothesis.',
      call = call
    )
  }

  i <- pmatch(null, null_hypothesis_types)

  if (is.na(i)) {
    cli_abort(
      '`null` should be either "point", "independence", or "paired independence".',
      call = call
    )
  }

  null_hypothesis_types[i]
}

sanitize_hypothesis_params_independence <- function(dots) {
  if (length(dots) > 0) {
    cli_warn(
      "Parameter values should not be specified when testing that two \\
       variables are independent."
    )
  }

  NULL
}

sanitize_hypothesis_params_point <- function(dots, x, call = caller_env()) {
  if (length(dots) != 1) {
    cli_abort(
      "You must specify exactly one of `p`, `mu`, `med`, or `sigma`.",
      call = call
    )
  }

  if (!is.null(dots$p)) {
    dots$p <- sanitize_hypothesis_params_proportion(dots$p, x, call = call)
  }

  dots
}

sanitize_hypothesis_params_proportion <- function(p, x, call = caller_env()) {
  eps <- if (capabilities("long.double")) {
    sqrt(.Machine$double.eps)
  } else {
    0.01
  }

  if (anyNA(p)) {
    cli_abort(
      '`p` should not contain missing values.',
      call = call
    )
  }

  if (any(p < 0 | p > 1)) {
    cli_abort(
      '`p` should only contain values between zero and one.',
      call = call
    )
  }

  if (length(p) == 1) {
    if (!has_attr(x, "success")) {
      cli_abort(
        "A point null regarding a proportion requires that `success` \\
          be indicated in `specify()`.",
        call = call
      )
    }

    p <- c(p, 1 - p)
    names(p) <- get_success_then_response_levels(x)
  } else {
    if (sum(p) < 1 - eps | sum(p) > 1 + eps) {
      cli_abort(
        "Make sure the hypothesized values for the `p` parameters sum to 1. \\
          Please try again.",
        call = call
      )
    }
  }

  p
}

sanitize_hypothesis_params_paired_independence <- function(dots) {
  if (length(dots) > 0) {
    cli_warn(
      "Parameter values should not be specified when testing paired independence."
    )
  }

  NULL
}


================================================
FILE: R/infer.R
================================================
#' infer: a grammar for statistical inference
#'
#' The objective of this package is to perform statistical inference using a
#' grammar that illustrates the underlying concepts and a format that coheres
#' with the tidyverse.
#'
#' For an overview of how to use the core functionality, see `vignette("infer")`
#'
#'
#' @docType package
#' @name infer
"_PACKAGE"

#' @importFrom cli cli_abort cli_warn cli_inform qty no

## quiets concerns of R CMD check re: the .'s that appear in pipelines
## From Jenny Bryan's googlesheets package
if (getRversion() >= "2.15.1") {
  utils::globalVariables(
    c(
      "prop",
      "stat",
      "value",
      "x",
      "y",
      "..density..",
      "statistic",
      ".",
      "parameter",
      "p.value",
      "xmin",
      "x_min",
      "xmax",
      "x_max",
      "density",
      "denom",
      "diff_prop",
      "group_num",
      "n1",
      "n2",
      "num_suc",
      "p_hat",
      "total_suc",
      "explan",
      "probs",
      "conf.low",
      "conf.high",
      "prop_1",
      "prop_2",
      "data",
      "setNames",
      "resp",
      "capture.output",
      "stats",
      "estimate",
      "any_of",
      "model",
      "term",
      "where",
      "hypothesis"
    )
  )
}


================================================
FILE: R/observe.R
================================================
#' Calculate observed statistics
#'
#' @description
#'
#' This function is a wrapper that calls [specify()], [hypothesize()], and
#' [calculate()] consecutively that can be used to calculate observed
#' statistics from data. [hypothesize()] will only be called if a point
#' null hypothesis parameter is supplied.
#'
#' Learn more in `vignette("infer")`.
#'
#' @inheritParams specify
#' @inheritParams hypothesize
#' @inheritParams calculate
#'
#' @return A 1-column tibble containing the calculated statistic `stat`.
#'
#' @inheritSection calculate Arbitrary test statistics
#'
#' @examples
#' # calculating the observed mean number of hours worked per week
#' gss |>
#'   observe(hours ~ NULL, stat = "mean")
#'
#' # equivalently, calculating the same statistic with the core verbs
#' gss |>
#'   specify(response = hours) |>
#'   calculate(stat = "mean")
#'
#' # calculating a t statistic for hypothesized mu = 40 hours worked/week
#' gss |>
#'   observe(hours ~ NULL, stat = "t", null = "point", mu = 40)
#'
#' # equivalently, calculating the same statistic with the core verbs
#' gss |>
#'   specify(response = hours) |>
#'   hypothesize(null = "point", mu = 40) |>
#'   calculate(stat = "t")
#'
#' # similarly for a difference in means in age based on whether
#' # the respondent has a college degree
#' observe(
#'   gss,
#'   age ~ college,
#'   stat = "diff in means",
#'   order = c("degree", "no degree")
#' )
#'
#' # equivalently, calculating the same statistic with the core verbs
#' gss |>
#'   specify(age ~ college) |>
#'   calculate("diff in means", order = c("degree", "no degree"))
#'
#' # for a more in-depth explanation of how to use the infer package
#' \dontrun{
#' vignette("infer")
#' }
#'
#' @family wrapper functions
#' @family functions for calculating observed statistics
#' @export
observe <- function(
  x,
  # specify arguments
  formula,
  response = NULL,
  explanatory = NULL,
  success = NULL,
  # hypothesize arguments
  null = NULL,
  p = NULL,
  mu = NULL,
  med = NULL,
  sigma = NULL,
  # calculate arguments
  stat = c(
    "mean",
    "median",
    "sum",
    "sd",
    "prop",
    "count",
    "diff in means",
    "diff in medians",
    "diff in props",
    "Chisq",
    "F",
    "slope",
    "correlation",
    "t",
    "z",
    "ratio of props",
    "odds ratio"
  ),
  order = NULL,
  ...
) {
  # use hypothesize() if appropriate (or needed to pass an informative
  # message/warning). otherwise, pipe directly to calculate().
  if (!all(sapply(list(p, mu, med, sigma), is.null))) {
    hypothesize_fn <- hypothesize
  } else {
    hypothesize_fn <- function(x, ...) {
      x
    }
  }

  # pass arguments on to core verbs
  res <- 
    specify(
    x = x,
    formula = formula,
    response = {{ response }},
    explanatory = {{ explanatory }},
    success = success
    )
  
  hypothesize_fn(
    res,
    null = if (has_explanatory(res)) {
      "independence"
    } else {
      "point"
    },
    p = p,
    mu = mu,
    med = med,
    sigma = sigma
  ) |>
  calculate(
    stat = stat,
    order = order,
    ...
  )
}


================================================
FILE: R/pipe.R
================================================
#' Pipe
#'
#' Like \{dplyr\}, \{infer\} also uses the pipe (\code{|>}) function
#' from \code{magrittr} to turn function composition into a series of
#' iterative statements.
#'
#' @param lhs,rhs Inference functions and the initial data frame.
#'
#' @importFrom magrittr %>%
#' @name %>%
#' @rdname pipe
#' @export
NULL


================================================
FILE: R/print_methods.R
================================================
#' Print methods
#'
#' @param x An object of class `infer`, i.e. output from [specify()] or
#'   [hypothesize()], or of class `infer_layer`, i.e. output from
#'   [shade_p_value()] or [shade_confidence_interval()].
#' @param ... Arguments passed to methods.
#' @importFrom glue glue_collapse glue
#'
#' @rdname print.infer
#' @export
print.infer <- function(x, ...) {
  attrs <- names(attributes(x))
  header <- character(3)
  if ("response" %in% attrs) {
    header[1] <- glue(
      'Response: {response_name(x)} ({attr(x, "response_type")})',
      .null = "NULL"
    )
    if ("explanatory" %in% attrs) {
      header[2] <- glue(
        'Explanatory: {paste0(paste0(explanatory_name(x), " (",
        attr(x, "explanatory_type"), ")"), collapse = ", ")}',
        .null = "NULL"
      )
    }
  }
  if ("null" %in% attrs) {
    header[3] <- glue('Null Hypothesis: {attr(x, "null")}', .null = "NULL")
  }

  cat(glue::glue_collapse(
    header[header != ""],
    width = cli::console_width(),
    sep = "\n"
  ))
  cat("\n")

  NextMethod()
}

#' @rdname print.infer
#' @export
print.infer_layer <- function(x, ...) {
  cat(x)
}

#' @rdname print.infer
#' @export
print.infer_dist <- function(x, ...) {
  cat(x)
}


================================================
FILE: R/rep_sample_n.R
================================================
#' Perform repeated sampling
#'
#' @description
#'
#' These functions extend the functionality of [dplyr::sample_n()] and
#' [dplyr::slice_sample()] by allowing for repeated sampling of data.
#' This operation is especially helpful while creating sampling
#' distributions—see the examples below!
#'
#' @param tbl,.data Data frame of population from which to sample.
#' @param size,n,prop `size` and `n` refer to the sample size of each sample.
#' The `size` argument to `rep_sample_n()` is required, while in
#' `rep_slice_sample()` sample size defaults to 1 if not specified. `prop`, an
#' argument to `rep_slice_sample()`, refers to the proportion of rows to sample
#' in each sample, and is rounded down in the case that `prop * nrow(.data)` is
#' not an integer. When using `rep_slice_sample()`, please only supply one of
#' `n` or `prop`.
#' @param replace Should samples be taken with replacement?
#' @param reps Number of samples to take.
#' @param prob,weight_by A vector of sampling weights for each of the rows in
#' `.data`—must have length equal to `nrow(.data)`. For `weight_by`, this
#' may also be an unquoted column name in `.data`.
#'
#' @details
#'
#' `rep_sample_n()` and `rep_slice_sample()` are designed to behave similar to
#' their dplyr counterparts. As such, they have at least the following
#' differences:
#' - In case `replace = FALSE` having `size` bigger than number of data rows in
#' `rep_sample_n()` will give an error. In `rep_slice_sample()` having such `n`
#' or `prop > 1` will give warning and output sample size will be set to number
#' of rows in data.
#'
#' Note that the [dplyr::sample_n()] function  has been superseded by
#' [dplyr::slice_sample()].
#'
#' @return A tibble of size `reps * n` rows corresponding to `reps`
#'   samples of size `n` from `.data`, grouped by `replicate`.
#'
#' @examples
#' library(dplyr)
#' library(ggplot2)
#' library(tibble)
#'
#' # take 1000 samples of size n = 50, without replacement
#' slices <- gss |>
#'   rep_slice_sample(n = 50, reps = 1000)
#'
#' slices
#'
#' # compute the proportion of respondents with a college
#' # degree in each replicate
#' p_hats <- slices |>
#'   group_by(replicate) |>
#'   summarize(prop_college = mean(college == "degree"))
#'
#' # plot sampling distribution
#' ggplot(p_hats, aes(x = prop_college)) +
#'   geom_density() +
#'   labs(
#'     x = "p_hat", y = "Number of samples",
#'     title = "Sampling distribution of p_hat"
#'   )
#'
#' # sampling with probability weights. Note probabilities are automatically
#' # renormalized to sum to 1
#' df <- tibble(
#'   id = 1:5,
#'   letter = factor(c("a", "b", "c", "d", "e"))
#' )
#'
#' rep_slice_sample(df, n = 2, reps = 5, weight_by = c(.5, .4, .3, .2, .1))
#'
#' # alternatively, pass an unquoted column name in `.data` as `weight_by`
#' df <- df |> mutate(wts = c(.5, .4, .3, .2, .1))
#'
#' rep_slice_sample(df, n = 2, reps = 5, weight_by = wts)
#' @export
rep_sample_n <- function(tbl, size, replace = FALSE, reps = 1, prob = NULL) {
  check_type(tbl, is.data.frame)
  check_type(size, is_single_number, "single non-negative number", min_val = 0)
  check_type(replace, is_truefalse, "TRUE or FALSE")
  check_type(
    reps,
    is_single_number,
    "single number not less than 1",
    min_val = 1
  )
  check_type(
    prob,
    ~ is.numeric(.) && (length(.) == nrow(tbl)),
    glue::glue("numeric vector with length `nrow(tbl)` = {nrow(tbl)}"),
    allow_null = TRUE
  )

  # In `dplyr::sample_n()` `size` can't be more than number of rows in data
  notify_extra_size(size, tbl, replace, notify_type = "sample_n")

  make_replicate_tbl(
    tbl = tbl,
    size = size,
    replace = replace,
    prob = prob,
    reps = reps
  )
}

#' @rdname rep_sample_n
#' @export
rep_slice_sample <- function(
  .data,
  n = NULL,
  prop = NULL,
  replace = FALSE,
  weight_by = NULL,
  reps = 1
) {
  check_type(.data, is.data.frame)
  check_type(
    n,
    is_single_number,
    "single non-negative number",
    allow_null = TRUE,
    min_val = 0
  )
  check_type(
    prop,
    is_single_number,
    "single non-negative number",
    allow_null = TRUE,
    min_val = 0
  )
  check_type(replace, is_truefalse, "TRUE or FALSE")
  eval_weight_by <- try(rlang::eval_tidy(weight_by), silent = TRUE)
  if (inherits(eval_weight_by, "try-error")) {
    weight_by <- rlang::enquo(weight_by)
    check_cols(.data, weight_by, "permute", FALSE, "weight_by")
    weight_by <- .data[[rlang::as_name(weight_by)]]
  }
  check_type(
    weight_by,
    ~ is.numeric(.) && (length(.) == nrow(.data)),
    glue::glue(
      "a numeric vector with length `nrow(.data)` = {nrow(.data)} \\
                 or an unquoted column name"
    ),
    allow_null = TRUE
  )
  check_type(
    reps,
    is_single_number,
    "single number not less than 1",
    min_val = 1
  )

  # Compute sample size based on `n` and `prop`
  size <- make_slice_size(n = n, prop = prop, n_total = nrow(.data))

  # In `dplyr::slice_sample()` asked sample size is allowed to be bigger than
  # number of rows in data. In that case (at least currently) sample size is
  # silently replaced to be number of rows. Here we give a warning.
  notify_extra_size(size, .data, replace, notify_type = "slice_sample")

  make_replicate_tbl(
    tbl = .data,
    size = size,
    replace = replace,
    prob = weight_by,
    reps = reps
  )
}

make_replicate_tbl <- function(tbl, size, replace, prob, reps) {
  # Generate row indexes for every future replicate (this way it respects
  # possibility of `replace = FALSE`)
  n <- nrow(tbl)

  if (!replace) {
    idx_list <- replicate(
      reps,
      sample_int(n, size, replace = FALSE, prob = prob),
      simplify = FALSE
    )
  } else {
    idx_list <- sample_int(n, size * reps, replace = TRUE, prob = prob)
    idx_list <- vctrs::vec_chop(idx_list, sizes = rep(size, reps))
  }

  # Get actual sample size which can differ from `size` (currently if it is
  # bigger than number of rows in `tbl` inside `rep_slice_sample()`)
  sample_size <- length(idx_list[[1]])
  i <- unlist(idx_list)

  res <- vctrs::vec_slice(tbl, i)
  res <-
    dplyr::bind_cols(
      tibble::new_tibble(list(
        replicate = rep(seq_len(reps), each = sample_size)
      )),
      res
    )
  res <- group_by_replicate(res, reps = reps, n = sample_size)
  copy_attrs(res, tbl)
}

notify_extra_size <- function(
  size,
  tbl,
  replace,
  notify_type,
  call = caller_env()
) {
  if (!replace && (size > nrow(tbl))) {
    msg <- glue::glue(
      "Asked sample size ({size}) is bigger than ",
      "number of rows in data ({nrow(tbl)}) while `replace` is FALSE"
    )
    switch(
      notify_type,
      sample_n = cli_abort("{msg}. Use `replace = TRUE`.", call = call),
      slice_sample = cli_warn("{msg}. Using number of rows as sample size.")
    )
  }

  TRUE
}

# Modified code from https://github.com/tidyverse/dplyr/blob/master/R/slice.R
# (at commit 0f29aa4)
sample_int <- function(n, size, replace = FALSE, prob = NULL) {
  if (!replace) {
    # If `replace` is `FALSE`, allow `size` to be bigger than `n` by silently
    # replacing it with `n`
    size <- min(size, n)
  }

  if (size == 0L) {
    integer(0)
  } else {
    sample.int(n, size, prob = prob, replace = replace)
  }
}

make_slice_size <- function(n, prop, n_total, call = caller_env()) {
  if (is.null(n)) {
    if (is.null(prop)) {
      # By default return size 1
      1L
    } else {
      as.integer(n_total * prop)
    }
  } else {
    if (is.null(prop)) {
      n
    } else {
      cli_abort(
        "Please supply exactly one of the `n` or `prop` arguments.",
        call = call
      )
    }
  }
}


================================================
FILE: R/set_params.R
================================================
#' To determine which theoretical distribution to fit (if any)
#'
#' @param x A data frame that can be coerced into a [tibble][tibble::tibble].
#'
#' @noRd
set_params <- function(x) {
  attr(x, "theory_type") <- NULL

  if (has_response(x)) {
    num_response_levels <- length(unique(response_variable(x)))

    check_factor_levels(
      response_variable(x),
      "response",
      response_name(x)
    )
  }

  if (is_mlr(x)) {
    return(x)
  }

  if (has_explanatory(x)) {
    num_explanatory_levels <- length(unique(explanatory_variable(x)))

    check_factor_levels(
      explanatory_variable(x),
      "explanatory",
      explanatory_name(x)
    )
  }

  # One variable
  if (
    has_response(x) &&
      !has_explanatory(x) &&
      has_attr(x, "response_type") &&
      !has_attr(x, "explanatory_type")
  ) {
    # One mean
    if (attr(x, "response_type") %in% c("integer", "numeric")) {
      attr(x, "theory_type") <- "One sample t"
      attr(x, "distr_param") <- stats::t.test(
        response_variable(x)
      )[["parameter"]]
      attr(x, "type") <- "bootstrap"
    } else if (
      # One prop
      (attr(x, "response_type") == "factor") && (num_response_levels == 2)
    ) {
      # No parameters since standard normal
      attr(x, "theory_type") <- "One sample prop z"
      # Changed to `"draw"` when `p` provided in `hypothesize()`
      attr(x, "type") <- "bootstrap"
    } else {
      attr(x, "theory_type") <- "Chi-square Goodness of Fit"
      attr(x, "distr_param") <- num_response_levels - 1
      attr(x, "type") <- "draw"
    }
  }

  # Two variables
  if (
    has_response(x) &&
      has_explanatory(x) &
      has_attr(x, "response_type") &&
      has_attr(x, "explanatory_type")
  ) {
    attr(x, "type") <- "bootstrap"

    # Response is numeric, explanatory is categorical
    if (
      (attr(x, "response_type") %in% c("integer", "numeric")) &
        (attr(x, "explanatory_type") == "factor")
    ) {
      # Two sample means (t distribution)
      if (num_explanatory_levels == 2) {
        attr(x, "theory_type") <- "Two sample t"
        # Keep track of Satterthwaite degrees of freedom since lost when
        # in aggregation w/ calculate()/generate()
        attr(x, "distr_param") <- stats::t.test(
          response_variable(x) ~ explanatory_variable(x)
        )[["parameter"]]
      } else {
        # >2 sample means (F distribution)
        attr(x, "theory_type") <- "ANOVA"
        # Get numerator and denominator degrees of freedom
        degrees <- stats::anova(stats::aov(
          response_variable(x) ~ explanatory_variable(x)
        ))$Df
        attr(x, "distr_param") <- degrees[1]
        attr(x, "distr_param2") <- degrees[2]
      }
    }

    # Response is categorical, explanatory is categorical
    if (
      (attr(x, "response_type") == "factor") &
        (attr(x, "explanatory_type") == "factor")
    ) {
      attr(x, "type") <- "bootstrap"

      # Two sample proportions (z distribution)
      # Parameter(s) not needed since standard normal
      if (
        (num_response_levels == 2) &
          (num_explanatory_levels == 2)
      ) {
        attr(x, "theory_type") <- "Two sample props z"
      } else {
        # >2 sample proportions (chi-square test of indep)
        attr(x, "theory_type") <- "Chi-square test of indep"
        attr(x, "distr_param") <- suppressWarnings(
          stats::chisq.test(
            table(response_variable(x), explanatory_variable(x))
          )$parameter
        )
      }
    }

    # Response is numeric, explanatory is numeric
    if (
      (attr(x, "response_type") %in% c("integer", "numeric")) &
        (attr(x, "explanatory_type") %in% c("integer", "numeric"))
    ) {
      response_string <- response_name(x)
      explanatory_string <- explanatory_name(x)
      attr(x, "theory_type") <- "Slope/correlation with t"
      attr(x, "distr_param") <- nrow(x) - 2
    }
  }

  x
}

check_factor_levels <- function(x, type, name) {
  if (is.factor(x)) {
    unused <- setdiff(levels(x), unique(x))

    if (length(unused) > 0 && !suppress_infer_messages()) {
      cli_inform(
        "Dropping unused factor levels {list(unused)} from the \\
         supplied {type} variable '{name}'."
      )
    }
  }
}


================================================
FILE: R/shade_confidence_interval.R
================================================
#' Add information about confidence interval
#'
#' @description
#'
#' `shade_confidence_interval()` plots a confidence interval region on top of
#' [visualize()] output. The output is a ggplot2 layer that can be added with
#' `+`. The function has a shorter alias, `shade_ci()`.
#'
#' Learn more in `vignette("infer")`.
#'
#' @param endpoints The lower and upper bounds of the interval to be plotted.
#'   Likely, this will be the output of [get_confidence_interval()].
#'   For [calculate()]-based workflows, this will be a 2-element vector
#'   or a `1 x 2` data frame containing the lower and upper values to be plotted.
#'   For [`fit()`][fit.infer()]-based workflows, a `(p + 1) x 3` data frame
#'   with columns `term`, `lower_ci`, and `upper_ci`, giving the upper and
#'   lower bounds for each regression term. For use in visualizations of
#'   [assume()] output, this must be the output of [get_confidence_interval()].
#' @param color A character or hex string specifying the color of the
#'   end points as a vertical lines on the plot.
#' @param fill A character or hex string specifying the color to shade the
#'   confidence interval. If `NULL` then no shading is actually done.
#' @param ... Other arguments passed along to ggplot2 functions.
#'
#' @return If added to an existing infer visualization, a ggplot2
#'   object displaying the supplied intervals on top of its corresponding
#'   distribution. Otherwise, an `infer_layer` list.
#'
#' @examples
#' # find the point estimate---mean number of hours worked per week
#' point_estimate <- gss |>
#'   specify(response = hours) |>
#'   calculate(stat = "mean")
#'
#' # ...and a bootstrap distribution
#' boot_dist <- gss |>
#'   # ...we're interested in the number of hours worked per week
#'   specify(response = hours) |>
#'   # generating data points
#'   generate(reps = 1000, type = "bootstrap") |>
#'   # finding the distribution from the generated data
#'   calculate(stat = "mean")
#'
#' # find a confidence interval around the point estimate
#' ci <- boot_dist |>
#'   get_confidence_interval(point_estimate = point_estimate,
#'                           # at the 95% confidence level
#'                           level = .95,
#'                           # using the standard error method
#'                           type = "se")
#'
#'
#' # and plot it!
#' boot_dist |>
#'   visualize() +
#'   shade_confidence_interval(ci)
#'
#' # or just plot the bounds
#' boot_dist |>
#'   visualize() +
#'   shade_confidence_interval(ci, fill = NULL)
#'
#' # you can shade confidence intervals on top of
#' # theoretical distributions, too---the theoretical
#' # distribution will be recentered and rescaled to
#' # align with the confidence interval
#' sampling_dist <- gss |>
#'   specify(response = hours) |>
#'   assume(distribution = "t")
#'
#' visualize(sampling_dist) +
#'   shade_confidence_interval(ci)
#'
#' \donttest{
#' # to visualize distributions of coefficients for multiple
#' # explanatory variables, use a `fit()`-based workflow
#'
#' # fit 1000 linear models with the `hours` variable permuted
#' null_fits <- gss |>
#'  specify(hours ~ age + college) |>
#'  hypothesize(null = "independence") |>
#'  generate(reps = 1000, type = "permute") |>
#'  fit()
#'
#' null_fits
#'
#' # fit a linear model to the observed data
#' obs_fit <- gss |>
#'   specify(hours ~ age + college) |>
#'   fit()
#'
#' obs_fit
#'
#' # get confidence intervals for each term
#' conf_ints <-
#'   get_confidence_interval(
#'     null_fits,
#'     point_estimate = obs_fit,
#'     level = .95
#'   )
#'
#' # visualize distributions of coefficients
#' # generated under the null
#' visualize(null_fits)
#'
#' # add a confidence interval shading layer to juxtapose
#' # the null fits with the observed fit for each term
#' visualize(null_fits) +
#'   shade_confidence_interval(conf_ints)
#' }
#'
#' # more in-depth explanation of how to use the infer package
#' \dontrun{
#' vignette("infer")
#' }
#'
#' @name shade_confidence_interval
NULL

#' @rdname shade_confidence_interval
#' @family visualization functions
#' @export
shade_confidence_interval <- function(
  endpoints,
  color = "mediumaquamarine",
  fill = "turquoise",
  ...
) {
  # since most of the logic for shading is in shade_confidence_interval_term, which
  # is only called by `+.gg`, we need to check for mistakenly piped inputs here
  check_for_piped_visualize(endpoints, color, fill)

  # store inputs in classed output that can passed to a `ggplot_add` method
  structure(
    "A confidence interval shading layer.",
    class = "infer_layer",
    fn = "shade_confidence_interval",
    endpoints = if (is.null(endpoints)) {
      NA
    } else {
      endpoints
    },
    color = color,
    fill = list(fill),
    dots = list(...)
  )
}

shade_confidence_interval_term <- function(
  plot,
  endpoints,
  color = "mediumaquamarine",
  fill = "turquoise",
  dots,
  call = rlang::call2("shade_confidence_interval")
) {
  if (all(is.na(endpoints))) {
    endpoints <- NULL
  }

  # argument checking
  endpoints <- impute_endpoints(endpoints, plot, call = call)
  check_shade_confidence_interval_args(color, fill)

  if (is.null(endpoints)) {
    return(plot)
  }

  res <- list()

  if (!is.null(fill)) {
    # Making extra step of precomputing arguments in order to have default value
    # of `alpha = 0.6` overwritable in `...`
    rect_args <- c_dedupl(
      # Not overwritable arguments
      list(
        data = data.frame(endpoints[1]),
        mapping = aes(
          xmin = endpoints[1],
          xmax = endpoints[2],
          ymin = 0,
          ymax = Inf
        ),
        fill = fill,
        inherit.aes = FALSE
      ),
      # Extra arguments
      dots,
      # Default arguments that might be replaced in `...`
      list(alpha = 0.6)
    )
    rect_layer <- do.call(ggplot2::geom_rect, rect_args)

    res <- c(res, list(rect_layer))
  }

  segment_args <- c_dedupl(
    list(
      data = data.frame(x = endpoints),
      mapping = aes(x = x, xend = x, y = 0, yend = Inf),
      color = color,
      inherit.aes = FALSE
    ),
    dots,
    list(linewidth = 2)
  )
  segment_layer <- do.call(ggplot2::geom_segment, segment_args)

  res <- c(res, list(segment_layer))

  if (inherits(plot[["plot_env"]][["data"]], "infer_dist")) {
    plot <-
      redraw_theory_layer(
        plot,
        mean_shift = attr(endpoints, "point_estimate"),
        sd_shift = attr(endpoints, "se")
      ) +
      ggplot2::labs(
        title = "Rescaled Theoretical Distribution",
        x = "stat"
      )
  }

  plot + res
}

#' @rdname shade_confidence_interval
#' @export
shade_ci <- shade_confidence_interval


================================================
FILE: R/shade_p_value.R
================================================
#' Shade histogram area beyond an observed statistic
#'
#' @description
#'
#' `shade_p_value()` plots a p-value region on top of
#' [visualize()] output. The output is a ggplot2 layer that can be added with
#' `+`. The function has a shorter alias, `shade_pvalue()`.
#'
#' Learn more in `vignette("infer")`.
#'
#' @param obs_stat The observed statistic or estimate. For
#'   [calculate()]-based workflows, this will be a 1-element numeric vector or
#'   a `1 x 1` data frame containing the observed statistic.
#'   For [`fit()`][fit.infer()]-based workflows, a `(p + 1) x 2` data frame
#'   with columns `term` and `estimate` giving the observed estimate for
#'   each term.
#' @param direction A string specifying in which direction the shading should
#'   occur. Options are `"less"`, `"greater"`, or `"two-sided"`. Can
#'   also give `"left"`, `"right"`, `"both"`, `"two_sided"`, `"two sided"`,
#'   or `"two.sided"`. If `NULL`, the function will not shade any area.
#' @param color A character or hex string specifying the color of the observed
#'   statistic as a vertical line on the plot.
#' @param fill A character or hex string specifying the color to shade the
#'   p-value region. If `NULL`, the function will not shade any area.
#' @param ... Other arguments passed along to ggplot2 functions.
#'   For expert use only.
#'
#' @return If added to an existing infer visualization, a ggplot2
#'   object displaying the supplied statistic on top of its corresponding
#'   distribution. Otherwise, an `infer_layer` list.
#'
#'
#' @examples
#' # find the point estimate---mean number of hours worked per week
#' point_estimate <- gss |>
#'   specify(response = hours) |>
#'   hypothesize(null = "point", mu = 40) |>
#'   calculate(stat = "t")
#'
#' # ...and a null distribution
#' null_dist <- gss |>
#'   # ...we're interested in the number of hours worked per week
#'   specify(response = hours) |>
#'   # hypothesizing that the mean is 40
#'   hypothesize(null = "point", mu = 40) |>
#'   # generating data points for a null distribution
#'   generate(reps = 1000, type = "bootstrap") |>
#'   # estimating the null distribution
#'   calculate(stat = "t")
#'
#' # shade the p-value of the point estimate
#' null_dist |>
#'   visualize() +
#'   shade_p_value(obs_stat = point_estimate, direction = "two-sided")
#'
#' # you can shade confidence intervals on top of
#' # theoretical distributions, too!
#' null_dist_theory <- gss |>
#'   specify(response = hours) |>
#'   assume(distribution = "t")
#'
#' null_dist_theory |>
#'   visualize() +
#'   shade_p_value(obs_stat = point_estimate, direction = "two-sided")
#'
#' \donttest{
#' # to visualize distributions of coefficients for multiple
#' # explanatory variables, use a `fit()`-based workflow
#'
#' # fit 1000 linear models with the `hours` variable permuted
#' null_fits <- gss |>
#'  specify(hours ~ age + college) |>
#'  hypothesize(null = "independence") |>
#'  generate(reps = 1000, type = "permute") |>
#'  fit()
#'
#' null_fits
#'
#' # fit a linear model to the observed data
#' obs_fit <- gss |>
#'   specify(hours ~ age + college) |>
#'   fit()
#'
#' obs_fit
#'
#' # visualize distributions of coefficients
#' # generated under the null
#' visualize(null_fits)
#'
#' # add a p-value shading layer to juxtapose the null
#' # fits with the observed fit for each term
#' visualize(null_fits) +
#'   shade_p_value(obs_fit, direction = "both")
#'
#' # the direction argument will be applied
#' # to the plot for each term
#' visualize(null_fits) +
#'   shade_p_value(obs_fit, direction = "left")
#' }
#'
#' # more in-depth explanation of how to use the infer package
#' \dontrun{
#' vignette("infer")
#' }
#'
#' @name shade_p_value
NULL

#' @rdname shade_p_value
#' @family visualization functions
#' @export
shade_p_value <- function(
  obs_stat,
  direction,
  color = "red2",
  fill = "pink",
  ...
) {
  # since most of the logic for p-value shading is in shade_p_value_term, which
  # is only called by `+.gg`, we need to check for mistakenly piped inputs here
  check_for_piped_visualize(obs_stat, direction, color, fill)

  # store inputs in classed output that can passed to a `ggplot_add` method
  structure(
    "A p-value shading layer.",
    class = "infer_layer",
    fn = "shade_p_value",
    obs_stat = if (is.null(obs_stat)) {
      NA
    } else {
      obs_stat
    },
    direction = if (is.null(direction)) {
      NA
    } else {
      direction
    },
    color = color,
    fill = list(fill),
    dots = list(...)
  )
}

#' @rdname shade_p_value
#' @export
shade_pvalue <- shade_p_value

shade_p_value_term <- function(
  plot,
  obs_stat,
  direction,
  color = "red2",
  fill = "pink",
  dots,
  call = rlang::call2("shade_p_value")
) {
  if (all(is.na(obs_stat))) {
    obs_stat <- NULL
  }

  if (all(is.na(direction))) {
    direction <- NULL
  }

  # argument checking
  obs_stat <- check_obs_stat(obs_stat, plot, call = call)
  check_shade_p_value_args(obs_stat, direction, color, fill, call = call)

  term <- x_axis_label(plot)

  if (is.null(obs_stat)) {
    return(plot)
  }

  res <- list()

  # Add shading
  if (!is.null(direction) && !is.null(fill)) {
    if (direction %in% c("less", "left", "greater", "right")) {
      tail_area <- one_tail_area(obs_stat, direction)

      res <- c(res, do.call(geom_tail_area, c(list(tail_area, fill), dots)))
    } else if (
      direction %in%
        c("two_sided", "both", "two-sided", "two sided", "two.sided")
    ) {
      tail_area <- two_tail_area(obs_stat, direction)

      res <- c(res, do.call(geom_tail_area, c(list(tail_area, fill), dots)))
    } else {
      cli_warn(
        '`direction` should be one of `"less"`, `"left"`, `"greater"`, \\
         `"right"`, `"two-sided"`, `"both"`, `"two_sided"`, `"two sided"`, \\
         or `"two.sided"`.'
      )
    }
  }

  # Add vertical line at `obs_stat`
  # Making extra step of precomputing arguments in order to have default value
  # of `linewidth = 2` overwritable in `...`
  segment_args <- c_dedupl(
    # Not overwritable arguments
    list(
      # Address length-1 aesthetics warning by providing geom-specific data (#528)
      data = data.frame(obs_stat = obs_stat),
      # Here `aes()` is needed to force {ggplot2} to include segment in the plot
      mapping = aes(x = obs_stat, xend = obs_stat, y = 0, yend = Inf),
      color = color,
      inherit.aes = FALSE
    ),
    # Extra arguments
    dots,
    # Default arguments that might be replaced in `...`
    list(linewidth = 2)
  )
  segment_layer <- do.call(ggplot2::geom_segment, segment_args)

  res <- c(res, list(segment_layer))

  plot + res
}


check_shade_p_value_args <- function(
  obs_stat,
  direction,
  color,
  fill,
  call = caller_env()
) {
  if (!is.null(obs_stat)) {
    check_type(obs_stat, is.numeric, call = call)
  }
  if (!is.null(direction)) {
    check_type(direction, is.character, call = call)
  }
  check_type(color, is_color_string, "color string", call = call)
  check_type(fill, is_color_string, "color string", call = call)

  TRUE
}

geom_tail_area <- function(tail_data, fill, ...) {
  area_args <- c_dedupl(
    list(
      data = tail_data,
      mapping = aes(x = x, y = y, group = dir),
      fill = fill,
      show.legend = FALSE,
      inherit.aes = FALSE
    ),
    list(...),
    list(alpha = 0.6)
  )
  area_layer <- do.call(ggplot2::geom_area, area_args)

  list(area_layer)
}

two_tail_area <- function(obs_stat, direction) {
  # Take advantage of {ggplot2} functionality to accept function as `data`.
  # This is needed to make possible existence of `shade_p_value()` in case of
  # `direction = "both"`, as it depends on actual `data` but adding it as
  # argument to `shade_p_value()` is very bad.
  # Also needed to warn about incorrect usage of right tail tests.
  function(data) {
    warn_right_tail_test(direction, short_theory_type(data))

    if (get_viz_method(data) == "theoretical") {
      second_border <- -obs_stat
    } else {
      second_border <- mirror_obs_stat(data$stat, obs_stat)
    }

    left_area <- one_tail_area(
      min(obs_stat, second_border),
      "left",
      do_warn = FALSE
    )(data)
    right_area <- one_tail_area(
      max(obs_stat, second_border),
      "right",
      do_warn = FALSE
    )(data)

    ret <- dplyr::bind_rows(left_area, right_area)

    # jitter one of the x coords that the right and left area have in common
    # so that their heights aren't summed
    common_x <- which.max(ret$x[ret$dir == "left"])

    ret$x[common_x] <- ret$x[common_x] - 1e-5 * ret$x[common_x]

    ret
  }
}

one_tail_area <- function(obs_stat, direction, do_warn = TRUE) {
  # Take advantage of {ggplot2} functionality to accept function as `data`.
  function(data) {
    warn_right_tail_test(direction, short_theory_type(data), do_warn)

    norm_dir <- norm_direction(direction)
    viz_method <- get_viz_method(data)

    # Compute grid points for upper bound of shading area
    switch(
      viz_method,
      theoretical = theor_area(data, obs_stat, norm_dir),
      simulation = hist_area(data, obs_stat, norm_dir, yval = "ymax"),
      both = hist_area(data, obs_stat, norm_dir, yval = "density")
    )
  }
}

theor_area <- function(data, obs_stat, direction, n_grid = 1001) {
  plot_data <- create_plot_data(data)

  g <- ggplot(plot_data) + theoretical_layer(data, "black", do_warn = FALSE)
  g_data <- ggplot2::ggplot_build(g)[["data"]][[1]]

  curve_fun <- stats::approxfun(
    x = g_data[["x"]],
    y = g_data[["y"]],
    yleft = 0,
    yright = 0
  )

  # Compute "x" grid of curve, area under which will be shaded.
  x_grid <- switch(
    # `direction` can be one of "left" or "right" at this point of execution
    direction,
    left = seq(from = min(g_data[["x"]]), to = obs_stat, length.out = n_grid),
    right = seq(from = obs_stat, to = max(g_data[["x"]]), length.out = n_grid)
  )

  tibble::tibble(x = x_grid, y = curve_fun(x_grid), dir = direction)
}

hist_area <- function(data, obs_stat, direction, yval) {
  g <- ggplot(data) + simulation_layer(data)
  g_data <- ggplot2::ggplot_build(g)[["data"]][[1]]

  # Compute knots for step function representing histogram bars and space
  # between them.
  # "x" coordinates are computed from `x_left` and `x_right`: "x" coordinates
  # of "shrinked" (to avoid duplicte points later) histogram bars.
  x_left <- (1 - 1e-5) * g_data[["xmin"]] + 1e-5 * g_data[["xmax"]]
  x_right <- 1e-5 * g_data[["xmin"]] + (1 - 1e-5) * g_data[["xmax"]]
  # `x` is created as `c(x_left[1], x_right[1], x_left[2], ...)`
  x <- c(t(cbind(x_left, x_right)))

  # "y" coordinates represent values of future `stepfun(..., right = FALSE)`
  # outputs between `x` knots. That is:
  # y[1] is value inside [-Inf, x_left[1]) (zero),
  # y[2] - value inside [x_left[1], x_right[1]) (height of first histogram bar),
  # y[3] - value inside [x_right[1], x_left[2]) (zero), and so on.
  y <- c(0, t(cbind(g_data[[yval]], 0)))

  # Output step function should evaluate to histogram bar heights on both
  # corresponding ends, i.e. `curve_fun(c(x_left[1], x_right[1]))` should return
  # vector of length two with heights of first histogram bar. `stepfun()` treats
  # input `x` as consequtive semi-open intervals. To achieve effect of closed
  # intervals, `pmax()` trick is used.
  curve_fun <- function(t) {
    pmax(
      stats::stepfun(x, y, right = FALSE)(t),
      stats::stepfun(x, y, right = TRUE)(t)
    )
  }

  # "True" left and right "x" coordinates of histogram bars are added to achieve
  # "almost vertical" lines with `geom_area()` usage. If don't do this, then
  # area might be shaded under line segments connecting edges of consequtive
  # histogram bars.
  x_extra <- switch(
    direction,
    left = g_data[["xmax"]],
    right = g_data[["xmin"]]
  )
  x_extra <- sort(c(x, x_extra))
  x_grid <- switch(
    # `direction` can be one of "left" or "right" at this point of execution
    direction,
    left = c(x_extra[x_extra < obs_stat], obs_stat),
    right = c(obs_stat, x_extra[x_extra > obs_stat])
  )

  # if area will have area 0, return 0-length tibble to trigger
  # `ggplot:::empty()` edge case (#528)
  if (length(x_grid) == 1) {
    return(tibble::tibble(x = numeric(0), y = numeric(0), dir = character(0)))
  }

  tibble::tibble(x = x_grid, y = curve_fun(x_grid), dir = direction)
}

norm_direction <- function(direction) {
  switch(
    direction,
    less = ,
    left = "left",
    greater = ,
    right = "right",
    two_sided = ,
    `two-sided` = ,
    `two sided` = ,
    `two.sided` = ,
    both = "both"
  )
}

warn_right_tail_test <- function(direction, stat_name, do_warn = TRUE) {
  if (
    do_warn &&
      !is.null(direction) &&
      !(direction %in% c("greater", "right")) &&
      (stat_name %in% c("F", "Chi-Square"))
  ) {
    cli_warn(
      "{stat_name} usually corresponds to right-tailed tests. \\
       Proceed with caution."
    )
  }

  TRUE
}

mirror_obs_stat <- function(vector, observation) {
  obs_percentile <- stats::ecdf(vector)(observation)

  stats::quantile(vector, probs = 1 - obs_percentile)
}


================================================
FILE: R/specify.R
================================================
#' Specify response and explanatory variables
#'
#' @description
#'
#' `specify()` is used to specify which columns in the supplied data frame are
#' the relevant response (and, if applicable, explanatory) variables. Note that
#' character variables are converted to `factor`s.
#'
#' Learn more in `vignette("infer")`.
#'
#' @param x A data frame that can be coerced into a [tibble][tibble::tibble].
#' @param formula A formula with the response variable on the left and the
#'   explanatory on the right. Alternatively, a `response` and `explanatory`
#'   argument can be supplied.
#' @param response The variable name in `x` that will serve as the response.
#'   This is an alternative to using the `formula` argument.
#' @param explanatory The variable name in `x` that will serve as the
#'   explanatory variable. This is an alternative to using the formula argument.
#' @param success The level of `response` that will be considered a success, as
#'   a string. Needed for inference on one proportion, a difference in
#'   proportions, and corresponding z stats.
#'
#' @return A tibble containing the response (and explanatory, if specified)
#'   variable data.
#'
#' @examples
#' # specifying for a point estimate on one variable
#' gss |>
#'    specify(response = age)
#'
#' # specify a relationship between variables as a formula...
#' gss |>
#'   specify(age ~ partyid)
#'
#' # ...or with named arguments!
#' gss |>
#'   specify(response = age, explanatory = partyid)
#'
#' # more in-depth explanation of how to use the infer package
#' \dontrun{
#' vignette("infer")
#' }
#'
#' @importFrom rlang f_lhs f_rhs get_expr caller_env
#' @importFrom dplyr select any_of across
#' @importFrom methods hasArg
#' @family core functions
#' @export
specify <- function(
  x,
  formula,
  response = NULL,
  explanatory = NULL,
  success = NULL
) {
  check_type(x, is.data.frame)

  # Standardize variable types
  x <- standardize_variable_types(x)

  # Parse response and explanatory variables
  response <- enquo(response)
  explanatory <- enquo(explanatory)

  x <- parse_variables(x, formula, response, explanatory)

  # Add attributes
  attr(x, "success") <- success
  attr(x, "generated") <- FALSE
  attr(x, "hypothesized") <- FALSE
  attr(x, "fitted") <- FALSE

  # Check the success argument
  check_success_arg(x, success)

  # Select variables
  x <- x |>
    select(any_of(c(response_name(x), explanatory_name(x))))

  is_complete <- stats::complete.cases(x)
  if (!all(is_complete)) {
    x <- dplyr::filter(x, is_complete)
    cli_warn("Removed {sum(!is_complete)} rows containing missing values.")
  }

  # Add "infer" class
  append_infer_class(x)
}

parse_variables <- function(
  x,
  formula,
  response,
  explanatory,
  call = caller_env()
) {
  if (methods::hasArg(formula)) {
    tryCatch(
      rlang::is_formula(formula),
      error = function(e) {
        cli_abort(
          c(
            "The argument you passed in for the formula does not exist.",
            i = "Were you trying to pass in an unquoted column name?",
            i = "Did you forget to name one or more arguments?"
          ),
          call = call
        )
      }
    )
    if (!rlang::is_formula(formula)) {
      cli_abort(
        c(
          "The first unnamed argument must be a formula.",
          i = "You passed in '{get_type(formula)}'.",
          x = "Did you forget to name one or more arguments?"
        ),
        call = call
      )
    }
  }

  attr(x, "response") <- get_expr(response)
  attr(x, "explanatory") <- get_expr(explanatory)
  attr(x, "formula") <- NULL

  if (methods::hasArg(formula)) {
    attr(x, "response") <- f_lhs(formula)
    attr(x, "explanatory") <- f_rhs(formula)
    attr(x, "formula") <- formula
  }

  # Check response and explanatory variables to be appropriate for later use
  if (!has_response(x)) {
    cli_abort(
      "Please supply a response variable that is not `NULL`.",
      call = call
    )
  }

  check_var_correct(x, "response", call = call)
  check_var_correct(x, "explanatory", call = call)

  # If there's an explanatory var
  check_vars_different(x, call = call)

  if (!has_attr(x, "response")) {
    attr(x, "response_type") <- NULL
  } else {
    attr(x, "response_type") <- class(response_variable(x))
  }

  if (!has_attr(x, "explanatory")) {
    attr(x, "explanatory_type") <- NULL
  } else {
    attr(x, "explanatory_type") <-
      purrr::map_chr(as.data.frame(explanatory_variable(x)), class)
  }

  attr(x, "type_desc_response") <- determine_variable_type(x, "response")
  attr(x, "type_desc_explanatory") <- determine_variable_type(x, "explanatory")

  # Determine params for theoretical fit
  x <- set_params(x)

  x
}

check_success_arg <- function(x, success, call = caller_env()) {
  response_col <- response_variable(x)

  if (!is.null(success)) {
    if (!is.character(success)) {
      cli_abort("`success` must be a string.", call = call)
    }
    if (!is.factor(response_col)) {
      cli_abort(
        "`success` should only be specified if the response is a categorical \\
         variable.",
        call = call
      )
    }
    if (!(success %in% levels(response_col))) {
      cli_abort(
        '{success} is not a valid level of {response_name(x)}.',
        call = call
      )
    }
    if (sum(table(response_col) > 0) > 2) {
      cli_abort(
        "`success` can only be used if the response has two levels. \\
         `filter()` can reduce a variable to two levels.",
        call = call
      )
    }
  }

  if (
    (attr(x, "response_type") == "factor" &&
      is.null(success) &&
      length(levels(response_variable(x))) == 2) &&
      ((!has_attr(x, "explanatory_type") ||
        length(levels(explanatory_variable(x))) == 2))
  ) {
    cli_abort(
      'A level of the response variable `{response_name(x)}` needs to be \\
        specified for the `success` argument in `specify()`.',
      call = call
    )
  }
}

check_var_correct <- function(x, var_name, call = caller_env()) {
  var <- attr(x, var_name)

  # Variable (if present) should be a symbolic column name
  if (!is.null(var)) {
    if (!rlang::is_symbolic(var)) {
      cli_abort(
        "The {var_name} should be a bare variable name (not a string in \\
         quotation marks).",
        call = call
      )
    }

    if (any(!(all.vars(var) %in% names(x)))) {
      cli_abort(
        'The {var_name} variable `{var}` cannot be found in this dataframe.',
        call = call
      )
    }
  }

  TRUE
}

check_vars_different <- function(x, call = caller_env()) {
  if (has_response(x) && has_explanatory(x)) {
    if (identical(response_name(x), explanatory_name(x))) {
      cli_abort(
        "The response and explanatory variables must be different from one \\
         another.",
        call = call
      )
    }
  }

  TRUE
}


================================================
FILE: R/utils.R
================================================
# Miscellaneous Helpers -----------------------------------------------
suppress_infer_messages <- function() {
  identical(Sys.getenv("SUPPRESS_INFER_MESSAGES"), "true")
}

append_infer_class <- function(x) {
  x_cl <- class(x)
  if (x_cl[1] != "infer") {
    class(x) <- c("infer", x_cl)
  }

  x
}

format_params <- function(x) {
  par_levels <- get_par_levels(x)
  fct_levels <- as.character(unique(response_variable(x)))
  attr(x, "params")[match(fct_levels, par_levels)]
}

print_params <- function(x) {
  params <- attr(x, "params")

  switch(
    as.character(length(params)),
    "1" = glue(": `{names(params)} = {unname(params)}`", .null = "NULL"),
    "2" = glue(": `p = .5`", .null = "NULL"),
    glue("s: `p = c({put_params(x, params)})`", .null = "NULL")
  )
}

put_params <- function(x, params) {
  paste0(get_par_levels(x), " = ", params, collapse = ", ")
}

get_par_levels <- function(x) {
  par_names <- names(attr(x, "params"))
  gsub("^.\\.", "", par_names)
}

copy_attrs <- function(
  to,
  from,
  attrs = c(
    "response",
    "success",
    "explanatory",
    "response_type",
    "explanatory_type",
    "distr_param",
    "distr_param2",
    "null",
    "params",
    "theory_type",
    "generated",
    "type",
    "hypothesized",
    "formula",
    "fitted",
    "type_desc_response",
    "type_desc_explanatory"
  )
) {
  for (at in attrs) {
    attr(to, at) <- attr(from, at)
  }

  to
}

# Wrapper for deduplication by name after doing `c(...)`
c_dedupl <- function(...) {
  l <- c(...)

  l_names <- names(l)

  if (is.null(l_names)) {
    l
  } else {
    l[!duplicated(l_names) | (l_names == "")]
  }
}

reorder_explanatory <- function(x, order) {
  x[[explanatory_name(x)]] <- factor(
    explanatory_variable(x),
    levels = c(order[1], order[2])
  )
  x
}

standardize_variable_types <- function(x) {
  tibble::as_tibble(x) |>
    # character and ordered to factor
    dplyr::mutate(
      dplyr::across(
        where(~ is.character(.x) || is.ordered(.x)),
        ~ factor(.x, ordered = FALSE)
      )
    ) |>
    # logical to factor, with TRUE as the first level
    dplyr::mutate(
      dplyr::across(
        where(~ is.logical(.x)),
        ~ factor(.x, levels = c("TRUE", "FALSE"))
      )
    ) |>
    # integer to numeric
    dplyr::mutate(
      dplyr::across(
        where(is.integer),
        as.numeric
      )
    )
}

# Performant grouping ----------------------------------------------------------
group_by_replicate <- function(tbl, reps, n) {
  dplyr::new_grouped_df(
    tbl,
    groups = make_replicate_groups(tbl, reps = reps, n = n)
  )
}


make_replicate_groups <- function(tbl, reps, n) {
  res <-
    tibble::new_tibble(list(
      replicate = 1:reps,
      .rows = vctrs::as_list_of(
        vctrs::vec_chop(seq_len(n * reps), sizes = rep(n, reps)),
        .ptype = integer()
      )
    ))

  attr(res, ".drop") <- TRUE

  res
}

# Getters, setters, and indicators ------------------------------------------
explanatory_expr <- function(x) {
  attr(x, "explanatory")
}

explanatory_name <- function(x) {
  all.vars(explanatory_expr(x))
}

# if there is more than one explanatory variable, return a data frame.
# if there's one, return a vector. otherwise, return NULL.
explanatory_variable <- function(x) {
  if (!is.null(explanatory_expr(x))) {
    if (length(explanatory_name(x)) > 1) {
      x[explanatory_name(x)]
    } else {
      x[[explanatory_name(x)]]
    }
  } else {
    NULL
  }
}

response_expr <- function(x) {
  attr(x, "response")
}

response_name <- function(x) {
  as.character(response_expr(x))
}

response_variable <- function(x) {
  x[[response_name(x)]]
}

theory_type <- function(x) {
  attr(x, "theory_type")
}

get_response_levels <- function(x) {
  as.character(unique(response_variable(x)))
}

get_success_then_response_levels <- function(x) {
  success_attr <- attr(x, "success")
  response_levels <- setdiff(
    get_response_levels(x),
    success_attr
  )
  c(success_attr, response_levels)
}

is_generated <- function(x) {
  isTRUE(attr(x, "generated"))
}

is_hypothesized <- function(x) {
  isTRUE(attr(x, "hypothesized"))
}

is_fitted <- function(x) {
  isTRUE(attr(x, "fitted"))
}

is_mlr <- function(x) {
  length(explanatory_name(x)) > 1
}

has_attr <- function(x, at) {
  !is.null(attr(x, at, exact = TRUE))
}

has_explanatory <- function(x) {
  has_attr(x, "explanatory")
}

has_response <- function(x) {
  has_attr(x, "response")
}

is_color_string <- function(x) {
  rlang::is_string(x) &&
    tryCatch(is.matrix(grDevices::col2rgb(x)), error = function(e) {
      FALSE
    })
}

is_single_number <- function(
  x,
  min_val = -Inf,
  max_val = Inf,
  include_min_val = TRUE,
  include_max_val = TRUE
) {
  left_compare <- if (include_min_val) {
    `>=`
  } else {
    `>`
  }
  right_compare <- if (include_max_val) {
    `<=`
  } else {
    `<`
  }

  is.numeric(x) &&
    (length(x) == 1) &&
    is.finite(x) &&
    left_compare(x, min_val) &&
    right_compare(x, max_val)
}

is_truefalse <- function(x) {
  identical(x, TRUE) || identical(x, FALSE)
}

# Helpers for test statistics --------------------------------------

# Simplify and standardize checks by grouping statistics based on variable types
# num = numeric, bin = binary (dichotomous), mult = multinomial
stat_types <- tibble::tribble(
  ~resp,
  ~exp,
  ~stats,
  "num",
  "",
  c("mean", "median", "sum", "sd", "t"),
  "num",
  "num",
  c("slope", "correlation"),
  "num",
  "bin",
  c("diff in means", "diff in medians", "t", "ratio of means"),
  "num",
  "mult",
  c("F"),
  "bin",
  "",
  c("prop", "count", "z"),
  "bin",
  "bin",
  c("diff in props", "z", "ratio of props", "odds ratio", "Chisq"),
  "bin",
  "mult",
  c("Chisq"),
  "mult",
  "bin",
  c("Chisq"),
  "mult",
  "",
  c("Chisq"),
  "mult",
  "mult",
  c("Chisq"),
)

stat_type_desc <- tibble::tribble(
  ~type,
  ~description,
  "num",
  "numeric",
  "bin",
  "dichotomous categorical",
  "mult",
  "multinomial categorical"
)

get_stat_type_desc <- function(stat_type) {
  stat_type_desc$description[stat_type_desc$type == stat_type]
}

stat_desc <- tibble::tribble(
  ~stat,
  ~description,
  "mean",
  "A mean",
  "median",
  "A median",
  "sum",
  "A sum",
  "sd",
  "A standard deviation",
  "prop",
  "A proportion",
  "count",
  "A count",
  "diff in means",
  "A difference in means",
  "diff in medians",
  "A difference in medians",
  "diff in props",
  "A difference in proportions",
  "Chisq",
  "A chi-square statistic",
  "F",
  "An F statistic",
  "slope",
  "A slope",
  "correlation",
  "A correlation",
  "t",
  "A t statistic",
  "z",
  "A z statistic",
  "ratio of props",
  "A ratio of proportions",
  "ratio of means",
  "A ratio of means",
  "odds ratio",
  "An odds ratio"
)

stat_hypotheses <- tibble::tribble(
  ~stat,
  ~hypothesis,
  "mean",
  "point",
  "median",
  "point",
  "sum",
  "point",
  "sd",
  "point",
  "prop",
  "point",
  "count",
  "point",
  "mean",
  "paired independence",
  "median",
  "paired independence",
  "sum",
  "paired independence",
  "sd",
  "paired independence",
  "diff in means",
  "independence",
  "diff in medians",
  "independence",
  "diff in props",
  "independence",
  "Chisq",
  "independence",
  "Chisq",
  "point",
  "F",
  "independence",
  "slope",
  "independence",
  "correlation",
  "independence",
  "t",
  "independence",
  "t",
  "point",
  "z",
  "independence",
  "z",
  "point",
  "ratio of props",
  "independence",
  "ratio of means",
  "independence",
  "odds ratio",
  "independence"
)

get_stat_desc <- function(stat) {
  stat_desc$description[stat_desc$stat == stat]
}

# Values of `stat` argument of `calculate()`
implemented_stats <- c(
  "mean",
  "median",
  "sum",
  "sd",
  "prop",
  "count",
  "diff in means",
  "diff in medians",
  "diff in props",
  "Chisq",
  "F",
  "slope",
  "correlation",
  "t",
  "z",
  "ratio of props",
  "ratio of means",
  "odds ratio"
)

implemented_stats_aliases <- tibble::tribble(
  ~alias,
  ~target,
  # Allow case insensitive stat names
  "f",
  "F",
  "chisq",
  "Chisq"
)

untheorized_stats <- implemented_stats[
  !implemented_stats %in%
    c(
      "Chisq",
      "F",
      "t",
      "z"
    )
]

# Given a statistic and theory type, assume a reasonable null
p_null <- function(x) {
  lvls <- levels(response_variable(x))
  num_lvls <- length(lvls)
  probs <- 1 / num_lvls

  setNames(rep(probs, num_lvls), paste0("p.", lvls))
}

# The "null_fn" column is a function(x) whose output gives attr(x, "params")
theorized_nulls <- tibble::tribble(
  ~stat,
  ~null_fn,
  "Chisq",
  p_null,
  "t",
  function(x) {
    setNames(0, "mu")
  },
  "z",
  p_null
)

determine_variable_type <- function(x, variable) {
  var <- switch(
    variable,
    response = response_variable(x),
    explanatory = explanatory_variable(x)
  )

  if (is.null(var)) {
    ""
  } else if (inherits(var, "numeric")) {
    "num"
  } else if (length(unique(var)) == 2) {
    "bin"
  } else {
    "mult"
  }
}

# Argument checking --------------------------------------------------------

check_order <- function(
  x,
  order,
  in_calculate = TRUE,
  stat,
  call = caller_env()
) {
  # If there doesn't need to be an order argument, warn if there is one,
  # and otherwise, skip checks
  if (
    !(theory_type(x) %in%
      c("Two sample props z", "Two sample t") ||
      is.null(stat) ||
      stat %in%
        c(
          "diff in means",
          "diff in medians",
          "diff in props",
          "ratio of props",
          "odds ratio"
        ))
  ) {
    if (!is.null(order)) {
      cli_warn(
        "Statistic is not based on a difference or ratio; the `order` argument \\
          will be ignored. Check {.help [{.fun calculate}](infer::calculate)} \\
          for details."
      )
    } else {
      return(order)
    }
  }

  explanatory_variable <- explanatory_variable(x)
  unique_ex <- sort(unique(explanatory_variable))

  if (is.null(order) & in_calculate) {
    # Default to subtracting/dividing the first (alphabetically) level by the
    # second, unless the explanatory variable is a factor (in which case order
    # is preserved); raise a warning if this was done implicitly.
    order <- as.character(unique_ex)
    cli_warn(
      "The statistic is based on a difference or ratio; by default, for \\
       difference-based statistics, the explanatory variable is subtracted \\
       in the order \"{unique_ex[1]}\" - \"{unique_ex[2]}\", or divided in \\
       the order \"{unique_ex[1]}\" / \"{unique_ex[2]}\" for ratio-based \\
       statistics. To specify this order yourself, supply `order = \\
       c(\"{unique_ex[1]}\", \"{unique_ex[2]}\")` to the calculate() function."
    )
  } else if (is.null(order)) {
    order <- as.character(unique_ex)
    cli_warn(
      "The statistic is based on a difference or ratio; by default, for \\
       difference-based statistics, the explanatory variable is subtracted \\
       in the order \"{unique_ex[1]}\" - \"{unique_ex[2]}\", or divided in \\
       the order \"{unique_ex[1]}\" / \"{unique_ex[2]}\" for ratio-based \\
       statistics. To specify this order yourself, supply `order = \\
       c(\"{unique_ex[1]}\", \"{unique_ex[2]}\")`."
    )
  } else {
    if (xor(is.na(order[1]), is.na(order[2]))) {
      cli_abort(
        "Only one level specified in `order`. Both levels need to be specified.",
        call = call
      )
    }
    if (length(order) > 2) {
      cli_abort(
        "`order` is expecting only two entries.",
        call = call
      )
    }
    if (order[1] %in% unique_ex == FALSE) {
      cli_abort(
        "{order[1]} is not a level of the explanatory variable.",
        call = call
      )
    }
    if (order[2] %in% unique_ex == FALSE) {
      cli_abort(
        "{order[2]} is not a level of the explanatory variable.",
        call = call
      )
    }
  }
  # return the order as given (unless the argument was invalid or NULL)
  order
}

check_point_params <- function(x, stat, call = caller_env()) {
  param_names <- attr(attr(x, "params"), "names")
  hyp_text <- 'to be set in `hypothesize()`.'
  if (
    is_hypothesized(x) && !identical(attr(x, "null"), "paired independence")
  ) {
    if (stat %in% c("mean", "median", "sd", "prop")) {
      if ((stat == "mean") && !("mu" %in% param_names)) {
        cli_abort('`stat == "mean"` requires `"mu"` {hyp_text}', call = call)
      }
      if (!(stat == "mean") && ("mu" %in% param_names)) {
        cli_abort(
          '`"mu"` does not correspond to `stat = "{stat}"`.',
          call = call
        )
      }
      if ((stat == "median") && !("med" %in% param_names)) {
        cli_abort('`stat == "median"` requires `"med"` {hyp_text}', call = call)
      }
      if (!(stat == "median") && ("med" %in% param_names)) {
        cli_abort(
          '`"med"` does not correspond to `stat = "{stat}"`.',
          call = call
        )
      }
    }
  }
}

# This function checks for NaNs in the output of `calculate` and raises
# a message/warning/error depending on the context in which it was called.
check_for_nan <- function(x, context) {
  if (inherits(x, "infer_dist")) {
    return(x)
  }

  stat_is_nan <- is.nan(x[["stat"]])
  num_nans <- sum(stat_is_nan)
  # If there are no NaNs, continue on as normal :-)
  if (num_nans == 0) {
    return(x)
  }

  calc_ref <- c(
    i = "See {.help [{.fun calculate}](infer::calculate)} for more details."
  )
  # If all of the data is NaN, raise an error
  if (num_nans == nrow(x)) {
    cli_abort(
      c("All calculated statistics were `NaN`.", calc_ref),
      call = NULL
    )
  }

  stats_were <- if (num_nans == 1) {
    "statistic was"
  } else {
    "statistics were"
  }
  num_nans_msg <- glue::glue("{num_nans} calculated {stats_were} `NaN`")

  if (context == "visualize") {
    # Raise a warning and plot the data with NaNs removed
    cli_warn(
      c(
        "{num_nans_msg}. `NaN`s have been omitted from visualization.",
        calc_ref
      )
    )
    return(x[!stat_is_nan, ])
  } else if (context == "get_p_value") {
    # Raise an error
    cli_abort(
      c(
        "{num_nans_msg}. Simulation-based p-values are not well-defined for \\
       null distributions with non-finite values.",
        calc_ref
      ),
      call = NULL
    )
  }
}

check_direction <- function(
  direction = c(
    "less",
    "greater",
    "two_sided",
    "left",
    "right",
    "both",
    "two-sided",
    "two sided",
    "two.sided"
  ),
  call = caller_env()
) {
  check_type(direction, is.character, call = call)

  if (
    !(direction %in%
      c(
        "less",
        "greater",
        "two_sided",
        "left",
        "right",
        "both",
        "two-sided",
        "two sided",
        "two.sided"
      ))
  ) {
    cli_abort(
      'The provided value for `direction` is not appropriate. Possible values \\
      are "less", "greater", "two-sided", "left", "right", "both", \
      "two_sided", "two sided", or "two.sided".',
      call = call
    )
  }
}

check_obs_stat <- function(obs_stat, plot = NULL, call = caller_env()) {
  if (!is.null(obs_stat)) {
    if ("data.frame" %in% class(obs_stat)) {
      if (is_fitted(obs_stat)) {
        x_lab <- x_axis_label(plot)

        obs_stat <-
          obs_stat |>
          dplyr::filter(term == x_lab) |>
          dplyr::pull(estimate)

        return(obs_stat)
      }

      check_type(obs_stat, is.data.frame, call = call)
      if ((nrow(obs_stat) != 1) || (ncol(obs_stat) != 1)) {
        cli_warn(
          "The first row and first column value of the given `obs_stat` will \\
           be used."
        )
      }

      # [[1]] is used in case `stat` is not specified as name of 1x1
      obs_stat <- obs_stat[[1]][[1]]
      check_type(obs_stat, is.numeric, call = call)
    } else {
      check_type(obs_stat, is.numeric, call = call)
    }
  }

  obs_stat
}

check_mlr_x_and_obs_stat <- function(
  x,
  obs_stat,
  fn,
  arg,
  call = caller_env()
) {
  if (!is_fitted(obs_stat)) {
    cli_abort(
      c(
        "The `{arg}` argument should be the output of `fit()`.",
        i = "See the documentation with `?{fn}`."
      ),
      call = call
    )
  }

  if (!is_generated(x)) {
    cli_abort(
      "The `x` argument needs to be passed to `generate()` before `fit()`.",
      call = call
    )
  }

  if (
    any(!unique(x$term) %in% unique(obs_stat$term)) ||
      any(!unique(obs_stat$term) %in% unique(x$term))
  ) {
    cli_abort(
      "The explanatory variables used to generate the distribution of \\
       null fits are not the same used to fit the observed data.",
      call = call
    )
  }

  if (response_name(x) != response_name(obs_stat)) {
    cli_abort(
      "The response variable of the null fits ({response_name(x)}) is not \\
       the same as that of the observed fit ({response_name(obs_stat)}).",
      call = call
    )
  }

  invisible(TRUE)
}

#' Check object type
#'
#' Throw an error in case object is not of desired type.
#'
#' @param x An object to check.
#' @param predicate A function to perform check or a formula (as input for
#'   `rlang::as_function()`). A good idea is to use function named `is.*()` or
#'   `is_*()` with possible `<package>::` prefix.
#' @param type_name A string for desired type name. If `NULL`, type is taken
#'   from parsing original name of supplied `predicate`: all alphanumeric with
#'   '_' and '.' characters (until the name end) after the first appearance of
#'   either `is.` or `is_`. In case of a doubt supply `type_name` explicitly.
#' @param x_name String to be used as variable name instead of supplied one
#'   (default).
#' @param allow_null If `TRUE` then error isn't thrown if `x` is `NULL`, no
#'   matter what `predicate(x)` returns.
#' @param ... Arguments to be passed to `predicate`.
#'
#' @examples
#' \donttest{
#' x <- 1
#' check_type(x, is.numeric)
#' check_type(x, is.logical)
#' check_type(x, rlang::is_string, "character of length 1")
#' check_type(
#'   x,
#'   ~ is.character(.) && (length(.) == 1),
#'   "character of length 1"
#' )
Download .txt
gitextract__66l93tt/

├── .Rbuildignore
├── .gitattributes
├── .github/
│   ├── .gitignore
│   ├── CODE_OF_CONDUCT.md
│   └── workflows/
│       ├── R-CMD-check.yaml
│       ├── check-hard.yaml
│       ├── lock.yaml
│       ├── pkgdown.yaml
│       ├── pr-commands.yaml
│       └── test-coverage.yaml
├── .gitignore
├── .vscode/
│   ├── extensions.json
│   └── settings.json
├── CONTRIBUTING.md
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── NEWS.md
├── R/
│   ├── assume.R
│   ├── calculate.R
│   ├── deprecated.R
│   ├── fit.R
│   ├── generate.R
│   ├── get_confidence_interval.R
│   ├── get_p_value.R
│   ├── gss.R
│   ├── hypothesize.R
│   ├── infer.R
│   ├── observe.R
│   ├── pipe.R
│   ├── print_methods.R
│   ├── rep_sample_n.R
│   ├── set_params.R
│   ├── shade_confidence_interval.R
│   ├── shade_p_value.R
│   ├── specify.R
│   ├── utils.R
│   ├── visualize.R
│   └── wrappers.R
├── README.Rmd
├── README.md
├── _pkgdown.yml
├── air.toml
├── codecov.yml
├── data/
│   └── gss.rda
├── data-raw/
│   └── save_gss.R
├── figs/
│   ├── paper/
│   │   ├── apa.csl
│   │   ├── columns.tex
│   │   ├── paper.Rmd
│   │   ├── paper.bib
│   │   ├── paper.log
│   │   └── paper.md
│   └── rethinking-inference.key
├── infer.Rproj
├── inst/
│   └── CITATION
├── man/
│   ├── assume.Rd
│   ├── calculate.Rd
│   ├── chisq_stat.Rd
│   ├── chisq_test.Rd
│   ├── deprecated.Rd
│   ├── fit.infer.Rd
│   ├── generate.Rd
│   ├── get_confidence_interval.Rd
│   ├── get_p_value.Rd
│   ├── gss.Rd
│   ├── hypothesize.Rd
│   ├── infer.Rd
│   ├── observe.Rd
│   ├── pipe.Rd
│   ├── print.infer.Rd
│   ├── prop_test.Rd
│   ├── reexports.Rd
│   ├── rep_sample_n.Rd
│   ├── shade_confidence_interval.Rd
│   ├── shade_p_value.Rd
│   ├── specify.Rd
│   ├── t_stat.Rd
│   ├── t_test.Rd
│   └── visualize.Rd
├── man-roxygen/
│   └── seeds.Rmd
├── tests/
│   ├── testthat/
│   │   ├── _snaps/
│   │   │   ├── aliases.md
│   │   │   ├── assume.md
│   │   │   ├── calculate.md
│   │   │   ├── fit.md
│   │   │   ├── generate.md
│   │   │   ├── get_confidence_interval.md
│   │   │   ├── get_p_value.md
│   │   │   ├── hypothesize.md
│   │   │   ├── observe.md
│   │   │   ├── print.md
│   │   │   ├── rep_sample_n.md
│   │   │   ├── shade_confidence_interval.md
│   │   │   ├── shade_p_value.md
│   │   │   ├── specify.md
│   │   │   ├── utils.md
│   │   │   ├── visualize.md
│   │   │   └── wrappers.md
│   │   ├── helper-data.R
│   │   ├── setup.R
│   │   ├── test-aliases.R
│   │   ├── test-assume.R
│   │   ├── test-calculate.R
│   │   ├── test-fit.R
│   │   ├── test-generate.R
│   │   ├── test-get_confidence_interval.R
│   │   ├── test-get_p_value.R
│   │   ├── test-hypothesize.R
│   │   ├── test-observe.R
│   │   ├── test-print.R
│   │   ├── test-rep_sample_n.R
│   │   ├── test-shade_confidence_interval.R
│   │   ├── test-shade_p_value.R
│   │   ├── test-specify.R
│   │   ├── test-utils.R
│   │   ├── test-visualize.R
│   │   └── test-wrappers.R
│   └── testthat.R
└── vignettes/
    ├── anova.Rmd
    ├── chi_squared.Rmd
    ├── infer.Rmd
    ├── infer_cache/
    │   └── html/
    │       ├── __packages
    │       ├── calculate-point_94c073b633c3cf7bef3252dcad544ee2.RData
    │       ├── calculate-point_94c073b633c3cf7bef3252dcad544ee2.rdb
    │       ├── calculate-point_94c073b633c3cf7bef3252dcad544ee2.rdx
    │       ├── generate-permute_21b25928d642a97a30057306d51f1b23.RData
    │       ├── generate-permute_21b25928d642a97a30057306d51f1b23.rdb
    │       ├── generate-permute_21b25928d642a97a30057306d51f1b23.rdx
    │       ├── generate-point_d562524427be20dbb4736ca1ea29b04b.RData
    │       ├── generate-point_d562524427be20dbb4736ca1ea29b04b.rdb
    │       ├── generate-point_d562524427be20dbb4736ca1ea29b04b.rdx
    │       ├── hypothesize-40-hr-week_c8e33c404efa90c2ca0b2eacad95b06c.RData
    │       ├── hypothesize-40-hr-week_c8e33c404efa90c2ca0b2eacad95b06c.rdb
    │       ├── hypothesize-40-hr-week_c8e33c404efa90c2ca0b2eacad95b06c.rdx
    │       ├── hypothesize-independence_fe1c79b9f1dc0df488828fdd34c8145f.RData
    │       ├── hypothesize-independence_fe1c79b9f1dc0df488828fdd34c8145f.rdb
    │       ├── hypothesize-independence_fe1c79b9f1dc0df488828fdd34c8145f.rdx
    │       ├── specify-diff-in-means_e4103c4c3e3daedd5c1429b7a1bc8727.RData
    │       ├── specify-diff-in-means_e4103c4c3e3daedd5c1429b7a1bc8727.rdb
    │       ├── specify-diff-in-means_e4103c4c3e3daedd5c1429b7a1bc8727.rdx
    │       ├── specify-example_3ea3cfa390233b127dc25b05b0354bcf.RData
    │       ├── specify-example_3ea3cfa390233b127dc25b05b0354bcf.rdb
    │       ├── specify-example_3ea3cfa390233b127dc25b05b0354bcf.rdx
    │       ├── specify-one_149be66261b0606b7ddb80efd10fa81d.RData
    │       ├── specify-one_149be66261b0606b7ddb80efd10fa81d.rdb
    │       ├── specify-one_149be66261b0606b7ddb80efd10fa81d.rdx
    │       ├── specify-success_e8eb15e9f621ccf60cb6527a6bccdb4b.RData
    │       ├── specify-success_e8eb15e9f621ccf60cb6527a6bccdb4b.rdb
    │       ├── specify-success_e8eb15e9f621ccf60cb6527a6bccdb4b.rdx
    │       ├── specify-two_20085531c110a936ee691162f225333b.RData
    │       ├── specify-two_20085531c110a936ee691162f225333b.rdb
    │       └── specify-two_20085531c110a936ee691162f225333b.rdx
    ├── observed_stat_examples.Rmd
    ├── paired.Rmd
    └── t_test.Rmd
Condensed preview — 155 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (834K chars).
[
  {
    "path": ".Rbuildignore",
    "chars": 408,
    "preview": "^CRAN-RELEASE$\n^.*\\.Rproj$\n^\\.Rproj\\.user$\n^README\\.Rmd$\n^figs$\n^profiles*\n^examples*\n^codecov\\.yml$\n^docs*\n^CONDUCT\\.md"
  },
  {
    "path": ".gitattributes",
    "chars": 51,
    "preview": "* text=auto\ndata/* binary\nsrc/* text=lf\nR/* text=lf"
  },
  {
    "path": ".github/.gitignore",
    "chars": 7,
    "preview": "*.html\n"
  },
  {
    "path": ".github/CODE_OF_CONDUCT.md",
    "chars": 5244,
    "preview": "# Contributor Covenant Code of Conduct\n\n## Our Pledge\n\nWe as members, contributors, and leaders pledge to make participa"
  },
  {
    "path": ".github/workflows/R-CMD-check.yaml",
    "chars": 1712,
    "preview": "# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples\n# Need help debugging build failures? Start at"
  },
  {
    "path": ".github/workflows/check-hard.yaml",
    "chars": 1575,
    "preview": "# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples\n# Need help debugging build failures? Start at"
  },
  {
    "path": ".github/workflows/lock.yaml",
    "chars": 1041,
    "preview": "name: 'Lock Threads'\n\non:\n  schedule:\n    - cron: '0 0 * * *'\n\njobs:\n  lock:\n    runs-on: ubuntu-latest\n    steps:\n     "
  },
  {
    "path": ".github/workflows/pkgdown.yaml",
    "chars": 1300,
    "preview": "# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples\n# Need help debugging build failures? Start at"
  },
  {
    "path": ".github/workflows/pr-commands.yaml",
    "chars": 2501,
    "preview": "# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples\n# Need help debugging build failures? Start at"
  },
  {
    "path": ".github/workflows/test-coverage.yaml",
    "chars": 1813,
    "preview": "# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples\n# Need help debugging build failures? Start at"
  },
  {
    "path": ".gitignore",
    "chars": 82,
    "preview": ".Rproj.user\n.Rhistory\n.RData\n.Ruserdata\n.DS_Store\n.httr-oauth\ndoc\nMeta\ndocs\n*.psd\n"
  },
  {
    "path": ".vscode/extensions.json",
    "chars": 62,
    "preview": "{\n    \"recommendations\": [\n        \"Posit.air-vscode\"\n    ]\n}\n"
  },
  {
    "path": ".vscode/settings.json",
    "chars": 114,
    "preview": "{\n    \"[r]\": {\n        \"editor.formatOnSave\": true,\n        \"editor.defaultFormatter\": \"Posit.air-vscode\"\n    }\n}\n"
  },
  {
    "path": "CONTRIBUTING.md",
    "chars": 1573,
    "preview": "# Contributing\n\nContributions to the `infer` whether in the form of bug fixes, issue reports, new\ncode or documentation "
  },
  {
    "path": "DESCRIPTION",
    "chars": 2562,
    "preview": "Type: Package\nPackage: infer\nTitle: Tidy Statistical Inference\nVersion: 1.1.0.9000\nAuthors@R: c(\n    person(\"Andrew\", \"B"
  },
  {
    "path": "LICENSE",
    "chars": 43,
    "preview": "YEAR: 2025\nCOPYRIGHT HOLDER: infer authors\n"
  },
  {
    "path": "LICENSE.md",
    "chars": 1072,
    "preview": "# MIT License\n\nCopyright (c) 2025 infer authors\n\nPermission is hereby granted, free of charge, to any person obtaining a"
  },
  {
    "path": "NAMESPACE",
    "chars": 2600,
    "preview": "# Generated by roxygen2: do not edit by hand\n\nS3method(calc_impl,\"function\")\nS3method(calc_impl,Chisq)\nS3method(calc_imp"
  },
  {
    "path": "NEWS.md",
    "chars": 27426,
    "preview": "# infer (development version)\n\n# infer 1.1.0\n\n* Fixed bug where adding `shade_confidence_interval(NULL)` or `shade_p_val"
  },
  {
    "path": "R/assume.R",
    "chars": 12239,
    "preview": "#' Define a theoretical distribution\n#'\n#' @description\n#'\n#' This function allows the user to define a null distributio"
  },
  {
    "path": "R/calculate.R",
    "chars": 20700,
    "preview": "#' Calculate summary statistics\n#'\n#' @description\n#'\n#' Given the output of [specify()] and/or [hypothesize()], this fu"
  },
  {
    "path": "R/deprecated.R",
    "chars": 913,
    "preview": "#' Deprecated functions and objects\n#'\n#' These functions and objects should no longer be used. They will be removed\n#' "
  },
  {
    "path": "R/fit.R",
    "chars": 7552,
    "preview": "#' @importFrom generics fit\n#' @details\n#' Read more about infer's [fit][fit.infer()] function [here][fit.infer()] or\n#'"
  },
  {
    "path": "R/generate.R",
    "chars": 10415,
    "preview": "#' Generate resamples, permutations, or simulations\n#'\n#' @description\n#'\n#' Generation creates a simulated distribution"
  },
  {
    "path": "R/get_confidence_interval.R",
    "chars": 12159,
    "preview": "#' Compute confidence interval\n#'\n#' @description\n#'\n#' Compute a confidence interval around a summary statistic. Both\n#"
  },
  {
    "path": "R/get_p_value.R",
    "chars": 9768,
    "preview": "#' Compute p-value\n#'\n#' @description\n#'\n#' Compute a p-value from a null distribution and observed statistic.\n#'\n#' Lea"
  },
  {
    "path": "R/gss.R",
    "chars": 1345,
    "preview": "#' Subset of data from the General Social Survey (GSS).\n#'\n#' The General Social Survey is a high-quality survey which g"
  },
  {
    "path": "R/hypothesize.R",
    "chars": 6673,
    "preview": "#' Declare a null hypothesis\n#'\n#' @description\n#'\n#' Declare a null hypothesis about variables selected in [specify()]."
  },
  {
    "path": "R/infer.R",
    "chars": 1250,
    "preview": "#' infer: a grammar for statistical inference\n#'\n#' The objective of this package is to perform statistical inference us"
  },
  {
    "path": "R/observe.R",
    "chars": 3076,
    "preview": "#' Calculate observed statistics\n#'\n#' @description\n#'\n#' This function is a wrapper that calls [specify()], [hypothesiz"
  },
  {
    "path": "R/pipe.R",
    "chars": 320,
    "preview": "#' Pipe\n#'\n#' Like \\{dplyr\\}, \\{infer\\} also uses the pipe (\\code{|>}) function\n#' from \\code{magrittr} to turn function"
  },
  {
    "path": "R/print_methods.R",
    "chars": 1218,
    "preview": "#' Print methods\n#'\n#' @param x An object of class `infer`, i.e. output from [specify()] or\n#'   [hypothesize()], or of "
  },
  {
    "path": "R/rep_sample_n.R",
    "chars": 7633,
    "preview": "#' Perform repeated sampling\n#'\n#' @description\n#'\n#' These functions extend the functionality of [dplyr::sample_n()] an"
  },
  {
    "path": "R/set_params.R",
    "chars": 4245,
    "preview": "#' To determine which theoretical distribution to fit (if any)\n#'\n#' @param x A data frame that can be coerced into a [t"
  },
  {
    "path": "R/shade_confidence_interval.R",
    "chars": 6653,
    "preview": "#' Add information about confidence interval\n#'\n#' @description\n#'\n#' `shade_confidence_interval()` plots a confidence i"
  },
  {
    "path": "R/shade_p_value.R",
    "chars": 13076,
    "preview": "#' Shade histogram area beyond an observed statistic\n#'\n#' @description\n#'\n#' `shade_p_value()` plots a p-value region o"
  },
  {
    "path": "R/specify.R",
    "chars": 6821,
    "preview": "#' Specify response and explanatory variables\n#'\n#' @description\n#'\n#' `specify()` is used to specify which columns in t"
  },
  {
    "path": "R/utils.R",
    "chars": 19404,
    "preview": "# Miscellaneous Helpers -----------------------------------------------\nsuppress_infer_messages <- function() {\n  identi"
  },
  {
    "path": "R/visualize.R",
    "chars": 22106,
    "preview": "#' @importFrom ggplot2 ggplot_add\n#' @export\nggplot2::ggplot_add\n\n#' Visualize statistical inference\n#'\n#' @description\n"
  },
  {
    "path": "R/wrappers.R",
    "chars": 18978,
    "preview": "# Wrapper functions\n# Different shortcuts to doing traditional hypothesis tests & confidence\n# intervals in R as well as"
  },
  {
    "path": "README.Rmd",
    "chars": 6670,
    "preview": "---\noutput: github_document\n---\n\n# infer R Package <img src=\"man/figures/logo.png\" alt=\"A hexagonal logo. A green silhou"
  },
  {
    "path": "README.md",
    "chars": 7223,
    "preview": "\n# infer R Package <img src=\"man/figures/logo.png\" alt=\"A hexagonal logo. A green silhouette of a fir tree sits atop bla"
  },
  {
    "path": "_pkgdown.yml",
    "chars": 1133,
    "preview": "url: https://infer.tidymodels.org\n\ntemplate:\n  package: tidytemplate\n  bootstrap: 5\n  bslib:\n    danger: \"#CA225E\"\n    p"
  },
  {
    "path": "air.toml",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "codecov.yml",
    "chars": 232,
    "preview": "comment: false\n\ncoverage:\n  status:\n    project:\n      default:\n        target: auto\n        threshold: 1%\n        infor"
  },
  {
    "path": "data-raw/save_gss.R",
    "chars": 2318,
    "preview": "library(dplyr)\nlibrary(forcats)\nlibrary(srvyr)\nlibrary(ggplot2)\n\n# pull gss data\ntemp <- tempfile()\ndownload.file(\"https"
  },
  {
    "path": "figs/paper/apa.csl",
    "chars": 14926,
    "preview": "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<style xmlns=\"http://purl.org/net/xbiblio/csl\" class=\"in-text\" version=\"1.0\" demo"
  },
  {
    "path": "figs/paper/columns.tex",
    "chars": 385,
    "preview": "\\newenvironment{cols}[1][]{}{}\n\n\\newenvironment{col}[1]{\\begin{minipage}{#1}\\ignorespaces}{%\n\\end{minipage}\n\\ifhmode\\uns"
  },
  {
    "path": "figs/paper/paper.Rmd",
    "chars": 7934,
    "preview": "---\ntitle: 'infer: An R package for tidyverse-friendly statistical inference'\ntags:\n  - data science\n  - tidyverse\n  - i"
  },
  {
    "path": "figs/paper/paper.bib",
    "chars": 5523,
    "preview": "@book{ismay2019statistical,\n  title={Statistical Inference via Data Science: A ModernDive into {R} and the Tidyverse},\n "
  },
  {
    "path": "figs/paper/paper.log",
    "chars": 63285,
    "preview": "This is XeTeX, Version 3.14159265-2.6-0.99999 (TeX Live 2018) (preloaded format=xelatex 2018.4.16)  15 SEP 2021 07:48\nen"
  },
  {
    "path": "figs/paper/paper.md",
    "chars": 7934,
    "preview": "---\ntitle: 'infer: An R package for tidyverse-friendly statistical inference'\ntags:\n  - data science\n  - tidyverse\n  - i"
  },
  {
    "path": "infer.Rproj",
    "chars": 355,
    "preview": "Version: 1.0\n\nRestoreWorkspace: Default\nSaveWorkspace: Default\nAlwaysSaveHistory: Default\n\nEnableCodeIndexing: Yes\nUseSp"
  },
  {
    "path": "inst/CITATION",
    "chars": 585,
    "preview": "bibentry(\n  \"Article\",\n  title     = \"{infer}: An {R} package for tidyverse-friendly statistical inference\",\n  author   "
  },
  {
    "path": "man/assume.Rd",
    "chars": 6020,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/assume.R\n\\name{assume}\n\\alias{assume}\n\\tit"
  },
  {
    "path": "man/calculate.Rd",
    "chars": 7767,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/calculate.R\n\\name{calculate}\n\\alias{calcul"
  },
  {
    "path": "man/chisq_stat.Rd",
    "chars": 2137,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/wrappers.R\n\\name{chisq_stat}\n\\alias{chisq_"
  },
  {
    "path": "man/chisq_test.Rd",
    "chars": 1748,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/wrappers.R\n\\name{chisq_test}\n\\alias{chisq_"
  },
  {
    "path": "man/deprecated.Rd",
    "chars": 906,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/deprecated.R\n\\name{deprecated}\n\\alias{depr"
  },
  {
    "path": "man/fit.infer.Rd",
    "chars": 6857,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/fit.R\n\\name{fit.infer}\n\\alias{fit.infer}\n\\"
  },
  {
    "path": "man/generate.Rd",
    "chars": 5103,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/generate.R\n\\name{generate}\n\\alias{generate"
  },
  {
    "path": "man/get_confidence_interval.Rd",
    "chars": 5874,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/get_confidence_interval.R\n\\name{get_confid"
  },
  {
    "path": "man/get_p_value.Rd",
    "chars": 5110,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/get_p_value.R\n\\name{get_p_value}\n\\alias{ge"
  },
  {
    "path": "man/gss.Rd",
    "chars": 1422,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/gss.R\n\\docType{data}\n\\name{gss}\n\\alias{gss"
  },
  {
    "path": "man/hypothesize.Rd",
    "chars": 2659,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/hypothesize.R\n\\name{hypothesize}\n\\alias{hy"
  },
  {
    "path": "man/infer.Rd",
    "chars": 2032,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/infer.R\n\\docType{package}\n\\name{infer}\n\\al"
  },
  {
    "path": "man/observe.Rd",
    "chars": 7320,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/observe.R\n\\name{observe}\n\\alias{observe}\n\\"
  },
  {
    "path": "man/pipe.Rd",
    "chars": 375,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/pipe.R\n\\name{\\%>\\%}\n\\alias{\\%>\\%}\n\\title{P"
  },
  {
    "path": "man/print.infer.Rd",
    "chars": 714,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/print_methods.R\n\\name{print.infer}\n\\alias{"
  },
  {
    "path": "man/prop_test.Rd",
    "chars": 4209,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/wrappers.R\n\\name{prop_test}\n\\alias{prop_te"
  },
  {
    "path": "man/reexports.Rd",
    "chars": 630,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/fit.R, R/visualize.R\n\\docType{import}\n\\nam"
  },
  {
    "path": "man/rep_sample_n.Rd",
    "chars": 3339,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/rep_sample_n.R\n\\name{rep_sample_n}\n\\alias{"
  },
  {
    "path": "man/shade_confidence_interval.Rd",
    "chars": 4335,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/shade_confidence_interval.R\n\\name{shade_co"
  },
  {
    "path": "man/shade_p_value.Rd",
    "chars": 3975,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/shade_p_value.R\n\\name{shade_p_value}\n\\alia"
  },
  {
    "path": "man/specify.Rd",
    "chars": 1878,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/specify.R\n\\name{specify}\n\\alias{specify}\n\\"
  },
  {
    "path": "man/t_stat.Rd",
    "chars": 2591,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/wrappers.R\n\\name{t_stat}\n\\alias{t_stat}\n\\t"
  },
  {
    "path": "man/t_test.Rd",
    "chars": 2370,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/wrappers.R\n\\name{t_test}\n\\alias{t_test}\n\\t"
  },
  {
    "path": "man/visualize.Rd",
    "chars": 6712,
    "preview": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/visualize.R\n\\name{visualize}\n\\alias{visual"
  },
  {
    "path": "man-roxygen/seeds.Rmd",
    "chars": 1189,
    "preview": "# Reproducibility\n\nWhen using the infer package for research, or in other cases when exact reproducibility is a priority"
  },
  {
    "path": "tests/testthat/_snaps/aliases.md",
    "chars": 467,
    "preview": "# old aliases produce informative error\n\n    Code\n      res <- p_value(gss_calc, obs_stat = -0.2, direction = \"right\")\n "
  },
  {
    "path": "tests/testthat/_snaps/assume.md",
    "chars": 3429,
    "preview": "# assume errors with bad arguments\n\n    Code\n      assume(hypothesize(specify(gss, age ~ college), null = \"independence\""
  },
  {
    "path": "tests/testthat/_snaps/calculate.md",
    "chars": 18327,
    "preview": "# x is a tibble\n\n    Code\n      calculate(vec, stat = \"mean\")\n    Condition\n      Error in `calculate()`:\n      ! `x` mu"
  },
  {
    "path": "tests/testthat/_snaps/fit.md",
    "chars": 590,
    "preview": "# fit.infer messages informatively on excessive null\n\n    Code\n      res_ <- fit(hypothesize(specify(gss, hours ~ age + "
  },
  {
    "path": "tests/testthat/_snaps/generate.md",
    "chars": 12424,
    "preview": "# cohesion with type argument\n\n    Code\n      res_ <- generate(hyp_prop, type = \"bootstrap\")\n    Condition\n      Warning"
  },
  {
    "path": "tests/testthat/_snaps/get_confidence_interval.md",
    "chars": 5649,
    "preview": "# get_confidence_interval messages with no explicit `level`\n\n    Code\n      res_ <- get_confidence_interval(test_df)\n   "
  },
  {
    "path": "tests/testthat/_snaps/get_p_value.md",
    "chars": 4062,
    "preview": "# direction is appropriate\n\n    Code\n      get_p_value(test_df, obs_stat = 0.5, direction = \"righ\")\n    Condition\n      "
  },
  {
    "path": "tests/testthat/_snaps/hypothesize.md",
    "chars": 5962,
    "preview": "# hypothesize() throws an error when null is not point or independence\n\n    Code\n      hypothesize(specify(mtcars_df, re"
  },
  {
    "path": "tests/testthat/_snaps/observe.md",
    "chars": 450,
    "preview": "# observe() output is the same as the old wrappers\n\n    Code\n      res_wrap <- chisq_stat(gss_tbl, college ~ partyid)\n  "
  },
  {
    "path": "tests/testthat/_snaps/print.md",
    "chars": 897,
    "preview": "# print method fits linewidth with many predictors (#543)\n\n    Code\n      specify(mtcars, mpg ~ cyl + disp + hp + drat +"
  },
  {
    "path": "tests/testthat/_snaps/rep_sample_n.md",
    "chars": 5605,
    "preview": "# `rep_sample_n` checks input\n\n    Code\n      rep_sample_n(\"a\", size = 1)\n    Condition\n      Error in `rep_sample_n()`:"
  },
  {
    "path": "tests/testthat/_snaps/shade_confidence_interval.md",
    "chars": 2289,
    "preview": "# shade_confidence_interval throws errors and warnings\n\n    Code\n      res_ <- gss_viz_sim + shade_confidence_interval(c"
  },
  {
    "path": "tests/testthat/_snaps/shade_p_value.md",
    "chars": 2622,
    "preview": "# shade_p_value throws errors\n\n    Code\n      gss_viz_sim + shade_p_value(\"a\", \"right\")\n    Condition\n      Error in `sh"
  },
  {
    "path": "tests/testthat/_snaps/specify.md",
    "chars": 4762,
    "preview": "# data argument\n\n    Code\n      specify(blah ~ cyl)\n    Condition\n      Error in `specify()`:\n      ! `x` must be 'data."
  },
  {
    "path": "tests/testthat/_snaps/utils.md",
    "chars": 1139,
    "preview": "# check_type works\n\n    Code\n      check_type(x_var, is.character)\n    Condition\n      Error:\n      ! `x_var` must be 'c"
  },
  {
    "path": "tests/testthat/_snaps/visualize.md",
    "chars": 11279,
    "preview": "# visualize warns with bad arguments\n\n    Code\n      res_ <- visualize(calculate(generate(hypothesize(specify(gss_tbl, a"
  },
  {
    "path": "tests/testthat/_snaps/wrappers.md",
    "chars": 7740,
    "preview": "# t_test works\n\n    Code\n      res_ <- t_test(gss_tbl, hours ~ sex)\n    Condition\n      Warning:\n      The statistic is "
  },
  {
    "path": "tests/testthat/helper-data.R",
    "chars": 1595,
    "preview": "set.seed(4242)\n\nexpect_doppelganger <- function(title, fig, ...) {\n  testthat::skip_if_not_installed(\"vdiffr\")\n  vdiffr:"
  },
  {
    "path": "tests/testthat/setup.R",
    "chars": 85,
    "preview": "withr::local_envvar(SUPPRESS_INFER_MESSAGES = \"true\", .local_envir = teardown_env())\n"
  },
  {
    "path": "tests/testthat/test-aliases.R",
    "chars": 479,
    "preview": "test_that(\"aliases work\", {\n  expect_equal(\n    gss_calc |>\n      get_pvalue(obs_stat = -0.2, direction = \"right\") |>\n  "
  },
  {
    "path": "tests/testthat/test-assume.R",
    "chars": 6882,
    "preview": "test_that(\"distribution description works as expected\", {\n  # extract the \"first element\" to convert to character\n  assu"
  },
  {
    "path": "tests/testthat/test-calculate.R",
    "chars": 26368,
    "preview": "# calculate arguments\ntest_that(\"x is a tibble\", {\n  vec <- 1:10\n  expect_snapshot(error = TRUE, calculate(vec, stat = \""
  },
  {
    "path": "tests/testthat/test-fit.R",
    "chars": 3232,
    "preview": "x1 <- gss[1:100, ] |> specify(response = hours)\nx2 <- gss[1:100, ] |> specify(hours ~ NULL)\nx3 <- gss[1:100, ] |> specif"
  },
  {
    "path": "tests/testthat/test-generate.R",
    "chars": 17805,
    "preview": "hyp_prop <- mtcars_df |>\n  specify(response = am, success = \"1\") |>\n  hypothesize(null = \"point\", p = .5)\n\nhyp_diff_in_p"
  },
  {
    "path": "tests/testthat/test-get_confidence_interval.R",
    "chars": 12351,
    "preview": "set.seed(2018)\ntest_df <- gss_calc[1:20, ]\ntest_df$stat <- c(\n  -5,\n  -4,\n  -4,\n  -4,\n  -1,\n  -0.5,\n  rep(0, 6),\n  1,\n  "
  },
  {
    "path": "tests/testthat/test-get_p_value.R",
    "chars": 11114,
    "preview": "set.seed(2018)\ntest_df <- gss_calc[1:20, ]\ntest_df$stat <- sample(c(\n  -5,\n  -4,\n  -4,\n  -4,\n  -1,\n  -0.5,\n  rep(0, 6),\n"
  },
  {
    "path": "tests/testthat/test-hypothesize.R",
    "chars": 7155,
    "preview": "one_mean <- mtcars_df |>\n  specify(response = mpg) |> # formula alt: mpg ~ NULL\n  hypothesize(null = \"point\", mu = 25)\n\n"
  },
  {
    "path": "tests/testthat/test-observe.R",
    "chars": 4010,
    "preview": "test_that(\"observe() output is equal to core verbs\", {\n  expect_equal(\n    gss |>\n      observe(hours ~ NULL, stat = \"me"
  },
  {
    "path": "tests/testthat/test-print.R",
    "chars": 343,
    "preview": "test_that(\"print works\", {\n  expect_output(print(\n    gss_tbl |>\n      specify(age ~ hours) |>\n      hypothesize(null = "
  },
  {
    "path": "tests/testthat/test-rep_sample_n.R",
    "chars": 8301,
    "preview": "n_population <- 5\npopulation <- tibble::tibble(\n  ball_id = 1:n_population,\n  color = factor(c(rep(\"red\", 3), rep(\"white"
  },
  {
    "path": "tests/testthat/test-shade_confidence_interval.R",
    "chars": 2594,
    "preview": "# shade_confidence_interval -----------------------------------------------\ntest_that(\"shade_confidence_interval works\","
  },
  {
    "path": "tests/testthat/test-shade_p_value.R",
    "chars": 5932,
    "preview": "# shade_p_value -----------------------------------------------------------\ntest_that(\"shade_p_value works\", {\n  skip_if"
  },
  {
    "path": "tests/testthat/test-specify.R",
    "chars": 4084,
    "preview": "one_nonshift_mean <- mtcars_df |> specify(response = mpg)\n\none_nonshift_prop <- mtcars_df |> specify(response = am, succ"
  },
  {
    "path": "tests/testthat/test-utils.R",
    "chars": 4635,
    "preview": "test_that(\"append_infer_class works\", {\n  expect_equal(\n    class(append_infer_class(structure(\"a\", class = \"b\"))),\n    "
  },
  {
    "path": "tests/testthat/test-visualize.R",
    "chars": 24689,
    "preview": "library(dplyr)\n\nset.seed(42)\n\nhours_resamp <- gss_tbl |>\n  specify(hours ~ NULL) |>\n  hypothesize(null = \"point\", med = "
  },
  {
    "path": "tests/testthat/test-wrappers.R",
    "chars": 15854,
    "preview": "test_that(\"t_test works\", {\n  # Two Sample\n  expect_snapshot(res_ <- gss_tbl |> t_test(hours ~ sex))\n\n  expect_snapshot("
  },
  {
    "path": "tests/testthat.R",
    "chars": 370,
    "preview": "# This file is part of the standard setup for testthat.\n# It is recommended that you do not modify it.\n#\n# Where should "
  },
  {
    "path": "vignettes/anova.Rmd",
    "chars": 6741,
    "preview": "---\ntitle: \"Tidy ANOVA (Analysis of Variance) with infer\"\ndescription: \"Conducting ANOVA (Analysis of Variance) on tidy "
  },
  {
    "path": "vignettes/chi_squared.Rmd",
    "chars": 11557,
    "preview": "---\ntitle: \"Tidy Chi-Squared Tests with infer\"\ndescription: \"Conducting Chi-Squared tests on tidy data with infer.\"\noutp"
  },
  {
    "path": "vignettes/infer.Rmd",
    "chars": 17731,
    "preview": "---\ntitle: \"Getting to Know infer\"\ndescription: \"An introduction to the infer R package.\"\noutput: \n  rmarkdown::html_vig"
  },
  {
    "path": "vignettes/infer_cache/html/__packages",
    "chars": 43,
    "preview": "base\nusethis\ndevtools\ndplyr\ntestthat\ninfer\n"
  },
  {
    "path": "vignettes/infer_cache/html/calculate-point_94c073b633c3cf7bef3252dcad544ee2.rdb",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "vignettes/infer_cache/html/generate-permute_21b25928d642a97a30057306d51f1b23.rdb",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "vignettes/infer_cache/html/generate-point_d562524427be20dbb4736ca1ea29b04b.rdb",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "vignettes/infer_cache/html/hypothesize-40-hr-week_c8e33c404efa90c2ca0b2eacad95b06c.rdb",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "vignettes/infer_cache/html/hypothesize-independence_fe1c79b9f1dc0df488828fdd34c8145f.rdb",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "vignettes/infer_cache/html/specify-diff-in-means_e4103c4c3e3daedd5c1429b7a1bc8727.rdb",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "vignettes/infer_cache/html/specify-example_3ea3cfa390233b127dc25b05b0354bcf.rdb",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "vignettes/infer_cache/html/specify-one_149be66261b0606b7ddb80efd10fa81d.rdb",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "vignettes/infer_cache/html/specify-success_e8eb15e9f621ccf60cb6527a6bccdb4b.rdb",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "vignettes/infer_cache/html/specify-two_20085531c110a936ee691162f225333b.rdb",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "vignettes/observed_stat_examples.Rmd",
    "chars": 42473,
    "preview": "---\ntitle: \"Full infer Pipeline Examples\"\ndescription: \"A near-exhaustive demonstration of the functionality in infer.\"\n"
  },
  {
    "path": "vignettes/paired.Rmd",
    "chars": 7244,
    "preview": "---\ntitle: \"Tidy inference for paired data\"\ndescription: \"Conducting tests for paired independence on tidy data with inf"
  },
  {
    "path": "vignettes/t_test.Rmd",
    "chars": 12201,
    "preview": "---\ntitle: \"Tidy t-Tests with infer\"\ndescription: \"Conducting t-Tests on tidy data with infer.\"\noutput: rmarkdown::html_"
  }
]

// ... and 22 more files (download for full content)

About this extraction

This page contains the full source code of the tidymodels/infer GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 155 files (768.6 KB), approximately 222.0k tokens. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Copied to clipboard!