Repository: hadley/bigvis
Branch: master
Commit: 9cce240578a2
Files: 103
Total size: 162.1 KB

Directory structure:
gitextract_t_3h8d36/

├── .Rbuildignore
├── .gitignore
├── .travis.yml
├── DESCRIPTION
├── NAMESPACE
├── R/
│   ├── RcppExports.R
│   ├── adjust.r
│   ├── autoplot.r
│   ├── bigvis.r
│   ├── bin.r
│   ├── breaks.r
│   ├── challenge.r
│   ├── condense.r
│   ├── condensed.r
│   ├── dgrid.r
│   ├── h.r
│   ├── id.r
│   ├── movies.r
│   ├── mt.r
│   ├── origin.r
│   ├── peel.r
│   ├── ranged.r
│   ├── rebin.r
│   ├── rmse.r
│   ├── smooth.r
│   ├── standardise.r
│   ├── utils.r
│   ├── weighted-stats.r
│   └── width.r
├── README.md
├── bench/
│   ├── bin-structure.cpp
│   ├── bin.cpp
│   ├── count.cpp
│   ├── group-tempvar.cpp
│   ├── kernel.cpp
│   ├── mean.cpp
│   ├── median.cpp
│   └── smooth-1d.cpp
├── bigvis.Rproj
├── data/
│   └── movies.rdata
├── inst/
│   ├── include/
│   │   └── bigvis.h
│   └── tests/
│       ├── test-binned-vectors.r
│       ├── test-breaks.r
│       ├── test-condense.r
│       ├── test-frange.r
│       ├── test-group-1d.r
│       ├── test-group-2d.r
│       ├── test-origin.r
│       ├── test-ranged.r
│       ├── test-smooth.r
│       ├── test-stat.r
│       ├── test-summary-moments.r
│       └── test-weighted-stats.r
├── man/
│   ├── autoplot.condensed.Rd
│   ├── best_h.Rd
│   ├── bigvis.Rd
│   ├── bin.Rd
│   ├── breaks.Rd
│   ├── condense.Rd
│   ├── condensed.Rd
│   ├── dchallenge.Rd
│   ├── dgrid.Rd
│   ├── find_origin.Rd
│   ├── find_width.Rd
│   ├── frange.Rd
│   ├── h_grid.Rd
│   ├── is.ranged.Rd
│   ├── movies.Rd
│   ├── mt.Rd
│   ├── peel.Rd
│   ├── ranged.Rd
│   ├── rmse_cvs.Rd
│   ├── round_any.condensed.Rd
│   ├── smooth.Rd
│   ├── standardise.Rd
│   ├── transform.condensed.Rd
│   ├── weighted.IQR.Rd
│   ├── weighted.ecdf.Rd
│   ├── weighted.median.Rd
│   ├── weighted.quantile.Rd
│   └── weighted.var.Rd
├── notes.md
└── src/
    ├── .gitignore
    ├── BigVis.cpp
    ├── BinnedVector.cpp
    ├── BinnedVectors.cpp
    ├── Makevars
    ├── RcppExports.cpp
    ├── Summary2d.cpp
    ├── Summary2d.h
    ├── condense-gen.r
    ├── condense.cpp
    ├── double-diff-sum.cpp
    ├── frange.cpp
    ├── group-hex.h
    ├── group.cpp
    ├── group.h
    ├── lowerBound.cpp
    ├── smooth-nd.cpp
    ├── stats.cpp
    ├── stats.h
    ├── summary.cpp
    └── summary.h

================================================
FILE CONTENTS
================================================

================================================
FILE: .Rbuildignore
================================================
bench
notes.md
^.*\.Rproj$
^\.Rproj\.user$
^\.travis\.yml$
^src/condense-gen\.r$


================================================
FILE: .gitignore
================================================
.Rproj.user
.Rhistory
.RData
src/*.o
src/*.so
src/*.dll


================================================
FILE: .travis.yml
================================================
# Sample .travis.yml for R projects

language: r
warnings_are_errors: true
sudo: required

r_github_packages:
  - jimhester/covr
after_success:
  - Rscript -e 'covr::codecov()'


================================================
FILE: DESCRIPTION
================================================
Package: bigvis
Version: 0.1.0.9000
Title: Tools for visualisation of big data sets
Description: Tools for visualising large datasets.
Authors@R: c(
    person("Hadley", "Wickham", role = c("aut", "cre"), , "hadley@rstudio.com"),
    person("Yue", "Hue", role = "aut"),
    person("R Core team", role = "ctb", comment = "guess_bandwidth adapted from stats::bw.SJ")
    )
Depends:
    Rcpp
Imports:
    methods
Suggests:
    plyr,
    ggplot2,
    scales
LazyData: true
LinkingTo: 
    Rcpp, 
    BH
License: GPL (>= 2)
Collate:
    'standardise.r'
    'movies.r'
    'RcppExports.R'
    'adjust.r'
    'ranged.r'
    'bigvis.r'
    'rebin.r'
    'autoplot.r'
    'origin.r'
    'utils.r'
    'breaks.r'
    'weighted-stats.r'
    'condense.r'
    'condensed.r'
    'bin.r'
    'smooth.r'
    'challenge.r'
    'peel.r'
    'id.r'
    'rmse.r'
    'width.r'
    'h.r'
    'mt.r'
    'dgrid.r'


================================================
FILE: NAMESPACE
================================================
# Generated by roxygen2 (4.1.1): do not edit by hand

S3method("[",dgrid)
S3method("[<-",ranged)
S3method(Math,condensed)
S3method(Ops,condensed)
S3method(Ops,ranged)
S3method(as.condensed,condensed)
S3method(as.condensed,data.frame)
S3method(as.data.frame,dgrid)
S3method(as.data.frame,ranged)
S3method(as.integer,dgrid)
S3method(max,dgrid)
S3method(max,ranged)
S3method(min,dgrid)
S3method(min,ranged)
S3method(print,ranged)
S3method(range,dgrid)
S3method(range,ranged)
S3method(str,ranged)
S3method(transform,condensed)
export(as.condensed)
export(autoplot.condensed)
export(best_h)
export(bin)
export(breaks)
export(condense)
export(dchallenge)
export(dgrid)
export(find_origin)
export(find_width)
export(frange)
export(h_grid)
export(inv_mt)
export(is.condensed)
export(is.dgrid)
export(is.ranged)
export(mt)
export(mt_trans)
export(peel)
export(ranged)
export(rchallenge)
export(rebin)
export(rmse_cv)
export(rmse_cvs)
export(round_any.condensed)
export(smooth)
export(standardise)
export(weighted.IQR)
export(weighted.ecdf)
export(weighted.median)
export(weighted.quantile)
export(weighted.sd)
export(weighted.var)
exportMethods(as.integer)
exportMethods(show)
importFrom(Rcpp,compileAttributes)
importFrom(Rcpp,cpp_object_initializer)
importFrom(methods,new)
useDynLib(bigvis)


================================================
FILE: R/RcppExports.R
================================================
# This file was generated by Rcpp::compileAttributes
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

condense_count <- function(x, z, weight, drop = FALSE) {
    .Call('bigvis_condense_count', PACKAGE = 'bigvis', x, z, weight, drop)
}

condense_sum <- function(x, z, weight, drop = FALSE) {
    .Call('bigvis_condense_sum', PACKAGE = 'bigvis', x, z, weight, drop)
}

condense_mean <- function(x, z, weight, drop = FALSE) {
    .Call('bigvis_condense_mean', PACKAGE = 'bigvis', x, z, weight, drop)
}

condense_sd <- function(x, z, weight, drop = FALSE) {
    .Call('bigvis_condense_sd', PACKAGE = 'bigvis', x, z, weight, drop)
}

condense_median <- function(x, z, weight, drop = FALSE) {
    .Call('bigvis_condense_median', PACKAGE = 'bigvis', x, z, weight, drop)
}

double_diff_sum <- function(bin, count) {
    .Call('bigvis_double_diff_sum', PACKAGE = 'bigvis', bin, count)
}

#' Efficient implementation of range.
#'
#' This is an efficient C++ implementation of range for numeric vectors:
#' it avoids S3 dispatch, and computes both min and max in a single pass
#' through the input.
#'
#' If \code{x} has a \code{range} attribute (e.g. it's a \code{\link{ranged}}
#' object), it will be used instead of computing the range from scratch.
#' 
#' @param x a numeric vector, or a \code{\link{ranged}} object
#' @param finite If \code{TRUE} ignores missing values and infinities. Note
#'   that if the vector is empty, or only contains missing values, 
#'   \code{frange} will return \code{c(Inf, -Inf)} because those are the
#'   identity values for \code{\link{min}} and \code{\link{max}} respectively.
#' @export
#' @examples
#' x <- runif(1e6)
#' system.time(range(x))
#' system.time(frange(x))
#'
#' rx <- ranged(x)
#' system.time(frange(rx))
frange <- function(x, finite = TRUE) {
    .Call('bigvis_frange', PACKAGE = 'bigvis', x, finite)
}

group_fixed <- function(x, width, origin = 0) {
    .Call('bigvis_group_fixed', PACKAGE = 'bigvis', x, width, origin)
}

group_rect <- function(x, y, x_width, y_width, x_origin, y_origin) {
    .Call('bigvis_group_rect', PACKAGE = 'bigvis', x, y, x_width, y_width, x_origin, y_origin)
}

group_hex <- function(x, y, x_width, y_width, x_origin, y_origin, x_max) {
    .Call('bigvis_group_hex', PACKAGE = 'bigvis', x, y, x_width, y_width, x_origin, y_origin, x_max)
}

lowerBound <- function(x, breaks) {
    .Call('bigvis_lowerBound', PACKAGE = 'bigvis', x, breaks)
}

smooth_nd_1 <- function(grid_in, z_in, w_in_, grid_out, var, h, type = "mean") {
    .Call('bigvis_smooth_nd_1', PACKAGE = 'bigvis', grid_in, z_in, w_in_, grid_out, var, h, type)
}

smooth_nd <- function(grid_in, z_in, w_in_, grid_out, h) {
    .Call('bigvis_smooth_nd', PACKAGE = 'bigvis', grid_in, z_in, w_in_, grid_out, h)
}

bisquare <- function(u, b) {
    .Call('bigvis_bisquare', PACKAGE = 'bigvis', u, b)
}

regress <- function(x, y, w) {
    .Call('bigvis_regress', PACKAGE = 'bigvis', x, y, w)
}

medianC <- function(x) {
    .Call('bigvis_median', PACKAGE = 'bigvis', x)
}

regress_robust <- function(x, y, w, iterations = 3L) {
    .Call('bigvis_regress_robust', PACKAGE = 'bigvis', x, y, w, iterations)
}

compute_moments <- function(x) {
    .Call('bigvis_compute_moments', PACKAGE = 'bigvis', x)
}

compute_sum <- function(x) {
    .Call('bigvis_compute_sum', PACKAGE = 'bigvis', x)
}

compute_median <- function(x) {
    .Call('bigvis_compute_median', PACKAGE = 'bigvis', x)
}


================================================
FILE: R/adjust.r
================================================
# Protect against floating point areas by slightly adjusting breaks.
# Adapted from graphics::hist.default.
adjust_breaks <- function(breaks, open = "right") {
  open <- match.arg(open, c("left", "right"))

  breaks <- sort(breaks)
  diddle <- 1e-07 * median(diff(breaks))
  if (open == "left") {
    fuzz <- c(-diddle, rep.int(diddle, length(breaks) - 1))
  } else {
    fuzz <- c(rep.int(-diddle, length(breaks) - 1), diddle)
  }
  breaks + fuzz
}


================================================
FILE: R/autoplot.r
================================================
#' Autoplot condensed summaries.
#'
#' @param x a condensed summary
#' @param var which summary variable to display
#' @param ... other arguments passed on to individual methods
#' @method autoplot condensed
#' @export autoplot.condensed
#' @examples
#' if (require("ggplot2")) {
#'
#' # 1d summaries -----------------------------
#' x <- rchallenge(1e4)
#' z <- x + rt(length(x), df = 2)
#' xsum <- condense(bin(x, 0.1))
#' zsum <- condense(bin(x, 0.1), z = z)
#'
#' autoplot(xsum)
#' autoplot(peel(xsum))
#'
#' autoplot(zsum)
#' autoplot(peel(zsum, keep = 1))
#' autoplot(peel(zsum))
#'
#' # 2d summaries -----------------------------
#' y <- runif(length(x))
#' xysum <- condense(bin(x, 0.1), bin(y, 0.1))
#' xyzsum <- condense(bin(x, 0.1), bin(y, 0.1), z = z)
#'
#' autoplot(xysum)
#' autoplot(peel(xysum))
#' autoplot(xyzsum)
#' autoplot(peel(xyzsum))
#' }
autoplot.condensed <- function(x, var = last(summary_vars(x)), ...) {
  stopifnot(is.condensed(x))
  stopifnot(is.character(var), length(var) == 1)
  summaries <- c(
    .count = "total",
    .sum = "total",
    .mean = "summary",
    .sd = "summary",
    .median = "summary"
  )
  if (!(var %in% names(summaries))) {
    stop("Unknown varible", call. = FALSE)
  }
  d <- gcol(x)
  if (d > 2) {
    stop("No autoplot methods available for more than two d")
  }

  f <- paste0("plot_", summaries[var], "_", d)
  find_fun(f)(x, var = var, ...)
}


plot_total_1 <- function(x, var = ".count", show_na = TRUE, log = "") {
  xvar <- names(x)[[1]]

  plot <- ggplot2::ggplot(x[-1, ], ggplot2::aes_string(x = xvar, y = var)) +
    ggplot2::geom_line(na.rm = TRUE)

  if (show_na) {
    plot <- plot + na_layer(x, var)
  }

  if (logv(log, "y")) {
    plot <- plot + ggplot2::scale_y_continuous(trans = "log1p")
  }
  if (logv(log, "x")) {
    plot <- plot + ggplot2::scale_x_log10()
  }

  plot
}

plot_total_2 <- function(x, var = ".count", show_na = TRUE, log = "") {
  x <- peel(x, keep = 1)
  xvar <- names(x)[[1]]
  yvar <- names(x)[[2]]
  miss <- is.na(x[[1]]) + 2 * is.na(x[[2]])

  fill_trans <- if (logv(log, "z")) "log1p" else "identity"

  plot <- ggplot2::ggplot(x[miss == 0, ], ggplot2::aes_string(x = xvar, y = yvar)) +
    ggplot2::geom_raster(ggplot2::aes_string(fill = var)) +
    ggplot2::scale_fill_gradient(low = "grey90", high = "black", trans = fill_trans) +
    ggplot2::expand_limits(fill = 0)

  if (show_na) {
  }

  plot <- plot + if (logv(log, "x")) ggplot2::scale_x_log10()
  plot <- plot + if (logv(log, "y")) ggplot2::scale_y_log10()

  plot
}

plot_summary_1 <- function(x, var = ".mean", show_na = TRUE,
                                    show_n = x %contains% ".count", log = NULL) {
  xvar <- names(x)[[1]]

  plot <- ggplot2::ggplot(x[-1, ], ggplot2::aes_string(x = xvar, y = var)) +
    ggplot2::geom_line(na.rm = TRUE) +
    ggplot2::scale_size_area()

  if (show_n) {
    plot <- plot +
      ggplot2::geom_point(ggplot2::aes_string(color = ".count"), na.rm = TRUE) +
      ggplot2::scale_colour_gradient(trans = "log10")
  }

  if (show_na) {
    plot <- plot + na_layer(x, var)
  }

  plot
}

plot_summary_2 <- function(x, var = ".mean", show_na = TRUE, log = "") {
  x <- peel(x, keep = 1)
  xvar <- names(x)[[1]]
  yvar <- names(x)[[2]]

  miss <- is.na(x[[1]]) + 2 * is.na(x[[2]])

  plot <- ggplot2::ggplot(x[miss == 0, ], ggplot2::aes_string(x = xvar, y = yvar)) +
    ggplot2::geom_tile(ggplot2::aes_string(fill = var)) +
    ggplot2::scale_fill_gradient2()

  if (show_na) {
  }

  plot <- plot + if (logv(log, "x")) ggplot2::scale_x_log10()
  plot <- plot + if (logv(log, "y")) ggplot2::scale_y_log10()

  plot
}

na_layer <- function(x, var) {
  val <- x[[var]][is.na(x[[1]])]
  if (length(val) == 0 || is.na(val) || val == 0) return()

  xloc <- miss_poss(x[[1]])
  ggplot2::annotate("text", x = xloc, y = val, colour = "red", label = "NA",
    size = 3)
}

logv <- function(log, var) var %in% strsplit(log, "")[[1]]

miss_poss <- function(x) {
  rng <- frange(x)
  rng[1] - (rng[2] - rng[1]) * 0.05
}


================================================
FILE: R/bigvis.r
================================================
#' The big vis package.
#'
#' @useDynLib bigvis
#' @docType package
#' @name bigvis
NULL

if (!exists("BigVis")) {
  BigVis <- Rcpp::Module("BigVis")
}


#' @param x,object,... Generic args
#' @rdname bigvis
#' @export
setMethod("show", "Rcpp_BinnedVector", function(object) {
  cat("Binned [", object$size(), "]. ",
    "Width: ", object$width(), " Origin: ", object$origin(), "\n", sep = "")
})

#' @rdname bigvis
#' @export
setMethod("as.integer", "Rcpp_BinnedVector", function(x, ...) {
  vapply(seq_len(x$size()), x$bin_i, integer(1))
})


# Silence R CMD check note
#' @importFrom methods new
#' @importFrom Rcpp compileAttributes cpp_object_initializer
NULL


================================================
FILE: R/bin.r
================================================

#' Create a binned variable.
#'
#' @details
#' This function produces an R reference class that wraps around a C++ function.
#' Generally, you should just treat this as an opaque object with reference
#' semantics, and you shouldn't call the methods on it - pass it to
#' \code{\link{condense}} and friends.
#'
#' @param x numeric or integer vector
#' @param width bin width. If not specified, about 10,000 bins will be chosen
#'   using the algorithim in \code{\link{find_width}}.
#' @param origin origin. If not specified, guessed by \code{\link{find_origin}}.
#' @param name name of original variable. This will be guessed from the input to
#'   \code{group} if not supplied. Used in the output of
#'   \code{\link{condense}} etc.
#' @export
#' @examples
#' x <- runif(1e6)
#' bin(x)
#' bin(x, 0.01)
#' bin(x, 0.01, origin = 0.5)
bin <- function(x, width = find_width(x), origin = find_origin(x, width),
                name = NULL) {
  stopifnot(is.numeric(x))
  stopifnot(is.numeric(width), length(width) == 1, width > 0)
  stopifnot(is.numeric(origin), length(origin) == 1)

  if (is.null(name)) {
    name <- deparse(substitute(x))
  }
  stopifnot(is.character(name), length(name) == 1)

  if (!is.ranged(x)) {
    attr(x, "range") <- frange(x)
    class(x) <- "ranged"
  }
  if (origin > min(x)) {
    warning("Origin larger than min(x): some values will be truncated",
      call. = FALSE)
  }

  BigVis$BinnedVector$new(x, name, width, origin)
}


is.binned <- function(x) {
  is(x, "Rcpp_BinnedVector")
}

bins <- function(...) {
  BigVis$BinnedVectors$new(list(...))
}


================================================
FILE: R/breaks.r
================================================
#' Compute breaks given origin and width.
#'
#' Breaks are right-open, left-closed [x, y), so if \code{max(x)} is an integer
#' multiple of binwidth, then we need one more break. This function only returns
#' the left-side of the breaks.
#'
#' The first break is special, because it always contains missing values.
#'
#' @param x numeric vector
#' @param origin bin origin
#' @param binwidth bin width
#' @export
#' @keywords internal
#' @examples
#' breaks(10, origin = 0, binwidth = 1)
#' breaks(9.9, origin = 0, binwidth = 1)
#'
#' breaks(1:10, origin = 0, binwidth = 2)
breaks <- function(x, binwidth, origin = min(x)) {
  if (!is.binned(x)) {
    x <- bin(x, binwidth, origin)
  }

  # -1 for NA bin, -1 since R is 1 indexed
  nbins <- x$nbins() - 2
  c(NA, x$origin() + seq.int(1, nbins) * x$width())
}


================================================
FILE: R/challenge.r
================================================
#' Density and random number generation functions for a challenging
#' distribution.
#'
#' This is a 1/3-2/3 mixture of a t-distribution with 2 degrees of freedom
#' centered at 15 and scaled by 2, and a gamma distribution with shape 2
#' and rate 1/3. (The t-distribution is windsorised at 0, but this
#' has negligible effect.) This distribution is challenging because it
#' mixes heavy tailed and asymmetric distributions.
#'
#' @param x values to evaluate pdf at
#' @param n number of random samples to generate
#' @export
#' @examples
#' plot(dchallenge, xlim = c(-5, 60), n = 500)
#'
#' x <- rchallenge(1e4)
#' hist(x, breaks = 1000)
#' xsum <- condense(bin(x, 0.1))
#' plot(xsum$x, xsum$.count, type = "l")
#' xsmu <- smooth(xsum, 0.3)
#' plot(xsmu$x, xsmu$.count, type = "l")
#' plot(xsmu$x, xsmu$.count, type = "l", xlim = c(0, 30))
dchallenge <- function(x) {
  # Windorised t-distribution
  scale <- function(x) (x - 30) / 2
  spike <- ifelse(x < 0, 0, dt(scale(x), df = 2)) +
    pt(scale(0), df = 2) * (x == 0)

  slope <- dgamma(x, 2, 1/3)

  (spike + 2 * slope) / 3
}

# plot(pchallenge, xlim = c(-5, 60), n = 500)
pchallenge <- function(x) {
  #  H(y) =
  #  = int_0^y h(x) dx
  #  = int_0^y 1/3 f(x) + 2/3 g(x) dx
  #  = 1/3 int_0^y f(x) dx + 2/3 int_0^y g(x) dx =
  #  = 1/3 F(y) + 2/3 G(y)

  # h(x) = g((x - 30) / 2)
  # H(y) = int_0^y g((x - 30) / 2) dx
  # complete transformation

  scale <- function(x) (x - 30) / 2
  spike <- ifelse(x < 0, 0, pt(scale(x), df = 2))

  slope <- pgamma(x, 2, 1/3)

  (spike + 2 * slope) / 3
}

qchallenge <- function(x) {
  # approximate pchallenge with 1000 points, and linearise
  # use to implement fast option to rchallenge that does inverse pdf
  # transformation + runif()
}


#' @rdname dchallenge
#' @export
rchallenge <- function(n) {
  nt <- rbinom(1, n, 1 / 3)
  ngamma <- n - nt

  spike <- 2 * rt(nt, df = 2) + 15
  spike[spike < 0] <- 0

  slope <- rgamma(ngamma, 2, 1/3)

  c(spike, slope)
}


================================================
FILE: R/condense.r
================================================
#' Efficient binned summaries.
#'
#' @param ... group objects created by \code{\link{bin}}
#' @param z a numeric vector to summary for each group. Optional for some
#'   summary statistics.
#' @param summary the summary statistic to use. Currently must be one of
#'   count, sum, mean, median or sd. If \code{NULL}, defaults to mean if
#'   y is present, count if not.
#' @param w a vector of weights. Not currently supported by all summary
#'   functions.
#' @param drop if \code{TRUE} only locations with data will be returned.  This
#'   is more efficient if the data is very sparse (<1\% of cells filled), and
#'   is slightly less efficient. Defaults to \code{TRUE} if you are condensing
#'   over two or more dimensions, \code{FALSE} for 1d.
#' @export
#' @examples
#' x <- runif(1e5)
#' gx <- bin(x, 0.1)
#' condense(gx)
condense <- function(..., z = NULL, summary = NULL, w = NULL, drop = NULL) {
  gs <- list(...)
  if (length(gs) == 1 && is.list(gs[[1]])) gs <- gs[[1]]

  is_binned <- vapply(gs, is.binned, logical(1))
  if (!all(is_binned)) {
    stop("All objects passed to ... must be binned.", call. = FALSE)
  }

  drop <- drop %||% (length(gs) > 1)

  if (is.null(summary)) {
    summary <- if (is.null(z)) "count" else "mean"
    message("Summarising with ", summary)
  }

  # C++ code can deal with NULL inputs more efficiently than R code
  z <- z %||% numeric()
  w <- w %||% numeric()

  # Check lengths consistent
  n <- gs[[1]]$size()
  stopifnot(length(z) == 0 || length(z) == n)
  stopifnot(length(w) == 0 || length(w) == n)

  f <- find_fun(paste("condense", summary, sep = "_"))
  out <- f(gs, z, w, drop = drop)

  condensed(gs, out[[1]], out[[2]])
}


================================================
FILE: R/condensed.r
================================================
#' Condensed: an S3 class for condensed summaries.
#'
#' This object managed the properties of condensed (summarised) data frames.
#'
#' @section S3 methods:
#'
#' Mathematical functions with methods for \code{binsum} object will modify
#' the x column of the data frame and \code{\link{rebin}} the data, calculating
#' updated summary statistics.
#'
#' Currently methods are provided for the \code{Math} group generic,
#' logical comparison and arithmetic operators, and
#' \code{\link[plyr]{round_any}}.
#'
#' @param groups list of \code{\link{bin}}ed objects
#' @param grouped,summary output from C++ condense function
#' @keywords internal
#' @examples
#' if (require("ggplot2")) {
#'
#' x <- rchallenge(1e4)
#' xsum <- condense(bin(x, 1 / 10))
#'
#' # Basic math operations just modify the first column
#' autoplot(xsum)
#' autoplot(xsum * 10)
#' autoplot(xsum - 30)
#' autoplot(abs(xsum - 30))
#'
#' # Similarly, logical operations work on the first col
#' autoplot(xsum[xsum > 10, ])
#'}
condensed <- function(groups, grouped, summary) {
  grouped <- as.data.frame(grouped)
  summary <- as.data.frame(summary)

  for (i in seq_along(groups)) {
    grouped[[i]] <- dgrid(grouped[[i]],
      groups[[i]]$width(), groups[[i]]$origin(), groups[[i]]$nbins())
  }

  names(summary) <- paste0(".", names(summary))

  df <- data.frame(grouped, summary)
  class(df) <- c("condensed", class(df))
  df
}

#' @export
#' @rdname condensed
#' @param x object to test or coerce
is.condensed <- function(x) inherits(x, "condensed")

#' @export
#' @rdname condensed
as.condensed <- function(x) UseMethod("as.condensed")
#' @export
as.condensed.condensed <- function(x) x
#' @export
as.condensed.data.frame <- function(x) {
  structure(x, class = c("condensed", class(x)))
}

summary_vars <- function(x) {
  stopifnot(is.condensed(x))
  nm <- names(x)
  names(x)[grepl("^\\.", names(x))]
}

group_vars <- function(x) {
  setdiff(names(x), summary_vars(x))
}

gcol <- function(x) length(group_vars(x))


#' @export
Math.condensed <- function(x, ...) {
  generic <- match.fun(.Generic)
  x[[1]] <- generic(x[[1]], ...)
  rebin(x)
}

#' @export
Ops.condensed <- function(e1, e2) {
  logical_ops <- c("==", "!=", "<", "<=", ">=", ">")
  math_ops <- c("+", "-", "*", "/", "^", "%%", "%/%")

  generic <- match.fun(.Generic)
  if (.Generic %in% logical_ops)  {
    l <- generic(e1[[1]], e2)
    l[1] <- TRUE # always preserve missings
    l & !is.na(l)
  } else if (.Generic %in% math_ops) {
    e1[[1]] <- generic(e1[[1]], e2)
    rebin(e1)
  } else {
    stop(.Generic, " not supported for condensed objects", call. = FALSE)
  }
}

#' Round any method for condensed objects
#'
#' @inheritParams plyr::round_any
#' @export
#' @keywords internal
round_any.condensed <- function(x, accuracy, f = round) {
  gvars <- group_vars(x)
  x[gvars] <- lapply(x[gvars], plyr::round_any, accuracy = accuracy, f = f)
  rebin(x)
}


================================================
FILE: R/dgrid.r
================================================
#' dgrid: an S3 class for data grids
#'
#' @param x a numeric vector to test or coerce.
#' @param width bin width
#' @param origin bin origins
#' @param nbins number of bins
#' @export
#' @examples
#' g <- dgrid(0:10 + 0.5, width = 1)
#' range(g)
#' as.integer(g)
dgrid <- function(x, width, origin = 0, nbins = NULL) {
  stopifnot(is.numeric(x))
  stopifnot(is.numeric(width), length(width) == 1, width > 0)
  stopifnot(is.numeric(origin), length(origin) == 1)

  if (is.null(nbins)) {
    nbins <- floor((max(x) - origin) / width)
  }

  structure(x, class = c("dgrid", "numeric"),
    width = width, origin = origin, nbins = nbins)
}

#' @export
#' @rdname dgrid
is.dgrid <- function(x) inherits(x, "dgrid")

#' @export
"[.dgrid" <- function(x, ...) {
  dgrid(NextMethod(), width = attr(x, "width"),
    origin = attr(x, "origin"), nbins = attr(x, "nbins"))
}

#' @export
min.dgrid <- function(x, ...) attr(x, "origin")
#' @export
max.dgrid <- function(x, ...) {
  min(x) + attr(x, "nbins") * attr(x, "width")
}
#' @export
range.dgrid <- function(x, ...) c(min(x), max(x))

#' @export
as.integer.dgrid <- function(x, ...) {
  as.integer((unclass(x) - attr(x, "origin")) / attr(x, "width") + 1L)
}

#' @export
as.data.frame.dgrid <- function(x, ...) {
  n <- length(x)
  list <- list(x)
  class(list) <- "data.frame"
  attr(list, "row.names") <- c(NA_integer_, -n)
  list
}


================================================
FILE: R/h.r
================================================
#' Find "best" smoothing parameter using leave-one-out cross validation.
#'
#' Minimises the leave-one-out estimate of root mean-squared error to find
#' find the "optimal" bandwidth for smoothing.
#'
#' L-BFGS-B optimisation is used to constrain the bandwidths to be greater
#' than the binwidths: if the bandwidth is smaller than the binwidth it's
#' impossible to compute the rmse because no smoothing occurs. The tolerance
#' is set relatively high for numerical optimisation since the precise choice
#' of bandwidth makes little difference visually, and we're unlikely to have
#' sufficient data to make a statistically significant choice anyway.
#'
#' @param x condensed summary to smooth
#' @param h_init initial values of bandwidths to start search out. If not
#'  specified defaults to 5 times the binwidth of each variable.
#' @param ... other arguments (like \code{var}) passed on to
#'  \code{\link{rmse_cv}}
#' @param tol numerical tolerance, defaults to 1\%.
#' @param control additional control parameters passed on to \code{\link{optim}}
#'   The most useful argument is probably trace, which makes it possible to
#'   follow the progress of the optimisation.
#' @family bandwidth estimation functions
#' @return a single numeric value representing the bandwidth that minimises
#'   the leave-one-out estimate of rmse. Vector has attributes
#'   \code{evaluations} giving the number of times the objective function
#'   was evaluated. If the optimisation does not converge, or smoothing is not
#'   needed (i.e. the estimate is on the lower bounds), a warning is thrown.
#' @export
#' @examples
#' \donttest{
#' x <- rchallenge(1e4)
#' xsum <- condense(bin(x, 1 / 10))
#' h <- best_h(xsum, control = list(trace = 3, REPORT = 1))
#'
#' if (require("ggplot2")) {
#' autoplot(xsum)
#' autoplot(smooth(xsum, h))
#' }
#' }
best_h <- function(x, h_init = NULL, ..., tol = 1e-2, control = list()) {
  stopifnot(is.condensed(x))

  gvars <- group_vars(x)
  widths <- vapply(x[gvars], attr, "width", FUN.VALUE = numeric(1))
  h_init <- h_init %||% widths * 5
  stopifnot(is.numeric(h_init), length(h_init) == length(gvars))

  stopifnot(is.list(control))
  control <- modifyList(list(factr = tol / .Machine$double.eps), control)

  # Optimise
  rmse <- function(h) {
    rmse_cv(x, h, ...)
  }
  res <- optim(h_init, rmse, method = "L-BFGS-B", lower = widths * 1.01,
    control = control)
  h <- unname(res$par)

  # Feedback
  if (res$convergence != 0) {
    warning("Failed to converge: ", res$message, call. = FALSE)
  } else if (rel_dist(h, widths) < 1e-3) {
    warning("h close to lower bound: smoothing not needed", call. = FALSE)
  }
  structure(h, evaluations = res$counts[1], conv = res$convergence)
}

rel_dist <- function(x, y) {
  mean(abs(x - y) / abs(x + y))
}

#' Generate grid of plausible bandwidths for condensed summary.
#'
#' By default, the bandwidths start at the bin width, and then continue
#' up 50 (\code{n}) steps until 20 (\code{max}) times the bin width.
#'
#' @param x a condensed summary
#' @param n number of bandwidths to generate (in each dimension)
#' @param max maximum bandwidth to generate, as multiple of binwidth.
#' @family bandwidth estimation functions
#' @export
#' @examples
#' x <- rchallenge(1e4)
#' xsum <- condense(bin(x, 1 / 10))
#' h_grid(xsum)
#'
#' y <- runif(1e4)
#' xysum <- condense(bin(x, 1 / 10), bin(y, 1 / 100))
#' h_grid(xysum, n = 10)
h_grid <- function(x, n = 50, max = 20) {
  stopifnot(is.condensed(x))
  stopifnot(is.numeric(n), length(n) == 1, n > 0)
  stopifnot(is.numeric(max), length(max) == 1, max > 0)

  gs <- x[group_vars(x)]
  widths <- vapply(gs, attr, "width", FUN.VALUE = numeric(1))

  hs <- lapply(widths, function(w) w * seq(2, max, length = n))
  expand.grid(hs, KEEP.OUT.ATTRS = FALSE)
}


================================================
FILE: R/id.r
================================================
# Copied and pasted from plyr to avoid dependency

id <- function(.variables, drop = FALSE) {
  # Drop all zero length inputs
  lengths <- vapply(.variables, length, integer(1))
  .variables <- .variables[lengths != 0]

  if (length(.variables) == 0) {
    n <- nrow(.variables) %||% 0L
    return(structure(seq_len(n), n = n))
  }

  # Special case for single variable
  if (length(.variables) == 1) {
    return(id_var(.variables[[1]], drop = drop))
  }

  # Calculate individual ids
  ids <- rev(lapply(.variables, id_var, drop = drop))
  p <- length(ids)

  # Calculate dimensions
  ndistinct <- vapply(ids, attr, "n", FUN.VALUE = numeric(1),
    USE.NAMES = FALSE)
  n <- prod(ndistinct)
  if (n > 2 ^ 31) {
    # Too big for integers, have to use strings, which will be much slower :(

    char_id <- do.call("paste", c(ids, sep = "\r"))
    res <- match(char_id, unique(char_id))
  } else {
    combs <- c(1, cumprod(ndistinct[-p]))

    mat <- do.call("cbind", ids)
    res <- c((mat - 1L) %*% combs + 1L)
  }
  attr(res, "n") <- n


  if (drop) {
    id_var(res, drop = TRUE)
  } else {
    structure(as.integer(res), n = attr(res, "n"))
  }
}

id_var <- function(x, drop = FALSE) {
  if (length(x) == 0) return(structure(integer(), n = 0L))
  if (!is.null(attr(x, "n")) && !drop) return(x)

  if (is.factor(x) && !drop) {
    id <- as.integer(addNA(x, ifany = TRUE))
    n <- length(levels(x))
  } else {
    levels <- sort(unique(x), na.last = TRUE)
    id <- match(x, levels)
    n <- max(id)
  }
  structure(id, n = n)
}


================================================
FILE: R/movies.r
================================================
#' Movie information and user ratings from IMDB.com.
#'
#' The internet movie database, \url{http://imdb.com/}, is a website devoted
#' to collecting movie data supplied by studios and fans.  It claims to be the
#' biggest movie database on the web and is run by amazon.  More about
#' information imdb.com can be found online, 
#' \url{http://imdb.com/help/show_leaf?about}, including information about 
#' the data collection process,
#' \url{http://imdb.com/help/show_leaf?infosource}.
#' 
#' Movies were selected for inclusion if they had a known length and had been rated by at least one imdb user.  The data set contains the following fields:
#' 
#' \itemize{
#'   \item title.  Title of the movie.
#'   \item year.  Year of release.
#'   \item budget.  Total budget (if known) in US dollars
#'   \item length.  Length in minutes.
#'   \item rating.  Average IMDB user rating.
#'   \item votes.  Number of IMDB users who rated this movie.
#'   \item mpaa.  MPAA rating.
#'   \item action, animation, comedy, drama, documentary, romance, short:
#'     \code{TRUE} if movie belongs to that genre.
#' }
#' 
#' @docType data
#' @usage data(movies)
#' @name movies
#' @format A data frame with 130,456 rows and 14 variables
#' @references \url{http://had.co.nz/data/movies/}
NULL


================================================
FILE: R/mt.r
================================================
#' Modulus transformation (and its inverse).
#'
#' A generalisation of the box-cox transformation that works for
#' values with both positive and negative values.
#'
#' This is useful for compressing the tails of long-tailed distributions,
#' often encountered with very large datasets.
#'
#' @param x values to transform
#' @param lambda degree of transformation
#' @export
#' @references J. John and N. Draper. "An alternative family of
#'  transformations." Applied Statistics, pages 190-197, 1980.
#'  \url{http://www.jstor.org/stable/2986305}
#' @examples
#' x <- seq(-10, 10, length = 100)
#' plot(x, mt(x, 0), type = "l")
#' plot(x, mt(x, 0.25), type = "l")
#' plot(x, mt(x, 0.5), type = "l")
#' plot(x, mt(x, 1), type = "l")
#' plot(x, mt(x, 2), type = "l")
#' plot(x, mt(x, -1), type = "l")
#' plot(x, mt(x, -2), type = "l")
mt <- function(x, lambda) {
  stopifnot(is.numeric(x))
  stopifnot(is.numeric(lambda), length(lambda) == 1)

  if (lambda == 0) {
    sign(x) * log(abs(x) + 1)
  } else {
    sign(x) * ((abs(x) + 1) ^ lambda - 1) / lambda
   }
}

#' @rdname mt
#' @export
inv_mt <- function(x, lambda) {
  stopifnot(is.numeric(x))
  stopifnot(is.numeric(lambda), length(lambda) == 1)

  if (lambda == 0) {
    sign(x) * (exp(abs(x)) - 1)
  } else {
    sign(x) * ((abs(x) * lambda + 1) ^ (1 / lambda) - 1)
   }
}

#' @rdname mt
#' @export
mt_trans <- function(lambda) {
  scales::trans_new("modulo",
    function(x) mt(x, lambda),
    function(x) inv_mt(x, lambda)
  )
}


================================================
FILE: R/origin.r
================================================
#' Find the origin.
#'
#' @details
#' This algorithm implements simple heuristics for determining the origin of
#' a histogram when only the binwidth is specified. It:
#'
#' \itemize{
#'    \item rounds to zero, if relatively close
#'    \item subtracts 0.5 offset, if an x is integer
#'    \item ensures the origin is a multiple of the binwidth
#' }
#' @param x numeric or integer vector
#' @param binwidth binwidth
#' @export
#' @keywords internal
#' @family reasonable defaults
#' @examples
#' find_origin(1:10, 1)
#' find_origin(1:10, 2)
#' find_origin(c(1, 1e6), 1)
find_origin <- function(x, binwidth) {
  rng <- frange(x, finite = TRUE)
  if (!all(is.finite(rng))) stop("No valid values in x", call. = FALSE)

  offset <- is.integer(x) * 0.5

  if (close_to_zero(rng[1], rng)) {
    0 - offset
  } else {
    floor_any(rng[1], binwidth) - offset
  }
}

close_to_zero <- function(x, rng) {
  (abs(x) / abs(rng[2] - rng[1])) < 1e-3
}

floor_any <- function(x, accuracy) {
  floor(x / accuracy) * accuracy
}


================================================
FILE: R/peel.r
================================================
#' Peel off low density regions of the data.
#'
#' Keeps specified proportion of data by removing the lowest density regions,
#' either anywhere on the plot, or for 2d, just around the edges.
#'
#' This is useful for visualisation, as an easy way of focussing on the regions
#' where the majority of the data lies.
#'
#' @param x condensed summary
#' @param keep (approximate) proportion of data to keep. If \code{1}, will
#'   remove all cells with counts.  All missing values will be preserved.
#' @param central if \code{TRUE} peels off regions of lowest density only from
#'   the outside of the data. In 2d this works by progressively peeling off
#'   convex hull of the data: the current algorithm is quite slow.
#'   If \code{FALSE}, just removes the lowest density regions wherever they are
#'   found. Regions with 0 density are removed regardless of location.
#'   Defaults to TRUE if there are two or fewer grouping variables is less.
#' @export
#' @examples
#' x <- rt(1e5, df = 2)
#' y <- rt(1e5, df = 2)
#' xysum <- condense(bin(x, 1 / 10), bin(y, 1 / 10))
#' plot(xysum$x, xysum$y)
#'
#' plot(peel(xysum, 0.95, central = TRUE)[1:2])
#' plot(peel(xysum, 0.90, central = TRUE)[1:2])
#' plot(peel(xysum, 0.50, central = TRUE)[1:2])
peel <- function(x, keep = 0.99, central = NULL) {
  stopifnot(is.condensed(x))
  stopifnot(is.numeric(keep), length(keep) == 1, keep > 0, keep <= 1)
  central <- central %||% (gcol(x) <= 2)
  stopifnot(is.logical(central), length(central) == 1)

  if (is.null(x$.count)) {
    stop("Can only peel objects with .count variable", call. = FALSE)
  }

  x <- x[x$.count > 0, , drop = FALSE]
  if (keep == 1) return(x)

  complete <- complete.cases(x[group_vars(x)])
  x_complete <- x[complete, , drop = FALSE]

  if (central) {
    peeled <- peel_outer(x_complete, keep)
  } else {
    peeled <- peel_anywhere(x_complete, keep)
  }

  rbind(peeled, x[!complete, , drop = FALSE])
}

peel_anywhere <- function(x, keep) {
  ord <- order(x$.count, decreasing = TRUE)
  prop <- cumsum(x$.count[ord]) / sum(x$.count)

  ind <- which(prop >= keep)[1]
  x[ord[seq_len(ind)], , drop = FALSE]
}

peel_outer <- function(x, keep) {
  d <- gcol(x)
  if (d > 2) {
    stop("Outer peeling only works with 1d or 2d data", call. = FALSE)
  }

  n <- sum(x$.count)
  x <- x[order(x$.count, decreasing = TRUE), ]
  prop <- cumsum(x$.count) / n

  # Peel off smallest values on chull
  candidate <- which(prop >= keep)
  on_hull <- intersect(candidate, chull(x[1:d]))
  left <- sum(x$.count[-on_hull]) / n

  while(left >= keep && length(on_hull) > 0) {
    x <- x[-on_hull, , drop = FALSE]
    prop <- prop[-on_hull]
    candidate <- which(prop >= keep)
    on_hull <- intersect(candidate, chull(x[1:d]))
    left <- sum(x$.count[-on_hull]) / n
  }

  x
}


================================================
FILE: R/ranged.r
================================================
#' A S3 class for caching the range of a vector
#'
#' This class is designed for dealing with large vectors, where the cost of
#' recomputing the range multiple times is prohibitive. It provides methods
#' for \code{\link{print}} and \code{\link{str}} that display only the range,
#' not the contents.
#'
#' @section Performance:
#' For best performance, you may want to run copy and paste the contents of
#' this function into your function, to avoid making any copies of \code{x}.
#' This is probably only necessary if you're dealing with extremely large
#' vectors, > 100 million obs.
#'
#' @param x a numeric vector
#' @param range the range of the vector (excluding missing values), if known.
#'   If unknown, it will be computed with \code{\link{frange}}, a fast C++
#'   implementation of \code{\link{range}}.
#' @export
#' @examples
#' x <- runif(1e6)
#' y <- ranged(x)
#' range(y)
#' y
#' str(y)
#'
#' # Modifications to the class currently destroy the cache
#' y[1] <- 10
#' max(y)
#' class(y)
#' z <- y + 10
#' max(z)
#' class(z)
ranged <- function(x, range = frange(x, finite = TRUE)) {
  stopifnot(is.numeric(x))

  # Reset range attribute so that lazy evaluation of range
  # always recomputes from scratch
  attr(x, "range") <- NULL

  attr(x, "range") <- range
  class(x) <- "ranged"
  x
}

#' Test if an object is of class ranged.
#'
#' @export
#' @param x object to test
#' @keywords internal
is.ranged <- function(x) inherits(x, "ranged")

#' @export
min.ranged <- function(x, ...) attr(x, "range")[1]
#' @export
max.ranged <- function(x, ...) attr(x, "range")[2]
#' @export
range.ranged <- function(x, ...) attr(x, "range")

#' @export
print.ranged <- function(x, ...) {
  rng <- attr(x, "range")
  # attr(x, "range") <- NULL
  # attr(x, "class") <- NULL
  # print.default(x)
  cat("Ranged 1:", length(x), " [", format(rng[1]), ", ", format(rng[2]), "]\n",
    sep = "")
}

#' @export
str.ranged <- function(object, ...) {
  rng <- attr(object, "range")
  cat(" Ranged [1:", length(object), "] ", format(rng[1]), "--", format(rng[2]),
    "\n", sep = "")
}

#' @export
Ops.ranged <- function(e1, e2) {
  attr(e1, "range") <- NULL
  class(e1) <- NULL

  NextMethod(e1, e2)
}

#' @export
"[<-.ranged" <- function(x, ..., value) {
  attr(x, "range") <- NULL
  attr(x, "class") <- NULL
  NextMethod(x, ..., value = value)
}

#' @export
as.data.frame.ranged <- function(x, ...) {
  n <- length(x)
  x <- list(x)
  class(x) <- "data.frame"
  attr(x, "row.names") <- c(NA_integer_, -n)

  x
}


================================================
FILE: R/rebin.r
================================================
#' Transform condensed objects, collapsing unique bins.
#'
#' @details
#' You don't need to use \code{rebin} if you use transform: it will
#' automatically rebin for you.  You will need to use it if you manually
#' transform any grouping variables.
#'
#' @param data,`_data` a condensed summary
#' @param ... named arguments evaluated in the context of the data
#' @usage \\method{transform}{condensed}(`_data`, ...)
#' @keywords internal
#' @examples
#' x <- runif(1e4, -1, 1)
#' xsum <- condense(bin(x, 1 / 50))
#'
#' # Transforming by hand: must use rebin
#' xsum$x <- abs(xsum$x)
#' rebin(xsum)
#' if (require("ggplot2")) {
#'   autoplot(xsum) + geom_point()
#'   autoplot(rebin(xsum)) + geom_point()
#' }
#'
#' #' Transforming with transform
#' y <- x ^ 2 + runif(length(x), -0.1, 0.1)
#' xysum <- condense(bin(x, 1 / 50), z = y)
#' xysum <- transform(xysum, x = abs(x))
#' if (require("ggplot2")) {
#'   autoplot(xysum)
#' }
#' @export
#' @method transform condensed
transform.condensed <- function(`_data`, ...) {
  df <- transform.data.frame(`_data`, ...)
  rebin(as.condensed(df))
}

#' @export
#' @rdname transform.condensed
rebin <- function(data) {
  stopifnot(is.condensed(data))

  old_g <- data[group_vars(data)]
  old_g[] <- lapply(old_g, zapsmall, digits = 3)
  ids <- id(old_g, drop = TRUE)
  if (!anyDuplicated(ids)) return(data)

  old_s <- data[summary_vars(data)]
  new_s <- lapply(names(old_s), function(var) rebin_var(old_s, ids, var))
  names(new_s) <- names(old_s)

  uids <- !duplicated(ids)
  new_g <- old_g[uids, , drop = FALSE]
  ord <- order(ids[uids], na.last = FALSE)

  as.condensed(data.frame(new_g[ord, , drop = FALSE], new_s))
}

rebin_var <- function(df, ids, var) {
  stopifnot(is.data.frame(df))
  stopifnot(is.integer(ids), length(ids) == nrow(df))
  stopifnot(is.character(var), length(var) == 1, var %in% names(rebinners))

  rows <- split(seq_len(nrow(df)), ids)
  f <- rebinners[[var]]

  vapply(rows, function(i) f(df[i, , drop = FALSE]), numeric(1),
    USE.NAMES = FALSE)
}

rebinners <- list(
  .median = function(df) mean(df$.median, na.rm = TRUE),
  .sum = function(df) sum(df$.sum, na.rm = TRUE),
  .count = function(df) sum(df$.count, na.rm = TRUE),
  .mean = function(df) {
    if (is.null(df$.count)) {
      mean(df$.mean, na.rm = TRUE)
    } else {
      weighted.mean(df$.mean, df$.count)
    }
  }
)


================================================
FILE: R/rmse.r
================================================
#' Estimate smoothing RMSE using leave-one-out cross-validation.
#'
#' \code{rmse_cv} computes the leave-one-out RMSE for a single vector of
#' bandwidths, \code{rmse_cvs} computes for a multiple vectors of bandwidths,
#' stored as a data frame.
#'
#' @param x condensed summary table
#' @param h,hs for \code{rmse_cv}, a vector of bandwidths; for \code{rmse_cv}
#'    a data frame of bandwidths, as generated by \code{\link{h_grid}}.
#' @param var variable to smooth
#' @param ... other variables passed on to \code{\link{smooth}}
#' @family bandwidth estimation functions
#' @export
#' @examples
#' \donttest{
#' set.seed(1014)
#' # 1d -----------------------------
#' x <- rchallenge(1e4)
#' xsum <- condense(bin(x, 1 / 10))
#' cvs <- rmse_cvs(xsum)
#'
#' if (require("ggplot2")) {
#' autoplot(xsum)
#' qplot(x, err, data = cvs, geom = "line")
#' xsmu <- smooth(xsum, 1.3)
#' autoplot(xsmu)
#' autoplot(peel(xsmu))
#' }
#'
#' # 2d -----------------------------
#' y <- runif(1e4)
#' xysum <- condense(bin(x, 1 / 10), bin(y, 1 / 100))
#' cvs <- rmse_cvs(xysum, h_grid(xysum, 10))
#' if (require("ggplot2")) {
#' qplot(x, y, data = cvs, size = err)
#' }
#' }
rmse_cvs <- function(x, hs = h_grid(x), ...) {
  rmse_1 <- function(i) {
    rmse_cv(x, as.numeric(hs[i, ]), ...)
  }
  err <- vapply(seq_len(nrow(hs)), rmse_1, numeric(1))
  data.frame(hs, err)
}

#' @rdname rmse_cvs
#' @export
rmse_cv <- function(x, h, var = summary_vars(x)[1], ...) {
  # can't smooth missing values, so drop.
  x <- x[complete.cases(x), , drop = FALSE]
  gvars <- group_vars(x)

  pred_error <- function(i) {
    out <- as.matrix(x[i, gvars, drop = FALSE])
    smu <- smooth(x[-i, , drop = FALSE], grid = out, h = h, var = var, ...)
    smu[[var]] - x[[var]][i]
  }
  err <- vapply(seq_len(nrow(x)), pred_error, numeric(1))
  sqrt(mean(err ^ 2, na.rm = TRUE))
}


================================================
FILE: R/smooth.r
================================================
#' Smooth a condensed data frame.
#'
#' @param x a condensed summary
#' @param h numeric vector of bandwidths, one for each grouping variable in
#'   \code{x}
#' @param var variable to smooth
#' @param grid a data frame with the grouping colums as x.  In order for the
#'   factored version of \code{smooth_nd} to work, this grid must be a superset
#'   of \code{x}.
#' @param type type of smoothing to use.  Current options are \code{"mean"},
#'   a kernel weighted mean; \code{"regression"}, a kernel weighted local
#'   regression; and \code{"robust_regression"}, robust kernel weighted local
#'   regression in the style of \code{\link{loess}}.  Unique prefixes are also
#'   acceptable.
#' @param factor if \code{TRUE} compute the n-dimensional smooth by a sequence
#'   of 1d smoothes. For \code{type = "mean"} the results are always the same
#    as \code{FALSE}; for \code{type = "regress"} they will be equal if the
#'   grid values are uncorrelated (e.g. the grid is complete at every location);
#'   and is very approximate for \code{type = "robust"}.
#' @export
#' @examples
#' x <- runif(1e5)
#' xsum <- condense(bin(x, 1 / 100))
#' xsmu1 <- smooth(xsum, 5 / 100)
#' xsmu2 <- smooth(xsum, 5 / 100, factor = FALSE)
#'
#' # More challenging distribution
#' x <- rchallenge(1e4)
#' xsum <- condense(bin(x, 0.1))
#' xsmu <- smooth(xsum, 1)
#'
#' plot(xsum$x, xsum$.count, type = "l")
#' lines(xsmu$x, xsmu$.count, col = "red")
#'
#' xsmu2 <- smooth(xsum, 1, type = "regress")
#' plot(xsmu$x, xsmu$.count, type = "l", xlim = c(0, 50))
#' lines(xsmu2$x, xsmu2$.count, col = "red")
#' # Note difference in tails
smooth <- function(x, h, var = summary_vars(x)[1], grid = NULL, type = "mean",
                   factor = TRUE) {
  stopifnot(is.condensed(x))
  stopifnot(is.numeric(h), all(h > 0))
  type <- match.arg(type, c("mean", "regression", "robust_regression"))

  if (type != "mean" && !factor) {
    stop("Only factored approximations available for types other than mean",
      call. = FALSE)
  }

  grid_in <- as.matrix(x[group_vars(x)])
  grid_out <- grid %||% grid_in
  stopifnot(is.matrix(grid_out), is.numeric(grid_out),
    ncol(grid_out) == ncol(grid_in), nrow(grid_out) > 0)

  z <- x[[var]]
  w <- if (var != ".count" && x %contains% ".count") x$.count else numeric()

  if (factor) {
    for(i in 1:ncol(grid_in)) {
      # smooth_nd_1 is a C++ function, so var is 0 indexed
      z <- smooth_nd_1(grid_in, z, w, grid_out, var = i - 1, h = h[i],
        type = type)
    }
  } else {
    z <- smooth_nd(grid_in, z, w, grid_out, h)
  }

  out <- data.frame(grid_out)
  out[[var]] <- z
  structure(out, class = c("condensed", class(out)))
}

complete_grid <- function(df) {
  stopifnot(is.data.frame(df))

  breaks <- function(width, origin, nbins) {
    origin + width * seq.int(nbins)
  }

  cols <- lapply(df, function(x) do.call(breaks, attributes(x)))
  expand.grid(cols, KEEP.OUT.ATTRS = FALSE)
}


================================================
FILE: R/standardise.r
================================================
#' Standardise a summary to sum to one.
#'
#' @param x a condensed summary. Must have \code{.count} variable.
#' @param margin margins to standardise along.  If \code{NULL}, the default,
#'  standardises the whole array.
#' @export
#' @examples
#' b1 <- condense(bin(movies$year, 1))
#' d1 <- smooth(b1, 2, type = "reg")
#'
#' if (require("ggplot2")) {
#'
#' autoplot(b1)
#' autoplot(d1)
#'
#' # Note change in x-axis limits
#' autoplot(standardise(d1))
#' }
#'
#' # Can also standardise a dimension at a time
#' b2 <- with(movies, condense(bin(year, 2), bin(length, 10)))
#' b2 <- peel(b2, central = TRUE)
#'
#' if (require("ggplot2")) {
#'
#' autoplot(b2)
#' autoplot(standardise(b2))    # note legend
#' autoplot(standardise(b2, "year"))   # each row sums to 1
#' autoplot(standardise(b2, "length")) # each col sums to 1
#'
#' base <- ggplot(b2, aes(length, .count)) +
#'   geom_line(aes(group = year, colour = year))
#' base
#' base %+% standardise(b2)  # Just affects y axis labels
#' base %+% standardise(b2, "year") # Makes year comparable
#' base %+% standardise(b2, "length") # Meaningless for this display
#'
#' }
standardise <- function(x, margin = integer()) {
  stopifnot(is.condensed(x), !is.null(x$.count))

  if (length(margin) == 0) {
    x$.count <- prop(x$.count)
  } else {
    x$.count <- ave(x$.count, id(x[margin]), FUN = prop)
    x$.count[is.na(x$.count)] <- 0
  }

  x
}

prop <- function(x) x / sum(x, na.rm = TRUE)


================================================
FILE: R/utils.r
================================================
"%||%" <- function(x, y) if (is.null(x)) y else x

last <- function(x) x[length(x)]

"%contains%" <- function(df, var) {
  var %in% names(df)
}

find_fun <- function(name, env = globalenv()) {
  if (is.function(name)) return(name)

  ns_env <- asNamespace("bigvis")
  if (exists(name, ns_env, mode = "function")) {
    return(get(name, ns_env))
  }

  if (exists(name, env, mode = "function")) {
    return(get(name, env))
  }

  stop("Could not find function ", name, call. = FALSE)
}


================================================
FILE: R/weighted-stats.r
================================================
#' Compute a weighted variance or standard deviation of a vector.
#'
#' @details
#' Note that unlike the base R \code{\link{var}} function, these functions only
#' work with individual vectors not matrices or data frames.
#'
#' @family weighted statistics
#' @seealso \code{\link[stats]{weighted.mean}}
#' @param x numeric vector of observations
#' @param w integer vector of weights, representing the number of
#'  time each \code{x} was observed
#' @param na.rm if \code{TRUE}, missing values in both \code{w} and \code{x}
#'   will be removed prior computation. Otherwise if there are missing values
#'   the result will always be missing.
#' @export
#' @examples
#' x <- c(1:5)
#' w <- rpois(5, 5) + 1
#' y <- x[rep(seq_along(x), w)]
#' weighted.var(x, w)
#' var(y)
#'
#' stopifnot(all.equal(weighted.var(x, w), var(y)))
weighted.var <- function(x, w = NULL, na.rm = FALSE) {
  if (na.rm) {
    na <- is.na(x) | is.na(w)
    x <- x[!na]
    w <- w[!na]
  }

  sum(w * (x - weighted.mean(x, w)) ^ 2) / (sum(w) - 1)
}

#' @export
#' @rdname weighted.var
weighted.sd <- function(x, w, na.rm = TRUE) sqrt(weighted.var(x, w, na.rm = TRUE))

#' A weighted ecdf function.
#'
#' An extension of the base \code{\link[stats]{ecdf}} function which works
#' with weighted data.
#'
#' @section S3 methods:
#' The \code{ecdf} class has methods for \code{\link{plot}},
#' \code{\link{lines}}, \code{\link{summary}} and \code{\link{quantile}}.
#' \code{\link{quantile}} does not currently correctly compute values for
#' weighted ecdfs.
#'
#' @inheritParams weighted.var
#' @family weighted statistics
#' @seealso \code{\link[stats]{weighted.mean}}
#' @export
#' @examples
#' x <- runif(200)
#' w <- rpois(200, 5) + 1
#'
#' e <- weighted.ecdf(x, w)
#' plot(e)
#' summary(e)
#'
#' y <- x[rep(seq_along(x), w)]
#' plot(ecdf(y))
weighted.ecdf <- function(x, w) {
  stopifnot(length(x) == length(w))
  stopifnot(anyDuplicated(x) == 0)

  ord <- order(x)
  x <- x[ord]
  w <- w[ord]

  n <- sum(w)
  wts <- cumsum(w / n)

  f <- approxfun(x, wts, method = "constant", yleft = 0, yright = 1, f = 0)
  class(f) <- c("wecdf", "ecdf", "stepfun", class(f))
  attr(f, "call") <- sys.call()
  environment(f)$nobs <- n
  f
}

#' Compute quantiles of weighted data.
#'
#' @details
#' Currently only implements the type 7 algorithm, as described in
#' \code{\link{quantile}}. Based on \code{\link{quantile}} written by R-core.
#'
#' @inheritParams weighted.var
#' @param probs numeric vector of probabilities between 0 and 1
#' @param na.rm If \code{TRUE} will automatically remove missing values
#'   in \code{x} or \code{w}.
#' @family weighted statistics
#' @export
#' @examples
#' x <- runif(200)
#' w <- rpois(200, 5) + 1
#' weighted.quantile(x, w)
weighted.quantile <- function (x, w, probs = seq(0, 1, 0.25), na.rm = FALSE) {
  stopifnot(length(x) == length(w))
  na <- is.na(x) | is.na(w)
  if (any(na)) {
    if (na.rm) {
      x <- x[!na]
      w <- w[!na]
    } else {
      stop("Missing values not allowed when na.rm is FALSE", call. = FALSE)
    }
  }

  # Ensure x and w in ascending order of x
  ord <- order(x)
  x <- x[ord]
  w <- w[ord]

  # Find closest x just below and above index
  n <- sum(w)
  index <- 1 + (n - 1) * probs
  j <- floor(index)

  wts <- cumsum(w)
  lo <- x[lowerBound(j, wts)]     # X_j
  hi <- x[lowerBound(j + 1, wts)]

  g <- index - j
  ifelse(lo == hi, lo, (1 - g) * lo + g * hi)
}
# Q[i](p) = (1 - g) x[j] + g x[j+1]
# j = floor(np + m)
# g = np + m - j
#
# For type 7:
#   m = 1 - p =>
#   j = floor(1 + (n - 1) * p)
#   g = (np + 1 - p) - floor(1 + (n - 1) * p)

#' Compute the median of weighted data.
#'
#' @details This is a simple wrapper around \code{\link{weighted.quantile}}
#' @inheritParams weighted.quantile
#' @export
#' @examples
#' x <- runif(200)
#' w <- rpois(200, 5) + 1
#'
#' median(x)
#' weighted.median(x, w)
weighted.median <- function(x, w, na.rm = FALSE) {
  weighted.quantile(x, w, probs = 0.5, na.rm = na.rm)
}

#' Compute the interquartile range of weighted data.
#'
#' @details This is a simple wrapper around \code{\link{weighted.quantile}}
#' @inheritParams weighted.quantile
#' @export
#' @examples
#' x <- sort(runif(200))
#' w <- rpois(200, seq(1, 10, length = 200)) + 1
#'
#' IQR(x)
#' weighted.IQR(x, w)
weighted.IQR <- function(x, w, na.rm = FALSE) {
  diff(weighted.quantile(x, w, probs = c(0.25, 0.75), na.rm = na.rm))
}


================================================
FILE: R/width.r
================================================
#' Compute a reasonable default binwidth.
#'
#' @param x a numeric vector. If a numeric vector of length one is supplied,
#'   it's assumed that
#' @param nbins desired number of bins (approximate)
#' @export
#' @keywords internal
#' @family reasonable defaults
#' @examples
#' find_width(c(0, 5))
#' find_width(c(0, 5.023432))
#' find_width(c(0, 5.9))
find_width <- function(x, nbins = 1e4) {
  stopifnot(is.numeric(x))
  stopifnot(is.numeric(nbins), length(nbins) == 1, nbins > 0)

  x <- diff(frange(x))
  size <- x / nbins

  # divide into order of magnitude and multiplier
  om <- 10 ^ ceiling(log10(size))
  mult <- size / om

  # ensure number per unit is multiple of 1, 2, 3, 4, or 5
  per_unit <- 1 / mult
  rounders <- c(1, 2, 3, 4, 5)
  poss <- round(per_unit / rounders) * rounders
  poss <- poss[poss != 0]
  width <- om / poss[which.min(abs(poss - per_unit))]

  structure(width, n = ceiling(x / width), per_unit = 1 / width)
}


================================================
FILE: README.md
================================================
# bigvis

[![Travis-CI Build Status](https://travis-ci.org/hadley/bigvis.svg?branch=master)](https://travis-ci.org/hadley/bigvis)
[![Coverage Status](https://img.shields.io/codecov/c/github/hadley/bigvis/master.svg)](https://codecov.io/github/hadley/bigvis?branch=master)

The bigvis package provides tools for exploratory data analysis of __large datasets__ (10-100 million obs). The aim is to have most operations take less than 5 seconds on commodity hardware, even for 100,000,000 data points.

Since bigvis is not currently available on CRAN, the easiest way to try it out is to:

```R
# install.packages("devtools")
devtools::install_github("hadley/bigvis")
```

## Workflow

The bigvis package is structured around the following workflow:

* `bin()` and `condense()` to get a compact summary of the data

* if the estimates are rough, you might want to `smooth()`. See `best_h()` and `rmse_cvs()` to figure out a good starting bandwidth

* if you're working with counts, you might want to `standardise()`

* visualise the results with `autoplot()` (you'll need to load `ggplot2` to use this)

## Weighted statistics

Bigvis also provides a number of standard statistics efficiently implemented on weighted/binned data: `weighted.median`, `weighted.IQR`, `weighted.var`, `weighted.sd`, `weighted.ecdf` and `weighted.quantile`. 

## Acknowledgements

This package wouldn't be possible without:

* the fantastic [Rcpp](http://dirk.eddelbuettel.com/code/rcpp.html) package, which makes it amazingly easy to integrate R and C++

* JJ Allaire and Carlos Scheidegger who have indefatigably answered my many C++ questions

* the generous support of Revolution Analytics who supported the early development.

* Yue Hu, who implemented a proof of concepts that showed that it might be possible to work with this much data in R.


================================================
FILE: bench/bin-structure.cpp
================================================
// How does the data structure implementing the bin affect performance.

#include <Rcpp.h>
using namespace Rcpp;

class Grouper {
    const Fast<NumericVector> x_;
    double width_;
    double origin_;
  public:
    Grouper (const NumericVector& x, double width, double origin = 0)
       : x_(x), width_(width), origin_(origin) {
    }

    int bin(int i) const {
      if (ISNAN(x_[i])) return 0;
      
      return (x_[i] - origin_) / width_ + 1;
    }

    int size() const {
      return x_.size();
    }
};

// [[Rcpp::export]]
std::vector<int> count_vector(const NumericVector& x, double width, double origin = 0) {
  Grouper grouper = Grouper(x, width, origin);
  std::vector<int> count;

  int n = grouper.size();
  for(int i = 0; i < n; ++i) {
    int bin = grouper.bin(i);
    if (bin >= count.size()) {
      count.resize(bin + 1);
    }

    ++count[bin];
  }

  return count;
}

// [[Rcpp::export]]
List count_map(const NumericVector& x, double width, double origin = 0) {
  Grouper grouper = Grouper(x, width, origin);
  std::map<int, int> count;

  int n = grouper.size();
  for(int i = 0; i < n; ++i) {
    int bin = grouper.bin(i);
    ++count[bin];
  }

  IntegerVector out_x(count.size()), out_y(count.size());
  std::map<int, int>::const_iterator count_it = count.begin(), 
    count_end = count.begin();
  for (int i = 0; count_it != count_end; ++count_it, ++i) {
    out_x[i] = count_it->first;
    out_y[i] = count_it->second;
  }
  return List::create(_["x"] = out_x, _["count"] = out_y);
}

// [[Rcpp::export]]
List count_umap(const NumericVector& x, double width, double origin = 0) {
  Grouper grouper = Grouper(x, width, origin);
  std::tr1::unordered_map<int, int> count;

  int n = grouper.size();
  for(int i = 0; i < n; ++i) {
    int bin = grouper.bin(i);

    ++count[bin];
  }

  IntegerVector out_x(count.size()), out_y(count.size());
  std::tr1::unordered_map<int, int>::iterator count_it = count.begin(), 
    count_end = count.end();
  for (int i = 0; count_it != count_end; ++count_it, ++i) {
    out_x[i] = count_it->first;
    out_y[i] = count_it->second;
  }
  return List::create(_["x"] = out_x, _["count"] = out_y);
}


template <class T>
inline void hash_combine(std::size_t & seed, const T & v) {
  std::tr1::hash<T> hasher;
  seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
}

namespace std {
  namespace tr1 {  
    template<typename S, typename T> struct hash<pair<S, T> > {
      inline size_t operator()(const pair<S, T> & v) const {
        size_t seed = 0;
        ::hash_combine(seed, v.first);
        ::hash_combine(seed, v.second);
        return seed;
      }
    };
  }
}

// [[Rcpp::export]]
List count_umap2(const NumericVector& x, double width, double origin = 0) {
  Grouper grouper = Grouper(x, width, origin);
  std::tr1::unordered_map<std::pair<int, int>, int> count;

  int n = grouper.size();
  for(int i = 0; i < n; ++i) {
    int bin = grouper.bin(i);

    ++count[std::make_pair(bin, bin)];
  }

  IntegerVector out_x(count.size()), out_y(count.size());
  std::tr1::unordered_map<std::pair<int, int>, int>::iterator count_it = count.begin(), 
    count_end = count.end();
  for (int i = 0; count_it != count_end; ++count_it, ++i) {
    out_x[i] = count_it->first.first;
    out_y[i] = count_it->second;
  }
  return List::create(_["x"] = out_x, _["count"] = out_y);
}

// [[Rcpp::export]]
List count_umap2_man(const NumericVector& x, double width, double origin = 0) {
  Grouper grouper = Grouper(x, width, origin);
  std::tr1::unordered_map<int, int> count;

  int n = grouper.size();
  for(int i = 0; i < n; ++i) {
    int bin = grouper.bin(i);
    bin = bin * 100 + bin;
    ++count[bin];
  }

  IntegerVector out_x(count.size()), out_y(count.size());
  std::tr1::unordered_map<int, int>::iterator count_it = count.begin(), 
    count_end = count.end();
  for (int i = 0; count_it != count_end; ++count_it, ++i) {
    out_x[i] = count_it->first;
    out_y[i] = count_it->second;
  }
  return List::create(_["x"] = out_x, _["count"] = out_y);
}


/*** R 
  options(digits = 3)
  library(microbenchmark)
  x <- runif(1e5)

  # As expected, for small contiguous inputs, vector is fastest, followed by
  # unordered maps (about half as fast), with maps in a distant last place.
  microbenchmark(
    count_vector(x, 1 / 1000),
    count_map(x, 1 / 1000),
    count_umap(x, 1 / 1000)
  )

  y <- c(x, x)
  y1 <- c(x, x + 10)
  y2 <- c(x, x + 100)
  y3 <- c(x, x + 1000)
  y4 <- c(x, x + 1000)

  # While using std::vector is somewhat faster, the asymptotic behaviour is
  # much worse - count_umap is basically constant, regardless of the number
  # of bins
  microbenchmark(
    count_vector(y, 1 / 1000),
    count_vector(y1, 1 / 1000),
    count_vector(y2, 1 / 1000),
    count_vector(y3, 1 / 1000),
    count_vector(y4, 1 / 1000),
    count_umap(y, 1 / 1000),
    count_umap(y1, 1 / 1000),
    count_umap(y2, 1 / 1000),
    count_umap(y3, 1 / 1000),
    count_umap(y4, 1 / 1000),
    times = 10
  )

  # Using umap with a pair is about twice as slow as with an int: this probably
  # implies that I should do the hashing myself.
  microbenchmark(
    count_umap(x, 1 / 1000),
    count_umap2(x, 1 / 1000),
    count_umap2_man(x, 1 / 1000)
  )

*/

================================================
FILE: bench/bin.cpp
================================================
#include <Rcpp.h>
#include <iostream>
#include <algorithm>
using namespace Rcpp;

//' @param breaks must be ordered and span the complete range of x. 
// [[Rcpp::export]]
IntegerVector bin(NumericVector x, NumericVector breaks) {
  // Put missing values in the last position
  int n = breaks.size();
  IntegerVector out(n + 1);

  for(NumericVector::iterator it = x.begin(); it != x.end(); it++) {
    double val = *it;
    if (ISNAN(val)) {
      out[n]++;
    } else {
      NumericVector::iterator bin_it = 
        std::upper_bound(breaks.begin(), breaks.end(), val);

      int bin = std::distance(breaks.begin(), bin_it);
      out[bin]++;
    }
  }

  return out;
}

// [[Rcpp::export]]
IntegerVector bin2(NumericVector x, NumericVector breaks) {
  // Put missing values in the last position
  int n = breaks.size(), bin;
  IntegerVector out(n + 1);

  NumericVector::iterator x_it = x.begin(), x_end, bin_it,
    breaks_it = breaks.begin(), breaks_end = breaks.end();

  for(; x_it != x.end(); ++x_it) {
    double val = *x_it;
    if (ISNAN(val)) {
      ++out[n];
    } else {
      bin_it = std::upper_bound(breaks_it, breaks_end, val);
      bin = std::distance(breaks_it, bin_it);
      ++out[bin];
    }
  }

  return out;
}

// [[Rcpp::export]]
std::vector<int> bin3(NumericVector x, double width, double origin = 0) {
  int bin, nmissing = 0;
  std::vector<int> out;

  NumericVector::iterator x_it = x.begin(), x_end;
  for(; x_it != x.end(); ++x_it) {
    double val = *x_it;
    if (ISNAN(val)) {
      ++nmissing;
    } else {
      bin = (val - origin) / width;
      if (bin < 0) continue;
    
      // Make sure there's enough space
      if (bin >= out.size()) {
        out.resize(bin + 1);
      }
      ++out[bin];
    }
  }

  // Put missing values in the last position
  out.push_back(nmissing);
  return out;
}

// Create class to encapsulate binning operations ------------------------------


class BinFixed {
    double width_;
    double origin_;
  public:
    BinFixed (double width, double origin = 0) {
      width_ = width;
      origin_ = origin;
    }
    int inline operator() (double val) const { 
      return (val - origin_) / width_;
    }
};
class BinBreaks {
    NumericVector breaks_;
    NumericVector::iterator breaks_it_, breaks_end_;

  public:
    BinBreaks (NumericVector& breaks) {
      breaks_ = breaks;
      breaks_it_ = breaks.begin();
      breaks_end_ = breaks.end();
    }
    int inline operator() (double val) const { 
      NumericVector::iterator 
        bin_it = std::upper_bound(breaks_it_, breaks_end_, val);

      return std::distance(breaks_it_, bin_it);
    }
};


template<typename Binner>
std::vector<int> bin_bin(NumericVector x, Binner binner) {
  int bin, nmissing = 0;
  std::vector<int> out;

  NumericVector::iterator x_it = x.begin(), x_end;
  for(; x_it != x.end(); ++x_it) {
    double val = *x_it;
    if (ISNAN(val)) {
      ++nmissing;
    } else {
      bin = binner(val);
      if (bin < 0) continue;
    
      // Make sure there's enough space
      if (bin >= out.size()) {
        out.resize(bin + 1);
      }
      ++out[bin];
    }
  }

  // Put missing values in the last position
  out.push_back(nmissing);
  return out;
}

// [[Rcpp::export]]
std::vector<int> bin_bin_fixed(NumericVector x, double width, double origin = 0) {
  return bin_bin(x, BinFixed(width, origin));
}

// [[Rcpp::export]]
std::vector<int> bin_bin_breaks(NumericVector x, NumericVector breaks) {
  return bin_bin(x, BinBreaks(breaks));
}

// Try using a Fast<NumericVector> ------------------------------
// Considerable speed improvement for simple binning function
template<typename Binner>
std::vector<int> fbin_bin(NumericVector x, Binner binner) {
  int bin, nmissing = 0;
  std::vector<int> out;

  Fast<NumericVector> fx(x);
  int n = x.size();

  for(int i = 0; i < n; ++i) {
    double val = fx[i];
    if (ISNAN(val)) {
      ++nmissing;
    } else {
      bin = binner(val);
      if (bin < 0) continue;
    
      // Make sure there's enough space
      if (bin >= out.size()) {
        out.resize(bin + 1);
      }
      ++out[bin];
    }
  }

  // Put missing values in the last position
  out.push_back(nmissing);
  return out;
}

// [[Rcpp::export]]
std::vector<int> fbin_bin_fixed(NumericVector x, double width, double origin = 0) {
  return fbin_bin(x, BinFixed(width, origin));
}

// [[Rcpp::export]]
std::vector<int> fbin_bin_breaks(NumericVector x, NumericVector breaks) {
  return fbin_bin(x, BinBreaks(breaks));
}

/*** R 
options(digits = 3)
library(microbenchmark)
x <- runif(1e5)
breaks <- seq(0, 1, length = 100)

# Breaks
microbenchmark(
  hist(x, breaks, plot = F),
  bin(x, breaks),
  bin2(x, breaks),
  bin_bin_breaks(x, breaks),
  fbin_bin_breaks(x, breaks)
)

# Fixed bins
microbenchmark(
  bin3(x, 1/100, 0),
  bin_bin_fixed(x, 1/100, 0),
  fbin_bin_fixed(x, 1/100, 0)
)

x6 <- runif(1e6)
x7 <- runif(1e7)
x8 <- runif(1e8)

microbenchmark(
  bin_bin_fixed(x6, 1/100, 0),
  fbin_bin_fixed(x6, 1/100, 0),
  bin_bin_fixed(x7, 1/100, 0),
  fbin_bin_fixed(x7, 1/100, 0),
  bin_bin_fixed(x8, 1/100, 0),
  fbin_bin_fixed(x8, 1/100, 0),
  times = 10)

*/

================================================
FILE: bench/count.cpp
================================================
// Experiment with making the binner more generic, so that the binner
// class also stores the variable being binned over - this is important
// for separating the grouping from the numeric operation.

#include <Rcpp.h>
#include <iostream>
#include <algorithm>
using namespace Rcpp;

template<typename Binner>
std::vector<int> count_x(const NumericVector& x, Binner binner) {
  std::vector<int> out;

  int n = x.size();

  for(int i = 0; i < n; ++i) {
    int bin = binner(x[i]);
    if (bin < 0) continue;
  
    // Make sure there's enough space
    if (bin >= out.size()) {
      out.resize(bin + 1);
    }
    ++out[bin];
  }

  return out;
}

template<typename Binner>
std::vector<int> count(Binner binner) {
  std::vector<int> out;

  int n = binner.size();
  for(int i = 0; i < n; ++i) {
    int bin = binner.bin(i);
    if (bin < 0) continue;
  
    // Make sure there's enough space
    if (bin >= out.size()) {
      out.resize(bin + 1);
    }
    ++out[bin];
  }

  return out;
}


class BinFixed {
    double width_;
    double origin_;
  public:
    BinFixed (double width, double origin = 0) {
      width_ = width;
      origin_ = origin;
    }

    int inline operator() (double val) const { 
      if (ISNAN(val)) return 0;

      return (val - origin_) / width_ + 1;
    }
};

class BinFixed2 {
    const NumericVector& x_;
    double width_;
    double origin_;
  public:
    BinFixed2 (const NumericVector& x, double width, double origin = 0)
       : x_(x), width_(width), origin_(origin) {
    }

    int bin(int i) const { 
      if (ISNAN(x_[i])) return 0;
      return (x_[i] - origin_) / width_ + 1;
    }

    int size() const {
      return x_.size();
    }
};


// [[Rcpp::export]]
std::vector<int> count_x2(NumericVector x, double width, double origin = 0) {
  return count_x(x, BinFixed(width, origin));
}

// [[Rcpp::export]]
std::vector<int> count2(NumericVector x, double width, double origin = 0) {
  return count(BinFixed2(x, width, origin));
}


/*** R 
options(digits = 3)
library(microbenchmark)
x <- runif(1e5)

# Breaks
microbenchmark(
  count_x2(x, 1/100),  
  count2(x, 1/100)
)

*/

================================================
FILE: bench/group-tempvar.cpp
================================================
// In a function like 
// 
// unsigned int bin(unsigned int i) const {
//   if (ISNAN(x_[i])) return 0;
//   if (x_[i] < origin_) return 0;
// 
//   return (x_[i] - origin_) / width_ + 1;
// }
// 
// should I create my own temporary double val = x_[i] ?
// 
// It looks like it saves ~0.2 ns per invocation, so probably not worth it for
// performance reasons.

#include <Rcpp.h>
using namespace Rcpp;

class Group1 {
    const Fast<NumericVector> x_;
    double width_;
    double origin_;
  public:
    Group1 (const NumericVector& x, double width, double origin = 0)
       : x_(x), width_(width), origin_(origin) {
    }

    unsigned int bin(unsigned int i) const {
      if (ISNAN(x_[i])) return 0;
      if (x_[i] < origin_) return 0;
      
      return (x_[i] - origin_) / width_ + 1;
    }

    int size() const {
      return x_.size();
    }
};

class Group2 {
    const Fast<NumericVector> x_;
    double width_;
    double origin_;
  public:
    Group2 (const NumericVector& x, double width, double origin = 0)
       : x_(x), width_(width), origin_(origin) {
    }

    unsigned int bin(unsigned int i) const {
      double val = x_[i];
      if (ISNAN(val)) return 0;
      if (val < origin_) return 0;
      
      return (val - origin_) / width_ + 1;
    }

    int size() const {
      return x_.size();
    }
};

template<typename Group>
IntegerVector group_out(const Group& group) {
  int n = group.size();
  IntegerVector out(n);
  for(int i = 0; i < n; ++i) {
    out[i] = group.bin(i);
  }

  return out;
}

// [[Rcpp::export]]
IntegerVector group1(const NumericVector& x, double width, double origin = 0) {
  return group_out(Group1(x, width, origin));
}

// [[Rcpp::export]]
IntegerVector group2(const NumericVector& x, double width, double origin = 0) {
  return group_out(Group2(x, width, origin));
}


/*** R
x <- runif(1e6)
library(microbenchmark)
stopifnot(all.equal(group1(x, 1/1000), group2(x, 1/1000)))

(m <- microbenchmark(
  group1(x, 1/1000),
  group2(x, 1/1000)
))
diff(summary(m)$median) / length(x) * 1e9 / 1e3
*/

================================================
FILE: bench/kernel.cpp
================================================
// Differences in kernel performance

#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
NumericVector normal_kernel(NumericVector x) {
  int n = x.size();
  NumericVector out(n);

  for (int i = 0; i < n; ++i) {
    out[i] = R::dnorm(x[i], 0, 1, 0);
  }

  return out;
}

// [[Rcpp::export]]
double tricube2(double x) {
  x = fabs(x);
  if (x > 1) return 0;

  return pow(1 - pow(x, 3), 3);
}

// [[Rcpp::export]]
double tricube(double x) {
  x = fabs(x);
  if (x > 1) return 0;

  double y = 1 - x * x * x;
  return y * y * y;
}

// [[Rcpp::export]]
NumericVector tricube_kernel(NumericVector x) {
  int n = x.size();
  NumericVector out(n);

  for (int i = 0; i < n; ++i) {
    out[i] = tricube(x[i]);
  }

  return out;
}

// [[Rcpp::export]]
NumericVector copy(NumericVector x) {
  int n = x.size();
  NumericVector out(n);

  for (int i = 0; i < n; ++i) {
    out[i] = x[i];
  }

  return out;
}

/*** R
options(digits = 3)
library(microbenchmark)

x <- runif(1e4)

mean(sapply(x, tricube) - sapply(x, tricube2))

microbenchmark(
  copy(x),
  tricube_kernel(x),
  normal_kernel(x)
)

*/

================================================
FILE: bench/mean.cpp
================================================
// Instead of counting, compute a more complicated statistic: a weighted mean

#include <Rcpp.h>
#include <iostream>
#include <algorithm>
using namespace Rcpp;

class BinFixed {
    const Fast<NumericVector> x_;
    double width_;
    double origin_;
  public:
    BinFixed (const NumericVector& x, double width, double origin = 0)
       : x_(x), width_(width), origin_(origin) {
    }

    int bin(int i) const { 
      if (ISNAN(x_[i])) return 0;
      return (x_[i] - origin_) / width_ + 1;
    }

    int size() const {
      return x_.size();
    }
};

template<typename Binner>
NumericVector group_mean(NumericVector& y, NumericVector& weight, Binner binner) {
  std::vector<double> count;
  std::vector<double> sum;

  int n = binner.size();
  for(int i = 0; i < n; ++i) {
    int bin = binner.bin(i);
    if (bin < 0) continue;
  
    // Make sure there's enough space
    if (bin >= sum.size()) {
      sum.resize(bin + 1);
      count.resize(bin + 1);
    }

    count[bin] += weight[i];
    sum[bin] += y[i];
  }

  int m = count.size();
  NumericVector res(m);
  for (int i = 0; i < m; ++i) {
    res[i] = sum[i] / count[i];
  }
  return res;
}


class StatMean {
    double count;
    double sum;

  public:
    StatMean () : count(0), sum(0) {
    }
    void push(double x, double weight) {
      count += weight;
      sum += x;
    }

    double compute() {
      return sum / count;
    }
};

template<typename Binner>
NumericVector group_mean2(NumericVector& y, NumericVector& weight, Binner binner) {
  std::vector<StatMean> stat;

  int n = binner.size();
  for(int i = 0; i < n; ++i) {
    int bin = binner.bin(i);
    if (bin < 0) continue;
  
    if (bin >= stat.size()) {
      stat.resize(bin + 1);
    }

    stat[bin].push(y[i], weight[i]);
  }

  int m = stat.size();
  NumericVector res(m);
  for (int i = 0; i < m; ++i) {
    res[i] = stat[i].compute();
  }
  return res;
}


// [[Rcpp::export]]
NumericVector group_mean_(NumericVector x, NumericVector y, NumericVector weight, 
                       double width, double origin = 0) {
  return group_mean(y, weight, BinFixed(x, width, origin));
}
// [[Rcpp::export]]
NumericVector group_mean2_(NumericVector x, NumericVector y, NumericVector weight, 
                       double width, double origin = 0) {
  return group_mean2(y, weight, BinFixed(x, width, origin));
}


/*** R 
options(digits = 3)
library(microbenchmark)
x <- runif(1e6)
y <- runif(1e6)
weight <- rep(1, 1e6)

# Breaks
microbenchmark(
  group_mean_(x, y, weight, width = 1/100),
  group_mean2_(x, y, weight, width = 1/100)
)

*/

================================================
FILE: bench/median.cpp
================================================
// Instead of counting, compute a more complicated statistic: a median

#include <Rcpp.h>
#include <iostream>
#include <algorithm>
using namespace Rcpp;

class BinFixed {
    const Fast<NumericVector> x_;
    double width_;
    double origin_;
  public:
    BinFixed (const NumericVector& x, double width, double origin = 0)
       : x_(x), width_(width), origin_(origin) {
    }

    int bin(int i) const {
      if (ISNAN(x_[i])) return 0;
      return (x_[i] - origin_) / width_ + 1;
    }

    int size() const {
      return x_.size();
    }
};
class StatMedian {
    std::vector<double> ys;

  public:
    void push(double x) {
      ys.push_back(x);
    }

    // Adapted from http://stackoverflow.com/questions/1719070/
    double compute() {
      if (ys.empty()) return NAN;

      int size = ys.size();
      std::vector<double>::iterator upper = ys.begin() + (int) (size / 2);
      std::nth_element(ys.begin(), upper, ys.end());

      if (size % 2 == 1) {
        return *upper;
      } else {
        std::vector<double>::iterator lower = upper - 1;
        std::nth_element(ys.begin(), lower, upper);
        return (*upper + *lower) / 2.0;
      }

    }
};

template<typename Binner>
NumericVector group_median(NumericVector& y, Binner binner) {
  std::vector<StatMedian> stat;

  int n = binner.size();
  for(int i = 0; i < n; ++i) {
    int bin = binner.bin(i);
    if (bin < 0) continue;

    if (bin >= stat.size()) {
      stat.resize(bin + 1);
    }

    stat[bin].push(y[i]);
  }

  int m = stat.size();
  NumericVector res(m);
  for (int i = 0; i < m; ++i) {
    res[i] = stat[i].compute();
  }
  return res;
}


// [[Rcpp::export]]
NumericVector group_median_(NumericVector x, NumericVector y,
                            double width, double origin = 0) {
  return group_median(y, BinFixed(x, width, origin));
}


/*** R
options(digits = 3)
library(microbenchmark)
x <- runif(1e5)
y <- runif(1e5)

group_median_tapply <- function(x, y, width, origin = 0) {
  bins <- trunc((x - origin) / width)
  c(NaN, unname(tapply(y, bins, median)))
}
med1 <- group_median_tapply(x, y, width = 1/1000)
med2 <- group_median_(x, y, width = 1/1000)
stopifnot(all.equal(med1, med2))

# Breaks
microbenchmark(
# group_median_tapply(x, y, width = 1/1000),
  group_median_(x, y, width = 1/1000)
)

*/


================================================
FILE: bench/smooth-1d.cpp
================================================
// Explore opportunities for making smooth_1d faster
// 
// Bounding to a given range is really important, and memoisation helps offset
// the cost of making a call back to R, but the biggest win is using a pure
// C/C++ kernel function.
// 
// 
#include <Rcpp.h>
using namespace Rcpp;

// Base implementation
// [[Rcpp::export]]
NumericVector smooth_1d(const NumericVector& x, const NumericVector& z, 
                        const NumericVector& x_out, const Function& kernel) {

  int n_in = x.size(), n_out = x_out.size();
  NumericVector z_out(n_out);

  for (int i = 0; i < n_out; i++) {
    for (int j = 0; j < n_in; j++) {
      double dist = x[j] - x_out[i];
      double k = as<NumericVector>(kernel(dist))[0];
      z_out[i] += z[j] * k;
    }
  }

  return z_out;
}

// Memoise distance calculations
// [[Rcpp::export]]
NumericVector smooth_1d_memo(const NumericVector& x, const NumericVector& z, 
                        const NumericVector& x_out, const Function& kernel) {
  int n_in = x.size(), n_out = x_out.size();
  NumericVector z_out(n_out);

  std::unordered_map<double, double> k_memo;

  for (int i = 0; i < n_out; i++) {
    for (int j = 0; j < n_in; j++) {
      double dist = x[j] - x_out[i];

      std::unordered_map<double, double>::const_iterator it = k_memo.find(dist);
      double k;
      if (it == k_memo.end()) {
        k = as<NumericVector>(kernel(dist))[0];
        k_memo[dist] = k;
      } else {
        k = it->second; 
      }

      z_out[i] += z[j] * k;
    }
  }

  return z_out;
}

// Use range of kernel
// [[Rcpp::export]]
NumericVector smooth_1d_range(const NumericVector& x, const NumericVector& z, 
                        const NumericVector& x_out, const Function& kernel,
                        double kmin, double kmax) {

  int n_in = x.size(), n_out = x_out.size();
  NumericVector z_out(n_out);

  for (int i = 0; i < n_out; i++) {
    for (int j = 0; j < n_in; j++) {
      double dist = x[j] - x_out[i];
      if (dist < kmin || dist > kmax) continue;

      double k = as<NumericVector>(kernel(dist))[0];
      z_out[i] += z[j] * k;
    }
  }

  return z_out;
}


// Memoise and use range
// [[Rcpp::export]]
NumericVector smooth_1d_memo_range(const NumericVector& x, const NumericVector& z, 
                        const NumericVector& x_out, const Function& kernel,
                        double kmin, double kmax) {
  int n_in = x.size(), n_out = x_out.size();
  NumericVector z_out(n_out);

  std::unordered_map<double, double> k_memo;

  for (int i = 0; i < n_out; i++) {
    for (int j = 0; j < n_in; j++) {
      double dist = x[j] - x_out[i];
      if (dist < kmin || dist > kmax) continue;

      std::unordered_map<double, double>::const_iterator it = k_memo.find(dist);
      double k;
      if (it == k_memo.end()) {
        k = as<NumericVector>(kernel(dist))[0];
        k_memo[dist] = k;
      } else {
        k = it->second; 
      }

      z_out[i] += z[j] * k;
    }
  }

  return z_out;
}

// Memoise and use range
// [[Rcpp::export]]
NumericVector smooth_1d_memo_range_map(const NumericVector& x, const NumericVector& z, 
                        const NumericVector& x_out, const Function& kernel,
                        double kmin, double kmax) {
  int n_in = x.size(), n_out = x_out.size();
  NumericVector z_out(n_out);

  std::map<double, double> k_memo;

  for (int i = 0; i < n_out; i++) {
    for (int j = 0; j < n_in; j++) {
      double dist = x[j] - x_out[i];
      if (dist < kmin || dist > kmax) continue;

      std::map<double, double>::const_iterator it = k_memo.find(dist);
      double k;
      if (it == k_memo.end()) {
        k = as<NumericVector>(kernel(dist))[0];
        k_memo[dist] = k;
      } else {
        k = it->second; 
      }

      z_out[i] += z[j] * k;
    }
  }

  return z_out;
}

// Memoise, use range & use C++ function for kernel
// [[Rcpp::export]]
NumericVector smooth_1d_memo_range_kcpp(const NumericVector& x, const NumericVector& z, 
                        const NumericVector& x_out, double kmin, double kmax) {
  int n_in = x.size(), n_out = x_out.size();
  NumericVector z_out(n_out);

  std::unordered_map<double, double> k_memo;

  for (int i = 0; i < n_out; i++) {
    for (int j = 0; j < n_in; j++) {
      double dist = x[j] - x_out[i];
      if (dist < kmin || dist > kmax) continue;

      std::unordered_map<double, double>::const_iterator it = k_memo.find(dist);
      double k;
      if (it == k_memo.end()) {
        k = R::dnorm(dist, 0.0, 0.1, 0);
        k_memo[dist] = k;
      } else {
        k = it->second; 
      }

      z_out[i] += z[j] * k;
    }
  }

  return z_out;
}

// Use cpp kernel function without memoisation
// [[Rcpp::export]]
NumericVector smooth_1d_range_kcpp(const NumericVector& x, const NumericVector& z, 
                        const NumericVector& x_out, double kmin, double kmax) {

  int n_in = x.size(), n_out = x_out.size();
  NumericVector z_out(n_out);

  for (int i = 0; i < n_out; i++) {
    for (int j = 0; j < n_in; j++) {
      double dist = x[j] - x_out[i];
      if (dist < kmin || dist > kmax) continue;

      double k = R::dnorm(dist, 0.0, 0.1, 0);
      z_out[i] += z[j] * k;
    }
  }

  return z_out;
}


/*** R
  options(digits = 2)
  x <- 1:10
  z <- rep(c(1, 2), length = length(x))
  k <- kernel("norm", sd = 0.1)
  krng <- range(k)
  grid <- seq(0, 11, length = 100)
  
  stopifnot(all.equal(
    smooth_1d_memo_range(x, z, grid, k, krng[1], krng[2]),
    smooth_1d_range_kcpp(x, z, grid, krng[1], krng[2])
  ))

  library(microbenchmark)
  microbenchmark(
    base = smooth_1d(x, z, grid, k),
    memo = smooth_1d_memo(x, z, grid, k),
    range = smooth_1d_range(x, z, grid, k, krng[1], krng[2]),
    "range + kcpp" = smooth_1d_range_kcpp(x, z, grid, krng[1], krng[2]),
    "range + memo" = smooth_1d_memo_range(x, z, grid, k, krng[1], krng[2]),
    "range + memo + kcpp" = smooth_1d_memo_range_kcpp(x, z, grid, krng[1], krng[2]),
    "range + memo + map" = smooth_1d_memo_range_map(x, z, grid, k, krng[1], krng[2])
  )
  
  # More realistic sample sizes
  x <- 1:3e3
  z <- rep(c(1, 2), length = length(x))
  
  grid3 <- seq(0, 11, length = 3e3)
  grid4 <- seq(0, 11, length = 3e4)

  microbenchmark(
    grid3_c = smooth_1d_range_kcpp(x, z, grid3, krng[1], krng[2]),
    grid3_r = smooth_1d_memo_range_map(x, z, grid3, k, krng[1], krng[2]),
    grid4_c = smooth_1d_range_kcpp(x, z, grid4, krng[1], krng[2]),
    grid4_r = smooth_1d_memo_range_map(x, z, grid4, k, krng[1], krng[2]),
    times = 10)

*/

================================================
FILE: bigvis.Rproj
================================================
Version: 1.0

RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8

RnwWeave: Sweave
LaTeX: pdfLaTeX

AutoAppendNewline: Yes

BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
PackageRoxygenize: rd,collate,namespace


================================================
FILE: inst/include/bigvis.h
================================================
#include <Rcpp.h>
#include <boost/shared_ptr.hpp>

using namespace Rcpp;

// Wrapper for numeric vector that makes it easy figure to out which 
// bin each observation belongs to.
class BinnedVector {
    // This should probably be a const NumericVector&, but that doesn't work
    // with modules currently
    NumericVector x_;
    String name_;
    double width_;
    double origin_;
  public:
    BinnedVector(NumericVector x, String name, double width, double origin = 0)
       : x_(x), name_(name), width_(width), origin_(origin) {
    }

    int bin_i(int i) const {
      return bin(x_[i]);
    }

    int bin(double x) const {
      if (ISNAN(x) || x == INFINITY || x == -INFINITY) return 0;
      if (x < origin_) return 0;

      return (x - origin_) / width_ + 1;
    }

    double unbin(int bin) const {
      if (bin == 0) return(NA_REAL);
      return (bin - 1) * width_ + origin_ + width_ / 2;
    }

    int nbins() const;

    int size() const {
      return x_.size();
    }

    double origin() const {
      return origin_;
    } 

    double width() const {
      return width_;
    }

    String name() const {
      return name_;
    }

};

// This class is just boilerplate. There might be rcpp magic that does the right thing here
// but I don't know it.
class BinnedVectorReference {
    boost::shared_ptr<BinnedVector> ref;

    const BinnedVector *get() const {
        return ref.get();
    };
    BinnedVector *get() {
        return ref.get();
    };

public:
    BinnedVectorReference() {};

    BinnedVectorReference(const BinnedVectorReference &o):
        ref(o.ref) {};

    explicit BinnedVectorReference(BinnedVector *ptr) {
        // Watch out, this takes ownership of the pointer!
        ref = boost::shared_ptr<BinnedVector>(ptr);
    }

    BinnedVectorReference(NumericVector x, String name, double width, double origin = 0) {
        BinnedVector *vec = new BinnedVector(x, name, width, origin);
        ref = boost::shared_ptr<BinnedVector>(vec);
    }

    int bin_i(int i) const { return get()->bin_i(i); }
    int bin(double x) const { return get()->bin(x); }
    double unbin(int bin) const { return get()->unbin(bin); }
    int nbins() const { return get()->nbins(); }
    int size() const { return get()->size();}
    double origin() const { return get()->origin();}
    double width() const { return get()->width();}
    String name() const { return get()->name();}
};

// A data structure to store multiple binned vectors
class BinnedVectors {
    int size_;
    std::vector<BinnedVectorReference> groups_;

  public:
    std::vector<int> bins_;
    BinnedVectors () : groups_(0), bins_(0) {
    }

    BinnedVectors (List gs) : groups_(0), bins_(0) {
      int n = gs.size();
      for (int i = 0; i < n; ++i) {
        add_vector(as<BinnedVectorReference>(gs[i]));
      }
    }

    void add_vector(BinnedVectorReference g) {
      if (groups_.empty()) {
        bins_.push_back(1);
        size_ = g.size();
      } else {
        if (g.size() != size_) stop("Inconsistent sizes");
        bins_.push_back(bins_.back() * g.nbins());
      }
      groups_.push_back(g);

    }

    int bin_i(int i) const;
    int bin(std::vector<double> x) const;
    std::vector<double> unbin(int bin) const;
    
    int nbins() const {
      return bins_.back() * groups_.front().nbins();
    }

    int ngroups() const {
      return bins_.size();
    }

    int size() const {
      return size_;
    }

    String name(int j) const {
      return groups_[j].name();
    }

};


RCPP_EXPOSED_AS(BinnedVectorReference)
RCPP_EXPOSED_WRAP(BinnedVectorReference)
RCPP_EXPOSED_AS(BinnedVectors)
RCPP_EXPOSED_WRAP(BinnedVectors)


================================================
FILE: inst/tests/test-binned-vectors.r
================================================
context("Binned vectors")

if (require("plyr")) {
  test_that("bins agree with plyr::id", {
    grid <- expand.grid(x = c(NA, seq(0, 0.5, by = 0.1)), y = c(NA, seq(0, 0.7, by = 0.1)))
    x <- grid$x
    y <- grid$y

    gx <- bin(x, 0.1)
    gy <- bin(y, 0.1)

    bv <- bins(gx, gy)
    bigvis <- sapply(seq_along(x) - 1, bv$bin_i)

    bin_x <- sapply(seq_along(x) - 1, gx$bin_i)
    bin_y <- sapply(seq_along(x) - 1, gy$bin_i)
    plyr <- as.vector(id(list(bin_x, bin_y)))

    expect_equal(bigvis + 1, plyr)
  })
}

test_that("square nbins correct", {
  g <- bin(1:10, 1)
  expect_equal(bins(g)$nbins(), 11)
  expect_equal(bins(g, g)$nbins(), 11 ^ 2)
  expect_equal(bins(g, g, g)$nbins(), 11 ^ 3)
})

test_that("rectangular nbins correct", {
  g11 <- bin(1:10, 1)
  g2 <- bin(rep(1, 10), 1)

  expect_equal(bins(g2, g11)$nbins(), 22)
  expect_equal(bins(g11, g2)$nbins(), 22)
})

test_that("diagonal nbins correct", {
  x <- runif(1e3)
  y <- x + runif(1e3, -0.2, 0.2)
  z <- rnorm(1e3, x)

  gx <- bin(x, 0.1)
  gy <- bin(y, 0.1)

  expect_equal(gx$nbins(), 11)
  expect_equal(gy$nbins(), 15)

  bvs <- bins(gx, gy)
  expect_equal(bvs$nbins(), 165)

  bins <- vapply(seq_along(x) - 1, bvs$bin_i, integer(1))
  expect_true(all(bins <= 165))
})

test_that("bin and unbin are symmetric", {
  g <- bin(-10:10, 1)
  bvs <- bins(g, g)

  grid <- expand.grid(x = -10:10, y = -10:10)
  bins <- unlist(Map(function(x, y) bvs$bin(c(x, y)), grid$x, grid$y))
  unbin <- t(vapply(bins, bvs$unbin, numeric(2)))
  colnames(unbin) <- c("x", "y")

  expect_equal(unbin, as.matrix(grid))
})

test_that("bin and unbin are symmetric with diff binning", {
  x <- c(-1, 5)
  y <- c(0.1, 1)

  bx <- bin(x, 1)
  by <- bin(y, 0.1)
  bvs <- bins(bx, by)

  grid <- expand.grid(
    x = breaks(bx)[-1] + 1 / 2,
    y = breaks(by)[-1] + 0.1 / 2)

  bins <- unlist(Map(function(x, y) bvs$bin(c(x, y)), grid$x, grid$y))
  unbin <- t(vapply(bins, bvs$unbin, numeric(2)))
  colnames(unbin) <- c("x", "y")

  expect_equal(unbin, as.matrix(grid))
})


================================================
FILE: inst/tests/test-breaks.r
================================================
context("Breaks")

last <- function(x) x[length(x)]

test_that("breaks includes max value, only if on border", {
  expect_equal(last(breaks(10, origin = 0, binwidth = 1)), 10)
  expect_equal(last(breaks(9.99, origin = 0, binwidth = 1)), 9)
})

test_that("breaks includes max value even when origin != 0", {
  expect_equal(last(breaks(10.5, origin = 0.5, binwidth = 1)), 10.5)
  expect_equal(last(breaks(10.49, origin = 0.5, binwidth = 1)), 9.5)
})


================================================
FILE: inst/tests/test-condense.r
================================================
context("Condense")

test_that("condense counts small vectors accurately", {
  x <- c(NA, 0:10)
  s1 <- condense(bin(x, 1, -0.5), summary = "count")
  # Pathological origin: need to add extra bin on end, because they're
  # right open, left closed
  s2 <- condense(bin(x, 1, 0), summary = "count")

  expect_equivalent(s1$x, c(NA, 0:10))
  expect_equivalent(s2$x, c(NA, 0:10 + 0.5))

  expect_equal(s1$.count, rep(1, length(x)))
  expect_equal(s2$.count, rep(1, length(x)))
})

test_that("weights modify counts", {
  x <- c(NA, 0:10)
  w <- rep(2, length(x))
  s <- condense(bin(x, 1), w = w, summary = "count")

  expect_equivalent(s$x, c(NA, 0:10))
  expect_equal(s$.count, rep(2, length(x)))
})

test_that("z affects sums, but not counts", {
  x <- c(NA, 0:10)
  z <- 0:11
  s <- condense(bin(x, 1), z = z, summary = "sum")

  expect_equal(s$.count, rep(1, length(x)))
  expect_equal(s$.sum, z)
})

test_that("drop = FALSE and drop = TRUE results agree", {
  x <- runif(1e3)
  y <- x + runif(1e3, -0.2, 0.2)
  z <- rnorm(1e3, x)

  gx <- bin(x, 0.1)
  gy <- bin(y, 0.1)

  count1 <- condense(gx, gy, summary = "count", drop = TRUE)
  expect_equal(sum(count1$.count == 0), 0)

  count2 <- condense(gx, gy, summary = "count", drop = FALSE)
  expect_equivalent(count1, count2[count2$.count != 0, ])
})

# 2d tests ---------------------------------------------------------------------

test_that("grid counted accurately", {
  # expand.grid orders in opposite way to bigvis
  grid <- expand.grid(y = c(NA, 1:2), x = c(NA, 1:2))
  s <- condense(bin(grid$x, 1), bin(grid$y, 1))

  expect_equal(s$.count, rep(1, nrow(grid)))
  expect_equivalent(s$grid.x, grid$x)
  expect_equivalent(s$grid.y, grid$y)
})

test_that("diagonal counted correctly", {
  df <- data.frame(x = c(NA, 1:2), y = c(NA, 1:2))
  s <- condense(bin(df$x, 1), bin(df$y, 1))

  expect_equal(nrow(s), nrow(df))
  expect_equal(s$df.x, s$df.y)
})

test_that("random data doesn't crash", {
  x <- runif(1e3, 8, 4963)
  y <- runif(1e3, 1e-2, 1e3)

  gx <- bin(x, 10)
  gy <- bin(y, 10)

  condense(gx, gy)
})


================================================
FILE: inst/tests/test-frange.r
================================================
context("frange")

test_that("frange agrees with range", {
  x <- rnorm(1e4)
  expect_equal(frange(x), range(x))
})

test_that("frange uses cache if present", {
  x <- rnorm(1e4)
  attr(x, "range") <- c(0, 10)
  expect_equal(frange(x), c(0, 10))
})

test_that("frange ignores NA and infinities by default", {
  x <- c(1, NA, Inf, -Inf)
  expect_equal(frange(x), c(1, 1))
})


================================================
FILE: inst/tests/test-group-1d.r
================================================
context("Grouping: 1d")

group <- function(x, width, origin = NULL) {
  g <- bin(x, width, origin)
  vapply(seq_along(x) - 1, g$bin_i, integer(1))
}

test_that("NAs belong to group 0", {
  x <- NA_real_
  expect_equal(group(x, 1, 0), 0L)
})

test_that("Inf and -Inf belong to group 0", {
  x <- c(-Inf, Inf)
  expect_equal(group(x, 1, 0), c(0, 0))
})

test_that("Out of range values belong to group 0", {
  expect_equal(group(-10, 1, 0), 0)
})

test_that("Positive integers unchanged if origin is 1", {
  expect_equal(group(1:10, 1, 1), 1:10)
})


================================================
FILE: inst/tests/test-group-2d.r
================================================
context("Grouping: 2d")

test_that("Two NAs gets bin 0", {
  expect_equal(group_rect(NA, NA, 1, 1, 0, 0), 0)
})

test_that("Sequential locations get sequential groups", {
  grid <- expand.grid(x = c(NA, 1:2), y = c(NA, 1:2))
  expect_equal(group_rect(grid$x, grid$y, 1, 1, 0.5, 0.5), 0:8)
})


================================================
FILE: inst/tests/test-origin.r
================================================
context("Origin")

test_that("origins close to zero rounded to zero" ,{
  expect_equal(find_origin(c(0.01, 1000)), 0)
  expect_equal(find_origin(c(10, 1e6)), 0)
})

test_that("origins rounded down by binwidth", {
  expect_equal(find_origin(c(1, 10), 1), 1)
  expect_equal(find_origin(c(1, 10), 2), 0)

  expect_equal(find_origin(c(5, 10), 2), 4)
  expect_equal(find_origin(c(5, 10), 5), 5)
})

test_that("integers have origin offset by 0.5", {
  expect_equal(find_origin(c(1L, 10L), 1), 0.5)

  expect_equal(find_origin(c(5L, 10L), 2), 3.5)
  expect_equal(find_origin(c(5L, 10L), 5), 4.5)
})


================================================
FILE: inst/tests/test-ranged.r
================================================
context("Ranged")

test_that("range attribute lost when modified", {
  x <- ranged(10:1)
  expect_equal(max(x), 10)


  x[1] <- 1
  expect_equal(max(x), 9)
  expect_equal(attr(x, "range"), NULL)
})


================================================
FILE: inst/tests/test-smooth.r
================================================
context("Smooth")

tricube <- function(x) {
  x <- abs(x)
  ifelse(x > 1, 0, (1 - x ^ 3) ^ 3)
}
# plot(tricube, xlim = c(-1.5, 1.5))

test_that("factorised smooth equal to manual smooth", {
  grid <- as.matrix(expand.grid(x = 1:10, y = 1:10, KEEP.OUT.ATTRS = FALSE))
  z <- rep(0, nrow(grid))
  z[c(5, 23, 84)] <- 1

  z_x <- smooth_nd_1(grid, z, numeric(), grid, 0, 3)
  z_y <- smooth_nd_1(grid, z, numeric(), grid, 1, 3)

  z_xy <- smooth_nd_1(grid, z_x, numeric(), grid, 1, 3)
  z_yx <- smooth_nd_1(grid, z_y, numeric(), grid, 0, 3)
  z2 <- smooth_nd(grid, z, numeric(), grid, c(3, 3))

  expect_equal(z_xy, z2)
  expect_equal(z_yx, z2)
})

# library(ggplot2)
# qplot(grid[, 1], grid[, 2], fill = z, geom = "raster")
# qplot(grid[, 1], grid[, 2], fill = z_xy, geom = "raster")
# qplot(grid[, 1], grid[, 2], fill = z_yx, geom = "raster")
# qplot(grid[, 1], grid[, 2], fill = z2, geom = "raster")

test_that("factorised smooth equal to manual smooth", {
  grid <- as.matrix(expand.grid(x = 1:10, y = 1:10, KEEP.OUT.ATTRS = FALSE))
  z <- rep(0, nrow(grid))
  z[c(5, 23, 84)] <- 1

  grid <- as.data.frame(grid)
  grid$.count <- z
  class(grid) <- c("condensed", class(grid))

  z1 <- smooth(grid, c(3, 3), ".count", factor = FALSE)
  z2 <- smooth(grid, c(3, 3), ".count", factor = TRUE)

  expect_equal(z1, z2)
})


================================================
FILE: inst/tests/test-stat.r
================================================
context("Stats")

test_that("linear regression recovers slope & intercept if no errors", {
  x <- 1:10
  w <- rep(1, 10)

  expect_equal(regress(x, x * 2, w), c(0, 2))
  expect_equal(regress(x, x * -2, w), c(0, -2))
  expect_equal(regress(x, x * -2 + 5, w), c(5, -2))
  expect_equal(regress(x, x * -2 + -5, w), c(-5, -2))
})

simpleLm <- function(x, y, w) {
  unname(coef(lm(y ~ x, weights = w)))
}

test_that("linear regression matches lm", {
  x <- 1:10
  y <- 10 + x * 2 + rnorm(10)
  w <- rep(1, 10)

  expect_equal(regress(x, y, w), simpleLm(x, y, w))
})

test_that("linear regression matches lm with weights", {
  x <- 1:10
  y <- 10 + x * 2 + rnorm(10)
  w <- runif(10)

  expect_equal(regress(x, y, w), simpleLm(x, y, w))
})

test_that("robust regression effectively removes outlier", {
  x <- 1:10
  y <- 10 + x * 2 + c(rep(0, 9), 10)
  w <- rep(1, 10)

  expect_equal(regress_robust(x, y, w, 10), c(10, 2))
})


================================================
FILE: inst/tests/test-summary-moments.r
================================================
context("Summary: moments")

count2 <- function(x) compute_moments(x)[1]
mean2 <- function(x) compute_moments(x)[2]
sd2 <- function(x) compute_moments(x)[3]

test_that("count agrees with length", {
  expect_equal(count2(1:10), 10)
  expect_equal(count2(5), 1)
  expect_equal(count2(numeric()), 0)
})

test_that("mean agree with base::mean", {
  expect_equal(mean2(1:10), mean(1:10))

  x <- runif(1e6)
  expect_equal(mean2(x), mean(x))
})

test_that("missing values are ignored", {
  x <- c(NA, 5, 5)
  expect_equal(count2(x), 2)
  expect_equal(mean2(x), 5)
})

test_that("standard deviation agrees with sd", {
  expect_equal(sd2(1:10), sd(1:10))

  x <- runif(1e6)
  expect_equal(sd2(x), sd(x))
})

test_that("summary statistics of zero length input are NaN", {
  expect_equal(compute_moments(numeric()), c(0, NaN, NaN))
})


================================================
FILE: inst/tests/test-weighted-stats.r
================================================
context("Weighted statistics")

test_that("weighted.var agrees with var when weights = 1", {
  samples <- replicate(20, runif(100), simplify = FALSE)

  var <- sapply(samples, var)
  wvar <- sapply(samples, weighted.var, w = rep(1, 100))

  expect_equal(wvar, var)
})

test_that("weighted.var agrees with var on repeated vector", {
  samples <- replicate(20, runif(100), simplify = FALSE)
  w <- rep(1:2, 50)
  samples_ex <- lapply(samples, rep, times = w)

  var <- sapply(samples_ex, var)
  wvar <- sapply(samples, weighted.var, w = w)

  expect_equal(wvar, var)
})

test_that("weighed.quantile agrees with quantile on repeated vector", {
  samples <- replicate(20, runif(100), simplify = FALSE)
  w <- rep(1:2, 50)
  samples_ex <- lapply(samples, rep, times = w)

  quant <- sapply(samples_ex, quantile, probs = 0.325, names = FALSE)
  wquant <- sapply(samples, weighted.quantile, w = w, probs = 0.325)

  expect_equal(quant, wquant)
})


================================================
FILE: man/autoplot.condensed.Rd
================================================
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/autoplot.r
\name{autoplot.condensed}
\alias{autoplot.condensed}
\title{Autoplot condensed summaries.}
\usage{
\method{autoplot}{condensed}(x, var = last(summary_vars(x)), ...)
}
\arguments{
\item{x}{a condensed summary}

\item{var}{which summary variable to display}

\item{...}{other arguments passed on to individual methods}
}
\description{
Autoplot condensed summaries.
}
\examples{
if (require("ggplot2")) {

# 1d summaries -----------------------------
x <- rchallenge(1e4)
z <- x + rt(length(x), df = 2)
xsum <- condense(bin(x, 0.1))
zsum <- condense(bin(x, 0.1), z = z)

autoplot(xsum)
autoplot(peel(xsum))

autoplot(zsum)
autoplot(peel(zsum, keep = 1))
autoplot(peel(zsum))

# 2d summaries -----------------------------
y <- runif(length(x))
xysum <- condense(bin(x, 0.1), bin(y, 0.1))
xyzsum <- condense(bin(x, 0.1), bin(y, 0.1), z = z)

autoplot(xysum)
autoplot(peel(xysum))
autoplot(xyzsum)
autoplot(peel(xyzsum))
}
}


================================================
FILE: man/best_h.Rd
================================================
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/h.r
\name{best_h}
\alias{best_h}
\title{Find "best" smoothing parameter using leave-one-out cross validation.}
\usage{
best_h(x, h_init = NULL, ..., tol = 0.01, control = list())
}
\arguments{
\item{x}{condensed summary to smooth}

\item{h_init}{initial values of bandwidths to start search out. If not
specified defaults to 5 times the binwidth of each variable.}

\item{...}{other arguments (like \code{var}) passed on to
\code{\link{rmse_cv}}}

\item{tol}{numerical tolerance, defaults to 1\%.}

\item{control}{additional control parameters passed on to \code{\link{optim}}
The most useful argument is probably trace, which makes it possible to
follow the progress of the optimisation.}
}
\value{
a single numeric value representing the bandwidth that minimises
  the leave-one-out estimate of rmse. Vector has attributes
  \code{evaluations} giving the number of times the objective function
  was evaluated. If the optimisation does not converge, or smoothing is not
  needed (i.e. the estimate is on the lower bounds), a warning is thrown.
}
\description{
Minimises the leave-one-out estimate of root mean-squared error to find
find the "optimal" bandwidth for smoothing.
}
\details{
L-BFGS-B optimisation is used to constrain the bandwidths to be greater
than the binwidths: if the bandwidth is smaller than the binwidth it's
impossible to compute the rmse because no smoothing occurs. The tolerance
is set relatively high for numerical optimisation since the precise choice
of bandwidth makes little difference visually, and we're unlikely to have
sufficient data to make a statistically significant choice anyway.
}
\examples{
\donttest{
x <- rchallenge(1e4)
xsum <- condense(bin(x, 1 / 10))
h <- best_h(xsum, control = list(trace = 3, REPORT = 1))

if (require("ggplot2")) {
autoplot(xsum)
autoplot(smooth(xsum, h))
}
}
}
\seealso{
Other bandwidth estimation functions: \code{\link{h_grid}};
  \code{\link{rmse_cv}}, \code{\link{rmse_cvs}}
}


================================================
FILE: man/bigvis.Rd
================================================
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/bigvis.r
\docType{package}
\name{bigvis}
\alias{as.integer,Rcpp_BinnedVector-method}
\alias{bigvis}
\alias{bigvis-package}
\alias{show,Rcpp_BinnedVector-method}
\title{The big vis package.}
\usage{
\S4method{show}{Rcpp_BinnedVector}(object)

\S4method{as.integer}{Rcpp_BinnedVector}(x, ...)
}
\arguments{
\item{x,object,...}{Generic args}
}
\description{
The big vis package.
}


================================================
FILE: man/bin.Rd
================================================
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/bin.r
\name{bin}
\alias{bin}
\title{Create a binned variable.}
\usage{
bin(x, width = find_width(x), origin = find_origin(x, width), name = NULL)
}
\arguments{
\item{x}{numeric or integer vector}

\item{width}{bin width. If not specified, about 10,000 bins will be chosen
using the algorithim in \code{\link{find_width}}.}

\item{origin}{origin. If not specified, guessed by \code{\link{find_origin}}.}

\item{name}{name of original variable. This will be guessed from the input to
\code{group} if not supplied. Used in the output of
\code{\link{condense}} etc.}
}
\description{
Create a binned variable.
}
\details{
This function produces an R reference class that wraps around a C++ function.
Generally, you should just treat this as an opaque object with reference
semantics, and you shouldn't call the methods on it - pass it to
\code{\link{condense}} and friends.
}
\examples{
x <- runif(1e6)
bin(x)
bin(x, 0.01)
bin(x, 0.01, origin = 0.5)
}


================================================
FILE: man/breaks.Rd
================================================
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/breaks.r
\name{breaks}
\alias{breaks}
\title{Compute breaks given origin and width.}
\usage{
breaks(x, binwidth, origin = min(x))
}
\arguments{
\item{x}{numeric vector}

\item{binwidth}{bin width}

\item{origin}{bin origin}
}
\description{
Breaks are right-open, left-closed [x, y), so if \code{max(x)} is an integer
multiple of binwidth, then we need one more break. This function only returns
the left-side of the breaks.
}
\details{
The first break is special, because it always contains missing values.
}
\examples{
breaks(10, origin = 0, binwidth = 1)
breaks(9.9, origin = 0, binwidth = 1)

breaks(1:10, origin = 0, binwidth = 2)
}
\keyword{internal}


================================================
FILE: man/condense.Rd
================================================
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/condense.r
\name{condense}
\alias{condense}
\title{Efficient binned summaries.}
\usage{
condense(..., z = NULL, summary = NULL, w = NULL, drop = NULL)
}
\arguments{
\item{...}{group objects created by \code{\link{bin}}}

\item{z}{a numeric vector to summary for each group. Optional for some
summary statistics.}

\item{summary}{the summary statistic to use. Currently must be one of
count, sum, mean, median or sd. If \code{NULL}, defaults to mean if
y is present, count if not.}

\item{w}{a vector of weights. Not currently supported by all summary
functions.}

\item{drop}{if \code{TRUE} only locations with data will be returned.  This
is more efficient if the data is very sparse (<1\% of cells filled), and
is slightly less efficient. Defaults to \code{TRUE} if you are condensing
over two or more dimensions, \code{FALSE} for 1d.}
}
\description{
Efficient binned summaries.
}
\examples{
x <- runif(1e5)
gx <- bin(x, 0.1)
condense(gx)
}


================================================
FILE: man/condensed.Rd
================================================
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/condensed.r
\name{condensed}
\alias{as.condensed}
\alias{condensed}
\alias{is.condensed}
\title{Condensed: an S3 class for condensed summaries.}
\usage{
condensed(groups, grouped, summary)

is.condensed(x)

as.condensed(x)
}
\arguments{
\item{groups}{list of \code{\link{bin}}ed objects}

\item{grouped,summary}{output from C++ condense function}

\item{x}{object to test or coerce}
}
\description{
This object managed the properties of condensed (summarised) data frames.
}
\section{S3 methods}{


Mathematical functions with methods for \code{binsum} object will modify
the x column of the data frame and \code{\link{rebin}} the data, calculating
updated summary statistics.

Currently methods are provided for the \code{Math} group generic,
logical comparison and arithmetic operators, and
\code{\link[plyr]{round_any}}.
}
\examples{
if (require("ggplot2")) {

x <- rchallenge(1e4)
xsum <- condense(bin(x, 1 / 10))

# Basic math operations just modify the first column
autoplot(xsum)
autoplot(xsum * 10)
autoplot(xsum - 30)
autoplot(abs(xsum - 30))

# Similarly, logical operations work on the first col
autoplot(xsum[xsum > 10, ])
}
}
\keyword{internal}


================================================
FILE: man/dchallenge.Rd
================================================
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/challenge.r
\name{dchallenge}
\alias{dchallenge}
\alias{rchallenge}
\title{Density and random number generation functions for a challenging
distribution.}
\usage{
dchallenge(x)

rchallenge(n)
}
\arguments{
\item{x}{values to evaluate pdf at}

\item{n}{number of random samples to generate}
}
\description{
This is a 1/3-2/3 mixture of a t-distribution with 2 degrees of freedom
centered at 15 and scaled by 2, and a gamma distribution with shape 2
and rate 1/3. (The t-distribution is windsorised at 0, but this
has negligible effect.) This distribution is challenging because it
mixes heavy tailed and asymmetric distributions.
}
\examples{
plot(dchallenge, xlim = c(-5, 60), n = 500)

x <- rchallenge(1e4)
hist(x, breaks = 1000)
xsum <- condense(bin(x, 0.1))
plot(xsum$x, xsum$.count, type = "l")
xsmu <- smooth(xsum, 0.3)
plot(xsmu$x, xsmu$.count, type = "l")
plot(xsmu$x, xsmu$.count, type = "l", xlim = c(0, 30))
}


================================================
FILE: man/dgrid.Rd
================================================
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/dgrid.r
\name{dgrid}
\alias{dgrid}
\alias{is.dgrid}
\title{dgrid: an S3 class for data grids}
\usage{
dgrid(x, width, origin = 0, nbins = NULL)

is.dgrid(x)
}
\arguments{
\item{x}{a numeric vector to test or coerce.}

\item{width}{bin width}

\item{origin}{bin origins}

\item{nbins}{number of bins}
}
\description{
dgrid: an S3 class for data grids
}
\examples{
g <- dgrid(0:10 + 0.5, width = 1)
range(g)
as.integer(g)
}


================================================
FILE: man/find_origin.Rd
================================================
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/origin.r
\name{find_origin}
\alias{find_origin}
\title{Find the origin.}
\usage{
find_origin(x, binwidth)
}
\arguments{
\item{x}{numeric or integer vector}

\item{binwidth}{binwidth}
}
\description{
Find the origin.
}
\details{
This algorithm implements simple heuristics for determining the origin of
a histogram when only the binwidth is specified. It:

\itemize{
   \item rounds to zero, if relatively close
   \item subtracts 0.5 offset, if an x is integer
   \item ensures the origin is a multiple of the binwidth
}
}
\examples{
find_origin(1:10, 1)
find_origin(1:10, 2)
find_origin(c(1, 1e6), 1)
}
\seealso{
Other reasonable defaults: \code{\link{find_width}}
}
\keyword{internal}


================================================
FILE: man/find_width.Rd
================================================
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/width.r
\name{find_width}
\alias{find_width}
\title{Compute a reasonable default binwidth.}
\usage{
find_width(x, nbins = 10000)
}
\arguments{
\item{x}{a numeric vector. If a numeric vector of length one is supplied,
it's assumed that}

\item{nbins}{desired number of bins (approximate)}
}
\description{
Compute a reasonable default binwidth.
}
\examples{
find_width(c(0, 5))
find_width(c(0, 5.023432))
find_width(c(0, 5.9))
}
\seealso{
Other reasonable defaults: \code{\link{find_origin}}
}
\keyword{internal}


================================================
FILE: man/frange.Rd
================================================
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/RcppExports.R
\name{frange}
\alias{frange}
\title{Efficient implementation of range.}
\usage{
frange(x, finite = TRUE)
}
\arguments{
\item{x}{a numeric vector, or a \code{\link{ranged}} object}

\item{finite}{If \code{TRUE} ignores missing values and infinities. Note
that if the vector is empty, or only contains missing values,
\code{frange} will return \code{c(Inf, -Inf)} because those are the
identity values for \code{\link{min}} and \code{\link{max}} respectively.}
}
\description{
This is an efficient C++ implementation of range for numeric vectors:
it avoids S3 dispatch, and computes both min and max in a single pass
through the input.
}
\details{
If \code{x} has a \code{range} attribute (e.g. it's a \code{\link{ranged}}
object), it will be used instead of computing the range from scratch.
}
\examples{
x <- runif(1e6)
system.time(range(x))
system.time(frange(x))

rx <- ranged(x)
system.time(frange(rx))
}


================================================
FILE: man/h_grid.Rd
================================================
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/h.r
\name{h_grid}
\alias{h_grid}
\title{Generate grid of plausible bandwidths for condensed summary.}
\usage{
h_grid(x, n = 50, max = 20)
}
\arguments{
\item{x}{a condensed summary}

\item{n}{number of bandwidths to generate (in each dimension)}

\item{max}{maximum bandwidth to generate, as multiple of binwidth.}
}
\description{
By default, the bandwidths start at the bin width, and then continue
up 50 (\code{n}) steps until 20 (\code{max}) times the bin width.
}
\examples{
x <- rchallenge(1e4)
xsum <- condense(bin(x, 1 / 10))
h_grid(xsum)

y <- runif(1e4)
xysum <- condense(bin(x, 1 / 10), bin(y, 1 / 100))
h_grid(xysum, n = 10)
}
\seealso{
Other bandwidth estimation functions: \code{\link{best_h}};
  \code{\link{rmse_cv}}, \code{\link{rmse_cvs}}
}


================================================
FILE: man/is.ranged.Rd
================================================
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/ranged.r
\name{is.ranged}
\alias{is.ranged}
\title{Test if an object is of class ranged.}
\usage{
is.ranged(x)
}
\arguments{
\item{x}{object to test}
}
\description{
Test if an object is of class ranged.
}
\keyword{internal}


================================================
FILE: man/movies.Rd
================================================
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/movies.r
\docType{data}
\name{movies}
\alias{movies}
\title{Movie information and user ratings from IMDB.com.}
\format{A data frame with 130,456 rows and 14 variables}
\usage{
data(movies)
}
\description{
The internet movie database, \url{http://imdb.com/}, is a website devoted
to collecting movie data supplied by studios and fans.  It claims to be the
biggest movie database on the web and is run by amazon.  More about
information imdb.com can be found online,
\url{http://imdb.com/help/show_leaf?about}, including information about
the data collection process,
\url{http://imdb.com/help/show_leaf?infosource}.
}
\details{
Movies were selected for inclusion if they had a known length and had been rated by at least one imdb user.  The data set contains the following fields:

\itemize{
  \item title.  Title of the movie.
  \item year.  Year of release.
  \item budget.  Total budget (if known) in US dollars
  \item length.  Length in minutes.
  \item rating.  Average IMDB user rating.
  \item votes.  Number of IMDB users who rated this movie.
  \item mpaa.  MPAA rating.
  \item action, animation, comedy, drama, documentary, romance, short:
    \code{TRUE} if movie belongs to that genre.
}
}
\references{
\url{http://had.co.nz/data/movies/}
}


================================================
FILE: man/mt.Rd
================================================
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/mt.r
\name{mt}
\alias{inv_mt}
\alias{mt}
\alias{mt_trans}
\title{Modulus transformation (and its inverse).}
\usage{
mt(x, lambda)

inv_mt(x, lambda)

mt_trans(lambda)
}
\arguments{
\item{x}{values to transform}

\item{lambda}{degree of transformation}
}
\description{
A generalisation of the box-cox transformation that works for
values with both positive and negative values.
}
\details{
This is useful for compressing the tails of long-tailed distributions,
often encountered with very large datasets.
}
\examples{
x <- seq(-10, 10, length = 100)
plot(x, mt(x, 0), type = "l")
plot(x, mt(x, 0.25), type = "l")
plot(x, mt(x, 0.5), type = "l")
plot(x, mt(x, 1), type = "l")
plot(x, mt(x, 2), type = "l")
plot(x, mt(x, -1), type = "l")
plot(x, mt(x, -2), type = "l")
}
\references{
J. John and N. Draper. "An alternative family of
 transformations." Applied Statistics, pages 190-197, 1980.
 \url{http://www.jstor.org/stable/2986305}
}


================================================
FILE: man/peel.Rd
================================================
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/peel.r
\name{peel}
\alias{peel}
\title{Peel off low density regions of the data.}
\usage{
peel(x, keep = 0.99, central = NULL)
}
\arguments{
\item{x}{condensed summary}

\item{keep}{(approximate) proportion of data to keep. If \code{1}, will
remove all cells with counts.  All missing values will be preserved.}

\item{central}{if \code{TRUE} peels off regions of lowest density only from
the outside of the data. In 2d this works by progressively peeling off
convex hull of the data: the current algorithm is quite slow.
If \code{FALSE}, just removes the lowest density regions wherever they are
found. Regions with 0 density are removed regardless of location.
Defaults to TRUE if there are two or fewer grouping variables is less.}
}
\description{
Keeps specified proportion of data by removing the lowest density regions,
either anywhere on the plot, or for 2d, just around the edges.
}
\details{
This is useful for visualisation, as an easy way of focussing on the regions
where the majority of the data lies.
}
\examples{
x <- rt(1e5, df = 2)
y <- rt(1e5, df = 2)
xysum <- condense(bin(x, 1 / 10), bin(y, 1 / 10))
plot(xysum$x, xysum$y)

plot(peel(xysum, 0.95, central = TRUE)[1:2])
plot(peel(xysum, 0.90, central = TRUE)[1:2])
plot(peel(xysum, 0.50, central = TRUE)[1:2])
}


================================================
FILE: man/ranged.Rd
================================================
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/ranged.r
\name{ranged}
\alias{ranged}
\title{A S3 class for caching the range of a vector}
\usage{
ranged(x, range = frange(x, finite = TRUE))
}
\arguments{
\item{x}{a numeric vector}

\item{range}{the range of the vector (excluding missing values), if known.
If unknown, it will be computed with \code{\link{frange}}, a fast C++
implementation of \code{\link{range}}.}
}
\description{
This class is designed for dealing with large vectors, where the cost of
recomputing the range multiple times is prohibitive. It provides methods
for \code{\link{print}} and \code{\link{str}} that display only the range,
not the contents.
}
\section{Performance}{

For best performance, you may want to run copy and paste the contents of
this function into your function, to avoid making any copies of \code{x}.
This is probably only necessary if you're dealing with extremely large
vectors, > 100 million obs.
}
\examples{
x <- runif(1e6)
y <- ranged(x)
range(y)
y
str(y)

# Modifications to the class currently destroy the cache
y[1] <- 10
max(y)
class(y)
z <- y + 10
max(z)
class(z)
}


================================================
FILE: man/rmse_cvs.Rd
================================================
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/rmse.r
\name{rmse_cvs}
\alias{rmse_cv}
\alias{rmse_cvs}
\title{Estimate smoothing RMSE using leave-one-out cross-validation.}
\usage{
rmse_cvs(x, hs = h_grid(x), ...)

rmse_cv(x, h, var = summary_vars(x)[1], ...)
}
\arguments{
\item{x}{condensed summary table}

\item{...}{other variables passed on to \code{\link{smooth}}}

\item{h,hs}{for \code{rmse_cv}, a vector of bandwidths; for \code{rmse_cv}
a data frame of bandwidths, as generated by \code{\link{h_grid}}.}

\item{var}{variable to smooth}
}
\description{
\code{rmse_cv} computes the leave-one-out RMSE for a single vector of
bandwidths, \code{rmse_cvs} computes for a multiple vectors of bandwidths,
stored as a data frame.
}
\examples{
\donttest{
set.seed(1014)
# 1d -----------------------------
x <- rchallenge(1e4)
xsum <- condense(bin(x, 1 / 10))
cvs <- rmse_cvs(xsum)

if (require("ggplot2")) {
autoplot(xsum)
qplot(x, err, data = cvs, geom = "line")
xsmu <- smooth(xsum, 1.3)
autoplot(xsmu)
autoplot(peel(xsmu))
}

# 2d -----------------------------
y <- runif(1e4)
xysum <- condense(bin(x, 1 / 10), bin(y, 1 / 100))
cvs <- rmse_cvs(xysum, h_grid(xysum, 10))
if (require("ggplot2")) {
qplot(x, y, data = cvs, size = err)
}
}
}
\seealso{
Other bandwidth estimation functions: \code{\link{best_h}};
  \code{\link{h_grid}}
}


================================================
FILE: man/round_any.condensed.Rd
================================================
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/condensed.r
\name{round_any.condensed}
\alias{round_any.condensed}
\title{Round any method for condensed objects}
\usage{
round_any.condensed(x, accuracy, f = round)
}
\arguments{
\item{x}{numeric or date-time (POSIXct) vector to round}

\item{accuracy}{number to round to; for POSIXct objects, a number of seconds}

\item{f}{rounding function: \code{\link{floor}}, \code{\link{ceiling}} or
\code{\link{round}}}
}
\description{
Round any method for condensed objects
}
\keyword{internal}


================================================
FILE: man/smooth.Rd
================================================
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/smooth.r
\name{smooth}
\alias{smooth}
\title{Smooth a condensed data frame.}
\usage{
smooth(x, h, var = summary_vars(x)[1], grid = NULL, type = "mean",
  factor = TRUE)
}
\arguments{
\item{x}{a condensed summary}

\item{h}{numeric vector of bandwidths, one for each grouping variable in
\code{x}}

\item{var}{variable to smooth}

\item{grid}{a data frame with the grouping colums as x.  In order for the
factored version of \code{smooth_nd} to work, this grid must be a superset
of \code{x}.}

\item{type}{type of smoothing to use.  Current options are \code{"mean"},
a kernel weighted mean; \code{"regression"}, a kernel weighted local
regression; and \code{"robust_regression"}, robust kernel weighted local
regression in the style of \code{\link{loess}}.  Unique prefixes are also
acceptable.}

\item{factor}{if \code{TRUE} compute the n-dimensional smooth by a sequence
of 1d smoothes. For \code{type = "mean"} the results are always the same
grid values are uncorrelated (e.g. the grid is complete at every location);
and is very approximate for \code{type = "robust"}.}
}
\description{
Smooth a condensed data frame.
}
\examples{
x <- runif(1e5)
xsum <- condense(bin(x, 1 / 100))
xsmu1 <- smooth(xsum, 5 / 100)
xsmu2 <- smooth(xsum, 5 / 100, factor = FALSE)

# More challenging distribution
x <- rchallenge(1e4)
xsum <- condense(bin(x, 0.1))
xsmu <- smooth(xsum, 1)

plot(xsum$x, xsum$.count, type = "l")
lines(xsmu$x, xsmu$.count, col = "red")

xsmu2 <- smooth(xsum, 1, type = "regress")
plot(xsmu$x, xsmu$.count, type = "l", xlim = c(0, 50))
lines(xsmu2$x, xsmu2$.count, col = "red")
# Note difference in tails
}


================================================
FILE: man/standardise.Rd
================================================
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/standardise.r
\name{standardise}
\alias{standardise}
\title{Standardise a summary to sum to one.}
\usage{
standardise(x, margin = integer())
}
\arguments{
\item{x}{a condensed summary. Must have \code{.count} variable.}

\item{margin}{margins to standardise along.  If \code{NULL}, the default,
standardises the whole array.}
}
\description{
Standardise a summary to sum to one.
}
\examples{
b1 <- condense(bin(movies$year, 1))
d1 <- smooth(b1, 2, type = "reg")

if (require("ggplot2")) {

autoplot(b1)
autoplot(d1)

# Note change in x-axis limits
autoplot(standardise(d1))
}

# Can also standardise a dimension at a time
b2 <- with(movies, condense(bin(year, 2), bin(length, 10)))
b2 <- peel(b2, central = TRUE)

if (require("ggplot2")) {

autoplot(b2)
autoplot(standardise(b2))    # note legend
autoplot(standardise(b2, "year"))   # each row sums to 1
autoplot(standardise(b2, "length")) # each col sums to 1

base <- ggplot(b2, aes(length, .count)) +
  geom_line(aes(group = year, colour = year))
base
base \%+\% standardise(b2)  # Just affects y axis labels
base \%+\% standardise(b2, "year") # Makes year comparable
base \%+\% standardise(b2, "length") # Meaningless for this display

}
}


================================================
FILE: man/transform.condensed.Rd
================================================
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/rebin.r
\name{transform.condensed}
\alias{rebin}
\alias{transform.condensed}
\title{Transform condensed objects, collapsing unique bins.}
\usage{
\\method{transform}{condensed}(`_data`, ...)

rebin(data)
}
\arguments{
\item{...}{named arguments evaluated in the context of the data}

\item{data,`_data`}{a condensed summary}
}
\description{
Transform condensed objects, collapsing unique bins.
}
\details{
You don't need to use \code{rebin} if you use transform: it will
automatically rebin for you.  You will need to use it if you manually
transform any grouping variables.
}
\examples{
x <- runif(1e4, -1, 1)
xsum <- condense(bin(x, 1 / 50))

# Transforming by hand: must use rebin
xsum$x <- abs(xsum$x)
rebin(xsum)
if (require("ggplot2")) {
  autoplot(xsum) + geom_point()
  autoplot(rebin(xsum)) + geom_point()
}

#' Transforming with transform
y <- x ^ 2 + runif(length(x), -0.1, 0.1)
xysum <- condense(bin(x, 1 / 50), z = y)
xysum <- transform(xysum, x = abs(x))
if (require("ggplot2")) {
  autoplot(xysum)
}
}
\keyword{internal}


================================================
FILE: man/weighted.IQR.Rd
================================================
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/weighted-stats.r
\name{weighted.IQR}
\alias{weighted.IQR}
\title{Compute the interquartile range of weighted data.}
\usage{
weighted.IQR(x, w, na.rm = FALSE)
}
\arguments{
\item{x}{numeric vector of observations}

\item{w}{integer vector of weights, representing the number of
time each \code{x} was observed}

\item{na.rm}{If \code{TRUE} will automatically remove missing values
in \code{x} or \code{w}.}
}
\description{
Compute the interquartile range of weighted data.
}
\details{
This is a simple wrapper around \code{\link{weighted.quantile}}
}
\examples{
x <- sort(runif(200))
w <- rpois(200, seq(1, 10, length = 200)) + 1

IQR(x)
weighted.IQR(x, w)
}


================================================
FILE: man/weighted.ecdf.Rd
================================================
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/weighted-stats.r
\name{weighted.ecdf}
\alias{weighted.ecdf}
\title{A weighted ecdf function.}
\usage{
weighted.ecdf(x, w)
}
\arguments{
\item{x}{numeric vector of observations}

\item{w}{integer vector of weights, representing the number of
time each \code{x} was observed}
}
\description{
An extension of the base \code{\link[stats]{ecdf}} function which works
with weighted data.
}
\section{S3 methods}{

The \code{ecdf} class has methods for \code{\link{plot}},
\code{\link{lines}}, \code{\link{summary}} and \code{\link{quantile}}.
\code{\link{quantile}} does not currently correctly compute values for
weighted ecdfs.
}
\examples{
x <- runif(200)
w <- rpois(200, 5) + 1

e <- weighted.ecdf(x, w)
plot(e)
summary(e)

y <- x[rep(seq_along(x), w)]
plot(ecdf(y))
}
\seealso{
\code{\link[stats]{weighted.mean}}

Other weighted statistics: \code{\link{weighted.quantile}};
  \code{\link{weighted.sd}}, \code{\link{weighted.var}}
}


================================================
FILE: man/weighted.median.Rd
================================================
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/weighted-stats.r
\name{weighted.median}
\alias{weighted.median}
\title{Compute the median of weighted data.}
\usage{
weighted.median(x, w, na.rm = FALSE)
}
\arguments{
\item{x}{numeric vector of observations}

\item{w}{integer vector of weights, representing the number of
time each \code{x} was observed}

\item{na.rm}{If \code{TRUE} will automatically remove missing values
in \code{x} or \code{w}.}
}
\description{
Compute the median of weighted data.
}
\details{
This is a simple wrapper around \code{\link{weighted.quantile}}
}
\examples{
x <- runif(200)
w <- rpois(200, 5) + 1

median(x)
weighted.median(x, w)
}


================================================
FILE: man/weighted.quantile.Rd
================================================
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/weighted-stats.r
\name{weighted.quantile}
\alias{weighted.quantile}
\title{Compute quantiles of weighted data.}
\usage{
weighted.quantile(x, w, probs = seq(0, 1, 0.25), na.rm = FALSE)
}
\arguments{
\item{x}{numeric vector of observations}

\item{w}{integer vector of weights, representing the number of
time each \code{x} was observed}

\item{probs}{numeric vector of probabilities between 0 and 1}

\item{na.rm}{If \code{TRUE} will automatically remove missing values
in \code{x} or \code{w}.}
}
\description{
Compute quantiles of weighted data.
}
\details{
Currently only implements the type 7 algorithm, as described in
\code{\link{quantile}}. Based on \code{\link{quantile}} written by R-core.
}
\examples{
x <- runif(200)
w <- rpois(200, 5) + 1
weighted.quantile(x, w)
}
\seealso{
Other weighted statistics: \code{\link{weighted.ecdf}};
  \code{\link{weighted.sd}}, \code{\link{weighted.var}}
}


================================================
FILE: man/weighted.var.Rd
================================================
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/weighted-stats.r
\name{weighted.var}
\alias{weighted.sd}
\alias{weighted.var}
\title{Compute a weighted variance or standard deviation of a vector.}
\usage{
weighted.var(x, w = NULL, na.rm = FALSE)

weighted.sd(x, w, na.rm = TRUE)
}
\arguments{
\item{x}{numeric vector of observations}

\item{w}{integer vector of weights, representing the number of
time each \code{x} was observed}

\item{na.rm}{if \code{TRUE}, missing values in both \code{w} and \code{x}
will be removed prior computation. Otherwise if there are missing values
the result will always be missing.}
}
\description{
Compute a weighted variance or standard deviation of a vector.
}
\details{
Note that unlike the base R \code{\link{var}} function, these functions only
work with individual vectors not matrices or data frames.
}
\examples{
x <- c(1:5)
w <- rpois(5, 5) + 1
y <- x[rep(seq_along(x), w)]
weighted.var(x, w)
var(y)

stopifnot(all.equal(weighted.var(x, w), var(y)))
}
\seealso{
\code{\link[stats]{weighted.mean}}

Other weighted statistics: \code{\link{weighted.ecdf}};
  \code{\link{weighted.quantile}}
}


================================================
FILE: notes.md
================================================
# Group

* 1d, nd

Future work: linear binning

# Summarise

* 1d

  * count, sum
  * count, mean, sd
  * median

* 2d
  * mean
  * regression
  * robust regression

* nd 
  * mean
  * regression (with eigen or armadillo)
  * robust regression (with eigen or armadillo)

Future work:

* skew?, kurt?
* boxplot
* weighted quantiles (C++ version of R code)
* compute standard errors / bootstrap standard errors?

* infrastructure for passing multiple z
  * 2d: cor, lm
  

# Smooth

Kernel smoothing plus binned summary leads to many common statistics: density =~ bin + smooth, loess =~ mean + smooth, rqss =~ quantile + smooth

* weights
* smoothing type
  * constant
  * linear
  * robust linear (lowess)
  * (linear poisson?)
* leave-one-out cross-validation
* optimisations
  * convert to integer grid & use pre-computed grid of kernel values
  * hash in smooth_nd_1 and compute more efficiently along 1d
* deal with missing values

* smooth needs to create complete grid when factor = TRUE

Think about input data structure: sparse grid, represented as a coordinate list. Binned grid class = integer vector + width/origin/nbins (0 = NA). Most transformations break the grid, in which case all you case preserve is min, max and number of bins. All smoothing methods adapted to work in terms of these integers. Need to extract out bin/unbin into own class (initialised with std::vector of bin sizes)

Possible that more performance is available by switching to a sparse tensor library.

# Visualise

* Product plots
* Standard errors + cut offs

* Peel: implement nd version using depth

================================================
FILE: src/.gitignore
================================================
*.o
*.so

================================================
FILE: src/BigVis.cpp
================================================
#include <bigvis.h>
using namespace Rcpp;

RCPP_MODULE(BigVis) {
  class_<BinnedVectorReference>("BinnedVector")
    .constructor<NumericVector, String, double, double>()
    .const_method("bin_i", &BinnedVectorReference::bin_i)
    .const_method("bin", &BinnedVectorReference::bin)
    .const_method("unbin", &BinnedVectorReference::unbin)
    .const_method("nbins", &BinnedVectorReference::nbins)
    .const_method("size", &BinnedVectorReference::size)
    .const_method("origin", &BinnedVectorReference::origin)
    .const_method("width", &BinnedVectorReference::width)
    .const_method("name", &BinnedVectorReference::name)
  ;
  class_<BinnedVectors>("BinnedVectors")
    .constructor<List>()
    .method("add_vector", &BinnedVectors::add_vector)
    .field("bins", &BinnedVectors::bins_)
    .const_method("bin_i", &BinnedVectors::bin_i)
    .const_method("bin", &BinnedVectors::bin)
    .const_method("unbin", &BinnedVectors::unbin)
    .const_method("nbins", &BinnedVectors::nbins)
  ;
}


================================================
FILE: src/BinnedVector.cpp
================================================
#include <bigvis.h>
using namespace Rcpp;

NumericVector frange(const NumericVector& x, const bool finite = true);

int BinnedVector::nbins() const {
  double max = frange(x_)[1];
  return bin(max) + 1; 
  // +1 bin for missing values
}


================================================
FILE: src/BinnedVectors.cpp
================================================
#include <bigvis.h>
using namespace Rcpp;

int BinnedVectors::bin_i(int i) const {
  int bin = 0;
  int ngroups = groups_.size();

  for (int j = 0; j < ngroups; ++j) {
    bin += groups_[j].bin_i(i) * bins_[(ngroups - 1) - j];
  }

  return bin;
}

int BinnedVectors::bin(std::vector<double> x) const {
  int ngroups = groups_.size();
  if (x.size() != ngroups) stop("x must be same length as groups");
  int bin = 0;

  for (int j = 0; j < ngroups; ++j) {
    int bin_j = groups_[j].bin(x[j]);
    bin += bin_j * bins_[(ngroups - 1) - j];
    // Rcout << "group: " << j << " bin: " << bin << " bin_j: " << bin_j << "\n";
  }

  return bin;
}

std::vector<double> BinnedVectors::unbin(int bin) const {
  int ngroups = groups_.size();
  std::vector<double> bins(ngroups);

  // if ngroups = 3, then: 
  // bin = groups[0].bin(x[0]) * bins[2] (biggest) +
  //       groups[1].bin(x[1]) * bins[1] +
  //       groups[2].bin(x[2]) * bins[0] (smallest)
  // peel off largest first
  //   bin_j = bin %/% bin[2]
  //   groups[0].unbin(bin_j)
  // and that goes in last output position

  for (int i = 0, j = ngroups - 1; i < ngroups - 1; ++i, --j) {
    int bin_j = bin % bins_[j];
    // Rcout << "group: " << j << " bin: " << bin << " bin_j: " << bin_j << "\n";
    bins[j] = groups_[j].unbin(bin_j);

    bin = (bin - bin_j) / bins_[j];
  }
  // Rcout << "group: " << 0 << " bin: " << bin << " bin_j: " << bin << "\n";
  // Special case for last group because x %% 1 = 0
  bins[0] = groups_[0].unbin(bin);

  return bins;
}


================================================
FILE: src/Makevars
================================================
PKG_CPPFLAGS=-I../inst/include


================================================
FILE: src/RcppExports.cpp
================================================
// This file was generated by Rcpp::compileAttributes
// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

#include "../inst/include/bigvis.h"
#include <Rcpp.h>

using namespace Rcpp;

// condense_count
List condense_count(const List& x, const NumericVector& z, const NumericVector& weight, bool drop);
RcppExport SEXP bigvis_condense_count(SEXP xSEXP, SEXP zSEXP, SEXP weightSEXP, SEXP dropSEXP) {
BEGIN_RCPP
    Rcpp::RObject __result;
    Rcpp::RNGScope __rngScope;
    Rcpp::traits::input_parameter< const List& >::type x(xSEXP);
    Rcpp::traits::input_parameter< const NumericVector& >::type z(zSEXP);
    Rcpp::traits::input_parameter< const NumericVector& >::type weight(weightSEXP);
    Rcpp::traits::input_parameter< bool >::type drop(dropSEXP);
    __result = Rcpp::wrap(condense_count(x, z, weight, drop));
    return __result;
END_RCPP
}
// condense_sum
List condense_sum(const List& x, const NumericVector& z, const NumericVector& weight, bool drop);
RcppExport SEXP bigvis_condense_sum(SEXP xSEXP, SEXP zSEXP, SEXP weightSEXP, SEXP dropSEXP) {
BEGIN_RCPP
    Rcpp::RObject __result;
    Rcpp::RNGScope __rngScope;
    Rcpp::traits::input_parameter< const List& >::type x(xSEXP);
    Rcpp::traits::input_parameter< const NumericVector& >::type z(zSEXP);
    Rcpp::traits::input_parameter< const NumericVector& >::type weight(weightSEXP);
    Rcpp::traits::input_parameter< bool >::type drop(dropSEXP);
    __result = Rcpp::wrap(condense_sum(x, z, weight, drop));
    return __result;
END_RCPP
}
// condense_mean
List condense_mean(const List& x, const NumericVector& z, const NumericVector& weight, bool drop);
RcppExport SEXP bigvis_condense_mean(SEXP xSEXP, SEXP zSEXP, SEXP weightSEXP, SEXP dropSEXP) {
BEGIN_RCPP
    Rcpp::RObject __result;
    Rcpp::RNGScope __rngScope;
    Rcpp::traits::input_parameter< const List& >::type x(xSEXP);
    Rcpp::traits::input_parameter< const NumericVector& >::type z(zSEXP);
    Rcpp::traits::input_parameter< const NumericVector& >::type weight(weightSEXP);
    Rcpp::traits::input_parameter< bool >::type drop(dropSEXP);
    __result = Rcpp::wrap(condense_mean(x, z, weight, drop));
    return __result;
END_RCPP
}
// condense_sd
List condense_sd(const List& x, const NumericVector& z, const NumericVector& weight, bool drop);
RcppExport SEXP bigvis_condense_sd(SEXP xSEXP, SEXP zSEXP, SEXP weightSEXP, SEXP dropSEXP) {
BEGIN_RCPP
    Rcpp::RObject __result;
    Rcpp::RNGScope __rngScope;
    Rcpp::traits::input_parameter< const List& >::type x(xSEXP);
    Rcpp::traits::input_parameter< const NumericVector& >::type z(zSEXP);
    Rcpp::traits::input_parameter< const NumericVector& >::type weight(weightSEXP);
    Rcpp::traits::input_parameter< bool >::type drop(dropSEXP);
    __result = Rcpp::wrap(condense_sd(x, z, weight, drop));
    return __result;
END_RCPP
}
// condense_median
List condense_median(const List& x, const NumericVector& z, const NumericVector& weight, bool drop);
RcppExport SEXP bigvis_condense_median(SEXP xSEXP, SEXP zSEXP, SEXP weightSEXP, SEXP dropSEXP) {
BEGIN_RCPP
    Rcpp::RObject __result;
    Rcpp::RNGScope __rngScope;
    Rcpp::traits::input_parameter< const List& >::type x(xSEXP);
    Rcpp::traits::input_parameter< const NumericVector& >::type z(zSEXP);
    Rcpp::traits::input_parameter< const NumericVector& >::type weight(weightSEXP);
    Rcpp::traits::input_parameter< bool >::type drop(dropSEXP);
    __result = Rcpp::wrap(condense_median(x, z, weight, drop));
    return __result;
END_RCPP
}
// double_diff_sum
std::vector<int> double_diff_sum(IntegerVector bin, IntegerVector count);
RcppExport SEXP bigvis_double_diff_sum(SEXP binSEXP, SEXP countSEXP) {
BEGIN_RCPP
    Rcpp::RObject __result;
    Rcpp::RNGScope __rngScope;
    Rcpp::traits::input_parameter< IntegerVector >::type bin(binSEXP);
    Rcpp::traits::input_parameter< IntegerVector >::type count(countSEXP);
    __result = Rcpp::wrap(double_diff_sum(bin, count));
    return __result;
END_RCPP
}
// frange
NumericVector frange(const NumericVector& x, const bool finite);
RcppExport SEXP bigvis_frange(SEXP xSEXP, SEXP finiteSEXP) {
BEGIN_RCPP
    Rcpp::RObject __result;
    Rcpp::RNGScope __rngScope;
    Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP);
    Rcpp::traits::input_parameter< const bool >::type finite(finiteSEXP);
    __result = Rcpp::wrap(frange(x, finite));
    return __result;
END_RCPP
}
// group_fixed
IntegerVector group_fixed(const NumericVector& x, double width, double origin);
RcppExport SEXP bigvis_group_fixed(SEXP xSEXP, SEXP widthSEXP, SEXP originSEXP) {
BEGIN_RCPP
    Rcpp::RObject __result;
    Rcpp::RNGScope __rngScope;
    Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP);
    Rcpp::traits::input_parameter< double >::type width(widthSEXP);
    Rcpp::traits::input_parameter< double >::type origin(originSEXP);
    __result = Rcpp::wrap(group_fixed(x, width, origin));
    return __result;
END_RCPP
}
// group_rect
IntegerVector group_rect(const NumericVector& x, const NumericVector& y, double x_width, double y_width, double x_origin, double y_origin);
RcppExport SEXP bigvis_group_rect(SEXP xSEXP, SEXP ySEXP, SEXP x_widthSEXP, SEXP y_widthSEXP, SEXP x_originSEXP, SEXP y_originSEXP) {
BEGIN_RCPP
    Rcpp::RObject __result;
    Rcpp::RNGScope __rngScope;
    Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP);
    Rcpp::traits::input_parameter< const NumericVector& >::type y(ySEXP);
    Rcpp::traits::input_parameter< double >::type x_width(x_widthSEXP);
    Rcpp::traits::input_parameter< double >::type y_width(y_widthSEXP);
    Rcpp::traits::input_parameter< double >::type x_origin(x_originSEXP);
    Rcpp::traits::input_parameter< double >::type y_origin(y_originSEXP);
    __result = Rcpp::wrap(group_rect(x, y, x_width, y_width, x_origin, y_origin));
    return __result;
END_RCPP
}
// group_hex
IntegerVector group_hex(const NumericVector& x, const NumericVector& y, double x_width, double y_width, double x_origin, double y_origin, double x_max);
RcppExport SEXP bigvis_group_hex(SEXP xSEXP, SEXP ySEXP, SEXP x_widthSEXP, SEXP y_widthSEXP, SEXP x_originSEXP, SEXP y_originSEXP, SEXP x_maxSEXP) {
BEGIN_RCPP
    Rcpp::RObject __result;
    Rcpp::RNGScope __rngScope;
    Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP);
    Rcpp::traits::input_parameter< const NumericVector& >::type y(ySEXP);
    Rcpp::traits::input_parameter< double >::type x_width(x_widthSEXP);
    Rcpp::traits::input_parameter< double >::type y_width(y_widthSEXP);
    Rcpp::traits::input_parameter< double >::type x_origin(x_originSEXP);
    Rcpp::traits::input_parameter< double >::type y_origin(y_originSEXP);
    Rcpp::traits::input_parameter< double >::type x_max(x_maxSEXP);
    __result = Rcpp::wrap(group_hex(x, y, x_width, y_width, x_origin, y_origin, x_max));
    return __result;
END_RCPP
}
// lowerBound
IntegerVector lowerBound(const NumericVector& x, const NumericVector& breaks);
RcppExport SEXP bigvis_lowerBound(SEXP xSEXP, SEXP breaksSEXP) {
BEGIN_RCPP
    Rcpp::RObject __result;
    Rcpp::RNGScope __rngScope;
    Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP);
    Rcpp::traits::input_parameter< const NumericVector& >::type breaks(breaksSEXP);
    __result = Rcpp::wrap(lowerBound(x, breaks));
    return __result;
END_RCPP
}
// smooth_nd_1
NumericVector smooth_nd_1(const NumericMatrix& grid_in, const NumericVector& z_in, const NumericVector& w_in_, const NumericMatrix& grid_out, const int var, const double h, const std::string type);
RcppExport SEXP bigvis_smooth_nd_1(SEXP grid_inSEXP, SEXP z_inSEXP, SEXP w_in_SEXP, SEXP grid_outSEXP, SEXP varSEXP, SEXP hSEXP, SEXP typeSEXP) {
BEGIN_RCPP
    Rcpp::RObject __result;
    Rcpp::RNGScope __rngScope;
    Rcpp::traits::input_parameter< const NumericMatrix& >::type grid_in(grid_inSEXP);
    Rcpp::traits::input_parameter< const NumericVector& >::type z_in(z_inSEXP);
    Rcpp::traits::input_parameter< const NumericVector& >::type w_in_(w_in_SEXP);
    Rcpp::traits::input_parameter< const NumericMatrix& >::type grid_out(grid_outSEXP);
    Rcpp::traits::input_parameter< const int >::type var(varSEXP);
    Rcpp::traits::input_parameter< const double >::type h(hSEXP);
    Rcpp::traits::input_parameter< const std::string >::type type(typeSEXP);
    __result = Rcpp::wrap(smooth_nd_1(grid_in, z_in, w_in_, grid_out, var, h, type));
    return __result;
END_RCPP
}
// smooth_nd
NumericVector smooth_nd(const NumericMatrix& grid_in, const NumericVector& z_in, const NumericVector& w_in_, const NumericMatrix& grid_out, const NumericVector h);
RcppExport SEXP bigvis_smooth_nd(SEXP grid_inSEXP, SEXP z_inSEXP, SEXP w_in_SEXP, SEXP grid_outSEXP, SEXP hSEXP) {
BEGIN_RCPP
    Rcpp::RObject __result;
    Rcpp::RNGScope __rngScope;
    Rcpp::traits::input_parameter< const NumericMatrix& >::type grid_in(grid_inSEXP);
    Rcpp::traits::input_parameter< const NumericVector& >::type z_in(z_inSEXP);
    Rcpp::traits::input_parameter< const NumericVector& >::type w_in_(w_in_SEXP);
    Rcpp::traits::input_parameter< const NumericMatrix& >::type grid_out(grid_outSEXP);
    Rcpp::traits::input_parameter< const NumericVector >::type h(hSEXP);
    __result = Rcpp::wrap(smooth_nd(grid_in, z_in, w_in_, grid_out, h));
    return __result;
END_RCPP
}
// bisquare
double bisquare(double u, double b);
RcppExport SEXP bigvis_bisquare(SEXP uSEXP, SEXP bSEXP) {
BEGIN_RCPP
    Rcpp::RObject __result;
    Rcpp::RNGScope __rngScope;
    Rcpp::traits::input_parameter< double >::type u(uSEXP);
    Rcpp::traits::input_parameter< double >::type b(bSEXP);
    __result = Rcpp::wrap(bisquare(u, b));
    return __result;
END_RCPP
}
// regress
NumericVector regress(const std::vector<double>& x, const std::vector<double>& y, const std::vector<double>& w);
RcppExport SEXP bigvis_regress(SEXP xSEXP, SEXP ySEXP, SEXP wSEXP) {
BEGIN_RCPP
    Rcpp::RObject __result;
    Rcpp::RNGScope __rngScope;
    Rcpp::traits::input_parameter< const std::vector<double>& >::type x(xSEXP);
    Rcpp::traits::input_parameter< const std::vector<double>& >::type y(ySEXP);
    Rcpp::traits::input_parameter< const std::vector<double>& >::type w(wSEXP);
    __result = Rcpp::wrap(regress(x, y, w));
    return __result;
END_RCPP
}
// median
double median(const std::vector<double>& x);
RcppExport SEXP bigvis_median(SEXP xSEXP) {
BEGIN_RCPP
    Rcpp::RObject __result;
    Rcpp::RNGScope __rngScope;
    Rcpp::traits::input_parameter< const std::vector<double>& >::type x(xSEXP);
    __result = Rcpp::wrap(median(x));
    return __result;
END_RCPP
}
// regress_robust
NumericVector regress_robust(const std::vector<double>& x, const std::vector<double>& y, const std::vector<double>& w, int iterations);
RcppExport SEXP bigvis_regress_robust(SEXP xSEXP, SEXP ySEXP, SEXP wSEXP, SEXP iterationsSEXP) {
BEGIN_RCPP
    Rcpp::RObject __result;
    Rcpp::RNGScope __rngScope;
    Rcpp::traits::input_parameter< const std::vector<double>& >::type x(xSEXP);
    Rcpp::traits::input_parameter< const std::vector<double>& >::type y(ySEXP);
    Rcpp::traits::input_parameter< const std::vector<double>& >::type w(wSEXP);
    Rcpp::traits::input_parameter< int >::type iterations(iterationsSEXP);
    __result = Rcpp::wrap(regress_robust(x, y, w, iterations));
    return __result;
END_RCPP
}
// compute_moments
NumericVector compute_moments(const NumericVector& x);
RcppExport SEXP bigvis_compute_moments(SEXP xSEXP) {
BEGIN_RCPP
    Rcpp::RObject __result;
    Rcpp::RNGScope __rngScope;
    Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP);
    __result = Rcpp::wrap(compute_moments(x));
    return __result;
END_RCPP
}
// compute_sum
NumericVector compute_sum(const NumericVector& x);
RcppExport SEXP bigvis_compute_sum(SEXP xSEXP) {
BEGIN_RCPP
    Rcpp::RObject __result;
    Rcpp::RNGScope __rngScope;
    Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP);
    __result = Rcpp::wrap(compute_sum(x));
    return __result;
END_RCPP
}
// compute_median
NumericVector compute_median(const NumericVector& x);
RcppExport SEXP bigvis_compute_median(SEXP xSEXP) {
BEGIN_RCPP
    Rcpp::RObject __result;
    Rcpp::RNGScope __rngScope;
    Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP);
    __result = Rcpp::wrap(compute_median(x));
    return __result;
END_RCPP
}


================================================
FILE: src/Summary2d.cpp
================================================
#include <Rcpp.h>
#include "Summary2d.h"
using namespace Rcpp;


================================================
FILE: src/Summary2d.h
================================================
#include <Rcpp.h>
#include "stats.h"
using namespace Rcpp;

class Summary2d {
  public:
    virtual void push(double x, double z, double w) =0;
    virtual double compute() =0;
    virtual ~Summary2d() {}
};

class Summary2dMean: public Summary2d {
    double w_, z_;

  public:
    Summary2dMean() : w_(0), z_(0) {}

    void push(double x, double z, double w) {
      // Rcout << "  x: " << x << " z: " << z << " w: " << w << "\n";
      w_ += w;
      z_ += z * w;
    }

    double compute() {
      // Rcout << "Result: " << z_ / w_ << "\n";
      return z_ / w_;
    }
};

class Summary2dRegression: public Summary2d {
    std::vector<double> x_, z_, w_;

  public:
    Summary2dRegression() {}

    void push(double x, double z, double w) {
      x_.push_back(x);
      z_.push_back(z);
      w_.push_back(w);
    }

    double compute() {
      return simpleLinearRegression(x_, z_, w_).alpha;
    }
};

class Summary2dRobustRegression: public Summary2d {
    int iterations_;
    std::vector<double> x_, z_, w_;

  public:
    Summary2dRobustRegression() : iterations_(3) {}
    Summary2dRobustRegression(int iterations) : iterations_(iterations) {}

    void push(double x, double z, double w) {
      x_.push_back(x);
      z_.push_back(z);
      w_.push_back(w);
    }

    double compute() {
      return simpleRobustRegression(x_, z_, w_, iterations_).alpha;
    }
};


================================================
FILE: src/condense-gen.r
================================================
library(whisker)

# Generate template specialisations for groupwise - these are the functions
# that are called from R.

summaries <- c(
  count = "Sum(0)",
  sum = "Sum(1)",
  mean = "Moments(1)",
  sd = "Moments(2)",
  median = "Median()"
)

template <- "
// [[Rcpp::export]]
List condense_{{name}}(const List& x, const NumericVector& z,
                       const NumericVector& weight, bool drop = false) {
  if (drop) {
    return sparse_condense(BinnedVectors(x), z, weight, Summary{{summary}});
  } else {
    return condense(BinnedVectors(x), z, weight, Summary{{summary}});
  }
}
"

cpp_fun <- function(summary) {
  whisker.render(template, list(
    name = tolower(summary),
    summary = summaries[[summary]]
  ))
}


groupwise <- readLines("condense.cpp")
split <- which(grepl("// -{40,}", groupwise))[1]
original <- groupwise[1:split]

writeLines(original, "condense.cpp")

cat("// Autogenerated by condense-gen.r\n", file = "condense.cpp", append = TRUE)
funs <- unlist(lapply(names(summaries), cpp_fun))
cat(funs, file = "condense.cpp", append = TRUE, sep = "")


================================================
FILE: src/condense.cpp
================================================
#include <Rcpp.h>
#include <bigvis.h>
#include "group.h"
#include "summary.h"

template<typename Stat>
List condense(const BinnedVectors& group, const NumericVector& z, 
                        const NumericVector& weight, const Stat& stat) {
  int n_obs = group.size();
  int n_bins = group.nbins();

  const NumericVector& weight_ = (weight.size() > 0) ? weight : 
    rep(NumericVector::create(1), n_obs);
  const NumericVector& z_ = (z.size() > 0) ? z : 
    rep(NumericVector::create(1), n_obs);

  // Push values into stats
  std::vector<Stat> stats(n_bins, stat);
  for(int i = 0; i < n_obs; ++i) {
    int bin = group.bin_i(i);
    // Rcout << "i: " << i << " bin: " << bin << "\n"; 
    stats.at(bin).push(z_[i], weight_[i]);      
  }

  // Compute values from stats and determine bins
  int n_stats = stat.size();
  int n_groups = group.ngroups();
  NumericMatrix out(n_bins, n_stats), bin(n_bins, n_groups);

  for (int i = 0; i < n_bins; ++i) {
    for (int j = 0; j < n_stats; ++j) {
      out(i, j) = stats[i].compute(j);
    }

    std::vector<double> bins = group.unbin(i);
    for (int j = 0; j < n_groups; ++j) {
      bin(i, j) = bins[j];
    }
  }

  // Name 
  CharacterVector out_cols(n_stats), bin_cols(n_groups);
  for (int j = 0; j < n_stats; ++j) {
    out_cols[j] = stat.name(j);
  }
  for (int j = 0; j < n_groups; ++j) {
    bin_cols[j] = group.name(j);
  }
  out.attr("dimnames") = List::create(CharacterVector::create(), out_cols);
  bin.attr("dimnames") = List::create(CharacterVector::create(), bin_cols);

  return List::create(bin, out);
}

template<typename Stat>
List sparse_condense(const BinnedVectors& group, const NumericVector& z, 
                        const NumericVector& weight, const Stat& stat) {
  int n_obs = group.size();

  const NumericVector& weight_ = (weight.size() > 0) ? weight : 
    rep(NumericVector::create(1), n_obs);
  const NumericVector& z_ = (z.size() > 0) ? z : 
    rep(NumericVector::create(1), n_obs);

  // Push values into stats
  typename std::map<int, Stat> stats;
  for(int i = 0; i < n_obs; ++i) {
    int bin = group.bin_i(i);
    
    typename std::map<int, Stat>::iterator loc = stats.find(bin);
    if (loc == stats.end()) {
      Stat new_stat(stat);
      new_stat.push(z_[i], weight_[i]);
      stats.insert(std::pair<int, Stat>(bin, new_stat));
    } else {
      (loc->second).push(z_[i], weight_[i]);
    }
  }

  // Compute values from stats and determine bins
  int n_bins = stats.size();
  int n_stats = stat.size();
  int n_groups = group.ngroups();
  NumericMatrix out(n_bins, n_stats), bin(n_bins, n_groups);

  typename std::map<int, Stat>::iterator stats_it = stats.begin(),
    stats_end = stats.end();


  for (int i = 0; stats_it != stats_end; ++stats_it, ++i) {
    for (int j = 0; j < n_stats; ++j) {
      out(i, j) = (stats_it->second).compute(j);
    }

    std::vector<double> bins = group.unbin(stats_it->first);
    for (int j = 0; j < n_groups; ++j) {
      bin(i, j) = bins[j];
    }
  }

  // Name 
  CharacterVector out_cols(n_stats), bin_cols(n_groups);
  for (int j = 0; j < n_stats; ++j) {
    out_cols[j] = stat.name(j);
  }
  for (int j = 0; j < n_groups; ++j) {
    bin_cols[j] = group.name(j);
  }
  out.attr("dimnames") = List::create(CharacterVector::create(), out_cols);
  bin.attr("dimnames") = List::create(CharacterVector::create(), bin_cols);

  return List::create(bin, out);
}

// -----------------------------------------------------------------------------
// Autogenerated by condense-gen.r

// [[Rcpp::export]]
List condense_count(const List& x, const NumericVector& z,
                       const NumericVector& weight, bool drop = false) {
  if (drop) {
    return sparse_condense(BinnedVectors(x), z, weight, SummarySum(0));
  } else {
    return condense(BinnedVectors(x), z, weight, SummarySum(0));
  }
}

// [[Rcpp::export]]
List condense_sum(const List& x, const NumericVector& z,
                       const NumericVector& weight, bool drop = false) {
  if (drop) {
    return sparse_condense(BinnedVectors(x), z, weight, SummarySum(1));
  } else {
    return condense(BinnedVectors(x), z, weight, SummarySum(1));
  }
}

// [[Rcpp::export]]
List condense_mean(const List& x, const NumericVector& z,
                       const NumericVector& weight, bool drop = false) {
  if (drop) {
    return sparse_condense(BinnedVectors(x), z, weight, SummaryMoments(1));
  } else {
    return condense(BinnedVectors(x), z, weight, SummaryMoments(1));
  }
}

// [[Rcpp::export]]
List condense_sd(const List& x, const NumericVector& z,
                       const NumericVector& weight, bool drop = false) {
  if (drop) {
    return sparse_condense(BinnedVectors(x), z, weight, SummaryMoments(2));
  } else {
    return condense(BinnedVectors(x), z, weight, SummaryMoments(2));
  }
}

// [[Rcpp::export]]
List condense_median(const List& x, const NumericVector& z,
                       const NumericVector& weight, bool drop = false) {
  if (drop) {
    return sparse_condense(BinnedVectors(x), z, weight, SummaryMedian());
  } else {
    return condense(BinnedVectors(x), z, weight, SummaryMedian());
  }
}


================================================
FILE: src/double-diff-sum.cpp
================================================
#include <Rcpp.h>
#include <iostream>
#include <algorithm>
using namespace Rcpp;

// Efficiently compute \sum \sum abs(x_i - x_j) for binned data
// 
// It's effectively equivalent to this R code on the ungrouped observations
// bin <- trunc(x / bw)
// diffs <- abs(outer(bin, bin, "-"))
// tabulate(diffs + 1)
// 
// [[Rcpp::export]]
std::vector<int> double_diff_sum(IntegerVector bin, IntegerVector count) {
  int n = bin.size();
  std::vector<int> out;

  for (int i = 0; i < n; i++) {
    for (int j = 0; j < n; j++) {
      int pos = abs(bin[i] - bin[j]);

      if (pos + 1 > out.size()) {
        out.resize(pos + 1);
      }
      out[pos] += count[i] * count[j];
    }
  }

  return out;
}


================================================
FILE: src/frange.cpp
================================================
#include <Rcpp.h>
using namespace Rcpp;

//' Efficient implementation of range.
//'
//' This is an efficient C++ implementation of range for numeric vectors:
//' it avoids S3 dispatch, and computes both min and max in a single pass
//' through the input.
//'
//' If \code{x} has a \code{range} attribute (e.g. it's a \code{\link{ranged}}
//' object), it will be used instead of computing the range from scratch.
//' 
//' @param x a numeric vector, or a \code{\link{ranged}} object
//' @param finite If \code{TRUE} ignores missing values and infinities. Note
//'   that if the vector is empty, or only contains missing values, 
//'   \code{frange} will return \code{c(Inf, -Inf)} because those are the
//'   identity values for \code{\link{min}} and \code{\link{max}} respectively.
//' @export
//' @examples
//' x <- runif(1e6)
//' system.time(range(x))
//' system.time(frange(x))
//'
//' rx <- ranged(x)
//' system.time(frange(rx))
// [[Rcpp::export]]
NumericVector frange(const NumericVector& x, const bool finite = true) {
  RObject cache = x.attr("range");
  if (cache.sexp_type() == REALSXP) return as<NumericVector>(cache);

  NumericVector out(2);
  out[0] = INFINITY;
  out[1] = -INFINITY;

  int n = x.length();
  for(int i = 0; i < n; ++i) {
    if (!finite && R_IsNA(x[i])) {
      out[0] = NA_REAL;
      out[1] = NA_REAL;
      return out;
    }

    // If finite, skip infinite values
    if (finite && (x[i] == INFINITY || x[i] == -INFINITY)) continue;

    if (x[i] < out[0]) out[0] = x[i];
    if (x[i] > out[1]) out[1] = x[i];
  }

  return out;
}


================================================
FILE: src/group-hex.h
================================================
/*
 * Translated from
 * https://github.com/d3/d3-plugins/blob/master/hexbin/hexbin.js
 *
 * Copyright (C) 2013 Hadley Wickham
 * Copyright (C) 2012 Mike Bostock (mbostock at gmail dot com)
 */
class GroupHex {
    const NumericVector x_;
    const NumericVector y_;
    double x_width_;
    double x_origin_;
    double y_width_;
    double y_origin_;
    double x_bins;

  public:
    GroupHex (const NumericVector& x, const NumericVector& y, 
                double x_width, double y_width, 
                double x_origin, double y_origin, 
                double x_max)
       : x_(x), y_(y), x_width_(x_width), x_origin_(x_origin), 
          y_width_(y_width), y_origin_(y_origin) {
      if (x.size() != y.size()) stop("x & y are not the same size");
      x_bins = x_max / x_width_ + 1;
    }

    int bin_i(int i) const {
      double py = ISNAN(y_[i]) ? 0 : (y_[i] - y_origin_) / y_width_ + 1;
      int pj = py;
      double py1 = py - pj;
      
      double px = ISNAN(x_[i]) ? 0 : (x_[i] - x_origin_) / x_width_ + 1 - 
        (pj % 2 ? 0.5 : 0);
      int pi = px;

       if (fabs(py1) * 3 > 1) {
        double px1 = px - pi,
               pi2 = pi + (px < pi ? -1 : 1) / 2,
               pj2 = pj + (py < pj ? -1 : 1),
               px2 = px - pi2,
               py2 = py - pj2;
        if (px1 * px1 + py1 * py1 > px2 * px2 + py2 * py2) {
          pi = pi2 + (pj % 2 ? 1 : -1) / 2;
          pj = pj2;
        }
      }

      return pj * x_bins + pj;
    }

    int size() const {
      return x_.size();
    }
};


================================================
FILE: src/group.cpp
================================================
#include <Rcpp.h>
#include "group.h"
#include "group-hex.h"
using namespace Rcpp;

template<typename Group>
IntegerVector group_out(const Group& group) {
  int n = group.size();
  IntegerVector out(n);
  for(int i = 0; i < n; ++i) {
    out[i] = group.bin_i(i);
  }

  return out;
}

RCPP_MODULE(Group) {
  class_<GroupFixed>("GroupFixed")
  .constructor<NumericVector, double, double>()
  .const_method("bin_i", &GroupFixed::bin_i)
  .const_method("bin", &GroupFixed::bin)
  .const_method("unbin", &GroupFixed::unbin)

  .const_method("size", &GroupFixed::size)
  .const_method("nbins", &GroupFixed::nbins)

  .const_method("origin", &GroupFixed::origin)
  .const_method("width", &GroupFixed::width)
  ;
}
RCPP_EXPOSED_AS(GroupFixed)
RCPP_EXPOSED_WRAP(GroupFixed)


// [[Rcpp::export]]
IntegerVector group_fixed(const NumericVector& x, double width, double origin = 0) {
  return group_out(GroupFixed(x, width, origin));
}

// [[Rcpp::export]]
IntegerVector group_rect(const NumericVector& x, const NumericVector& y, 
                         double x_width, double y_width,
                         double x_origin, double y_origin) {
  return group_out(Group2d<GroupFixed>(
    GroupFixed(x, x_width, x_origin), 
    GroupFixed(y, y_width, y_origin)));
}


// [[Rcpp::export]]
IntegerVector group_hex(const NumericVector& x, const NumericVector& y, 
                         double x_width, double y_width,
                         double x_origin, double y_origin,
                         double x_max) {
  return group_out(GroupHex(x, y, x_width, y_width, x_origin, y_origin, x_max));
}


================================================
FILE: src/group.h
================================================
#include <Rcpp.h>
using namespace Rcpp;

NumericVector frange(const NumericVector& x, const bool finite = true);

class GroupFixed {
    const NumericVector x_;
    double width_;
    double origin_;
  public:
    GroupFixed (NumericVector x, double width, double origin = 0)
       : x_(x), width_(width), origin_(origin) {
    }

    int bin_i(int i) const {
      if (ISNAN(x_[i]) || x_[i] == INFINITY || x_[i] == -INFINITY) return 0;
      if (x_[i] < origin_) return 0;
      
      return bin(x_[i]);
    }

    int bin(double x) const {
      return (x - origin_) / width_ + 1;
    }

    double unbin(int bin) const {
      if (bin == 0) return(NAN);
      return (bin - 1) * width_ + origin_;
    }

    double origin() const {
      return origin_;
    }
    double width() const {
      return width_;
    }


    int size() const {
      return x_.size();
    }

    int nbins() const {
      double max = frange(x_)(1);
      double dest = floor((max - origin_) / width_) * width_ + origin_;

      // + 1 for missing values
      // + 1 if highest value is on right-open boundary
      return (dest - origin_) / width_ + 1 + ((max >= dest) ? 1 : 0);
    }

};

template<typename Group>
class Group2d {
    const Group& x_;
    const Group& y_;
    int x_bins_;
    int y_bins_;

  public:
    Group2d (const Group& x, const Group& y) : x_(x), y_(y) {
      if (x_.size() != y_.size()) {
        stop("x and y are not equal sizes");
      }
      x_bins_ = x_.nbins();
      y_bins_ = y_.nbins();

      // Rcout << "x_bins: " << x_bins_ << " y_bins: " << y_bins_ << "\n";
    }

    int bin_i(int i) const {
      int x_bin = x_.bin_i(i), y_bin = y_.bin_i(i);
      int bin = y_bin * x_bins_ + x_bin;
      // Rcout << i << ": (" << x_bin << "," << y_bin << ") -> " << bin << "\n";
      return bin;
    }

    int size() const {
      return x_.size();
    }

    int nbins() const {
      return x_bins_ * y_bins_;
    }
};


template<typename Group>
class GroupNd {
    const std::vector<Group> groups_;
    const int ngroups_;

    int size_;
    std::vector<int> bins_;

  public:
    GroupNd (const std::vector<Group> groups) 
        : groups_(groups), ngroups_(groups.size()) {
      if (groups.size() == 0) {
        stop("Empty groups vector passed to GroupCompound");
      }

      size_ = groups[0].size();

      bins_[0] = 1;
      for (int i = 0; i < ngroups_ - 1; ++i) {
        if (groups_[i].size() != size_) stop("Groups not equal sizes");

        bins_[i + 1] = bins_[i] * groups_[i].nbins(); 
      }
    }

    int bin_i(int i) const {
      int bin = 0;

      for (int j = 0; j < ngroups_; ++j) {
        bin += groups_[j].bin(i) * bins_[j];
      }

      return bin;
    }

    // int nbins() const {
    //   return bins_[ngroups_ - 1];
    // }

    int ngroups() const {
      return groups_.size();
    }

    int size() const {
      return size_;
    }

    std::vector<double> unbin(int bin) const {
      std::vector<double> bins(ngroups_);

      for (int j = 0; j < ngroups_; ++j) {
        int bin_j = bin % bins_[j];
        bins[j] = groups_[j].unbin(bin_j);

        bin = bin - bin * bins_[j];
      }

      return bins;
    }

};


================================================
FILE: src/lowerBound.cpp
================================================
#include <algorithm>
#include <Rcpp.h>
using namespace Rcpp;

// Quick and dirty implementation of lowerBound, the complement to R's
// findInterval
// [[Rcpp::export]]
IntegerVector lowerBound(const NumericVector& x, const NumericVector& breaks) {
  int n = x.size();
  IntegerVector out(n);

  for (int i = 0; i < n; i++) {
    NumericVector::const_iterator it =
      std::lower_bound(breaks.begin(), breaks.end(), x[i]);
    if (it == breaks.end()) --it;
    out[i] = it - breaks.begin() + 1;
  }
  return out;
}


================================================
FILE: src/smooth-nd.cpp
================================================
#include <algorithm>
#include <Rcpp.h>
#include "group.h"
#include "Summary2d.h"
#include <boost/shared_ptr.hpp>
using namespace Rcpp;

boost::shared_ptr<Summary2d> createSummary(std::string type) {
  if (type == "mean") {
    return boost::shared_ptr<Summary2d>(new Summary2dMean());
  } else if (type == "regression") {
    return boost::shared_ptr<Summary2d>(new Summary2dRegression());
  } else if (type == "robust_regression") {
    return boost::shared_ptr<Summary2d>(new Summary2dRobustRegression());
  } else {
    stop("Unknown type");
    // Quiet warning
    return boost::shared_ptr<Summary2d>(new Summary2dMean());
  }
}

double tricube(double x) {
  if (NumericVector::is_na(x)) return 0;
  x = fabs(x);
  if (x > 1) return 0;

  double y = 1 - x * x * x;
  return y * y * y;
}

bool both_na(double x, double y) {
  return (NumericVector::is_na(x) && NumericVector::is_na(y));
}

// [[Rcpp::export]]
NumericVector smooth_nd_1(const NumericMatrix& grid_in, 
                          const NumericVector& z_in, 
                          const NumericVector& w_in_,
                          const NumericMatrix& grid_out, 
                          const int var, const double h,
                          const std::string type = "mean") {

  if (var < 0) stop("var < 0");
  if (var >= grid_in.ncol()) stop("var too large");
  if (h <= 0) stop("h <= 0");
  if (grid_in.ncol() != grid_out.ncol()) stop("Incompatible grid sizes");

  int n_in = grid_in.nrow(), n_out = grid_out.nrow(), d = grid_in.ncol();
  NumericVector w_in = (w_in_.size() > 0) ? w_in_ : 
    rep(NumericVector::create(1), n_in);
  NumericVector z_out(n_out), w_out(n_out);

  // Will be much more efficient to slice up by input dimension:
  // and most efficient way of doing that will be to bin with / bw
  // My data structure: sparse grids
  // 
  // And once we're smoothing in one direction, with guaranteed e2venly spaced
  // grid can avoid many kernel evaluations and can also compute more
  // efficient running sum

  for(int j = 0; j < n_out; ++j) {
    boost::shared_ptr<Summary2d> summary = createSummary(type);
    for (int i = 0; i < n_in; ++i) {
      // Check that all variables (apart from var) are equal
      bool equiv = true;
      for (int k = 0; k < d; ++k) {
        if (k == var) continue;

        double in = grid_in(i, k), out = grid_out(j, k);
        if (in != out && !both_na(in, out)) {
          equiv = false;
          break;
        }
      };
      if (!equiv) continue;

      double in = grid_in(i, var), out = grid_out(j, var);
      double dist = both_na(in, out) ? 0 : in - out;
      double w = tricube(dist / h) * w_in[i];
      if (w == 0) continue;

      summary->push(dist, z_in[i], w);
    }
    z_out[j] = summary->compute();
  }

  return z_out;
}

// [[Rcpp::export]]
NumericVector smooth_nd(const NumericMatrix& grid_in, 
                        const NumericVector& z_in, 
                        const NumericVector& w_in_,
                        const NumericMatrix& grid_out, 
                        const NumericVector h) {

  if (grid_in.nrow() != z_in.size()) stop("Incompatible input lengths");
  if (grid_in.ncol() != grid_out.ncol()) stop("Incompatible grid sizes");
  if (h.size() != grid_in.ncol()) stop("Incorrect h length");

  int n_in = grid_in.nrow(), n_out = grid_out.nrow(), d = grid_in.ncol();
  NumericVector w_in = (w_in_.size() > 0) ? w_in_ : 
    rep(NumericVector::create(1), n_in);
  NumericVector z_out(n_out), w_out(n_out);

  for (int i = 0; i < n_in; ++i) {
    for(int j = 0; j < n_out; ++j) {
      double w = 1;
      for (int k = 0; k < d; ++k) {
        double dist = (grid_in(i, k) - grid_out(j, k)) / h[k];
        w *= tricube(dist);
      }
      w *= w_in[i];

      w_out[j] += w;
      z_out[j] += z_in[i] * w;
    }
  }

  for(int j = 0; j < n_out; ++j) {
    z_out[j] /= w_out[j];
  }

  return z_out;
}


================================================
FILE: src/stats.cpp
================================================
#include <Rcpp.h>
using namespace Rcpp;

struct Regression {
  double alpha, beta;
};

// [[Rcpp::export]]
double bisquare(double u, double b) {
  u = fabs(u);
  return (u < b) ? pow(1 - pow(u / b, 2), 2) : 0;
}

Regression simpleLinearRegression(const std::vector<double>& x, 
                                  const std::vector<double>& y,
                                  const std::vector<double>& w) {
  int n = x.size();

  double x_wsum = 0, y_wsum = 0, w_sum = 0;
  for (int i = 0; i < n; ++i) {
    x_wsum += x[i] * w[i];
    y_wsum += y[i] * w[i];
    w_sum += w[i];
  };
  double x_mean = x_wsum / w_sum, y_mean = y_wsum / w_sum;

  double var_xy = 0, var_x = 0;
  for (int i = 0; i < n; ++i) {
    var_xy += w[i] * (x[i] - x_mean) * (y[i] - y_mean);
    var_x += w[i] * pow((x[i] - x_mean), 2);
  }

  Regression results;
  results.beta = (var_xy / var_x);
  results.alpha = y_mean - results.beta * x_mean;
  return results;
}

// [[Rcpp::export]]
NumericVector regress(const std::vector<double>& x, 
                      const std::vector<double>& y,
                      const std::vector<double>& w) {
  Regression regression = simpleLinearRegression(x, y, w);
  return NumericVector::create(regression.alpha, regression.beta);
}

double median(std::vector<double>* x) {
  if (x->empty()) return NAN;

  int size = x->size();
  std::vector<double>::iterator upper = x->begin() + (int) (size / 2);
  std::nth_element(x->begin(), upper, x->end());

  if (size % 2 == 1) {
    return *upper;
  } else {
    std::vector<double>::iterator lower = upper - 1;
    std::nth_element(x->begin(), lower, upper);
    return (*upper + *lower) / 2.0;
  }  
}

// [[Rcpp::export("medianC")]]
double median(const std::vector<double>& x) {
  std::vector<double> x_(x);
  return median(&x_);
}


Regression simpleRobustRegression(const std::vector<double>& x, 
                       const std::vector<double>& y,
                       const std::vector<double>& w,
                       int iterations = 3) {
  int n = x.size();
  Regression prev = simpleLinearRegression(x, y, w);

  for (int k = 0; k < iterations; ++k) {
    std::vector<double> resid(n);
    for (int i = 0; i < n; ++i) {
      resid[i] = fabs(y[i] - (prev.alpha + prev.beta * x[i]));
    }

    std::vector<double> w_(w);
    double b = 6 * median(resid);
    if (b < 1e-20) break;
    for (int i = 0; i < n; ++i) {
      w_[i] *= bisquare(resid[i], b);
    }

    prev = simpleLinearRegression(x, y, w_);
  }

  return prev;
}

// [[Rcpp::export]]
NumericVector regress_robust(const std::vector<double>& x, 
                             const std::vector<double>& y,
                             const std::vector<double>& w,
                             int iterations = 3) {
  Regression regression = simpleRobustRegression(x, y, w, iterations);
  return NumericVector::create(regression.alpha, regression.beta);
}


================================================
FILE: src/stats.h
================================================
struct Regression {
  double alpha, beta;
};

double bisquare(double u, double b);

Regression simpleLinearRegression(const std::vector<double>& x, 
                                  const std::vector<double>& y,
                                  const std::vector<double>& w);

Regression simpleRobustRegression(const std::vector<double>& x, 
                       const std::vector<double>& y,
                       const std::vector<double>& w,
                       int iterations = 3);

double median(const std::vector<double>& x);
double median(std::vector<double>* x);


================================================
FILE: src/summary.cpp
================================================
#include <Rcpp.h>
#include "summary.h"
using namespace Rcpp;

template<typename Summary>
NumericVector summary_compute(const NumericVector& x, Summary summary) {
  int n = x.size();
  for(int i = 0; i < n; ++i) {
    summary.push(x[i], 1);
  }

  int m = summary.size();
  NumericVector out(m);
  for(int i = 0; i < m; ++i) {
    out[i] = summary.compute(i);
  }

  return out;
}

// [[Rcpp::export]]
NumericVector compute_moments(const NumericVector& x) {
  return summary_compute(x, SummaryMoments(2));
}

// [[Rcpp::export]]
NumericVector compute_sum(const NumericVector& x) {
  return summary_compute(x, SummarySum(1));
}

// [[Rcpp::export]]
NumericVector compute_median(const NumericVector& x) {
  return summary_compute(x, SummaryMedian());
}


================================================
FILE: src/summary.h
================================================
#include <Rcpp.h>
#include "stats.h"
using namespace Rcpp;

class SummaryMoments {
    int i_;
    double weight;
    double mean;
    double m2;

  public:
    SummaryMoments (int i) : i_(i), weight(0), mean(0), m2(0) {
      if (i > 2) stop("Invalid moment");
    }

    // Algorithm adapted from 
    // http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Weighted_incremental_algorithm
    void push(double y, double w) {
      if (NumericVector::is_na(y)) return;

      // counts and weights
      weight += w;

      // mean
      if (i_ < 1) return;
      double delta = y - mean;
      mean += delta * w / weight;

      // variance
      if (i_ < 2) return;      
      m2 += delta * delta * w * (1 - w / weight);

      return;
    }

    const int size() const {
      return i_ + 1;
    }

    double compute(int i) const {
      switch (i) {
        case 0: return weight;
        case 1: return (weight == 0) ? NAN : mean;
        case 2: return (weight == 0) ? NAN : pow(m2 / (weight - 1), 0.5);
        default: 
          stop("Invalid output requested");
          return NAN;
      }
    }

    std::string name(int i) const {
      switch (i) {
        case 0: return "count";
        case 1: return "mean";
        case 2: return "sd";
        default: 
          stop("Invalid output requested");
          return "";
      }
    }
};

class SummarySum {
    int i_;
    int weight;
    double sum;

  public:
    SummarySum (int i) : i_(i), weight(0), sum(0) {
      if (i > 1 || i < 0) stop("Invalid moment");
    }

    void push(double y, double w) {
      if (NumericVector::is_na(y)) return;

      weight += w;
      if (i_ < 1) return;

      sum += y * w;
    }

    const int size() const {
      return i_ + 1;
    }

    double compute(int i) const  {
      switch (i) {
        case 0: return weight;
        case 1: return sum;
        default: 
          stop("Invalid output requested");
          return NAN;
      }
    }

    std::string name(int i) const {
      switch (i) {
        case 0: return "count";
        case 1: return "sum";
        default: 
          stop("Invalid output requested");
          return "";
      }
    }

};

class SummaryMedian {
    std::vector<double> ys;

  public:
    void push(double y, double w) {
      if (NumericVector::is_na(y)) return;

      ys.push_back(y);
    }

    int size() const {
      return 1;
    }

    // Adapted from http://stackoverflow.com/questions/1719070/
    double compute(int i) {
      return median(&ys);
    }

    std::string name(int i) const {
      return "median";
    }
};