[
  {
    "path": ".Rbuildignore",
    "content": "bench\nnotes.md\n^.*\\.Rproj$\n^\\.Rproj\\.user$\n^\\.travis\\.yml$\n^src/condense-gen\\.r$\n"
  },
  {
    "path": ".gitignore",
    "content": ".Rproj.user\n.Rhistory\n.RData\nsrc/*.o\nsrc/*.so\nsrc/*.dll\n"
  },
  {
    "path": ".travis.yml",
    "content": "# Sample .travis.yml for R projects\n\nlanguage: r\nwarnings_are_errors: true\nsudo: required\n\nr_github_packages:\n  - jimhester/covr\nafter_success:\n  - Rscript -e 'covr::codecov()'\n"
  },
  {
    "path": "DESCRIPTION",
    "content": "Package: bigvis\nVersion: 0.1.0.9000\nTitle: Tools for visualisation of big data sets\nDescription: Tools for visualising large datasets.\nAuthors@R: c(\n    person(\"Hadley\", \"Wickham\", role = c(\"aut\", \"cre\"), , \"hadley@rstudio.com\"),\n    person(\"Yue\", \"Hue\", role = \"aut\"),\n    person(\"R Core team\", role = \"ctb\", comment = \"guess_bandwidth adapted from stats::bw.SJ\")\n    )\nDepends:\n    Rcpp\nImports:\n    methods\nSuggests:\n    plyr,\n    ggplot2,\n    scales\nLazyData: true\nLinkingTo: \n    Rcpp, \n    BH\nLicense: GPL (>= 2)\nCollate:\n    'standardise.r'\n    'movies.r'\n    'RcppExports.R'\n    'adjust.r'\n    'ranged.r'\n    'bigvis.r'\n    'rebin.r'\n    'autoplot.r'\n    'origin.r'\n    'utils.r'\n    'breaks.r'\n    'weighted-stats.r'\n    'condense.r'\n    'condensed.r'\n    'bin.r'\n    'smooth.r'\n    'challenge.r'\n    'peel.r'\n    'id.r'\n    'rmse.r'\n    'width.r'\n    'h.r'\n    'mt.r'\n    'dgrid.r'\n"
  },
  {
    "path": "NAMESPACE",
    "content": "# Generated by roxygen2 (4.1.1): do not edit by hand\n\nS3method(\"[\",dgrid)\nS3method(\"[<-\",ranged)\nS3method(Math,condensed)\nS3method(Ops,condensed)\nS3method(Ops,ranged)\nS3method(as.condensed,condensed)\nS3method(as.condensed,data.frame)\nS3method(as.data.frame,dgrid)\nS3method(as.data.frame,ranged)\nS3method(as.integer,dgrid)\nS3method(max,dgrid)\nS3method(max,ranged)\nS3method(min,dgrid)\nS3method(min,ranged)\nS3method(print,ranged)\nS3method(range,dgrid)\nS3method(range,ranged)\nS3method(str,ranged)\nS3method(transform,condensed)\nexport(as.condensed)\nexport(autoplot.condensed)\nexport(best_h)\nexport(bin)\nexport(breaks)\nexport(condense)\nexport(dchallenge)\nexport(dgrid)\nexport(find_origin)\nexport(find_width)\nexport(frange)\nexport(h_grid)\nexport(inv_mt)\nexport(is.condensed)\nexport(is.dgrid)\nexport(is.ranged)\nexport(mt)\nexport(mt_trans)\nexport(peel)\nexport(ranged)\nexport(rchallenge)\nexport(rebin)\nexport(rmse_cv)\nexport(rmse_cvs)\nexport(round_any.condensed)\nexport(smooth)\nexport(standardise)\nexport(weighted.IQR)\nexport(weighted.ecdf)\nexport(weighted.median)\nexport(weighted.quantile)\nexport(weighted.sd)\nexport(weighted.var)\nexportMethods(as.integer)\nexportMethods(show)\nimportFrom(Rcpp,compileAttributes)\nimportFrom(Rcpp,cpp_object_initializer)\nimportFrom(methods,new)\nuseDynLib(bigvis)\n"
  },
  {
    "path": "R/RcppExports.R",
    "content": "# This file was generated by Rcpp::compileAttributes\n# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393\n\ncondense_count <- function(x, z, weight, drop = FALSE) {\n    .Call('bigvis_condense_count', PACKAGE = 'bigvis', x, z, weight, drop)\n}\n\ncondense_sum <- function(x, z, weight, drop = FALSE) {\n    .Call('bigvis_condense_sum', PACKAGE = 'bigvis', x, z, weight, drop)\n}\n\ncondense_mean <- function(x, z, weight, drop = FALSE) {\n    .Call('bigvis_condense_mean', PACKAGE = 'bigvis', x, z, weight, drop)\n}\n\ncondense_sd <- function(x, z, weight, drop = FALSE) {\n    .Call('bigvis_condense_sd', PACKAGE = 'bigvis', x, z, weight, drop)\n}\n\ncondense_median <- function(x, z, weight, drop = FALSE) {\n    .Call('bigvis_condense_median', PACKAGE = 'bigvis', x, z, weight, drop)\n}\n\ndouble_diff_sum <- function(bin, count) {\n    .Call('bigvis_double_diff_sum', PACKAGE = 'bigvis', bin, count)\n}\n\n#' Efficient implementation of range.\n#'\n#' This is an efficient C++ implementation of range for numeric vectors:\n#' it avoids S3 dispatch, and computes both min and max in a single pass\n#' through the input.\n#'\n#' If \\code{x} has a \\code{range} attribute (e.g. it's a \\code{\\link{ranged}}\n#' object), it will be used instead of computing the range from scratch.\n#' \n#' @param x a numeric vector, or a \\code{\\link{ranged}} object\n#' @param finite If \\code{TRUE} ignores missing values and infinities. Note\n#'   that if the vector is empty, or only contains missing values, \n#'   \\code{frange} will return \\code{c(Inf, -Inf)} because those are the\n#'   identity values for \\code{\\link{min}} and \\code{\\link{max}} respectively.\n#' @export\n#' @examples\n#' x <- runif(1e6)\n#' system.time(range(x))\n#' system.time(frange(x))\n#'\n#' rx <- ranged(x)\n#' system.time(frange(rx))\nfrange <- function(x, finite = TRUE) {\n    .Call('bigvis_frange', PACKAGE = 'bigvis', x, finite)\n}\n\ngroup_fixed <- function(x, width, origin = 0) {\n    .Call('bigvis_group_fixed', PACKAGE = 'bigvis', x, width, origin)\n}\n\ngroup_rect <- function(x, y, x_width, y_width, x_origin, y_origin) {\n    .Call('bigvis_group_rect', PACKAGE = 'bigvis', x, y, x_width, y_width, x_origin, y_origin)\n}\n\ngroup_hex <- function(x, y, x_width, y_width, x_origin, y_origin, x_max) {\n    .Call('bigvis_group_hex', PACKAGE = 'bigvis', x, y, x_width, y_width, x_origin, y_origin, x_max)\n}\n\nlowerBound <- function(x, breaks) {\n    .Call('bigvis_lowerBound', PACKAGE = 'bigvis', x, breaks)\n}\n\nsmooth_nd_1 <- function(grid_in, z_in, w_in_, grid_out, var, h, type = \"mean\") {\n    .Call('bigvis_smooth_nd_1', PACKAGE = 'bigvis', grid_in, z_in, w_in_, grid_out, var, h, type)\n}\n\nsmooth_nd <- function(grid_in, z_in, w_in_, grid_out, h) {\n    .Call('bigvis_smooth_nd', PACKAGE = 'bigvis', grid_in, z_in, w_in_, grid_out, h)\n}\n\nbisquare <- function(u, b) {\n    .Call('bigvis_bisquare', PACKAGE = 'bigvis', u, b)\n}\n\nregress <- function(x, y, w) {\n    .Call('bigvis_regress', PACKAGE = 'bigvis', x, y, w)\n}\n\nmedianC <- function(x) {\n    .Call('bigvis_median', PACKAGE = 'bigvis', x)\n}\n\nregress_robust <- function(x, y, w, iterations = 3L) {\n    .Call('bigvis_regress_robust', PACKAGE = 'bigvis', x, y, w, iterations)\n}\n\ncompute_moments <- function(x) {\n    .Call('bigvis_compute_moments', PACKAGE = 'bigvis', x)\n}\n\ncompute_sum <- function(x) {\n    .Call('bigvis_compute_sum', PACKAGE = 'bigvis', x)\n}\n\ncompute_median <- function(x) {\n    .Call('bigvis_compute_median', PACKAGE = 'bigvis', x)\n}\n\n"
  },
  {
    "path": "R/adjust.r",
    "content": "# Protect against floating point areas by slightly adjusting breaks.\n# Adapted from graphics::hist.default.\nadjust_breaks <- function(breaks, open = \"right\") {\n  open <- match.arg(open, c(\"left\", \"right\"))\n\n  breaks <- sort(breaks)\n  diddle <- 1e-07 * median(diff(breaks))\n  if (open == \"left\") {\n    fuzz <- c(-diddle, rep.int(diddle, length(breaks) - 1))\n  } else {\n    fuzz <- c(rep.int(-diddle, length(breaks) - 1), diddle)\n  }\n  breaks + fuzz\n}\n"
  },
  {
    "path": "R/autoplot.r",
    "content": "#' Autoplot condensed summaries.\n#'\n#' @param x a condensed summary\n#' @param var which summary variable to display\n#' @param ... other arguments passed on to individual methods\n#' @method autoplot condensed\n#' @export autoplot.condensed\n#' @examples\n#' if (require(\"ggplot2\")) {\n#'\n#' # 1d summaries -----------------------------\n#' x <- rchallenge(1e4)\n#' z <- x + rt(length(x), df = 2)\n#' xsum <- condense(bin(x, 0.1))\n#' zsum <- condense(bin(x, 0.1), z = z)\n#'\n#' autoplot(xsum)\n#' autoplot(peel(xsum))\n#'\n#' autoplot(zsum)\n#' autoplot(peel(zsum, keep = 1))\n#' autoplot(peel(zsum))\n#'\n#' # 2d summaries -----------------------------\n#' y <- runif(length(x))\n#' xysum <- condense(bin(x, 0.1), bin(y, 0.1))\n#' xyzsum <- condense(bin(x, 0.1), bin(y, 0.1), z = z)\n#'\n#' autoplot(xysum)\n#' autoplot(peel(xysum))\n#' autoplot(xyzsum)\n#' autoplot(peel(xyzsum))\n#' }\nautoplot.condensed <- function(x, var = last(summary_vars(x)), ...) {\n  stopifnot(is.condensed(x))\n  stopifnot(is.character(var), length(var) == 1)\n  summaries <- c(\n    .count = \"total\",\n    .sum = \"total\",\n    .mean = \"summary\",\n    .sd = \"summary\",\n    .median = \"summary\"\n  )\n  if (!(var %in% names(summaries))) {\n    stop(\"Unknown varible\", call. = FALSE)\n  }\n  d <- gcol(x)\n  if (d > 2) {\n    stop(\"No autoplot methods available for more than two d\")\n  }\n\n  f <- paste0(\"plot_\", summaries[var], \"_\", d)\n  find_fun(f)(x, var = var, ...)\n}\n\n\nplot_total_1 <- function(x, var = \".count\", show_na = TRUE, log = \"\") {\n  xvar <- names(x)[[1]]\n\n  plot <- ggplot2::ggplot(x[-1, ], ggplot2::aes_string(x = xvar, y = var)) +\n    ggplot2::geom_line(na.rm = TRUE)\n\n  if (show_na) {\n    plot <- plot + na_layer(x, var)\n  }\n\n  if (logv(log, \"y\")) {\n    plot <- plot + ggplot2::scale_y_continuous(trans = \"log1p\")\n  }\n  if (logv(log, \"x\")) {\n    plot <- plot + ggplot2::scale_x_log10()\n  }\n\n  plot\n}\n\nplot_total_2 <- function(x, var = \".count\", show_na = TRUE, log = \"\") {\n  x <- peel(x, keep = 1)\n  xvar <- names(x)[[1]]\n  yvar <- names(x)[[2]]\n  miss <- is.na(x[[1]]) + 2 * is.na(x[[2]])\n\n  fill_trans <- if (logv(log, \"z\")) \"log1p\" else \"identity\"\n\n  plot <- ggplot2::ggplot(x[miss == 0, ], ggplot2::aes_string(x = xvar, y = yvar)) +\n    ggplot2::geom_raster(ggplot2::aes_string(fill = var)) +\n    ggplot2::scale_fill_gradient(low = \"grey90\", high = \"black\", trans = fill_trans) +\n    ggplot2::expand_limits(fill = 0)\n\n  if (show_na) {\n  }\n\n  plot <- plot + if (logv(log, \"x\")) ggplot2::scale_x_log10()\n  plot <- plot + if (logv(log, \"y\")) ggplot2::scale_y_log10()\n\n  plot\n}\n\nplot_summary_1 <- function(x, var = \".mean\", show_na = TRUE,\n                                    show_n = x %contains% \".count\", log = NULL) {\n  xvar <- names(x)[[1]]\n\n  plot <- ggplot2::ggplot(x[-1, ], ggplot2::aes_string(x = xvar, y = var)) +\n    ggplot2::geom_line(na.rm = TRUE) +\n    ggplot2::scale_size_area()\n\n  if (show_n) {\n    plot <- plot +\n      ggplot2::geom_point(ggplot2::aes_string(color = \".count\"), na.rm = TRUE) +\n      ggplot2::scale_colour_gradient(trans = \"log10\")\n  }\n\n  if (show_na) {\n    plot <- plot + na_layer(x, var)\n  }\n\n  plot\n}\n\nplot_summary_2 <- function(x, var = \".mean\", show_na = TRUE, log = \"\") {\n  x <- peel(x, keep = 1)\n  xvar <- names(x)[[1]]\n  yvar <- names(x)[[2]]\n\n  miss <- is.na(x[[1]]) + 2 * is.na(x[[2]])\n\n  plot <- ggplot2::ggplot(x[miss == 0, ], ggplot2::aes_string(x = xvar, y = yvar)) +\n    ggplot2::geom_tile(ggplot2::aes_string(fill = var)) +\n    ggplot2::scale_fill_gradient2()\n\n  if (show_na) {\n  }\n\n  plot <- plot + if (logv(log, \"x\")) ggplot2::scale_x_log10()\n  plot <- plot + if (logv(log, \"y\")) ggplot2::scale_y_log10()\n\n  plot\n}\n\nna_layer <- function(x, var) {\n  val <- x[[var]][is.na(x[[1]])]\n  if (length(val) == 0 || is.na(val) || val == 0) return()\n\n  xloc <- miss_poss(x[[1]])\n  ggplot2::annotate(\"text\", x = xloc, y = val, colour = \"red\", label = \"NA\",\n    size = 3)\n}\n\nlogv <- function(log, var) var %in% strsplit(log, \"\")[[1]]\n\nmiss_poss <- function(x) {\n  rng <- frange(x)\n  rng[1] - (rng[2] - rng[1]) * 0.05\n}\n"
  },
  {
    "path": "R/bigvis.r",
    "content": "#' The big vis package.\n#'\n#' @useDynLib bigvis\n#' @docType package\n#' @name bigvis\nNULL\n\nif (!exists(\"BigVis\")) {\n  BigVis <- Rcpp::Module(\"BigVis\")\n}\n\n\n#' @param x,object,... Generic args\n#' @rdname bigvis\n#' @export\nsetMethod(\"show\", \"Rcpp_BinnedVector\", function(object) {\n  cat(\"Binned [\", object$size(), \"]. \",\n    \"Width: \", object$width(), \" Origin: \", object$origin(), \"\\n\", sep = \"\")\n})\n\n#' @rdname bigvis\n#' @export\nsetMethod(\"as.integer\", \"Rcpp_BinnedVector\", function(x, ...) {\n  vapply(seq_len(x$size()), x$bin_i, integer(1))\n})\n\n\n\n# Silence R CMD check note\n#' @importFrom methods new\n#' @importFrom Rcpp compileAttributes cpp_object_initializer\nNULL\n\n"
  },
  {
    "path": "R/bin.r",
    "content": "\n#' Create a binned variable.\n#'\n#' @details\n#' This function produces an R reference class that wraps around a C++ function.\n#' Generally, you should just treat this as an opaque object with reference\n#' semantics, and you shouldn't call the methods on it - pass it to\n#' \\code{\\link{condense}} and friends.\n#'\n#' @param x numeric or integer vector\n#' @param width bin width. If not specified, about 10,000 bins will be chosen\n#'   using the algorithim in \\code{\\link{find_width}}.\n#' @param origin origin. If not specified, guessed by \\code{\\link{find_origin}}.\n#' @param name name of original variable. This will be guessed from the input to\n#'   \\code{group} if not supplied. Used in the output of\n#'   \\code{\\link{condense}} etc.\n#' @export\n#' @examples\n#' x <- runif(1e6)\n#' bin(x)\n#' bin(x, 0.01)\n#' bin(x, 0.01, origin = 0.5)\nbin <- function(x, width = find_width(x), origin = find_origin(x, width),\n                name = NULL) {\n  stopifnot(is.numeric(x))\n  stopifnot(is.numeric(width), length(width) == 1, width > 0)\n  stopifnot(is.numeric(origin), length(origin) == 1)\n\n  if (is.null(name)) {\n    name <- deparse(substitute(x))\n  }\n  stopifnot(is.character(name), length(name) == 1)\n\n  if (!is.ranged(x)) {\n    attr(x, \"range\") <- frange(x)\n    class(x) <- \"ranged\"\n  }\n  if (origin > min(x)) {\n    warning(\"Origin larger than min(x): some values will be truncated\",\n      call. = FALSE)\n  }\n\n  BigVis$BinnedVector$new(x, name, width, origin)\n}\n\n\nis.binned <- function(x) {\n  is(x, \"Rcpp_BinnedVector\")\n}\n\nbins <- function(...) {\n  BigVis$BinnedVectors$new(list(...))\n}\n\n"
  },
  {
    "path": "R/breaks.r",
    "content": "#' Compute breaks given origin and width.\n#'\n#' Breaks are right-open, left-closed [x, y), so if \\code{max(x)} is an integer\n#' multiple of binwidth, then we need one more break. This function only returns\n#' the left-side of the breaks.\n#'\n#' The first break is special, because it always contains missing values.\n#'\n#' @param x numeric vector\n#' @param origin bin origin\n#' @param binwidth bin width\n#' @export\n#' @keywords internal\n#' @examples\n#' breaks(10, origin = 0, binwidth = 1)\n#' breaks(9.9, origin = 0, binwidth = 1)\n#'\n#' breaks(1:10, origin = 0, binwidth = 2)\nbreaks <- function(x, binwidth, origin = min(x)) {\n  if (!is.binned(x)) {\n    x <- bin(x, binwidth, origin)\n  }\n\n  # -1 for NA bin, -1 since R is 1 indexed\n  nbins <- x$nbins() - 2\n  c(NA, x$origin() + seq.int(1, nbins) * x$width())\n}\n"
  },
  {
    "path": "R/challenge.r",
    "content": "#' Density and random number generation functions for a challenging\n#' distribution.\n#'\n#' This is a 1/3-2/3 mixture of a t-distribution with 2 degrees of freedom\n#' centered at 15 and scaled by 2, and a gamma distribution with shape 2\n#' and rate 1/3. (The t-distribution is windsorised at 0, but this\n#' has negligible effect.) This distribution is challenging because it\n#' mixes heavy tailed and asymmetric distributions.\n#'\n#' @param x values to evaluate pdf at\n#' @param n number of random samples to generate\n#' @export\n#' @examples\n#' plot(dchallenge, xlim = c(-5, 60), n = 500)\n#'\n#' x <- rchallenge(1e4)\n#' hist(x, breaks = 1000)\n#' xsum <- condense(bin(x, 0.1))\n#' plot(xsum$x, xsum$.count, type = \"l\")\n#' xsmu <- smooth(xsum, 0.3)\n#' plot(xsmu$x, xsmu$.count, type = \"l\")\n#' plot(xsmu$x, xsmu$.count, type = \"l\", xlim = c(0, 30))\ndchallenge <- function(x) {\n  # Windorised t-distribution\n  scale <- function(x) (x - 30) / 2\n  spike <- ifelse(x < 0, 0, dt(scale(x), df = 2)) +\n    pt(scale(0), df = 2) * (x == 0)\n\n  slope <- dgamma(x, 2, 1/3)\n\n  (spike + 2 * slope) / 3\n}\n\n# plot(pchallenge, xlim = c(-5, 60), n = 500)\npchallenge <- function(x) {\n  #  H(y) =\n  #  = int_0^y h(x) dx\n  #  = int_0^y 1/3 f(x) + 2/3 g(x) dx\n  #  = 1/3 int_0^y f(x) dx + 2/3 int_0^y g(x) dx =\n  #  = 1/3 F(y) + 2/3 G(y)\n\n  # h(x) = g((x - 30) / 2)\n  # H(y) = int_0^y g((x - 30) / 2) dx\n  # complete transformation\n\n  scale <- function(x) (x - 30) / 2\n  spike <- ifelse(x < 0, 0, pt(scale(x), df = 2))\n\n  slope <- pgamma(x, 2, 1/3)\n\n  (spike + 2 * slope) / 3\n}\n\nqchallenge <- function(x) {\n  # approximate pchallenge with 1000 points, and linearise\n  # use to implement fast option to rchallenge that does inverse pdf\n  # transformation + runif()\n}\n\n\n#' @rdname dchallenge\n#' @export\nrchallenge <- function(n) {\n  nt <- rbinom(1, n, 1 / 3)\n  ngamma <- n - nt\n\n  spike <- 2 * rt(nt, df = 2) + 15\n  spike[spike < 0] <- 0\n\n  slope <- rgamma(ngamma, 2, 1/3)\n\n  c(spike, slope)\n}\n\n"
  },
  {
    "path": "R/condense.r",
    "content": "#' Efficient binned summaries.\n#'\n#' @param ... group objects created by \\code{\\link{bin}}\n#' @param z a numeric vector to summary for each group. Optional for some\n#'   summary statistics.\n#' @param summary the summary statistic to use. Currently must be one of\n#'   count, sum, mean, median or sd. If \\code{NULL}, defaults to mean if\n#'   y is present, count if not.\n#' @param w a vector of weights. Not currently supported by all summary\n#'   functions.\n#' @param drop if \\code{TRUE} only locations with data will be returned.  This\n#'   is more efficient if the data is very sparse (<1\\% of cells filled), and\n#'   is slightly less efficient. Defaults to \\code{TRUE} if you are condensing\n#'   over two or more dimensions, \\code{FALSE} for 1d.\n#' @export\n#' @examples\n#' x <- runif(1e5)\n#' gx <- bin(x, 0.1)\n#' condense(gx)\ncondense <- function(..., z = NULL, summary = NULL, w = NULL, drop = NULL) {\n  gs <- list(...)\n  if (length(gs) == 1 && is.list(gs[[1]])) gs <- gs[[1]]\n\n  is_binned <- vapply(gs, is.binned, logical(1))\n  if (!all(is_binned)) {\n    stop(\"All objects passed to ... must be binned.\", call. = FALSE)\n  }\n\n  drop <- drop %||% (length(gs) > 1)\n\n  if (is.null(summary)) {\n    summary <- if (is.null(z)) \"count\" else \"mean\"\n    message(\"Summarising with \", summary)\n  }\n\n  # C++ code can deal with NULL inputs more efficiently than R code\n  z <- z %||% numeric()\n  w <- w %||% numeric()\n\n  # Check lengths consistent\n  n <- gs[[1]]$size()\n  stopifnot(length(z) == 0 || length(z) == n)\n  stopifnot(length(w) == 0 || length(w) == n)\n\n  f <- find_fun(paste(\"condense\", summary, sep = \"_\"))\n  out <- f(gs, z, w, drop = drop)\n\n  condensed(gs, out[[1]], out[[2]])\n}\n"
  },
  {
    "path": "R/condensed.r",
    "content": "#' Condensed: an S3 class for condensed summaries.\n#'\n#' This object managed the properties of condensed (summarised) data frames.\n#'\n#' @section S3 methods:\n#'\n#' Mathematical functions with methods for \\code{binsum} object will modify\n#' the x column of the data frame and \\code{\\link{rebin}} the data, calculating\n#' updated summary statistics.\n#'\n#' Currently methods are provided for the \\code{Math} group generic,\n#' logical comparison and arithmetic operators, and\n#' \\code{\\link[plyr]{round_any}}.\n#'\n#' @param groups list of \\code{\\link{bin}}ed objects\n#' @param grouped,summary output from C++ condense function\n#' @keywords internal\n#' @examples\n#' if (require(\"ggplot2\")) {\n#'\n#' x <- rchallenge(1e4)\n#' xsum <- condense(bin(x, 1 / 10))\n#'\n#' # Basic math operations just modify the first column\n#' autoplot(xsum)\n#' autoplot(xsum * 10)\n#' autoplot(xsum - 30)\n#' autoplot(abs(xsum - 30))\n#'\n#' # Similarly, logical operations work on the first col\n#' autoplot(xsum[xsum > 10, ])\n#'}\ncondensed <- function(groups, grouped, summary) {\n  grouped <- as.data.frame(grouped)\n  summary <- as.data.frame(summary)\n\n  for (i in seq_along(groups)) {\n    grouped[[i]] <- dgrid(grouped[[i]],\n      groups[[i]]$width(), groups[[i]]$origin(), groups[[i]]$nbins())\n  }\n\n  names(summary) <- paste0(\".\", names(summary))\n\n  df <- data.frame(grouped, summary)\n  class(df) <- c(\"condensed\", class(df))\n  df\n}\n\n#' @export\n#' @rdname condensed\n#' @param x object to test or coerce\nis.condensed <- function(x) inherits(x, \"condensed\")\n\n#' @export\n#' @rdname condensed\nas.condensed <- function(x) UseMethod(\"as.condensed\")\n#' @export\nas.condensed.condensed <- function(x) x\n#' @export\nas.condensed.data.frame <- function(x) {\n  structure(x, class = c(\"condensed\", class(x)))\n}\n\nsummary_vars <- function(x) {\n  stopifnot(is.condensed(x))\n  nm <- names(x)\n  names(x)[grepl(\"^\\\\.\", names(x))]\n}\n\ngroup_vars <- function(x) {\n  setdiff(names(x), summary_vars(x))\n}\n\ngcol <- function(x) length(group_vars(x))\n\n\n#' @export\nMath.condensed <- function(x, ...) {\n  generic <- match.fun(.Generic)\n  x[[1]] <- generic(x[[1]], ...)\n  rebin(x)\n}\n\n#' @export\nOps.condensed <- function(e1, e2) {\n  logical_ops <- c(\"==\", \"!=\", \"<\", \"<=\", \">=\", \">\")\n  math_ops <- c(\"+\", \"-\", \"*\", \"/\", \"^\", \"%%\", \"%/%\")\n\n  generic <- match.fun(.Generic)\n  if (.Generic %in% logical_ops)  {\n    l <- generic(e1[[1]], e2)\n    l[1] <- TRUE # always preserve missings\n    l & !is.na(l)\n  } else if (.Generic %in% math_ops) {\n    e1[[1]] <- generic(e1[[1]], e2)\n    rebin(e1)\n  } else {\n    stop(.Generic, \" not supported for condensed objects\", call. = FALSE)\n  }\n}\n\n#' Round any method for condensed objects\n#'\n#' @inheritParams plyr::round_any\n#' @export\n#' @keywords internal\nround_any.condensed <- function(x, accuracy, f = round) {\n  gvars <- group_vars(x)\n  x[gvars] <- lapply(x[gvars], plyr::round_any, accuracy = accuracy, f = f)\n  rebin(x)\n}\n"
  },
  {
    "path": "R/dgrid.r",
    "content": "#' dgrid: an S3 class for data grids\n#'\n#' @param x a numeric vector to test or coerce.\n#' @param width bin width\n#' @param origin bin origins\n#' @param nbins number of bins\n#' @export\n#' @examples\n#' g <- dgrid(0:10 + 0.5, width = 1)\n#' range(g)\n#' as.integer(g)\ndgrid <- function(x, width, origin = 0, nbins = NULL) {\n  stopifnot(is.numeric(x))\n  stopifnot(is.numeric(width), length(width) == 1, width > 0)\n  stopifnot(is.numeric(origin), length(origin) == 1)\n\n  if (is.null(nbins)) {\n    nbins <- floor((max(x) - origin) / width)\n  }\n\n  structure(x, class = c(\"dgrid\", \"numeric\"),\n    width = width, origin = origin, nbins = nbins)\n}\n\n#' @export\n#' @rdname dgrid\nis.dgrid <- function(x) inherits(x, \"dgrid\")\n\n#' @export\n\"[.dgrid\" <- function(x, ...) {\n  dgrid(NextMethod(), width = attr(x, \"width\"),\n    origin = attr(x, \"origin\"), nbins = attr(x, \"nbins\"))\n}\n\n#' @export\nmin.dgrid <- function(x, ...) attr(x, \"origin\")\n#' @export\nmax.dgrid <- function(x, ...) {\n  min(x) + attr(x, \"nbins\") * attr(x, \"width\")\n}\n#' @export\nrange.dgrid <- function(x, ...) c(min(x), max(x))\n\n#' @export\nas.integer.dgrid <- function(x, ...) {\n  as.integer((unclass(x) - attr(x, \"origin\")) / attr(x, \"width\") + 1L)\n}\n\n#' @export\nas.data.frame.dgrid <- function(x, ...) {\n  n <- length(x)\n  list <- list(x)\n  class(list) <- \"data.frame\"\n  attr(list, \"row.names\") <- c(NA_integer_, -n)\n  list\n}\n"
  },
  {
    "path": "R/h.r",
    "content": "#' Find \"best\" smoothing parameter using leave-one-out cross validation.\n#'\n#' Minimises the leave-one-out estimate of root mean-squared error to find\n#' find the \"optimal\" bandwidth for smoothing.\n#'\n#' L-BFGS-B optimisation is used to constrain the bandwidths to be greater\n#' than the binwidths: if the bandwidth is smaller than the binwidth it's\n#' impossible to compute the rmse because no smoothing occurs. The tolerance\n#' is set relatively high for numerical optimisation since the precise choice\n#' of bandwidth makes little difference visually, and we're unlikely to have\n#' sufficient data to make a statistically significant choice anyway.\n#'\n#' @param x condensed summary to smooth\n#' @param h_init initial values of bandwidths to start search out. If not\n#'  specified defaults to 5 times the binwidth of each variable.\n#' @param ... other arguments (like \\code{var}) passed on to\n#'  \\code{\\link{rmse_cv}}\n#' @param tol numerical tolerance, defaults to 1\\%.\n#' @param control additional control parameters passed on to \\code{\\link{optim}}\n#'   The most useful argument is probably trace, which makes it possible to\n#'   follow the progress of the optimisation.\n#' @family bandwidth estimation functions\n#' @return a single numeric value representing the bandwidth that minimises\n#'   the leave-one-out estimate of rmse. Vector has attributes\n#'   \\code{evaluations} giving the number of times the objective function\n#'   was evaluated. If the optimisation does not converge, or smoothing is not\n#'   needed (i.e. the estimate is on the lower bounds), a warning is thrown.\n#' @export\n#' @examples\n#' \\donttest{\n#' x <- rchallenge(1e4)\n#' xsum <- condense(bin(x, 1 / 10))\n#' h <- best_h(xsum, control = list(trace = 3, REPORT = 1))\n#'\n#' if (require(\"ggplot2\")) {\n#' autoplot(xsum)\n#' autoplot(smooth(xsum, h))\n#' }\n#' }\nbest_h <- function(x, h_init = NULL, ..., tol = 1e-2, control = list()) {\n  stopifnot(is.condensed(x))\n\n  gvars <- group_vars(x)\n  widths <- vapply(x[gvars], attr, \"width\", FUN.VALUE = numeric(1))\n  h_init <- h_init %||% widths * 5\n  stopifnot(is.numeric(h_init), length(h_init) == length(gvars))\n\n  stopifnot(is.list(control))\n  control <- modifyList(list(factr = tol / .Machine$double.eps), control)\n\n  # Optimise\n  rmse <- function(h) {\n    rmse_cv(x, h, ...)\n  }\n  res <- optim(h_init, rmse, method = \"L-BFGS-B\", lower = widths * 1.01,\n    control = control)\n  h <- unname(res$par)\n\n  # Feedback\n  if (res$convergence != 0) {\n    warning(\"Failed to converge: \", res$message, call. = FALSE)\n  } else if (rel_dist(h, widths) < 1e-3) {\n    warning(\"h close to lower bound: smoothing not needed\", call. = FALSE)\n  }\n  structure(h, evaluations = res$counts[1], conv = res$convergence)\n}\n\nrel_dist <- function(x, y) {\n  mean(abs(x - y) / abs(x + y))\n}\n\n#' Generate grid of plausible bandwidths for condensed summary.\n#'\n#' By default, the bandwidths start at the bin width, and then continue\n#' up 50 (\\code{n}) steps until 20 (\\code{max}) times the bin width.\n#'\n#' @param x a condensed summary\n#' @param n number of bandwidths to generate (in each dimension)\n#' @param max maximum bandwidth to generate, as multiple of binwidth.\n#' @family bandwidth estimation functions\n#' @export\n#' @examples\n#' x <- rchallenge(1e4)\n#' xsum <- condense(bin(x, 1 / 10))\n#' h_grid(xsum)\n#'\n#' y <- runif(1e4)\n#' xysum <- condense(bin(x, 1 / 10), bin(y, 1 / 100))\n#' h_grid(xysum, n = 10)\nh_grid <- function(x, n = 50, max = 20) {\n  stopifnot(is.condensed(x))\n  stopifnot(is.numeric(n), length(n) == 1, n > 0)\n  stopifnot(is.numeric(max), length(max) == 1, max > 0)\n\n  gs <- x[group_vars(x)]\n  widths <- vapply(gs, attr, \"width\", FUN.VALUE = numeric(1))\n\n  hs <- lapply(widths, function(w) w * seq(2, max, length = n))\n  expand.grid(hs, KEEP.OUT.ATTRS = FALSE)\n}\n"
  },
  {
    "path": "R/id.r",
    "content": "# Copied and pasted from plyr to avoid dependency\n\nid <- function(.variables, drop = FALSE) {\n  # Drop all zero length inputs\n  lengths <- vapply(.variables, length, integer(1))\n  .variables <- .variables[lengths != 0]\n\n  if (length(.variables) == 0) {\n    n <- nrow(.variables) %||% 0L\n    return(structure(seq_len(n), n = n))\n  }\n\n  # Special case for single variable\n  if (length(.variables) == 1) {\n    return(id_var(.variables[[1]], drop = drop))\n  }\n\n  # Calculate individual ids\n  ids <- rev(lapply(.variables, id_var, drop = drop))\n  p <- length(ids)\n\n  # Calculate dimensions\n  ndistinct <- vapply(ids, attr, \"n\", FUN.VALUE = numeric(1),\n    USE.NAMES = FALSE)\n  n <- prod(ndistinct)\n  if (n > 2 ^ 31) {\n    # Too big for integers, have to use strings, which will be much slower :(\n\n    char_id <- do.call(\"paste\", c(ids, sep = \"\\r\"))\n    res <- match(char_id, unique(char_id))\n  } else {\n    combs <- c(1, cumprod(ndistinct[-p]))\n\n    mat <- do.call(\"cbind\", ids)\n    res <- c((mat - 1L) %*% combs + 1L)\n  }\n  attr(res, \"n\") <- n\n\n\n  if (drop) {\n    id_var(res, drop = TRUE)\n  } else {\n    structure(as.integer(res), n = attr(res, \"n\"))\n  }\n}\n\nid_var <- function(x, drop = FALSE) {\n  if (length(x) == 0) return(structure(integer(), n = 0L))\n  if (!is.null(attr(x, \"n\")) && !drop) return(x)\n\n  if (is.factor(x) && !drop) {\n    id <- as.integer(addNA(x, ifany = TRUE))\n    n <- length(levels(x))\n  } else {\n    levels <- sort(unique(x), na.last = TRUE)\n    id <- match(x, levels)\n    n <- max(id)\n  }\n  structure(id, n = n)\n}\n"
  },
  {
    "path": "R/movies.r",
    "content": "#' Movie information and user ratings from IMDB.com.\n#'\n#' The internet movie database, \\url{http://imdb.com/}, is a website devoted\n#' to collecting movie data supplied by studios and fans.  It claims to be the\n#' biggest movie database on the web and is run by amazon.  More about\n#' information imdb.com can be found online, \n#' \\url{http://imdb.com/help/show_leaf?about}, including information about \n#' the data collection process,\n#' \\url{http://imdb.com/help/show_leaf?infosource}.\n#' \n#' Movies were selected for inclusion if they had a known length and had been rated by at least one imdb user.  The data set contains the following fields:\n#' \n#' \\itemize{\n#'   \\item title.  Title of the movie.\n#'   \\item year.  Year of release.\n#'   \\item budget.  Total budget (if known) in US dollars\n#'   \\item length.  Length in minutes.\n#'   \\item rating.  Average IMDB user rating.\n#'   \\item votes.  Number of IMDB users who rated this movie.\n#'   \\item mpaa.  MPAA rating.\n#'   \\item action, animation, comedy, drama, documentary, romance, short:\n#'     \\code{TRUE} if movie belongs to that genre.\n#' }\n#' \n#' @docType data\n#' @usage data(movies)\n#' @name movies\n#' @format A data frame with 130,456 rows and 14 variables\n#' @references \\url{http://had.co.nz/data/movies/}\nNULL\n"
  },
  {
    "path": "R/mt.r",
    "content": "#' Modulus transformation (and its inverse).\n#'\n#' A generalisation of the box-cox transformation that works for\n#' values with both positive and negative values.\n#'\n#' This is useful for compressing the tails of long-tailed distributions,\n#' often encountered with very large datasets.\n#'\n#' @param x values to transform\n#' @param lambda degree of transformation\n#' @export\n#' @references J. John and N. Draper. \"An alternative family of\n#'  transformations.\" Applied Statistics, pages 190-197, 1980.\n#'  \\url{http://www.jstor.org/stable/2986305}\n#' @examples\n#' x <- seq(-10, 10, length = 100)\n#' plot(x, mt(x, 0), type = \"l\")\n#' plot(x, mt(x, 0.25), type = \"l\")\n#' plot(x, mt(x, 0.5), type = \"l\")\n#' plot(x, mt(x, 1), type = \"l\")\n#' plot(x, mt(x, 2), type = \"l\")\n#' plot(x, mt(x, -1), type = \"l\")\n#' plot(x, mt(x, -2), type = \"l\")\nmt <- function(x, lambda) {\n  stopifnot(is.numeric(x))\n  stopifnot(is.numeric(lambda), length(lambda) == 1)\n\n  if (lambda == 0) {\n    sign(x) * log(abs(x) + 1)\n  } else {\n    sign(x) * ((abs(x) + 1) ^ lambda - 1) / lambda\n   }\n}\n\n#' @rdname mt\n#' @export\ninv_mt <- function(x, lambda) {\n  stopifnot(is.numeric(x))\n  stopifnot(is.numeric(lambda), length(lambda) == 1)\n\n  if (lambda == 0) {\n    sign(x) * (exp(abs(x)) - 1)\n  } else {\n    sign(x) * ((abs(x) * lambda + 1) ^ (1 / lambda) - 1)\n   }\n}\n\n#' @rdname mt\n#' @export\nmt_trans <- function(lambda) {\n  scales::trans_new(\"modulo\",\n    function(x) mt(x, lambda),\n    function(x) inv_mt(x, lambda)\n  )\n}\n"
  },
  {
    "path": "R/origin.r",
    "content": "#' Find the origin.\n#'\n#' @details\n#' This algorithm implements simple heuristics for determining the origin of\n#' a histogram when only the binwidth is specified. It:\n#'\n#' \\itemize{\n#'    \\item rounds to zero, if relatively close\n#'    \\item subtracts 0.5 offset, if an x is integer\n#'    \\item ensures the origin is a multiple of the binwidth\n#' }\n#' @param x numeric or integer vector\n#' @param binwidth binwidth\n#' @export\n#' @keywords internal\n#' @family reasonable defaults\n#' @examples\n#' find_origin(1:10, 1)\n#' find_origin(1:10, 2)\n#' find_origin(c(1, 1e6), 1)\nfind_origin <- function(x, binwidth) {\n  rng <- frange(x, finite = TRUE)\n  if (!all(is.finite(rng))) stop(\"No valid values in x\", call. = FALSE)\n\n  offset <- is.integer(x) * 0.5\n\n  if (close_to_zero(rng[1], rng)) {\n    0 - offset\n  } else {\n    floor_any(rng[1], binwidth) - offset\n  }\n}\n\nclose_to_zero <- function(x, rng) {\n  (abs(x) / abs(rng[2] - rng[1])) < 1e-3\n}\n\nfloor_any <- function(x, accuracy) {\n  floor(x / accuracy) * accuracy\n}\n"
  },
  {
    "path": "R/peel.r",
    "content": "#' Peel off low density regions of the data.\n#'\n#' Keeps specified proportion of data by removing the lowest density regions,\n#' either anywhere on the plot, or for 2d, just around the edges.\n#'\n#' This is useful for visualisation, as an easy way of focussing on the regions\n#' where the majority of the data lies.\n#'\n#' @param x condensed summary\n#' @param keep (approximate) proportion of data to keep. If \\code{1}, will\n#'   remove all cells with counts.  All missing values will be preserved.\n#' @param central if \\code{TRUE} peels off regions of lowest density only from\n#'   the outside of the data. In 2d this works by progressively peeling off\n#'   convex hull of the data: the current algorithm is quite slow.\n#'   If \\code{FALSE}, just removes the lowest density regions wherever they are\n#'   found. Regions with 0 density are removed regardless of location.\n#'   Defaults to TRUE if there are two or fewer grouping variables is less.\n#' @export\n#' @examples\n#' x <- rt(1e5, df = 2)\n#' y <- rt(1e5, df = 2)\n#' xysum <- condense(bin(x, 1 / 10), bin(y, 1 / 10))\n#' plot(xysum$x, xysum$y)\n#'\n#' plot(peel(xysum, 0.95, central = TRUE)[1:2])\n#' plot(peel(xysum, 0.90, central = TRUE)[1:2])\n#' plot(peel(xysum, 0.50, central = TRUE)[1:2])\npeel <- function(x, keep = 0.99, central = NULL) {\n  stopifnot(is.condensed(x))\n  stopifnot(is.numeric(keep), length(keep) == 1, keep > 0, keep <= 1)\n  central <- central %||% (gcol(x) <= 2)\n  stopifnot(is.logical(central), length(central) == 1)\n\n  if (is.null(x$.count)) {\n    stop(\"Can only peel objects with .count variable\", call. = FALSE)\n  }\n\n  x <- x[x$.count > 0, , drop = FALSE]\n  if (keep == 1) return(x)\n\n  complete <- complete.cases(x[group_vars(x)])\n  x_complete <- x[complete, , drop = FALSE]\n\n  if (central) {\n    peeled <- peel_outer(x_complete, keep)\n  } else {\n    peeled <- peel_anywhere(x_complete, keep)\n  }\n\n  rbind(peeled, x[!complete, , drop = FALSE])\n}\n\npeel_anywhere <- function(x, keep) {\n  ord <- order(x$.count, decreasing = TRUE)\n  prop <- cumsum(x$.count[ord]) / sum(x$.count)\n\n  ind <- which(prop >= keep)[1]\n  x[ord[seq_len(ind)], , drop = FALSE]\n}\n\npeel_outer <- function(x, keep) {\n  d <- gcol(x)\n  if (d > 2) {\n    stop(\"Outer peeling only works with 1d or 2d data\", call. = FALSE)\n  }\n\n  n <- sum(x$.count)\n  x <- x[order(x$.count, decreasing = TRUE), ]\n  prop <- cumsum(x$.count) / n\n\n  # Peel off smallest values on chull\n  candidate <- which(prop >= keep)\n  on_hull <- intersect(candidate, chull(x[1:d]))\n  left <- sum(x$.count[-on_hull]) / n\n\n  while(left >= keep && length(on_hull) > 0) {\n    x <- x[-on_hull, , drop = FALSE]\n    prop <- prop[-on_hull]\n    candidate <- which(prop >= keep)\n    on_hull <- intersect(candidate, chull(x[1:d]))\n    left <- sum(x$.count[-on_hull]) / n\n  }\n\n  x\n}\n"
  },
  {
    "path": "R/ranged.r",
    "content": "#' A S3 class for caching the range of a vector\n#'\n#' This class is designed for dealing with large vectors, where the cost of\n#' recomputing the range multiple times is prohibitive. It provides methods\n#' for \\code{\\link{print}} and \\code{\\link{str}} that display only the range,\n#' not the contents.\n#'\n#' @section Performance:\n#' For best performance, you may want to run copy and paste the contents of\n#' this function into your function, to avoid making any copies of \\code{x}.\n#' This is probably only necessary if you're dealing with extremely large\n#' vectors, > 100 million obs.\n#'\n#' @param x a numeric vector\n#' @param range the range of the vector (excluding missing values), if known.\n#'   If unknown, it will be computed with \\code{\\link{frange}}, a fast C++\n#'   implementation of \\code{\\link{range}}.\n#' @export\n#' @examples\n#' x <- runif(1e6)\n#' y <- ranged(x)\n#' range(y)\n#' y\n#' str(y)\n#'\n#' # Modifications to the class currently destroy the cache\n#' y[1] <- 10\n#' max(y)\n#' class(y)\n#' z <- y + 10\n#' max(z)\n#' class(z)\nranged <- function(x, range = frange(x, finite = TRUE)) {\n  stopifnot(is.numeric(x))\n\n  # Reset range attribute so that lazy evaluation of range\n  # always recomputes from scratch\n  attr(x, \"range\") <- NULL\n\n  attr(x, \"range\") <- range\n  class(x) <- \"ranged\"\n  x\n}\n\n#' Test if an object is of class ranged.\n#'\n#' @export\n#' @param x object to test\n#' @keywords internal\nis.ranged <- function(x) inherits(x, \"ranged\")\n\n#' @export\nmin.ranged <- function(x, ...) attr(x, \"range\")[1]\n#' @export\nmax.ranged <- function(x, ...) attr(x, \"range\")[2]\n#' @export\nrange.ranged <- function(x, ...) attr(x, \"range\")\n\n#' @export\nprint.ranged <- function(x, ...) {\n  rng <- attr(x, \"range\")\n  # attr(x, \"range\") <- NULL\n  # attr(x, \"class\") <- NULL\n  # print.default(x)\n  cat(\"Ranged 1:\", length(x), \" [\", format(rng[1]), \", \", format(rng[2]), \"]\\n\",\n    sep = \"\")\n}\n\n#' @export\nstr.ranged <- function(object, ...) {\n  rng <- attr(object, \"range\")\n  cat(\" Ranged [1:\", length(object), \"] \", format(rng[1]), \"--\", format(rng[2]),\n    \"\\n\", sep = \"\")\n}\n\n#' @export\nOps.ranged <- function(e1, e2) {\n  attr(e1, \"range\") <- NULL\n  class(e1) <- NULL\n\n  NextMethod(e1, e2)\n}\n\n#' @export\n\"[<-.ranged\" <- function(x, ..., value) {\n  attr(x, \"range\") <- NULL\n  attr(x, \"class\") <- NULL\n  NextMethod(x, ..., value = value)\n}\n\n#' @export\nas.data.frame.ranged <- function(x, ...) {\n  n <- length(x)\n  x <- list(x)\n  class(x) <- \"data.frame\"\n  attr(x, \"row.names\") <- c(NA_integer_, -n)\n\n  x\n}\n"
  },
  {
    "path": "R/rebin.r",
    "content": "#' Transform condensed objects, collapsing unique bins.\n#'\n#' @details\n#' You don't need to use \\code{rebin} if you use transform: it will\n#' automatically rebin for you.  You will need to use it if you manually\n#' transform any grouping variables.\n#'\n#' @param data,`_data` a condensed summary\n#' @param ... named arguments evaluated in the context of the data\n#' @usage \\\\method{transform}{condensed}(`_data`, ...)\n#' @keywords internal\n#' @examples\n#' x <- runif(1e4, -1, 1)\n#' xsum <- condense(bin(x, 1 / 50))\n#'\n#' # Transforming by hand: must use rebin\n#' xsum$x <- abs(xsum$x)\n#' rebin(xsum)\n#' if (require(\"ggplot2\")) {\n#'   autoplot(xsum) + geom_point()\n#'   autoplot(rebin(xsum)) + geom_point()\n#' }\n#'\n#' #' Transforming with transform\n#' y <- x ^ 2 + runif(length(x), -0.1, 0.1)\n#' xysum <- condense(bin(x, 1 / 50), z = y)\n#' xysum <- transform(xysum, x = abs(x))\n#' if (require(\"ggplot2\")) {\n#'   autoplot(xysum)\n#' }\n#' @export\n#' @method transform condensed\ntransform.condensed <- function(`_data`, ...) {\n  df <- transform.data.frame(`_data`, ...)\n  rebin(as.condensed(df))\n}\n\n#' @export\n#' @rdname transform.condensed\nrebin <- function(data) {\n  stopifnot(is.condensed(data))\n\n  old_g <- data[group_vars(data)]\n  old_g[] <- lapply(old_g, zapsmall, digits = 3)\n  ids <- id(old_g, drop = TRUE)\n  if (!anyDuplicated(ids)) return(data)\n\n  old_s <- data[summary_vars(data)]\n  new_s <- lapply(names(old_s), function(var) rebin_var(old_s, ids, var))\n  names(new_s) <- names(old_s)\n\n  uids <- !duplicated(ids)\n  new_g <- old_g[uids, , drop = FALSE]\n  ord <- order(ids[uids], na.last = FALSE)\n\n  as.condensed(data.frame(new_g[ord, , drop = FALSE], new_s))\n}\n\nrebin_var <- function(df, ids, var) {\n  stopifnot(is.data.frame(df))\n  stopifnot(is.integer(ids), length(ids) == nrow(df))\n  stopifnot(is.character(var), length(var) == 1, var %in% names(rebinners))\n\n  rows <- split(seq_len(nrow(df)), ids)\n  f <- rebinners[[var]]\n\n  vapply(rows, function(i) f(df[i, , drop = FALSE]), numeric(1),\n    USE.NAMES = FALSE)\n}\n\nrebinners <- list(\n  .median = function(df) mean(df$.median, na.rm = TRUE),\n  .sum = function(df) sum(df$.sum, na.rm = TRUE),\n  .count = function(df) sum(df$.count, na.rm = TRUE),\n  .mean = function(df) {\n    if (is.null(df$.count)) {\n      mean(df$.mean, na.rm = TRUE)\n    } else {\n      weighted.mean(df$.mean, df$.count)\n    }\n  }\n)\n"
  },
  {
    "path": "R/rmse.r",
    "content": "#' Estimate smoothing RMSE using leave-one-out cross-validation.\n#'\n#' \\code{rmse_cv} computes the leave-one-out RMSE for a single vector of\n#' bandwidths, \\code{rmse_cvs} computes for a multiple vectors of bandwidths,\n#' stored as a data frame.\n#'\n#' @param x condensed summary table\n#' @param h,hs for \\code{rmse_cv}, a vector of bandwidths; for \\code{rmse_cv}\n#'    a data frame of bandwidths, as generated by \\code{\\link{h_grid}}.\n#' @param var variable to smooth\n#' @param ... other variables passed on to \\code{\\link{smooth}}\n#' @family bandwidth estimation functions\n#' @export\n#' @examples\n#' \\donttest{\n#' set.seed(1014)\n#' # 1d -----------------------------\n#' x <- rchallenge(1e4)\n#' xsum <- condense(bin(x, 1 / 10))\n#' cvs <- rmse_cvs(xsum)\n#'\n#' if (require(\"ggplot2\")) {\n#' autoplot(xsum)\n#' qplot(x, err, data = cvs, geom = \"line\")\n#' xsmu <- smooth(xsum, 1.3)\n#' autoplot(xsmu)\n#' autoplot(peel(xsmu))\n#' }\n#'\n#' # 2d -----------------------------\n#' y <- runif(1e4)\n#' xysum <- condense(bin(x, 1 / 10), bin(y, 1 / 100))\n#' cvs <- rmse_cvs(xysum, h_grid(xysum, 10))\n#' if (require(\"ggplot2\")) {\n#' qplot(x, y, data = cvs, size = err)\n#' }\n#' }\nrmse_cvs <- function(x, hs = h_grid(x), ...) {\n  rmse_1 <- function(i) {\n    rmse_cv(x, as.numeric(hs[i, ]), ...)\n  }\n  err <- vapply(seq_len(nrow(hs)), rmse_1, numeric(1))\n  data.frame(hs, err)\n}\n\n#' @rdname rmse_cvs\n#' @export\nrmse_cv <- function(x, h, var = summary_vars(x)[1], ...) {\n  # can't smooth missing values, so drop.\n  x <- x[complete.cases(x), , drop = FALSE]\n  gvars <- group_vars(x)\n\n  pred_error <- function(i) {\n    out <- as.matrix(x[i, gvars, drop = FALSE])\n    smu <- smooth(x[-i, , drop = FALSE], grid = out, h = h, var = var, ...)\n    smu[[var]] - x[[var]][i]\n  }\n  err <- vapply(seq_len(nrow(x)), pred_error, numeric(1))\n  sqrt(mean(err ^ 2, na.rm = TRUE))\n}\n\n"
  },
  {
    "path": "R/smooth.r",
    "content": "#' Smooth a condensed data frame.\n#'\n#' @param x a condensed summary\n#' @param h numeric vector of bandwidths, one for each grouping variable in\n#'   \\code{x}\n#' @param var variable to smooth\n#' @param grid a data frame with the grouping colums as x.  In order for the\n#'   factored version of \\code{smooth_nd} to work, this grid must be a superset\n#'   of \\code{x}.\n#' @param type type of smoothing to use.  Current options are \\code{\"mean\"},\n#'   a kernel weighted mean; \\code{\"regression\"}, a kernel weighted local\n#'   regression; and \\code{\"robust_regression\"}, robust kernel weighted local\n#'   regression in the style of \\code{\\link{loess}}.  Unique prefixes are also\n#'   acceptable.\n#' @param factor if \\code{TRUE} compute the n-dimensional smooth by a sequence\n#'   of 1d smoothes. For \\code{type = \"mean\"} the results are always the same\n#    as \\code{FALSE}; for \\code{type = \"regress\"} they will be equal if the\n#'   grid values are uncorrelated (e.g. the grid is complete at every location);\n#'   and is very approximate for \\code{type = \"robust\"}.\n#' @export\n#' @examples\n#' x <- runif(1e5)\n#' xsum <- condense(bin(x, 1 / 100))\n#' xsmu1 <- smooth(xsum, 5 / 100)\n#' xsmu2 <- smooth(xsum, 5 / 100, factor = FALSE)\n#'\n#' # More challenging distribution\n#' x <- rchallenge(1e4)\n#' xsum <- condense(bin(x, 0.1))\n#' xsmu <- smooth(xsum, 1)\n#'\n#' plot(xsum$x, xsum$.count, type = \"l\")\n#' lines(xsmu$x, xsmu$.count, col = \"red\")\n#'\n#' xsmu2 <- smooth(xsum, 1, type = \"regress\")\n#' plot(xsmu$x, xsmu$.count, type = \"l\", xlim = c(0, 50))\n#' lines(xsmu2$x, xsmu2$.count, col = \"red\")\n#' # Note difference in tails\nsmooth <- function(x, h, var = summary_vars(x)[1], grid = NULL, type = \"mean\",\n                   factor = TRUE) {\n  stopifnot(is.condensed(x))\n  stopifnot(is.numeric(h), all(h > 0))\n  type <- match.arg(type, c(\"mean\", \"regression\", \"robust_regression\"))\n\n  if (type != \"mean\" && !factor) {\n    stop(\"Only factored approximations available for types other than mean\",\n      call. = FALSE)\n  }\n\n  grid_in <- as.matrix(x[group_vars(x)])\n  grid_out <- grid %||% grid_in\n  stopifnot(is.matrix(grid_out), is.numeric(grid_out),\n    ncol(grid_out) == ncol(grid_in), nrow(grid_out) > 0)\n\n  z <- x[[var]]\n  w <- if (var != \".count\" && x %contains% \".count\") x$.count else numeric()\n\n  if (factor) {\n    for(i in 1:ncol(grid_in)) {\n      # smooth_nd_1 is a C++ function, so var is 0 indexed\n      z <- smooth_nd_1(grid_in, z, w, grid_out, var = i - 1, h = h[i],\n        type = type)\n    }\n  } else {\n    z <- smooth_nd(grid_in, z, w, grid_out, h)\n  }\n\n  out <- data.frame(grid_out)\n  out[[var]] <- z\n  structure(out, class = c(\"condensed\", class(out)))\n}\n\ncomplete_grid <- function(df) {\n  stopifnot(is.data.frame(df))\n\n  breaks <- function(width, origin, nbins) {\n    origin + width * seq.int(nbins)\n  }\n\n  cols <- lapply(df, function(x) do.call(breaks, attributes(x)))\n  expand.grid(cols, KEEP.OUT.ATTRS = FALSE)\n}\n"
  },
  {
    "path": "R/standardise.r",
    "content": "#' Standardise a summary to sum to one.\n#'\n#' @param x a condensed summary. Must have \\code{.count} variable.\n#' @param margin margins to standardise along.  If \\code{NULL}, the default,\n#'  standardises the whole array.\n#' @export\n#' @examples\n#' b1 <- condense(bin(movies$year, 1))\n#' d1 <- smooth(b1, 2, type = \"reg\")\n#'\n#' if (require(\"ggplot2\")) {\n#'\n#' autoplot(b1)\n#' autoplot(d1)\n#'\n#' # Note change in x-axis limits\n#' autoplot(standardise(d1))\n#' }\n#'\n#' # Can also standardise a dimension at a time\n#' b2 <- with(movies, condense(bin(year, 2), bin(length, 10)))\n#' b2 <- peel(b2, central = TRUE)\n#'\n#' if (require(\"ggplot2\")) {\n#'\n#' autoplot(b2)\n#' autoplot(standardise(b2))    # note legend\n#' autoplot(standardise(b2, \"year\"))   # each row sums to 1\n#' autoplot(standardise(b2, \"length\")) # each col sums to 1\n#'\n#' base <- ggplot(b2, aes(length, .count)) +\n#'   geom_line(aes(group = year, colour = year))\n#' base\n#' base %+% standardise(b2)  # Just affects y axis labels\n#' base %+% standardise(b2, \"year\") # Makes year comparable\n#' base %+% standardise(b2, \"length\") # Meaningless for this display\n#'\n#' }\nstandardise <- function(x, margin = integer()) {\n  stopifnot(is.condensed(x), !is.null(x$.count))\n\n  if (length(margin) == 0) {\n    x$.count <- prop(x$.count)\n  } else {\n    x$.count <- ave(x$.count, id(x[margin]), FUN = prop)\n    x$.count[is.na(x$.count)] <- 0\n  }\n\n  x\n}\n\nprop <- function(x) x / sum(x, na.rm = TRUE)\n"
  },
  {
    "path": "R/utils.r",
    "content": "\"%||%\" <- function(x, y) if (is.null(x)) y else x\n\nlast <- function(x) x[length(x)]\n\n\"%contains%\" <- function(df, var) {\n  var %in% names(df)\n}\n\nfind_fun <- function(name, env = globalenv()) {\n  if (is.function(name)) return(name)\n\n  ns_env <- asNamespace(\"bigvis\")\n  if (exists(name, ns_env, mode = \"function\")) {\n    return(get(name, ns_env))\n  }\n\n  if (exists(name, env, mode = \"function\")) {\n    return(get(name, env))\n  }\n\n  stop(\"Could not find function \", name, call. = FALSE)\n}\n"
  },
  {
    "path": "R/weighted-stats.r",
    "content": "#' Compute a weighted variance or standard deviation of a vector.\n#'\n#' @details\n#' Note that unlike the base R \\code{\\link{var}} function, these functions only\n#' work with individual vectors not matrices or data frames.\n#'\n#' @family weighted statistics\n#' @seealso \\code{\\link[stats]{weighted.mean}}\n#' @param x numeric vector of observations\n#' @param w integer vector of weights, representing the number of\n#'  time each \\code{x} was observed\n#' @param na.rm if \\code{TRUE}, missing values in both \\code{w} and \\code{x}\n#'   will be removed prior computation. Otherwise if there are missing values\n#'   the result will always be missing.\n#' @export\n#' @examples\n#' x <- c(1:5)\n#' w <- rpois(5, 5) + 1\n#' y <- x[rep(seq_along(x), w)]\n#' weighted.var(x, w)\n#' var(y)\n#'\n#' stopifnot(all.equal(weighted.var(x, w), var(y)))\nweighted.var <- function(x, w = NULL, na.rm = FALSE) {\n  if (na.rm) {\n    na <- is.na(x) | is.na(w)\n    x <- x[!na]\n    w <- w[!na]\n  }\n\n  sum(w * (x - weighted.mean(x, w)) ^ 2) / (sum(w) - 1)\n}\n\n#' @export\n#' @rdname weighted.var\nweighted.sd <- function(x, w, na.rm = TRUE) sqrt(weighted.var(x, w, na.rm = TRUE))\n\n#' A weighted ecdf function.\n#'\n#' An extension of the base \\code{\\link[stats]{ecdf}} function which works\n#' with weighted data.\n#'\n#' @section S3 methods:\n#' The \\code{ecdf} class has methods for \\code{\\link{plot}},\n#' \\code{\\link{lines}}, \\code{\\link{summary}} and \\code{\\link{quantile}}.\n#' \\code{\\link{quantile}} does not currently correctly compute values for\n#' weighted ecdfs.\n#'\n#' @inheritParams weighted.var\n#' @family weighted statistics\n#' @seealso \\code{\\link[stats]{weighted.mean}}\n#' @export\n#' @examples\n#' x <- runif(200)\n#' w <- rpois(200, 5) + 1\n#'\n#' e <- weighted.ecdf(x, w)\n#' plot(e)\n#' summary(e)\n#'\n#' y <- x[rep(seq_along(x), w)]\n#' plot(ecdf(y))\nweighted.ecdf <- function(x, w) {\n  stopifnot(length(x) == length(w))\n  stopifnot(anyDuplicated(x) == 0)\n\n  ord <- order(x)\n  x <- x[ord]\n  w <- w[ord]\n\n  n <- sum(w)\n  wts <- cumsum(w / n)\n\n  f <- approxfun(x, wts, method = \"constant\", yleft = 0, yright = 1, f = 0)\n  class(f) <- c(\"wecdf\", \"ecdf\", \"stepfun\", class(f))\n  attr(f, \"call\") <- sys.call()\n  environment(f)$nobs <- n\n  f\n}\n\n#' Compute quantiles of weighted data.\n#'\n#' @details\n#' Currently only implements the type 7 algorithm, as described in\n#' \\code{\\link{quantile}}. Based on \\code{\\link{quantile}} written by R-core.\n#'\n#' @inheritParams weighted.var\n#' @param probs numeric vector of probabilities between 0 and 1\n#' @param na.rm If \\code{TRUE} will automatically remove missing values\n#'   in \\code{x} or \\code{w}.\n#' @family weighted statistics\n#' @export\n#' @examples\n#' x <- runif(200)\n#' w <- rpois(200, 5) + 1\n#' weighted.quantile(x, w)\nweighted.quantile <- function (x, w, probs = seq(0, 1, 0.25), na.rm = FALSE) {\n  stopifnot(length(x) == length(w))\n  na <- is.na(x) | is.na(w)\n  if (any(na)) {\n    if (na.rm) {\n      x <- x[!na]\n      w <- w[!na]\n    } else {\n      stop(\"Missing values not allowed when na.rm is FALSE\", call. = FALSE)\n    }\n  }\n\n  # Ensure x and w in ascending order of x\n  ord <- order(x)\n  x <- x[ord]\n  w <- w[ord]\n\n  # Find closest x just below and above index\n  n <- sum(w)\n  index <- 1 + (n - 1) * probs\n  j <- floor(index)\n\n  wts <- cumsum(w)\n  lo <- x[lowerBound(j, wts)]     # X_j\n  hi <- x[lowerBound(j + 1, wts)]\n\n  g <- index - j\n  ifelse(lo == hi, lo, (1 - g) * lo + g * hi)\n}\n# Q[i](p) = (1 - g) x[j] + g x[j+1]\n# j = floor(np + m)\n# g = np + m - j\n#\n# For type 7:\n#   m = 1 - p =>\n#   j = floor(1 + (n - 1) * p)\n#   g = (np + 1 - p) - floor(1 + (n - 1) * p)\n\n#' Compute the median of weighted data.\n#'\n#' @details This is a simple wrapper around \\code{\\link{weighted.quantile}}\n#' @inheritParams weighted.quantile\n#' @export\n#' @examples\n#' x <- runif(200)\n#' w <- rpois(200, 5) + 1\n#'\n#' median(x)\n#' weighted.median(x, w)\nweighted.median <- function(x, w, na.rm = FALSE) {\n  weighted.quantile(x, w, probs = 0.5, na.rm = na.rm)\n}\n\n#' Compute the interquartile range of weighted data.\n#'\n#' @details This is a simple wrapper around \\code{\\link{weighted.quantile}}\n#' @inheritParams weighted.quantile\n#' @export\n#' @examples\n#' x <- sort(runif(200))\n#' w <- rpois(200, seq(1, 10, length = 200)) + 1\n#'\n#' IQR(x)\n#' weighted.IQR(x, w)\nweighted.IQR <- function(x, w, na.rm = FALSE) {\n  diff(weighted.quantile(x, w, probs = c(0.25, 0.75), na.rm = na.rm))\n}\n"
  },
  {
    "path": "R/width.r",
    "content": "#' Compute a reasonable default binwidth.\n#'\n#' @param x a numeric vector. If a numeric vector of length one is supplied,\n#'   it's assumed that\n#' @param nbins desired number of bins (approximate)\n#' @export\n#' @keywords internal\n#' @family reasonable defaults\n#' @examples\n#' find_width(c(0, 5))\n#' find_width(c(0, 5.023432))\n#' find_width(c(0, 5.9))\nfind_width <- function(x, nbins = 1e4) {\n  stopifnot(is.numeric(x))\n  stopifnot(is.numeric(nbins), length(nbins) == 1, nbins > 0)\n\n  x <- diff(frange(x))\n  size <- x / nbins\n\n  # divide into order of magnitude and multiplier\n  om <- 10 ^ ceiling(log10(size))\n  mult <- size / om\n\n  # ensure number per unit is multiple of 1, 2, 3, 4, or 5\n  per_unit <- 1 / mult\n  rounders <- c(1, 2, 3, 4, 5)\n  poss <- round(per_unit / rounders) * rounders\n  poss <- poss[poss != 0]\n  width <- om / poss[which.min(abs(poss - per_unit))]\n\n  structure(width, n = ceiling(x / width), per_unit = 1 / width)\n}\n"
  },
  {
    "path": "README.md",
    "content": "# bigvis\n\n[![Travis-CI Build Status](https://travis-ci.org/hadley/bigvis.svg?branch=master)](https://travis-ci.org/hadley/bigvis)\n[![Coverage Status](https://img.shields.io/codecov/c/github/hadley/bigvis/master.svg)](https://codecov.io/github/hadley/bigvis?branch=master)\n\nThe bigvis package provides tools for exploratory data analysis of __large datasets__ (10-100 million obs). The aim is to have most operations take less than 5 seconds on commodity hardware, even for 100,000,000 data points.\n\nSince bigvis is not currently available on CRAN, the easiest way to try it out is to:\n\n```R\n# install.packages(\"devtools\")\ndevtools::install_github(\"hadley/bigvis\")\n```\n\n## Workflow\n\nThe bigvis package is structured around the following workflow:\n\n* `bin()` and `condense()` to get a compact summary of the data\n\n* if the estimates are rough, you might want to `smooth()`. See `best_h()` and `rmse_cvs()` to figure out a good starting bandwidth\n\n* if you're working with counts, you might want to `standardise()`\n\n* visualise the results with `autoplot()` (you'll need to load `ggplot2` to use this)\n\n## Weighted statistics\n\nBigvis also provides a number of standard statistics efficiently implemented on weighted/binned data: `weighted.median`, `weighted.IQR`, `weighted.var`, `weighted.sd`, `weighted.ecdf` and `weighted.quantile`. \n\n## Acknowledgements\n\nThis package wouldn't be possible without:\n\n* the fantastic [Rcpp](http://dirk.eddelbuettel.com/code/rcpp.html) package, which makes it amazingly easy to integrate R and C++\n\n* JJ Allaire and Carlos Scheidegger who have indefatigably answered my many C++ questions\n\n* the generous support of Revolution Analytics who supported the early development.\n\n* Yue Hu, who implemented a proof of concepts that showed that it might be possible to work with this much data in R.\n"
  },
  {
    "path": "bench/bin-structure.cpp",
    "content": "// How does the data structure implementing the bin affect performance.\n\n#include <Rcpp.h>\nusing namespace Rcpp;\n\nclass Grouper {\n    const Fast<NumericVector> x_;\n    double width_;\n    double origin_;\n  public:\n    Grouper (const NumericVector& x, double width, double origin = 0)\n       : x_(x), width_(width), origin_(origin) {\n    }\n\n    int bin(int i) const {\n      if (ISNAN(x_[i])) return 0;\n      \n      return (x_[i] - origin_) / width_ + 1;\n    }\n\n    int size() const {\n      return x_.size();\n    }\n};\n\n// [[Rcpp::export]]\nstd::vector<int> count_vector(const NumericVector& x, double width, double origin = 0) {\n  Grouper grouper = Grouper(x, width, origin);\n  std::vector<int> count;\n\n  int n = grouper.size();\n  for(int i = 0; i < n; ++i) {\n    int bin = grouper.bin(i);\n    if (bin >= count.size()) {\n      count.resize(bin + 1);\n    }\n\n    ++count[bin];\n  }\n\n  return count;\n}\n\n// [[Rcpp::export]]\nList count_map(const NumericVector& x, double width, double origin = 0) {\n  Grouper grouper = Grouper(x, width, origin);\n  std::map<int, int> count;\n\n  int n = grouper.size();\n  for(int i = 0; i < n; ++i) {\n    int bin = grouper.bin(i);\n    ++count[bin];\n  }\n\n  IntegerVector out_x(count.size()), out_y(count.size());\n  std::map<int, int>::const_iterator count_it = count.begin(), \n    count_end = count.begin();\n  for (int i = 0; count_it != count_end; ++count_it, ++i) {\n    out_x[i] = count_it->first;\n    out_y[i] = count_it->second;\n  }\n  return List::create(_[\"x\"] = out_x, _[\"count\"] = out_y);\n}\n\n// [[Rcpp::export]]\nList count_umap(const NumericVector& x, double width, double origin = 0) {\n  Grouper grouper = Grouper(x, width, origin);\n  std::tr1::unordered_map<int, int> count;\n\n  int n = grouper.size();\n  for(int i = 0; i < n; ++i) {\n    int bin = grouper.bin(i);\n\n    ++count[bin];\n  }\n\n  IntegerVector out_x(count.size()), out_y(count.size());\n  std::tr1::unordered_map<int, int>::iterator count_it = count.begin(), \n    count_end = count.end();\n  for (int i = 0; count_it != count_end; ++count_it, ++i) {\n    out_x[i] = count_it->first;\n    out_y[i] = count_it->second;\n  }\n  return List::create(_[\"x\"] = out_x, _[\"count\"] = out_y);\n}\n\n\ntemplate <class T>\ninline void hash_combine(std::size_t & seed, const T & v) {\n  std::tr1::hash<T> hasher;\n  seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);\n}\n\nnamespace std {\n  namespace tr1 {  \n    template<typename S, typename T> struct hash<pair<S, T> > {\n      inline size_t operator()(const pair<S, T> & v) const {\n        size_t seed = 0;\n        ::hash_combine(seed, v.first);\n        ::hash_combine(seed, v.second);\n        return seed;\n      }\n    };\n  }\n}\n\n// [[Rcpp::export]]\nList count_umap2(const NumericVector& x, double width, double origin = 0) {\n  Grouper grouper = Grouper(x, width, origin);\n  std::tr1::unordered_map<std::pair<int, int>, int> count;\n\n  int n = grouper.size();\n  for(int i = 0; i < n; ++i) {\n    int bin = grouper.bin(i);\n\n    ++count[std::make_pair(bin, bin)];\n  }\n\n  IntegerVector out_x(count.size()), out_y(count.size());\n  std::tr1::unordered_map<std::pair<int, int>, int>::iterator count_it = count.begin(), \n    count_end = count.end();\n  for (int i = 0; count_it != count_end; ++count_it, ++i) {\n    out_x[i] = count_it->first.first;\n    out_y[i] = count_it->second;\n  }\n  return List::create(_[\"x\"] = out_x, _[\"count\"] = out_y);\n}\n\n// [[Rcpp::export]]\nList count_umap2_man(const NumericVector& x, double width, double origin = 0) {\n  Grouper grouper = Grouper(x, width, origin);\n  std::tr1::unordered_map<int, int> count;\n\n  int n = grouper.size();\n  for(int i = 0; i < n; ++i) {\n    int bin = grouper.bin(i);\n    bin = bin * 100 + bin;\n    ++count[bin];\n  }\n\n  IntegerVector out_x(count.size()), out_y(count.size());\n  std::tr1::unordered_map<int, int>::iterator count_it = count.begin(), \n    count_end = count.end();\n  for (int i = 0; count_it != count_end; ++count_it, ++i) {\n    out_x[i] = count_it->first;\n    out_y[i] = count_it->second;\n  }\n  return List::create(_[\"x\"] = out_x, _[\"count\"] = out_y);\n}\n\n\n/*** R \n  options(digits = 3)\n  library(microbenchmark)\n  x <- runif(1e5)\n\n  # As expected, for small contiguous inputs, vector is fastest, followed by\n  # unordered maps (about half as fast), with maps in a distant last place.\n  microbenchmark(\n    count_vector(x, 1 / 1000),\n    count_map(x, 1 / 1000),\n    count_umap(x, 1 / 1000)\n  )\n\n  y <- c(x, x)\n  y1 <- c(x, x + 10)\n  y2 <- c(x, x + 100)\n  y3 <- c(x, x + 1000)\n  y4 <- c(x, x + 1000)\n\n  # While using std::vector is somewhat faster, the asymptotic behaviour is\n  # much worse - count_umap is basically constant, regardless of the number\n  # of bins\n  microbenchmark(\n    count_vector(y, 1 / 1000),\n    count_vector(y1, 1 / 1000),\n    count_vector(y2, 1 / 1000),\n    count_vector(y3, 1 / 1000),\n    count_vector(y4, 1 / 1000),\n    count_umap(y, 1 / 1000),\n    count_umap(y1, 1 / 1000),\n    count_umap(y2, 1 / 1000),\n    count_umap(y3, 1 / 1000),\n    count_umap(y4, 1 / 1000),\n    times = 10\n  )\n\n  # Using umap with a pair is about twice as slow as with an int: this probably\n  # implies that I should do the hashing myself.\n  microbenchmark(\n    count_umap(x, 1 / 1000),\n    count_umap2(x, 1 / 1000),\n    count_umap2_man(x, 1 / 1000)\n  )\n\n*/"
  },
  {
    "path": "bench/bin.cpp",
    "content": "#include <Rcpp.h>\n#include <iostream>\n#include <algorithm>\nusing namespace Rcpp;\n\n//' @param breaks must be ordered and span the complete range of x. \n// [[Rcpp::export]]\nIntegerVector bin(NumericVector x, NumericVector breaks) {\n  // Put missing values in the last position\n  int n = breaks.size();\n  IntegerVector out(n + 1);\n\n  for(NumericVector::iterator it = x.begin(); it != x.end(); it++) {\n    double val = *it;\n    if (ISNAN(val)) {\n      out[n]++;\n    } else {\n      NumericVector::iterator bin_it = \n        std::upper_bound(breaks.begin(), breaks.end(), val);\n\n      int bin = std::distance(breaks.begin(), bin_it);\n      out[bin]++;\n    }\n  }\n\n  return out;\n}\n\n// [[Rcpp::export]]\nIntegerVector bin2(NumericVector x, NumericVector breaks) {\n  // Put missing values in the last position\n  int n = breaks.size(), bin;\n  IntegerVector out(n + 1);\n\n  NumericVector::iterator x_it = x.begin(), x_end, bin_it,\n    breaks_it = breaks.begin(), breaks_end = breaks.end();\n\n  for(; x_it != x.end(); ++x_it) {\n    double val = *x_it;\n    if (ISNAN(val)) {\n      ++out[n];\n    } else {\n      bin_it = std::upper_bound(breaks_it, breaks_end, val);\n      bin = std::distance(breaks_it, bin_it);\n      ++out[bin];\n    }\n  }\n\n  return out;\n}\n\n// [[Rcpp::export]]\nstd::vector<int> bin3(NumericVector x, double width, double origin = 0) {\n  int bin, nmissing = 0;\n  std::vector<int> out;\n\n  NumericVector::iterator x_it = x.begin(), x_end;\n  for(; x_it != x.end(); ++x_it) {\n    double val = *x_it;\n    if (ISNAN(val)) {\n      ++nmissing;\n    } else {\n      bin = (val - origin) / width;\n      if (bin < 0) continue;\n    \n      // Make sure there's enough space\n      if (bin >= out.size()) {\n        out.resize(bin + 1);\n      }\n      ++out[bin];\n    }\n  }\n\n  // Put missing values in the last position\n  out.push_back(nmissing);\n  return out;\n}\n\n// Create class to encapsulate binning operations ------------------------------\n\n\nclass BinFixed {\n    double width_;\n    double origin_;\n  public:\n    BinFixed (double width, double origin = 0) {\n      width_ = width;\n      origin_ = origin;\n    }\n    int inline operator() (double val) const { \n      return (val - origin_) / width_;\n    }\n};\nclass BinBreaks {\n    NumericVector breaks_;\n    NumericVector::iterator breaks_it_, breaks_end_;\n\n  public:\n    BinBreaks (NumericVector& breaks) {\n      breaks_ = breaks;\n      breaks_it_ = breaks.begin();\n      breaks_end_ = breaks.end();\n    }\n    int inline operator() (double val) const { \n      NumericVector::iterator \n        bin_it = std::upper_bound(breaks_it_, breaks_end_, val);\n\n      return std::distance(breaks_it_, bin_it);\n    }\n};\n\n\ntemplate<typename Binner>\nstd::vector<int> bin_bin(NumericVector x, Binner binner) {\n  int bin, nmissing = 0;\n  std::vector<int> out;\n\n  NumericVector::iterator x_it = x.begin(), x_end;\n  for(; x_it != x.end(); ++x_it) {\n    double val = *x_it;\n    if (ISNAN(val)) {\n      ++nmissing;\n    } else {\n      bin = binner(val);\n      if (bin < 0) continue;\n    \n      // Make sure there's enough space\n      if (bin >= out.size()) {\n        out.resize(bin + 1);\n      }\n      ++out[bin];\n    }\n  }\n\n  // Put missing values in the last position\n  out.push_back(nmissing);\n  return out;\n}\n\n// [[Rcpp::export]]\nstd::vector<int> bin_bin_fixed(NumericVector x, double width, double origin = 0) {\n  return bin_bin(x, BinFixed(width, origin));\n}\n\n// [[Rcpp::export]]\nstd::vector<int> bin_bin_breaks(NumericVector x, NumericVector breaks) {\n  return bin_bin(x, BinBreaks(breaks));\n}\n\n// Try using a Fast<NumericVector> ------------------------------\n// Considerable speed improvement for simple binning function\ntemplate<typename Binner>\nstd::vector<int> fbin_bin(NumericVector x, Binner binner) {\n  int bin, nmissing = 0;\n  std::vector<int> out;\n\n  Fast<NumericVector> fx(x);\n  int n = x.size();\n\n  for(int i = 0; i < n; ++i) {\n    double val = fx[i];\n    if (ISNAN(val)) {\n      ++nmissing;\n    } else {\n      bin = binner(val);\n      if (bin < 0) continue;\n    \n      // Make sure there's enough space\n      if (bin >= out.size()) {\n        out.resize(bin + 1);\n      }\n      ++out[bin];\n    }\n  }\n\n  // Put missing values in the last position\n  out.push_back(nmissing);\n  return out;\n}\n\n// [[Rcpp::export]]\nstd::vector<int> fbin_bin_fixed(NumericVector x, double width, double origin = 0) {\n  return fbin_bin(x, BinFixed(width, origin));\n}\n\n// [[Rcpp::export]]\nstd::vector<int> fbin_bin_breaks(NumericVector x, NumericVector breaks) {\n  return fbin_bin(x, BinBreaks(breaks));\n}\n\n/*** R \noptions(digits = 3)\nlibrary(microbenchmark)\nx <- runif(1e5)\nbreaks <- seq(0, 1, length = 100)\n\n# Breaks\nmicrobenchmark(\n  hist(x, breaks, plot = F),\n  bin(x, breaks),\n  bin2(x, breaks),\n  bin_bin_breaks(x, breaks),\n  fbin_bin_breaks(x, breaks)\n)\n\n# Fixed bins\nmicrobenchmark(\n  bin3(x, 1/100, 0),\n  bin_bin_fixed(x, 1/100, 0),\n  fbin_bin_fixed(x, 1/100, 0)\n)\n\nx6 <- runif(1e6)\nx7 <- runif(1e7)\nx8 <- runif(1e8)\n\nmicrobenchmark(\n  bin_bin_fixed(x6, 1/100, 0),\n  fbin_bin_fixed(x6, 1/100, 0),\n  bin_bin_fixed(x7, 1/100, 0),\n  fbin_bin_fixed(x7, 1/100, 0),\n  bin_bin_fixed(x8, 1/100, 0),\n  fbin_bin_fixed(x8, 1/100, 0),\n  times = 10)\n\n*/"
  },
  {
    "path": "bench/count.cpp",
    "content": "// Experiment with making the binner more generic, so that the binner\n// class also stores the variable being binned over - this is important\n// for separating the grouping from the numeric operation.\n\n#include <Rcpp.h>\n#include <iostream>\n#include <algorithm>\nusing namespace Rcpp;\n\ntemplate<typename Binner>\nstd::vector<int> count_x(const NumericVector& x, Binner binner) {\n  std::vector<int> out;\n\n  int n = x.size();\n\n  for(int i = 0; i < n; ++i) {\n    int bin = binner(x[i]);\n    if (bin < 0) continue;\n  \n    // Make sure there's enough space\n    if (bin >= out.size()) {\n      out.resize(bin + 1);\n    }\n    ++out[bin];\n  }\n\n  return out;\n}\n\ntemplate<typename Binner>\nstd::vector<int> count(Binner binner) {\n  std::vector<int> out;\n\n  int n = binner.size();\n  for(int i = 0; i < n; ++i) {\n    int bin = binner.bin(i);\n    if (bin < 0) continue;\n  \n    // Make sure there's enough space\n    if (bin >= out.size()) {\n      out.resize(bin + 1);\n    }\n    ++out[bin];\n  }\n\n  return out;\n}\n\n\nclass BinFixed {\n    double width_;\n    double origin_;\n  public:\n    BinFixed (double width, double origin = 0) {\n      width_ = width;\n      origin_ = origin;\n    }\n\n    int inline operator() (double val) const { \n      if (ISNAN(val)) return 0;\n\n      return (val - origin_) / width_ + 1;\n    }\n};\n\nclass BinFixed2 {\n    const NumericVector& x_;\n    double width_;\n    double origin_;\n  public:\n    BinFixed2 (const NumericVector& x, double width, double origin = 0)\n       : x_(x), width_(width), origin_(origin) {\n    }\n\n    int bin(int i) const { \n      if (ISNAN(x_[i])) return 0;\n      return (x_[i] - origin_) / width_ + 1;\n    }\n\n    int size() const {\n      return x_.size();\n    }\n};\n\n\n// [[Rcpp::export]]\nstd::vector<int> count_x2(NumericVector x, double width, double origin = 0) {\n  return count_x(x, BinFixed(width, origin));\n}\n\n// [[Rcpp::export]]\nstd::vector<int> count2(NumericVector x, double width, double origin = 0) {\n  return count(BinFixed2(x, width, origin));\n}\n\n\n/*** R \noptions(digits = 3)\nlibrary(microbenchmark)\nx <- runif(1e5)\n\n# Breaks\nmicrobenchmark(\n  count_x2(x, 1/100),  \n  count2(x, 1/100)\n)\n\n*/"
  },
  {
    "path": "bench/group-tempvar.cpp",
    "content": "// In a function like \n// \n// unsigned int bin(unsigned int i) const {\n//   if (ISNAN(x_[i])) return 0;\n//   if (x_[i] < origin_) return 0;\n// \n//   return (x_[i] - origin_) / width_ + 1;\n// }\n// \n// should I create my own temporary double val = x_[i] ?\n// \n// It looks like it saves ~0.2 ns per invocation, so probably not worth it for\n// performance reasons.\n\n#include <Rcpp.h>\nusing namespace Rcpp;\n\nclass Group1 {\n    const Fast<NumericVector> x_;\n    double width_;\n    double origin_;\n  public:\n    Group1 (const NumericVector& x, double width, double origin = 0)\n       : x_(x), width_(width), origin_(origin) {\n    }\n\n    unsigned int bin(unsigned int i) const {\n      if (ISNAN(x_[i])) return 0;\n      if (x_[i] < origin_) return 0;\n      \n      return (x_[i] - origin_) / width_ + 1;\n    }\n\n    int size() const {\n      return x_.size();\n    }\n};\n\nclass Group2 {\n    const Fast<NumericVector> x_;\n    double width_;\n    double origin_;\n  public:\n    Group2 (const NumericVector& x, double width, double origin = 0)\n       : x_(x), width_(width), origin_(origin) {\n    }\n\n    unsigned int bin(unsigned int i) const {\n      double val = x_[i];\n      if (ISNAN(val)) return 0;\n      if (val < origin_) return 0;\n      \n      return (val - origin_) / width_ + 1;\n    }\n\n    int size() const {\n      return x_.size();\n    }\n};\n\ntemplate<typename Group>\nIntegerVector group_out(const Group& group) {\n  int n = group.size();\n  IntegerVector out(n);\n  for(int i = 0; i < n; ++i) {\n    out[i] = group.bin(i);\n  }\n\n  return out;\n}\n\n// [[Rcpp::export]]\nIntegerVector group1(const NumericVector& x, double width, double origin = 0) {\n  return group_out(Group1(x, width, origin));\n}\n\n// [[Rcpp::export]]\nIntegerVector group2(const NumericVector& x, double width, double origin = 0) {\n  return group_out(Group2(x, width, origin));\n}\n\n\n/*** R\nx <- runif(1e6)\nlibrary(microbenchmark)\nstopifnot(all.equal(group1(x, 1/1000), group2(x, 1/1000)))\n\n(m <- microbenchmark(\n  group1(x, 1/1000),\n  group2(x, 1/1000)\n))\ndiff(summary(m)$median) / length(x) * 1e9 / 1e3\n*/"
  },
  {
    "path": "bench/kernel.cpp",
    "content": "// Differences in kernel performance\n\n#include <Rcpp.h>\nusing namespace Rcpp;\n\n// [[Rcpp::export]]\nNumericVector normal_kernel(NumericVector x) {\n  int n = x.size();\n  NumericVector out(n);\n\n  for (int i = 0; i < n; ++i) {\n    out[i] = R::dnorm(x[i], 0, 1, 0);\n  }\n\n  return out;\n}\n\n// [[Rcpp::export]]\ndouble tricube2(double x) {\n  x = fabs(x);\n  if (x > 1) return 0;\n\n  return pow(1 - pow(x, 3), 3);\n}\n\n// [[Rcpp::export]]\ndouble tricube(double x) {\n  x = fabs(x);\n  if (x > 1) return 0;\n\n  double y = 1 - x * x * x;\n  return y * y * y;\n}\n\n// [[Rcpp::export]]\nNumericVector tricube_kernel(NumericVector x) {\n  int n = x.size();\n  NumericVector out(n);\n\n  for (int i = 0; i < n; ++i) {\n    out[i] = tricube(x[i]);\n  }\n\n  return out;\n}\n\n// [[Rcpp::export]]\nNumericVector copy(NumericVector x) {\n  int n = x.size();\n  NumericVector out(n);\n\n  for (int i = 0; i < n; ++i) {\n    out[i] = x[i];\n  }\n\n  return out;\n}\n\n/*** R\noptions(digits = 3)\nlibrary(microbenchmark)\n\nx <- runif(1e4)\n\nmean(sapply(x, tricube) - sapply(x, tricube2))\n\nmicrobenchmark(\n  copy(x),\n  tricube_kernel(x),\n  normal_kernel(x)\n)\n\n*/"
  },
  {
    "path": "bench/mean.cpp",
    "content": "// Instead of counting, compute a more complicated statistic: a weighted mean\n\n#include <Rcpp.h>\n#include <iostream>\n#include <algorithm>\nusing namespace Rcpp;\n\nclass BinFixed {\n    const Fast<NumericVector> x_;\n    double width_;\n    double origin_;\n  public:\n    BinFixed (const NumericVector& x, double width, double origin = 0)\n       : x_(x), width_(width), origin_(origin) {\n    }\n\n    int bin(int i) const { \n      if (ISNAN(x_[i])) return 0;\n      return (x_[i] - origin_) / width_ + 1;\n    }\n\n    int size() const {\n      return x_.size();\n    }\n};\n\ntemplate<typename Binner>\nNumericVector group_mean(NumericVector& y, NumericVector& weight, Binner binner) {\n  std::vector<double> count;\n  std::vector<double> sum;\n\n  int n = binner.size();\n  for(int i = 0; i < n; ++i) {\n    int bin = binner.bin(i);\n    if (bin < 0) continue;\n  \n    // Make sure there's enough space\n    if (bin >= sum.size()) {\n      sum.resize(bin + 1);\n      count.resize(bin + 1);\n    }\n\n    count[bin] += weight[i];\n    sum[bin] += y[i];\n  }\n\n  int m = count.size();\n  NumericVector res(m);\n  for (int i = 0; i < m; ++i) {\n    res[i] = sum[i] / count[i];\n  }\n  return res;\n}\n\n\nclass StatMean {\n    double count;\n    double sum;\n\n  public:\n    StatMean () : count(0), sum(0) {\n    }\n    void push(double x, double weight) {\n      count += weight;\n      sum += x;\n    }\n\n    double compute() {\n      return sum / count;\n    }\n};\n\ntemplate<typename Binner>\nNumericVector group_mean2(NumericVector& y, NumericVector& weight, Binner binner) {\n  std::vector<StatMean> stat;\n\n  int n = binner.size();\n  for(int i = 0; i < n; ++i) {\n    int bin = binner.bin(i);\n    if (bin < 0) continue;\n  \n    if (bin >= stat.size()) {\n      stat.resize(bin + 1);\n    }\n\n    stat[bin].push(y[i], weight[i]);\n  }\n\n  int m = stat.size();\n  NumericVector res(m);\n  for (int i = 0; i < m; ++i) {\n    res[i] = stat[i].compute();\n  }\n  return res;\n}\n\n\n// [[Rcpp::export]]\nNumericVector group_mean_(NumericVector x, NumericVector y, NumericVector weight, \n                       double width, double origin = 0) {\n  return group_mean(y, weight, BinFixed(x, width, origin));\n}\n// [[Rcpp::export]]\nNumericVector group_mean2_(NumericVector x, NumericVector y, NumericVector weight, \n                       double width, double origin = 0) {\n  return group_mean2(y, weight, BinFixed(x, width, origin));\n}\n\n\n/*** R \noptions(digits = 3)\nlibrary(microbenchmark)\nx <- runif(1e6)\ny <- runif(1e6)\nweight <- rep(1, 1e6)\n\n# Breaks\nmicrobenchmark(\n  group_mean_(x, y, weight, width = 1/100),\n  group_mean2_(x, y, weight, width = 1/100)\n)\n\n*/"
  },
  {
    "path": "bench/median.cpp",
    "content": "// Instead of counting, compute a more complicated statistic: a median\n\n#include <Rcpp.h>\n#include <iostream>\n#include <algorithm>\nusing namespace Rcpp;\n\nclass BinFixed {\n    const Fast<NumericVector> x_;\n    double width_;\n    double origin_;\n  public:\n    BinFixed (const NumericVector& x, double width, double origin = 0)\n       : x_(x), width_(width), origin_(origin) {\n    }\n\n    int bin(int i) const {\n      if (ISNAN(x_[i])) return 0;\n      return (x_[i] - origin_) / width_ + 1;\n    }\n\n    int size() const {\n      return x_.size();\n    }\n};\nclass StatMedian {\n    std::vector<double> ys;\n\n  public:\n    void push(double x) {\n      ys.push_back(x);\n    }\n\n    // Adapted from http://stackoverflow.com/questions/1719070/\n    double compute() {\n      if (ys.empty()) return NAN;\n\n      int size = ys.size();\n      std::vector<double>::iterator upper = ys.begin() + (int) (size / 2);\n      std::nth_element(ys.begin(), upper, ys.end());\n\n      if (size % 2 == 1) {\n        return *upper;\n      } else {\n        std::vector<double>::iterator lower = upper - 1;\n        std::nth_element(ys.begin(), lower, upper);\n        return (*upper + *lower) / 2.0;\n      }\n\n    }\n};\n\ntemplate<typename Binner>\nNumericVector group_median(NumericVector& y, Binner binner) {\n  std::vector<StatMedian> stat;\n\n  int n = binner.size();\n  for(int i = 0; i < n; ++i) {\n    int bin = binner.bin(i);\n    if (bin < 0) continue;\n\n    if (bin >= stat.size()) {\n      stat.resize(bin + 1);\n    }\n\n    stat[bin].push(y[i]);\n  }\n\n  int m = stat.size();\n  NumericVector res(m);\n  for (int i = 0; i < m; ++i) {\n    res[i] = stat[i].compute();\n  }\n  return res;\n}\n\n\n// [[Rcpp::export]]\nNumericVector group_median_(NumericVector x, NumericVector y,\n                            double width, double origin = 0) {\n  return group_median(y, BinFixed(x, width, origin));\n}\n\n\n/*** R\noptions(digits = 3)\nlibrary(microbenchmark)\nx <- runif(1e5)\ny <- runif(1e5)\n\ngroup_median_tapply <- function(x, y, width, origin = 0) {\n  bins <- trunc((x - origin) / width)\n  c(NaN, unname(tapply(y, bins, median)))\n}\nmed1 <- group_median_tapply(x, y, width = 1/1000)\nmed2 <- group_median_(x, y, width = 1/1000)\nstopifnot(all.equal(med1, med2))\n\n# Breaks\nmicrobenchmark(\n# group_median_tapply(x, y, width = 1/1000),\n  group_median_(x, y, width = 1/1000)\n)\n\n*/\n"
  },
  {
    "path": "bench/smooth-1d.cpp",
    "content": "// Explore opportunities for making smooth_1d faster\n// \n// Bounding to a given range is really important, and memoisation helps offset\n// the cost of making a call back to R, but the biggest win is using a pure\n// C/C++ kernel function.\n// \n// \n#include <Rcpp.h>\nusing namespace Rcpp;\n\n// Base implementation\n// [[Rcpp::export]]\nNumericVector smooth_1d(const NumericVector& x, const NumericVector& z, \n                        const NumericVector& x_out, const Function& kernel) {\n\n  int n_in = x.size(), n_out = x_out.size();\n  NumericVector z_out(n_out);\n\n  for (int i = 0; i < n_out; i++) {\n    for (int j = 0; j < n_in; j++) {\n      double dist = x[j] - x_out[i];\n      double k = as<NumericVector>(kernel(dist))[0];\n      z_out[i] += z[j] * k;\n    }\n  }\n\n  return z_out;\n}\n\n// Memoise distance calculations\n// [[Rcpp::export]]\nNumericVector smooth_1d_memo(const NumericVector& x, const NumericVector& z, \n                        const NumericVector& x_out, const Function& kernel) {\n  int n_in = x.size(), n_out = x_out.size();\n  NumericVector z_out(n_out);\n\n  std::unordered_map<double, double> k_memo;\n\n  for (int i = 0; i < n_out; i++) {\n    for (int j = 0; j < n_in; j++) {\n      double dist = x[j] - x_out[i];\n\n      std::unordered_map<double, double>::const_iterator it = k_memo.find(dist);\n      double k;\n      if (it == k_memo.end()) {\n        k = as<NumericVector>(kernel(dist))[0];\n        k_memo[dist] = k;\n      } else {\n        k = it->second; \n      }\n\n      z_out[i] += z[j] * k;\n    }\n  }\n\n  return z_out;\n}\n\n// Use range of kernel\n// [[Rcpp::export]]\nNumericVector smooth_1d_range(const NumericVector& x, const NumericVector& z, \n                        const NumericVector& x_out, const Function& kernel,\n                        double kmin, double kmax) {\n\n  int n_in = x.size(), n_out = x_out.size();\n  NumericVector z_out(n_out);\n\n  for (int i = 0; i < n_out; i++) {\n    for (int j = 0; j < n_in; j++) {\n      double dist = x[j] - x_out[i];\n      if (dist < kmin || dist > kmax) continue;\n\n      double k = as<NumericVector>(kernel(dist))[0];\n      z_out[i] += z[j] * k;\n    }\n  }\n\n  return z_out;\n}\n\n\n// Memoise and use range\n// [[Rcpp::export]]\nNumericVector smooth_1d_memo_range(const NumericVector& x, const NumericVector& z, \n                        const NumericVector& x_out, const Function& kernel,\n                        double kmin, double kmax) {\n  int n_in = x.size(), n_out = x_out.size();\n  NumericVector z_out(n_out);\n\n  std::unordered_map<double, double> k_memo;\n\n  for (int i = 0; i < n_out; i++) {\n    for (int j = 0; j < n_in; j++) {\n      double dist = x[j] - x_out[i];\n      if (dist < kmin || dist > kmax) continue;\n\n      std::unordered_map<double, double>::const_iterator it = k_memo.find(dist);\n      double k;\n      if (it == k_memo.end()) {\n        k = as<NumericVector>(kernel(dist))[0];\n        k_memo[dist] = k;\n      } else {\n        k = it->second; \n      }\n\n      z_out[i] += z[j] * k;\n    }\n  }\n\n  return z_out;\n}\n\n// Memoise and use range\n// [[Rcpp::export]]\nNumericVector smooth_1d_memo_range_map(const NumericVector& x, const NumericVector& z, \n                        const NumericVector& x_out, const Function& kernel,\n                        double kmin, double kmax) {\n  int n_in = x.size(), n_out = x_out.size();\n  NumericVector z_out(n_out);\n\n  std::map<double, double> k_memo;\n\n  for (int i = 0; i < n_out; i++) {\n    for (int j = 0; j < n_in; j++) {\n      double dist = x[j] - x_out[i];\n      if (dist < kmin || dist > kmax) continue;\n\n      std::map<double, double>::const_iterator it = k_memo.find(dist);\n      double k;\n      if (it == k_memo.end()) {\n        k = as<NumericVector>(kernel(dist))[0];\n        k_memo[dist] = k;\n      } else {\n        k = it->second; \n      }\n\n      z_out[i] += z[j] * k;\n    }\n  }\n\n  return z_out;\n}\n\n// Memoise, use range & use C++ function for kernel\n// [[Rcpp::export]]\nNumericVector smooth_1d_memo_range_kcpp(const NumericVector& x, const NumericVector& z, \n                        const NumericVector& x_out, double kmin, double kmax) {\n  int n_in = x.size(), n_out = x_out.size();\n  NumericVector z_out(n_out);\n\n  std::unordered_map<double, double> k_memo;\n\n  for (int i = 0; i < n_out; i++) {\n    for (int j = 0; j < n_in; j++) {\n      double dist = x[j] - x_out[i];\n      if (dist < kmin || dist > kmax) continue;\n\n      std::unordered_map<double, double>::const_iterator it = k_memo.find(dist);\n      double k;\n      if (it == k_memo.end()) {\n        k = R::dnorm(dist, 0.0, 0.1, 0);\n        k_memo[dist] = k;\n      } else {\n        k = it->second; \n      }\n\n      z_out[i] += z[j] * k;\n    }\n  }\n\n  return z_out;\n}\n\n// Use cpp kernel function without memoisation\n// [[Rcpp::export]]\nNumericVector smooth_1d_range_kcpp(const NumericVector& x, const NumericVector& z, \n                        const NumericVector& x_out, double kmin, double kmax) {\n\n  int n_in = x.size(), n_out = x_out.size();\n  NumericVector z_out(n_out);\n\n  for (int i = 0; i < n_out; i++) {\n    for (int j = 0; j < n_in; j++) {\n      double dist = x[j] - x_out[i];\n      if (dist < kmin || dist > kmax) continue;\n\n      double k = R::dnorm(dist, 0.0, 0.1, 0);\n      z_out[i] += z[j] * k;\n    }\n  }\n\n  return z_out;\n}\n\n\n/*** R\n  options(digits = 2)\n  x <- 1:10\n  z <- rep(c(1, 2), length = length(x))\n  k <- kernel(\"norm\", sd = 0.1)\n  krng <- range(k)\n  grid <- seq(0, 11, length = 100)\n  \n  stopifnot(all.equal(\n    smooth_1d_memo_range(x, z, grid, k, krng[1], krng[2]),\n    smooth_1d_range_kcpp(x, z, grid, krng[1], krng[2])\n  ))\n\n  library(microbenchmark)\n  microbenchmark(\n    base = smooth_1d(x, z, grid, k),\n    memo = smooth_1d_memo(x, z, grid, k),\n    range = smooth_1d_range(x, z, grid, k, krng[1], krng[2]),\n    \"range + kcpp\" = smooth_1d_range_kcpp(x, z, grid, krng[1], krng[2]),\n    \"range + memo\" = smooth_1d_memo_range(x, z, grid, k, krng[1], krng[2]),\n    \"range + memo + kcpp\" = smooth_1d_memo_range_kcpp(x, z, grid, krng[1], krng[2]),\n    \"range + memo + map\" = smooth_1d_memo_range_map(x, z, grid, k, krng[1], krng[2])\n  )\n  \n  # More realistic sample sizes\n  x <- 1:3e3\n  z <- rep(c(1, 2), length = length(x))\n  \n  grid3 <- seq(0, 11, length = 3e3)\n  grid4 <- seq(0, 11, length = 3e4)\n\n  microbenchmark(\n    grid3_c = smooth_1d_range_kcpp(x, z, grid3, krng[1], krng[2]),\n    grid3_r = smooth_1d_memo_range_map(x, z, grid3, k, krng[1], krng[2]),\n    grid4_c = smooth_1d_range_kcpp(x, z, grid4, krng[1], krng[2]),\n    grid4_r = smooth_1d_memo_range_map(x, z, grid4, k, krng[1], krng[2]),\n    times = 10)\n\n*/"
  },
  {
    "path": "bigvis.Rproj",
    "content": "Version: 1.0\n\nRestoreWorkspace: Default\nSaveWorkspace: Default\nAlwaysSaveHistory: Default\n\nEnableCodeIndexing: Yes\nUseSpacesForTab: Yes\nNumSpacesForTab: 2\nEncoding: UTF-8\n\nRnwWeave: Sweave\nLaTeX: pdfLaTeX\n\nAutoAppendNewline: Yes\n\nBuildType: Package\nPackageUseDevtools: Yes\nPackageInstallArgs: --no-multiarch --with-keep.source\nPackageRoxygenize: rd,collate,namespace\n"
  },
  {
    "path": "inst/include/bigvis.h",
    "content": "#include <Rcpp.h>\n#include <boost/shared_ptr.hpp>\n\nusing namespace Rcpp;\n\n// Wrapper for numeric vector that makes it easy figure to out which \n// bin each observation belongs to.\nclass BinnedVector {\n    // This should probably be a const NumericVector&, but that doesn't work\n    // with modules currently\n    NumericVector x_;\n    String name_;\n    double width_;\n    double origin_;\n  public:\n    BinnedVector(NumericVector x, String name, double width, double origin = 0)\n       : x_(x), name_(name), width_(width), origin_(origin) {\n    }\n\n    int bin_i(int i) const {\n      return bin(x_[i]);\n    }\n\n    int bin(double x) const {\n      if (ISNAN(x) || x == INFINITY || x == -INFINITY) return 0;\n      if (x < origin_) return 0;\n\n      return (x - origin_) / width_ + 1;\n    }\n\n    double unbin(int bin) const {\n      if (bin == 0) return(NA_REAL);\n      return (bin - 1) * width_ + origin_ + width_ / 2;\n    }\n\n    int nbins() const;\n\n    int size() const {\n      return x_.size();\n    }\n\n    double origin() const {\n      return origin_;\n    } \n\n    double width() const {\n      return width_;\n    }\n\n    String name() const {\n      return name_;\n    }\n\n};\n\n// This class is just boilerplate. There might be rcpp magic that does the right thing here\n// but I don't know it.\nclass BinnedVectorReference {\n    boost::shared_ptr<BinnedVector> ref;\n\n    const BinnedVector *get() const {\n        return ref.get();\n    };\n    BinnedVector *get() {\n        return ref.get();\n    };\n\npublic:\n    BinnedVectorReference() {};\n\n    BinnedVectorReference(const BinnedVectorReference &o):\n        ref(o.ref) {};\n\n    explicit BinnedVectorReference(BinnedVector *ptr) {\n        // Watch out, this takes ownership of the pointer!\n        ref = boost::shared_ptr<BinnedVector>(ptr);\n    }\n\n    BinnedVectorReference(NumericVector x, String name, double width, double origin = 0) {\n        BinnedVector *vec = new BinnedVector(x, name, width, origin);\n        ref = boost::shared_ptr<BinnedVector>(vec);\n    }\n\n    int bin_i(int i) const { return get()->bin_i(i); }\n    int bin(double x) const { return get()->bin(x); }\n    double unbin(int bin) const { return get()->unbin(bin); }\n    int nbins() const { return get()->nbins(); }\n    int size() const { return get()->size();}\n    double origin() const { return get()->origin();}\n    double width() const { return get()->width();}\n    String name() const { return get()->name();}\n};\n\n// A data structure to store multiple binned vectors\nclass BinnedVectors {\n    int size_;\n    std::vector<BinnedVectorReference> groups_;\n\n  public:\n    std::vector<int> bins_;\n    BinnedVectors () : groups_(0), bins_(0) {\n    }\n\n    BinnedVectors (List gs) : groups_(0), bins_(0) {\n      int n = gs.size();\n      for (int i = 0; i < n; ++i) {\n        add_vector(as<BinnedVectorReference>(gs[i]));\n      }\n    }\n\n    void add_vector(BinnedVectorReference g) {\n      if (groups_.empty()) {\n        bins_.push_back(1);\n        size_ = g.size();\n      } else {\n        if (g.size() != size_) stop(\"Inconsistent sizes\");\n        bins_.push_back(bins_.back() * g.nbins());\n      }\n      groups_.push_back(g);\n\n    }\n\n    int bin_i(int i) const;\n    int bin(std::vector<double> x) const;\n    std::vector<double> unbin(int bin) const;\n    \n    int nbins() const {\n      return bins_.back() * groups_.front().nbins();\n    }\n\n    int ngroups() const {\n      return bins_.size();\n    }\n\n    int size() const {\n      return size_;\n    }\n\n    String name(int j) const {\n      return groups_[j].name();\n    }\n\n};\n\n\nRCPP_EXPOSED_AS(BinnedVectorReference)\nRCPP_EXPOSED_WRAP(BinnedVectorReference)\nRCPP_EXPOSED_AS(BinnedVectors)\nRCPP_EXPOSED_WRAP(BinnedVectors)\n"
  },
  {
    "path": "inst/tests/test-binned-vectors.r",
    "content": "context(\"Binned vectors\")\n\nif (require(\"plyr\")) {\n  test_that(\"bins agree with plyr::id\", {\n    grid <- expand.grid(x = c(NA, seq(0, 0.5, by = 0.1)), y = c(NA, seq(0, 0.7, by = 0.1)))\n    x <- grid$x\n    y <- grid$y\n\n    gx <- bin(x, 0.1)\n    gy <- bin(y, 0.1)\n\n    bv <- bins(gx, gy)\n    bigvis <- sapply(seq_along(x) - 1, bv$bin_i)\n\n    bin_x <- sapply(seq_along(x) - 1, gx$bin_i)\n    bin_y <- sapply(seq_along(x) - 1, gy$bin_i)\n    plyr <- as.vector(id(list(bin_x, bin_y)))\n\n    expect_equal(bigvis + 1, plyr)\n  })\n}\n\ntest_that(\"square nbins correct\", {\n  g <- bin(1:10, 1)\n  expect_equal(bins(g)$nbins(), 11)\n  expect_equal(bins(g, g)$nbins(), 11 ^ 2)\n  expect_equal(bins(g, g, g)$nbins(), 11 ^ 3)\n})\n\ntest_that(\"rectangular nbins correct\", {\n  g11 <- bin(1:10, 1)\n  g2 <- bin(rep(1, 10), 1)\n\n  expect_equal(bins(g2, g11)$nbins(), 22)\n  expect_equal(bins(g11, g2)$nbins(), 22)\n})\n\ntest_that(\"diagonal nbins correct\", {\n  x <- runif(1e3)\n  y <- x + runif(1e3, -0.2, 0.2)\n  z <- rnorm(1e3, x)\n\n  gx <- bin(x, 0.1)\n  gy <- bin(y, 0.1)\n\n  expect_equal(gx$nbins(), 11)\n  expect_equal(gy$nbins(), 15)\n\n  bvs <- bins(gx, gy)\n  expect_equal(bvs$nbins(), 165)\n\n  bins <- vapply(seq_along(x) - 1, bvs$bin_i, integer(1))\n  expect_true(all(bins <= 165))\n})\n\ntest_that(\"bin and unbin are symmetric\", {\n  g <- bin(-10:10, 1)\n  bvs <- bins(g, g)\n\n  grid <- expand.grid(x = -10:10, y = -10:10)\n  bins <- unlist(Map(function(x, y) bvs$bin(c(x, y)), grid$x, grid$y))\n  unbin <- t(vapply(bins, bvs$unbin, numeric(2)))\n  colnames(unbin) <- c(\"x\", \"y\")\n\n  expect_equal(unbin, as.matrix(grid))\n})\n\ntest_that(\"bin and unbin are symmetric with diff binning\", {\n  x <- c(-1, 5)\n  y <- c(0.1, 1)\n\n  bx <- bin(x, 1)\n  by <- bin(y, 0.1)\n  bvs <- bins(bx, by)\n\n  grid <- expand.grid(\n    x = breaks(bx)[-1] + 1 / 2,\n    y = breaks(by)[-1] + 0.1 / 2)\n\n  bins <- unlist(Map(function(x, y) bvs$bin(c(x, y)), grid$x, grid$y))\n  unbin <- t(vapply(bins, bvs$unbin, numeric(2)))\n  colnames(unbin) <- c(\"x\", \"y\")\n\n  expect_equal(unbin, as.matrix(grid))\n})\n"
  },
  {
    "path": "inst/tests/test-breaks.r",
    "content": "context(\"Breaks\")\n\nlast <- function(x) x[length(x)]\n\ntest_that(\"breaks includes max value, only if on border\", {\n  expect_equal(last(breaks(10, origin = 0, binwidth = 1)), 10)\n  expect_equal(last(breaks(9.99, origin = 0, binwidth = 1)), 9)\n})\n\ntest_that(\"breaks includes max value even when origin != 0\", {\n  expect_equal(last(breaks(10.5, origin = 0.5, binwidth = 1)), 10.5)\n  expect_equal(last(breaks(10.49, origin = 0.5, binwidth = 1)), 9.5)\n})\n"
  },
  {
    "path": "inst/tests/test-condense.r",
    "content": "context(\"Condense\")\n\ntest_that(\"condense counts small vectors accurately\", {\n  x <- c(NA, 0:10)\n  s1 <- condense(bin(x, 1, -0.5), summary = \"count\")\n  # Pathological origin: need to add extra bin on end, because they're\n  # right open, left closed\n  s2 <- condense(bin(x, 1, 0), summary = \"count\")\n\n  expect_equivalent(s1$x, c(NA, 0:10))\n  expect_equivalent(s2$x, c(NA, 0:10 + 0.5))\n\n  expect_equal(s1$.count, rep(1, length(x)))\n  expect_equal(s2$.count, rep(1, length(x)))\n})\n\ntest_that(\"weights modify counts\", {\n  x <- c(NA, 0:10)\n  w <- rep(2, length(x))\n  s <- condense(bin(x, 1), w = w, summary = \"count\")\n\n  expect_equivalent(s$x, c(NA, 0:10))\n  expect_equal(s$.count, rep(2, length(x)))\n})\n\ntest_that(\"z affects sums, but not counts\", {\n  x <- c(NA, 0:10)\n  z <- 0:11\n  s <- condense(bin(x, 1), z = z, summary = \"sum\")\n\n  expect_equal(s$.count, rep(1, length(x)))\n  expect_equal(s$.sum, z)\n})\n\ntest_that(\"drop = FALSE and drop = TRUE results agree\", {\n  x <- runif(1e3)\n  y <- x + runif(1e3, -0.2, 0.2)\n  z <- rnorm(1e3, x)\n\n  gx <- bin(x, 0.1)\n  gy <- bin(y, 0.1)\n\n  count1 <- condense(gx, gy, summary = \"count\", drop = TRUE)\n  expect_equal(sum(count1$.count == 0), 0)\n\n  count2 <- condense(gx, gy, summary = \"count\", drop = FALSE)\n  expect_equivalent(count1, count2[count2$.count != 0, ])\n})\n\n# 2d tests ---------------------------------------------------------------------\n\ntest_that(\"grid counted accurately\", {\n  # expand.grid orders in opposite way to bigvis\n  grid <- expand.grid(y = c(NA, 1:2), x = c(NA, 1:2))\n  s <- condense(bin(grid$x, 1), bin(grid$y, 1))\n\n  expect_equal(s$.count, rep(1, nrow(grid)))\n  expect_equivalent(s$grid.x, grid$x)\n  expect_equivalent(s$grid.y, grid$y)\n})\n\ntest_that(\"diagonal counted correctly\", {\n  df <- data.frame(x = c(NA, 1:2), y = c(NA, 1:2))\n  s <- condense(bin(df$x, 1), bin(df$y, 1))\n\n  expect_equal(nrow(s), nrow(df))\n  expect_equal(s$df.x, s$df.y)\n})\n\ntest_that(\"random data doesn't crash\", {\n  x <- runif(1e3, 8, 4963)\n  y <- runif(1e3, 1e-2, 1e3)\n\n  gx <- bin(x, 10)\n  gy <- bin(y, 10)\n\n  condense(gx, gy)\n})\n"
  },
  {
    "path": "inst/tests/test-frange.r",
    "content": "context(\"frange\")\n\ntest_that(\"frange agrees with range\", {\n  x <- rnorm(1e4)\n  expect_equal(frange(x), range(x))\n})\n\ntest_that(\"frange uses cache if present\", {\n  x <- rnorm(1e4)\n  attr(x, \"range\") <- c(0, 10)\n  expect_equal(frange(x), c(0, 10))\n})\n\ntest_that(\"frange ignores NA and infinities by default\", {\n  x <- c(1, NA, Inf, -Inf)\n  expect_equal(frange(x), c(1, 1))\n})\n"
  },
  {
    "path": "inst/tests/test-group-1d.r",
    "content": "context(\"Grouping: 1d\")\n\ngroup <- function(x, width, origin = NULL) {\n  g <- bin(x, width, origin)\n  vapply(seq_along(x) - 1, g$bin_i, integer(1))\n}\n\ntest_that(\"NAs belong to group 0\", {\n  x <- NA_real_\n  expect_equal(group(x, 1, 0), 0L)\n})\n\ntest_that(\"Inf and -Inf belong to group 0\", {\n  x <- c(-Inf, Inf)\n  expect_equal(group(x, 1, 0), c(0, 0))\n})\n\ntest_that(\"Out of range values belong to group 0\", {\n  expect_equal(group(-10, 1, 0), 0)\n})\n\ntest_that(\"Positive integers unchanged if origin is 1\", {\n  expect_equal(group(1:10, 1, 1), 1:10)\n})\n"
  },
  {
    "path": "inst/tests/test-group-2d.r",
    "content": "context(\"Grouping: 2d\")\n\ntest_that(\"Two NAs gets bin 0\", {\n  expect_equal(group_rect(NA, NA, 1, 1, 0, 0), 0)\n})\n\ntest_that(\"Sequential locations get sequential groups\", {\n  grid <- expand.grid(x = c(NA, 1:2), y = c(NA, 1:2))\n  expect_equal(group_rect(grid$x, grid$y, 1, 1, 0.5, 0.5), 0:8)\n})\n"
  },
  {
    "path": "inst/tests/test-origin.r",
    "content": "context(\"Origin\")\n\ntest_that(\"origins close to zero rounded to zero\" ,{\n  expect_equal(find_origin(c(0.01, 1000)), 0)\n  expect_equal(find_origin(c(10, 1e6)), 0)\n})\n\ntest_that(\"origins rounded down by binwidth\", {\n  expect_equal(find_origin(c(1, 10), 1), 1)\n  expect_equal(find_origin(c(1, 10), 2), 0)\n\n  expect_equal(find_origin(c(5, 10), 2), 4)\n  expect_equal(find_origin(c(5, 10), 5), 5)\n})\n\ntest_that(\"integers have origin offset by 0.5\", {\n  expect_equal(find_origin(c(1L, 10L), 1), 0.5)\n\n  expect_equal(find_origin(c(5L, 10L), 2), 3.5)\n  expect_equal(find_origin(c(5L, 10L), 5), 4.5)\n})\n"
  },
  {
    "path": "inst/tests/test-ranged.r",
    "content": "context(\"Ranged\")\n\ntest_that(\"range attribute lost when modified\", {\n  x <- ranged(10:1)\n  expect_equal(max(x), 10)\n\n\n  x[1] <- 1\n  expect_equal(max(x), 9)\n  expect_equal(attr(x, \"range\"), NULL)\n})\n"
  },
  {
    "path": "inst/tests/test-smooth.r",
    "content": "context(\"Smooth\")\n\ntricube <- function(x) {\n  x <- abs(x)\n  ifelse(x > 1, 0, (1 - x ^ 3) ^ 3)\n}\n# plot(tricube, xlim = c(-1.5, 1.5))\n\ntest_that(\"factorised smooth equal to manual smooth\", {\n  grid <- as.matrix(expand.grid(x = 1:10, y = 1:10, KEEP.OUT.ATTRS = FALSE))\n  z <- rep(0, nrow(grid))\n  z[c(5, 23, 84)] <- 1\n\n  z_x <- smooth_nd_1(grid, z, numeric(), grid, 0, 3)\n  z_y <- smooth_nd_1(grid, z, numeric(), grid, 1, 3)\n\n  z_xy <- smooth_nd_1(grid, z_x, numeric(), grid, 1, 3)\n  z_yx <- smooth_nd_1(grid, z_y, numeric(), grid, 0, 3)\n  z2 <- smooth_nd(grid, z, numeric(), grid, c(3, 3))\n\n  expect_equal(z_xy, z2)\n  expect_equal(z_yx, z2)\n})\n\n# library(ggplot2)\n# qplot(grid[, 1], grid[, 2], fill = z, geom = \"raster\")\n# qplot(grid[, 1], grid[, 2], fill = z_xy, geom = \"raster\")\n# qplot(grid[, 1], grid[, 2], fill = z_yx, geom = \"raster\")\n# qplot(grid[, 1], grid[, 2], fill = z2, geom = \"raster\")\n\ntest_that(\"factorised smooth equal to manual smooth\", {\n  grid <- as.matrix(expand.grid(x = 1:10, y = 1:10, KEEP.OUT.ATTRS = FALSE))\n  z <- rep(0, nrow(grid))\n  z[c(5, 23, 84)] <- 1\n\n  grid <- as.data.frame(grid)\n  grid$.count <- z\n  class(grid) <- c(\"condensed\", class(grid))\n\n  z1 <- smooth(grid, c(3, 3), \".count\", factor = FALSE)\n  z2 <- smooth(grid, c(3, 3), \".count\", factor = TRUE)\n\n  expect_equal(z1, z2)\n})\n"
  },
  {
    "path": "inst/tests/test-stat.r",
    "content": "context(\"Stats\")\n\ntest_that(\"linear regression recovers slope & intercept if no errors\", {\n  x <- 1:10\n  w <- rep(1, 10)\n\n  expect_equal(regress(x, x * 2, w), c(0, 2))\n  expect_equal(regress(x, x * -2, w), c(0, -2))\n  expect_equal(regress(x, x * -2 + 5, w), c(5, -2))\n  expect_equal(regress(x, x * -2 + -5, w), c(-5, -2))\n})\n\nsimpleLm <- function(x, y, w) {\n  unname(coef(lm(y ~ x, weights = w)))\n}\n\ntest_that(\"linear regression matches lm\", {\n  x <- 1:10\n  y <- 10 + x * 2 + rnorm(10)\n  w <- rep(1, 10)\n\n  expect_equal(regress(x, y, w), simpleLm(x, y, w))\n})\n\ntest_that(\"linear regression matches lm with weights\", {\n  x <- 1:10\n  y <- 10 + x * 2 + rnorm(10)\n  w <- runif(10)\n\n  expect_equal(regress(x, y, w), simpleLm(x, y, w))\n})\n\ntest_that(\"robust regression effectively removes outlier\", {\n  x <- 1:10\n  y <- 10 + x * 2 + c(rep(0, 9), 10)\n  w <- rep(1, 10)\n\n  expect_equal(regress_robust(x, y, w, 10), c(10, 2))\n})\n\n\n\n"
  },
  {
    "path": "inst/tests/test-summary-moments.r",
    "content": "context(\"Summary: moments\")\n\ncount2 <- function(x) compute_moments(x)[1]\nmean2 <- function(x) compute_moments(x)[2]\nsd2 <- function(x) compute_moments(x)[3]\n\ntest_that(\"count agrees with length\", {\n  expect_equal(count2(1:10), 10)\n  expect_equal(count2(5), 1)\n  expect_equal(count2(numeric()), 0)\n})\n\ntest_that(\"mean agree with base::mean\", {\n  expect_equal(mean2(1:10), mean(1:10))\n\n  x <- runif(1e6)\n  expect_equal(mean2(x), mean(x))\n})\n\ntest_that(\"missing values are ignored\", {\n  x <- c(NA, 5, 5)\n  expect_equal(count2(x), 2)\n  expect_equal(mean2(x), 5)\n})\n\ntest_that(\"standard deviation agrees with sd\", {\n  expect_equal(sd2(1:10), sd(1:10))\n\n  x <- runif(1e6)\n  expect_equal(sd2(x), sd(x))\n})\n\ntest_that(\"summary statistics of zero length input are NaN\", {\n  expect_equal(compute_moments(numeric()), c(0, NaN, NaN))\n})\n"
  },
  {
    "path": "inst/tests/test-weighted-stats.r",
    "content": "context(\"Weighted statistics\")\n\ntest_that(\"weighted.var agrees with var when weights = 1\", {\n  samples <- replicate(20, runif(100), simplify = FALSE)\n\n  var <- sapply(samples, var)\n  wvar <- sapply(samples, weighted.var, w = rep(1, 100))\n\n  expect_equal(wvar, var)\n})\n\ntest_that(\"weighted.var agrees with var on repeated vector\", {\n  samples <- replicate(20, runif(100), simplify = FALSE)\n  w <- rep(1:2, 50)\n  samples_ex <- lapply(samples, rep, times = w)\n\n  var <- sapply(samples_ex, var)\n  wvar <- sapply(samples, weighted.var, w = w)\n\n  expect_equal(wvar, var)\n})\n\ntest_that(\"weighed.quantile agrees with quantile on repeated vector\", {\n  samples <- replicate(20, runif(100), simplify = FALSE)\n  w <- rep(1:2, 50)\n  samples_ex <- lapply(samples, rep, times = w)\n\n  quant <- sapply(samples_ex, quantile, probs = 0.325, names = FALSE)\n  wquant <- sapply(samples, weighted.quantile, w = w, probs = 0.325)\n\n  expect_equal(quant, wquant)\n})\n"
  },
  {
    "path": "man/autoplot.condensed.Rd",
    "content": "% Generated by roxygen2 (4.1.1): do not edit by hand\n% Please edit documentation in R/autoplot.r\n\\name{autoplot.condensed}\n\\alias{autoplot.condensed}\n\\title{Autoplot condensed summaries.}\n\\usage{\n\\method{autoplot}{condensed}(x, var = last(summary_vars(x)), ...)\n}\n\\arguments{\n\\item{x}{a condensed summary}\n\n\\item{var}{which summary variable to display}\n\n\\item{...}{other arguments passed on to individual methods}\n}\n\\description{\nAutoplot condensed summaries.\n}\n\\examples{\nif (require(\"ggplot2\")) {\n\n# 1d summaries -----------------------------\nx <- rchallenge(1e4)\nz <- x + rt(length(x), df = 2)\nxsum <- condense(bin(x, 0.1))\nzsum <- condense(bin(x, 0.1), z = z)\n\nautoplot(xsum)\nautoplot(peel(xsum))\n\nautoplot(zsum)\nautoplot(peel(zsum, keep = 1))\nautoplot(peel(zsum))\n\n# 2d summaries -----------------------------\ny <- runif(length(x))\nxysum <- condense(bin(x, 0.1), bin(y, 0.1))\nxyzsum <- condense(bin(x, 0.1), bin(y, 0.1), z = z)\n\nautoplot(xysum)\nautoplot(peel(xysum))\nautoplot(xyzsum)\nautoplot(peel(xyzsum))\n}\n}\n\n"
  },
  {
    "path": "man/best_h.Rd",
    "content": "% Generated by roxygen2 (4.1.1): do not edit by hand\n% Please edit documentation in R/h.r\n\\name{best_h}\n\\alias{best_h}\n\\title{Find \"best\" smoothing parameter using leave-one-out cross validation.}\n\\usage{\nbest_h(x, h_init = NULL, ..., tol = 0.01, control = list())\n}\n\\arguments{\n\\item{x}{condensed summary to smooth}\n\n\\item{h_init}{initial values of bandwidths to start search out. If not\nspecified defaults to 5 times the binwidth of each variable.}\n\n\\item{...}{other arguments (like \\code{var}) passed on to\n\\code{\\link{rmse_cv}}}\n\n\\item{tol}{numerical tolerance, defaults to 1\\%.}\n\n\\item{control}{additional control parameters passed on to \\code{\\link{optim}}\nThe most useful argument is probably trace, which makes it possible to\nfollow the progress of the optimisation.}\n}\n\\value{\na single numeric value representing the bandwidth that minimises\n  the leave-one-out estimate of rmse. Vector has attributes\n  \\code{evaluations} giving the number of times the objective function\n  was evaluated. If the optimisation does not converge, or smoothing is not\n  needed (i.e. the estimate is on the lower bounds), a warning is thrown.\n}\n\\description{\nMinimises the leave-one-out estimate of root mean-squared error to find\nfind the \"optimal\" bandwidth for smoothing.\n}\n\\details{\nL-BFGS-B optimisation is used to constrain the bandwidths to be greater\nthan the binwidths: if the bandwidth is smaller than the binwidth it's\nimpossible to compute the rmse because no smoothing occurs. The tolerance\nis set relatively high for numerical optimisation since the precise choice\nof bandwidth makes little difference visually, and we're unlikely to have\nsufficient data to make a statistically significant choice anyway.\n}\n\\examples{\n\\donttest{\nx <- rchallenge(1e4)\nxsum <- condense(bin(x, 1 / 10))\nh <- best_h(xsum, control = list(trace = 3, REPORT = 1))\n\nif (require(\"ggplot2\")) {\nautoplot(xsum)\nautoplot(smooth(xsum, h))\n}\n}\n}\n\\seealso{\nOther bandwidth estimation functions: \\code{\\link{h_grid}};\n  \\code{\\link{rmse_cv}}, \\code{\\link{rmse_cvs}}\n}\n\n"
  },
  {
    "path": "man/bigvis.Rd",
    "content": "% Generated by roxygen2 (4.1.1): do not edit by hand\n% Please edit documentation in R/bigvis.r\n\\docType{package}\n\\name{bigvis}\n\\alias{as.integer,Rcpp_BinnedVector-method}\n\\alias{bigvis}\n\\alias{bigvis-package}\n\\alias{show,Rcpp_BinnedVector-method}\n\\title{The big vis package.}\n\\usage{\n\\S4method{show}{Rcpp_BinnedVector}(object)\n\n\\S4method{as.integer}{Rcpp_BinnedVector}(x, ...)\n}\n\\arguments{\n\\item{x,object,...}{Generic args}\n}\n\\description{\nThe big vis package.\n}\n\n"
  },
  {
    "path": "man/bin.Rd",
    "content": "% Generated by roxygen2 (4.1.1): do not edit by hand\n% Please edit documentation in R/bin.r\n\\name{bin}\n\\alias{bin}\n\\title{Create a binned variable.}\n\\usage{\nbin(x, width = find_width(x), origin = find_origin(x, width), name = NULL)\n}\n\\arguments{\n\\item{x}{numeric or integer vector}\n\n\\item{width}{bin width. If not specified, about 10,000 bins will be chosen\nusing the algorithim in \\code{\\link{find_width}}.}\n\n\\item{origin}{origin. If not specified, guessed by \\code{\\link{find_origin}}.}\n\n\\item{name}{name of original variable. This will be guessed from the input to\n\\code{group} if not supplied. Used in the output of\n\\code{\\link{condense}} etc.}\n}\n\\description{\nCreate a binned variable.\n}\n\\details{\nThis function produces an R reference class that wraps around a C++ function.\nGenerally, you should just treat this as an opaque object with reference\nsemantics, and you shouldn't call the methods on it - pass it to\n\\code{\\link{condense}} and friends.\n}\n\\examples{\nx <- runif(1e6)\nbin(x)\nbin(x, 0.01)\nbin(x, 0.01, origin = 0.5)\n}\n\n"
  },
  {
    "path": "man/breaks.Rd",
    "content": "% Generated by roxygen2 (4.1.1): do not edit by hand\n% Please edit documentation in R/breaks.r\n\\name{breaks}\n\\alias{breaks}\n\\title{Compute breaks given origin and width.}\n\\usage{\nbreaks(x, binwidth, origin = min(x))\n}\n\\arguments{\n\\item{x}{numeric vector}\n\n\\item{binwidth}{bin width}\n\n\\item{origin}{bin origin}\n}\n\\description{\nBreaks are right-open, left-closed [x, y), so if \\code{max(x)} is an integer\nmultiple of binwidth, then we need one more break. This function only returns\nthe left-side of the breaks.\n}\n\\details{\nThe first break is special, because it always contains missing values.\n}\n\\examples{\nbreaks(10, origin = 0, binwidth = 1)\nbreaks(9.9, origin = 0, binwidth = 1)\n\nbreaks(1:10, origin = 0, binwidth = 2)\n}\n\\keyword{internal}\n\n"
  },
  {
    "path": "man/condense.Rd",
    "content": "% Generated by roxygen2 (4.1.1): do not edit by hand\n% Please edit documentation in R/condense.r\n\\name{condense}\n\\alias{condense}\n\\title{Efficient binned summaries.}\n\\usage{\ncondense(..., z = NULL, summary = NULL, w = NULL, drop = NULL)\n}\n\\arguments{\n\\item{...}{group objects created by \\code{\\link{bin}}}\n\n\\item{z}{a numeric vector to summary for each group. Optional for some\nsummary statistics.}\n\n\\item{summary}{the summary statistic to use. Currently must be one of\ncount, sum, mean, median or sd. If \\code{NULL}, defaults to mean if\ny is present, count if not.}\n\n\\item{w}{a vector of weights. Not currently supported by all summary\nfunctions.}\n\n\\item{drop}{if \\code{TRUE} only locations with data will be returned.  This\nis more efficient if the data is very sparse (<1\\% of cells filled), and\nis slightly less efficient. Defaults to \\code{TRUE} if you are condensing\nover two or more dimensions, \\code{FALSE} for 1d.}\n}\n\\description{\nEfficient binned summaries.\n}\n\\examples{\nx <- runif(1e5)\ngx <- bin(x, 0.1)\ncondense(gx)\n}\n\n"
  },
  {
    "path": "man/condensed.Rd",
    "content": "% Generated by roxygen2 (4.1.1): do not edit by hand\n% Please edit documentation in R/condensed.r\n\\name{condensed}\n\\alias{as.condensed}\n\\alias{condensed}\n\\alias{is.condensed}\n\\title{Condensed: an S3 class for condensed summaries.}\n\\usage{\ncondensed(groups, grouped, summary)\n\nis.condensed(x)\n\nas.condensed(x)\n}\n\\arguments{\n\\item{groups}{list of \\code{\\link{bin}}ed objects}\n\n\\item{grouped,summary}{output from C++ condense function}\n\n\\item{x}{object to test or coerce}\n}\n\\description{\nThis object managed the properties of condensed (summarised) data frames.\n}\n\\section{S3 methods}{\n\n\nMathematical functions with methods for \\code{binsum} object will modify\nthe x column of the data frame and \\code{\\link{rebin}} the data, calculating\nupdated summary statistics.\n\nCurrently methods are provided for the \\code{Math} group generic,\nlogical comparison and arithmetic operators, and\n\\code{\\link[plyr]{round_any}}.\n}\n\\examples{\nif (require(\"ggplot2\")) {\n\nx <- rchallenge(1e4)\nxsum <- condense(bin(x, 1 / 10))\n\n# Basic math operations just modify the first column\nautoplot(xsum)\nautoplot(xsum * 10)\nautoplot(xsum - 30)\nautoplot(abs(xsum - 30))\n\n# Similarly, logical operations work on the first col\nautoplot(xsum[xsum > 10, ])\n}\n}\n\\keyword{internal}\n\n"
  },
  {
    "path": "man/dchallenge.Rd",
    "content": "% Generated by roxygen2 (4.1.1): do not edit by hand\n% Please edit documentation in R/challenge.r\n\\name{dchallenge}\n\\alias{dchallenge}\n\\alias{rchallenge}\n\\title{Density and random number generation functions for a challenging\ndistribution.}\n\\usage{\ndchallenge(x)\n\nrchallenge(n)\n}\n\\arguments{\n\\item{x}{values to evaluate pdf at}\n\n\\item{n}{number of random samples to generate}\n}\n\\description{\nThis is a 1/3-2/3 mixture of a t-distribution with 2 degrees of freedom\ncentered at 15 and scaled by 2, and a gamma distribution with shape 2\nand rate 1/3. (The t-distribution is windsorised at 0, but this\nhas negligible effect.) This distribution is challenging because it\nmixes heavy tailed and asymmetric distributions.\n}\n\\examples{\nplot(dchallenge, xlim = c(-5, 60), n = 500)\n\nx <- rchallenge(1e4)\nhist(x, breaks = 1000)\nxsum <- condense(bin(x, 0.1))\nplot(xsum$x, xsum$.count, type = \"l\")\nxsmu <- smooth(xsum, 0.3)\nplot(xsmu$x, xsmu$.count, type = \"l\")\nplot(xsmu$x, xsmu$.count, type = \"l\", xlim = c(0, 30))\n}\n\n"
  },
  {
    "path": "man/dgrid.Rd",
    "content": "% Generated by roxygen2 (4.1.1): do not edit by hand\n% Please edit documentation in R/dgrid.r\n\\name{dgrid}\n\\alias{dgrid}\n\\alias{is.dgrid}\n\\title{dgrid: an S3 class for data grids}\n\\usage{\ndgrid(x, width, origin = 0, nbins = NULL)\n\nis.dgrid(x)\n}\n\\arguments{\n\\item{x}{a numeric vector to test or coerce.}\n\n\\item{width}{bin width}\n\n\\item{origin}{bin origins}\n\n\\item{nbins}{number of bins}\n}\n\\description{\ndgrid: an S3 class for data grids\n}\n\\examples{\ng <- dgrid(0:10 + 0.5, width = 1)\nrange(g)\nas.integer(g)\n}\n\n"
  },
  {
    "path": "man/find_origin.Rd",
    "content": "% Generated by roxygen2 (4.1.1): do not edit by hand\n% Please edit documentation in R/origin.r\n\\name{find_origin}\n\\alias{find_origin}\n\\title{Find the origin.}\n\\usage{\nfind_origin(x, binwidth)\n}\n\\arguments{\n\\item{x}{numeric or integer vector}\n\n\\item{binwidth}{binwidth}\n}\n\\description{\nFind the origin.\n}\n\\details{\nThis algorithm implements simple heuristics for determining the origin of\na histogram when only the binwidth is specified. It:\n\n\\itemize{\n   \\item rounds to zero, if relatively close\n   \\item subtracts 0.5 offset, if an x is integer\n   \\item ensures the origin is a multiple of the binwidth\n}\n}\n\\examples{\nfind_origin(1:10, 1)\nfind_origin(1:10, 2)\nfind_origin(c(1, 1e6), 1)\n}\n\\seealso{\nOther reasonable defaults: \\code{\\link{find_width}}\n}\n\\keyword{internal}\n\n"
  },
  {
    "path": "man/find_width.Rd",
    "content": "% Generated by roxygen2 (4.1.1): do not edit by hand\n% Please edit documentation in R/width.r\n\\name{find_width}\n\\alias{find_width}\n\\title{Compute a reasonable default binwidth.}\n\\usage{\nfind_width(x, nbins = 10000)\n}\n\\arguments{\n\\item{x}{a numeric vector. If a numeric vector of length one is supplied,\nit's assumed that}\n\n\\item{nbins}{desired number of bins (approximate)}\n}\n\\description{\nCompute a reasonable default binwidth.\n}\n\\examples{\nfind_width(c(0, 5))\nfind_width(c(0, 5.023432))\nfind_width(c(0, 5.9))\n}\n\\seealso{\nOther reasonable defaults: \\code{\\link{find_origin}}\n}\n\\keyword{internal}\n\n"
  },
  {
    "path": "man/frange.Rd",
    "content": "% Generated by roxygen2 (4.1.1): do not edit by hand\n% Please edit documentation in R/RcppExports.R\n\\name{frange}\n\\alias{frange}\n\\title{Efficient implementation of range.}\n\\usage{\nfrange(x, finite = TRUE)\n}\n\\arguments{\n\\item{x}{a numeric vector, or a \\code{\\link{ranged}} object}\n\n\\item{finite}{If \\code{TRUE} ignores missing values and infinities. Note\nthat if the vector is empty, or only contains missing values,\n\\code{frange} will return \\code{c(Inf, -Inf)} because those are the\nidentity values for \\code{\\link{min}} and \\code{\\link{max}} respectively.}\n}\n\\description{\nThis is an efficient C++ implementation of range for numeric vectors:\nit avoids S3 dispatch, and computes both min and max in a single pass\nthrough the input.\n}\n\\details{\nIf \\code{x} has a \\code{range} attribute (e.g. it's a \\code{\\link{ranged}}\nobject), it will be used instead of computing the range from scratch.\n}\n\\examples{\nx <- runif(1e6)\nsystem.time(range(x))\nsystem.time(frange(x))\n\nrx <- ranged(x)\nsystem.time(frange(rx))\n}\n\n"
  },
  {
    "path": "man/h_grid.Rd",
    "content": "% Generated by roxygen2 (4.1.1): do not edit by hand\n% Please edit documentation in R/h.r\n\\name{h_grid}\n\\alias{h_grid}\n\\title{Generate grid of plausible bandwidths for condensed summary.}\n\\usage{\nh_grid(x, n = 50, max = 20)\n}\n\\arguments{\n\\item{x}{a condensed summary}\n\n\\item{n}{number of bandwidths to generate (in each dimension)}\n\n\\item{max}{maximum bandwidth to generate, as multiple of binwidth.}\n}\n\\description{\nBy default, the bandwidths start at the bin width, and then continue\nup 50 (\\code{n}) steps until 20 (\\code{max}) times the bin width.\n}\n\\examples{\nx <- rchallenge(1e4)\nxsum <- condense(bin(x, 1 / 10))\nh_grid(xsum)\n\ny <- runif(1e4)\nxysum <- condense(bin(x, 1 / 10), bin(y, 1 / 100))\nh_grid(xysum, n = 10)\n}\n\\seealso{\nOther bandwidth estimation functions: \\code{\\link{best_h}};\n  \\code{\\link{rmse_cv}}, \\code{\\link{rmse_cvs}}\n}\n\n"
  },
  {
    "path": "man/is.ranged.Rd",
    "content": "% Generated by roxygen2 (4.1.1): do not edit by hand\n% Please edit documentation in R/ranged.r\n\\name{is.ranged}\n\\alias{is.ranged}\n\\title{Test if an object is of class ranged.}\n\\usage{\nis.ranged(x)\n}\n\\arguments{\n\\item{x}{object to test}\n}\n\\description{\nTest if an object is of class ranged.\n}\n\\keyword{internal}\n\n"
  },
  {
    "path": "man/movies.Rd",
    "content": "% Generated by roxygen2 (4.1.1): do not edit by hand\n% Please edit documentation in R/movies.r\n\\docType{data}\n\\name{movies}\n\\alias{movies}\n\\title{Movie information and user ratings from IMDB.com.}\n\\format{A data frame with 130,456 rows and 14 variables}\n\\usage{\ndata(movies)\n}\n\\description{\nThe internet movie database, \\url{http://imdb.com/}, is a website devoted\nto collecting movie data supplied by studios and fans.  It claims to be the\nbiggest movie database on the web and is run by amazon.  More about\ninformation imdb.com can be found online,\n\\url{http://imdb.com/help/show_leaf?about}, including information about\nthe data collection process,\n\\url{http://imdb.com/help/show_leaf?infosource}.\n}\n\\details{\nMovies were selected for inclusion if they had a known length and had been rated by at least one imdb user.  The data set contains the following fields:\n\n\\itemize{\n  \\item title.  Title of the movie.\n  \\item year.  Year of release.\n  \\item budget.  Total budget (if known) in US dollars\n  \\item length.  Length in minutes.\n  \\item rating.  Average IMDB user rating.\n  \\item votes.  Number of IMDB users who rated this movie.\n  \\item mpaa.  MPAA rating.\n  \\item action, animation, comedy, drama, documentary, romance, short:\n    \\code{TRUE} if movie belongs to that genre.\n}\n}\n\\references{\n\\url{http://had.co.nz/data/movies/}\n}\n\n"
  },
  {
    "path": "man/mt.Rd",
    "content": "% Generated by roxygen2 (4.1.1): do not edit by hand\n% Please edit documentation in R/mt.r\n\\name{mt}\n\\alias{inv_mt}\n\\alias{mt}\n\\alias{mt_trans}\n\\title{Modulus transformation (and its inverse).}\n\\usage{\nmt(x, lambda)\n\ninv_mt(x, lambda)\n\nmt_trans(lambda)\n}\n\\arguments{\n\\item{x}{values to transform}\n\n\\item{lambda}{degree of transformation}\n}\n\\description{\nA generalisation of the box-cox transformation that works for\nvalues with both positive and negative values.\n}\n\\details{\nThis is useful for compressing the tails of long-tailed distributions,\noften encountered with very large datasets.\n}\n\\examples{\nx <- seq(-10, 10, length = 100)\nplot(x, mt(x, 0), type = \"l\")\nplot(x, mt(x, 0.25), type = \"l\")\nplot(x, mt(x, 0.5), type = \"l\")\nplot(x, mt(x, 1), type = \"l\")\nplot(x, mt(x, 2), type = \"l\")\nplot(x, mt(x, -1), type = \"l\")\nplot(x, mt(x, -2), type = \"l\")\n}\n\\references{\nJ. John and N. Draper. \"An alternative family of\n transformations.\" Applied Statistics, pages 190-197, 1980.\n \\url{http://www.jstor.org/stable/2986305}\n}\n\n"
  },
  {
    "path": "man/peel.Rd",
    "content": "% Generated by roxygen2 (4.1.1): do not edit by hand\n% Please edit documentation in R/peel.r\n\\name{peel}\n\\alias{peel}\n\\title{Peel off low density regions of the data.}\n\\usage{\npeel(x, keep = 0.99, central = NULL)\n}\n\\arguments{\n\\item{x}{condensed summary}\n\n\\item{keep}{(approximate) proportion of data to keep. If \\code{1}, will\nremove all cells with counts.  All missing values will be preserved.}\n\n\\item{central}{if \\code{TRUE} peels off regions of lowest density only from\nthe outside of the data. In 2d this works by progressively peeling off\nconvex hull of the data: the current algorithm is quite slow.\nIf \\code{FALSE}, just removes the lowest density regions wherever they are\nfound. Regions with 0 density are removed regardless of location.\nDefaults to TRUE if there are two or fewer grouping variables is less.}\n}\n\\description{\nKeeps specified proportion of data by removing the lowest density regions,\neither anywhere on the plot, or for 2d, just around the edges.\n}\n\\details{\nThis is useful for visualisation, as an easy way of focussing on the regions\nwhere the majority of the data lies.\n}\n\\examples{\nx <- rt(1e5, df = 2)\ny <- rt(1e5, df = 2)\nxysum <- condense(bin(x, 1 / 10), bin(y, 1 / 10))\nplot(xysum$x, xysum$y)\n\nplot(peel(xysum, 0.95, central = TRUE)[1:2])\nplot(peel(xysum, 0.90, central = TRUE)[1:2])\nplot(peel(xysum, 0.50, central = TRUE)[1:2])\n}\n\n"
  },
  {
    "path": "man/ranged.Rd",
    "content": "% Generated by roxygen2 (4.1.1): do not edit by hand\n% Please edit documentation in R/ranged.r\n\\name{ranged}\n\\alias{ranged}\n\\title{A S3 class for caching the range of a vector}\n\\usage{\nranged(x, range = frange(x, finite = TRUE))\n}\n\\arguments{\n\\item{x}{a numeric vector}\n\n\\item{range}{the range of the vector (excluding missing values), if known.\nIf unknown, it will be computed with \\code{\\link{frange}}, a fast C++\nimplementation of \\code{\\link{range}}.}\n}\n\\description{\nThis class is designed for dealing with large vectors, where the cost of\nrecomputing the range multiple times is prohibitive. It provides methods\nfor \\code{\\link{print}} and \\code{\\link{str}} that display only the range,\nnot the contents.\n}\n\\section{Performance}{\n\nFor best performance, you may want to run copy and paste the contents of\nthis function into your function, to avoid making any copies of \\code{x}.\nThis is probably only necessary if you're dealing with extremely large\nvectors, > 100 million obs.\n}\n\\examples{\nx <- runif(1e6)\ny <- ranged(x)\nrange(y)\ny\nstr(y)\n\n# Modifications to the class currently destroy the cache\ny[1] <- 10\nmax(y)\nclass(y)\nz <- y + 10\nmax(z)\nclass(z)\n}\n\n"
  },
  {
    "path": "man/rmse_cvs.Rd",
    "content": "% Generated by roxygen2 (4.1.1): do not edit by hand\n% Please edit documentation in R/rmse.r\n\\name{rmse_cvs}\n\\alias{rmse_cv}\n\\alias{rmse_cvs}\n\\title{Estimate smoothing RMSE using leave-one-out cross-validation.}\n\\usage{\nrmse_cvs(x, hs = h_grid(x), ...)\n\nrmse_cv(x, h, var = summary_vars(x)[1], ...)\n}\n\\arguments{\n\\item{x}{condensed summary table}\n\n\\item{...}{other variables passed on to \\code{\\link{smooth}}}\n\n\\item{h,hs}{for \\code{rmse_cv}, a vector of bandwidths; for \\code{rmse_cv}\na data frame of bandwidths, as generated by \\code{\\link{h_grid}}.}\n\n\\item{var}{variable to smooth}\n}\n\\description{\n\\code{rmse_cv} computes the leave-one-out RMSE for a single vector of\nbandwidths, \\code{rmse_cvs} computes for a multiple vectors of bandwidths,\nstored as a data frame.\n}\n\\examples{\n\\donttest{\nset.seed(1014)\n# 1d -----------------------------\nx <- rchallenge(1e4)\nxsum <- condense(bin(x, 1 / 10))\ncvs <- rmse_cvs(xsum)\n\nif (require(\"ggplot2\")) {\nautoplot(xsum)\nqplot(x, err, data = cvs, geom = \"line\")\nxsmu <- smooth(xsum, 1.3)\nautoplot(xsmu)\nautoplot(peel(xsmu))\n}\n\n# 2d -----------------------------\ny <- runif(1e4)\nxysum <- condense(bin(x, 1 / 10), bin(y, 1 / 100))\ncvs <- rmse_cvs(xysum, h_grid(xysum, 10))\nif (require(\"ggplot2\")) {\nqplot(x, y, data = cvs, size = err)\n}\n}\n}\n\\seealso{\nOther bandwidth estimation functions: \\code{\\link{best_h}};\n  \\code{\\link{h_grid}}\n}\n\n"
  },
  {
    "path": "man/round_any.condensed.Rd",
    "content": "% Generated by roxygen2 (4.1.1): do not edit by hand\n% Please edit documentation in R/condensed.r\n\\name{round_any.condensed}\n\\alias{round_any.condensed}\n\\title{Round any method for condensed objects}\n\\usage{\nround_any.condensed(x, accuracy, f = round)\n}\n\\arguments{\n\\item{x}{numeric or date-time (POSIXct) vector to round}\n\n\\item{accuracy}{number to round to; for POSIXct objects, a number of seconds}\n\n\\item{f}{rounding function: \\code{\\link{floor}}, \\code{\\link{ceiling}} or\n\\code{\\link{round}}}\n}\n\\description{\nRound any method for condensed objects\n}\n\\keyword{internal}\n\n"
  },
  {
    "path": "man/smooth.Rd",
    "content": "% Generated by roxygen2 (4.1.1): do not edit by hand\n% Please edit documentation in R/smooth.r\n\\name{smooth}\n\\alias{smooth}\n\\title{Smooth a condensed data frame.}\n\\usage{\nsmooth(x, h, var = summary_vars(x)[1], grid = NULL, type = \"mean\",\n  factor = TRUE)\n}\n\\arguments{\n\\item{x}{a condensed summary}\n\n\\item{h}{numeric vector of bandwidths, one for each grouping variable in\n\\code{x}}\n\n\\item{var}{variable to smooth}\n\n\\item{grid}{a data frame with the grouping colums as x.  In order for the\nfactored version of \\code{smooth_nd} to work, this grid must be a superset\nof \\code{x}.}\n\n\\item{type}{type of smoothing to use.  Current options are \\code{\"mean\"},\na kernel weighted mean; \\code{\"regression\"}, a kernel weighted local\nregression; and \\code{\"robust_regression\"}, robust kernel weighted local\nregression in the style of \\code{\\link{loess}}.  Unique prefixes are also\nacceptable.}\n\n\\item{factor}{if \\code{TRUE} compute the n-dimensional smooth by a sequence\nof 1d smoothes. For \\code{type = \"mean\"} the results are always the same\ngrid values are uncorrelated (e.g. the grid is complete at every location);\nand is very approximate for \\code{type = \"robust\"}.}\n}\n\\description{\nSmooth a condensed data frame.\n}\n\\examples{\nx <- runif(1e5)\nxsum <- condense(bin(x, 1 / 100))\nxsmu1 <- smooth(xsum, 5 / 100)\nxsmu2 <- smooth(xsum, 5 / 100, factor = FALSE)\n\n# More challenging distribution\nx <- rchallenge(1e4)\nxsum <- condense(bin(x, 0.1))\nxsmu <- smooth(xsum, 1)\n\nplot(xsum$x, xsum$.count, type = \"l\")\nlines(xsmu$x, xsmu$.count, col = \"red\")\n\nxsmu2 <- smooth(xsum, 1, type = \"regress\")\nplot(xsmu$x, xsmu$.count, type = \"l\", xlim = c(0, 50))\nlines(xsmu2$x, xsmu2$.count, col = \"red\")\n# Note difference in tails\n}\n\n"
  },
  {
    "path": "man/standardise.Rd",
    "content": "% Generated by roxygen2 (4.1.1): do not edit by hand\n% Please edit documentation in R/standardise.r\n\\name{standardise}\n\\alias{standardise}\n\\title{Standardise a summary to sum to one.}\n\\usage{\nstandardise(x, margin = integer())\n}\n\\arguments{\n\\item{x}{a condensed summary. Must have \\code{.count} variable.}\n\n\\item{margin}{margins to standardise along.  If \\code{NULL}, the default,\nstandardises the whole array.}\n}\n\\description{\nStandardise a summary to sum to one.\n}\n\\examples{\nb1 <- condense(bin(movies$year, 1))\nd1 <- smooth(b1, 2, type = \"reg\")\n\nif (require(\"ggplot2\")) {\n\nautoplot(b1)\nautoplot(d1)\n\n# Note change in x-axis limits\nautoplot(standardise(d1))\n}\n\n# Can also standardise a dimension at a time\nb2 <- with(movies, condense(bin(year, 2), bin(length, 10)))\nb2 <- peel(b2, central = TRUE)\n\nif (require(\"ggplot2\")) {\n\nautoplot(b2)\nautoplot(standardise(b2))    # note legend\nautoplot(standardise(b2, \"year\"))   # each row sums to 1\nautoplot(standardise(b2, \"length\")) # each col sums to 1\n\nbase <- ggplot(b2, aes(length, .count)) +\n  geom_line(aes(group = year, colour = year))\nbase\nbase \\%+\\% standardise(b2)  # Just affects y axis labels\nbase \\%+\\% standardise(b2, \"year\") # Makes year comparable\nbase \\%+\\% standardise(b2, \"length\") # Meaningless for this display\n\n}\n}\n\n"
  },
  {
    "path": "man/transform.condensed.Rd",
    "content": "% Generated by roxygen2 (4.1.1): do not edit by hand\n% Please edit documentation in R/rebin.r\n\\name{transform.condensed}\n\\alias{rebin}\n\\alias{transform.condensed}\n\\title{Transform condensed objects, collapsing unique bins.}\n\\usage{\n\\\\method{transform}{condensed}(`_data`, ...)\n\nrebin(data)\n}\n\\arguments{\n\\item{...}{named arguments evaluated in the context of the data}\n\n\\item{data,`_data`}{a condensed summary}\n}\n\\description{\nTransform condensed objects, collapsing unique bins.\n}\n\\details{\nYou don't need to use \\code{rebin} if you use transform: it will\nautomatically rebin for you.  You will need to use it if you manually\ntransform any grouping variables.\n}\n\\examples{\nx <- runif(1e4, -1, 1)\nxsum <- condense(bin(x, 1 / 50))\n\n# Transforming by hand: must use rebin\nxsum$x <- abs(xsum$x)\nrebin(xsum)\nif (require(\"ggplot2\")) {\n  autoplot(xsum) + geom_point()\n  autoplot(rebin(xsum)) + geom_point()\n}\n\n#' Transforming with transform\ny <- x ^ 2 + runif(length(x), -0.1, 0.1)\nxysum <- condense(bin(x, 1 / 50), z = y)\nxysum <- transform(xysum, x = abs(x))\nif (require(\"ggplot2\")) {\n  autoplot(xysum)\n}\n}\n\\keyword{internal}\n\n"
  },
  {
    "path": "man/weighted.IQR.Rd",
    "content": "% Generated by roxygen2 (4.1.1): do not edit by hand\n% Please edit documentation in R/weighted-stats.r\n\\name{weighted.IQR}\n\\alias{weighted.IQR}\n\\title{Compute the interquartile range of weighted data.}\n\\usage{\nweighted.IQR(x, w, na.rm = FALSE)\n}\n\\arguments{\n\\item{x}{numeric vector of observations}\n\n\\item{w}{integer vector of weights, representing the number of\ntime each \\code{x} was observed}\n\n\\item{na.rm}{If \\code{TRUE} will automatically remove missing values\nin \\code{x} or \\code{w}.}\n}\n\\description{\nCompute the interquartile range of weighted data.\n}\n\\details{\nThis is a simple wrapper around \\code{\\link{weighted.quantile}}\n}\n\\examples{\nx <- sort(runif(200))\nw <- rpois(200, seq(1, 10, length = 200)) + 1\n\nIQR(x)\nweighted.IQR(x, w)\n}\n\n"
  },
  {
    "path": "man/weighted.ecdf.Rd",
    "content": "% Generated by roxygen2 (4.1.1): do not edit by hand\n% Please edit documentation in R/weighted-stats.r\n\\name{weighted.ecdf}\n\\alias{weighted.ecdf}\n\\title{A weighted ecdf function.}\n\\usage{\nweighted.ecdf(x, w)\n}\n\\arguments{\n\\item{x}{numeric vector of observations}\n\n\\item{w}{integer vector of weights, representing the number of\ntime each \\code{x} was observed}\n}\n\\description{\nAn extension of the base \\code{\\link[stats]{ecdf}} function which works\nwith weighted data.\n}\n\\section{S3 methods}{\n\nThe \\code{ecdf} class has methods for \\code{\\link{plot}},\n\\code{\\link{lines}}, \\code{\\link{summary}} and \\code{\\link{quantile}}.\n\\code{\\link{quantile}} does not currently correctly compute values for\nweighted ecdfs.\n}\n\\examples{\nx <- runif(200)\nw <- rpois(200, 5) + 1\n\ne <- weighted.ecdf(x, w)\nplot(e)\nsummary(e)\n\ny <- x[rep(seq_along(x), w)]\nplot(ecdf(y))\n}\n\\seealso{\n\\code{\\link[stats]{weighted.mean}}\n\nOther weighted statistics: \\code{\\link{weighted.quantile}};\n  \\code{\\link{weighted.sd}}, \\code{\\link{weighted.var}}\n}\n\n"
  },
  {
    "path": "man/weighted.median.Rd",
    "content": "% Generated by roxygen2 (4.1.1): do not edit by hand\n% Please edit documentation in R/weighted-stats.r\n\\name{weighted.median}\n\\alias{weighted.median}\n\\title{Compute the median of weighted data.}\n\\usage{\nweighted.median(x, w, na.rm = FALSE)\n}\n\\arguments{\n\\item{x}{numeric vector of observations}\n\n\\item{w}{integer vector of weights, representing the number of\ntime each \\code{x} was observed}\n\n\\item{na.rm}{If \\code{TRUE} will automatically remove missing values\nin \\code{x} or \\code{w}.}\n}\n\\description{\nCompute the median of weighted data.\n}\n\\details{\nThis is a simple wrapper around \\code{\\link{weighted.quantile}}\n}\n\\examples{\nx <- runif(200)\nw <- rpois(200, 5) + 1\n\nmedian(x)\nweighted.median(x, w)\n}\n\n"
  },
  {
    "path": "man/weighted.quantile.Rd",
    "content": "% Generated by roxygen2 (4.1.1): do not edit by hand\n% Please edit documentation in R/weighted-stats.r\n\\name{weighted.quantile}\n\\alias{weighted.quantile}\n\\title{Compute quantiles of weighted data.}\n\\usage{\nweighted.quantile(x, w, probs = seq(0, 1, 0.25), na.rm = FALSE)\n}\n\\arguments{\n\\item{x}{numeric vector of observations}\n\n\\item{w}{integer vector of weights, representing the number of\ntime each \\code{x} was observed}\n\n\\item{probs}{numeric vector of probabilities between 0 and 1}\n\n\\item{na.rm}{If \\code{TRUE} will automatically remove missing values\nin \\code{x} or \\code{w}.}\n}\n\\description{\nCompute quantiles of weighted data.\n}\n\\details{\nCurrently only implements the type 7 algorithm, as described in\n\\code{\\link{quantile}}. Based on \\code{\\link{quantile}} written by R-core.\n}\n\\examples{\nx <- runif(200)\nw <- rpois(200, 5) + 1\nweighted.quantile(x, w)\n}\n\\seealso{\nOther weighted statistics: \\code{\\link{weighted.ecdf}};\n  \\code{\\link{weighted.sd}}, \\code{\\link{weighted.var}}\n}\n\n"
  },
  {
    "path": "man/weighted.var.Rd",
    "content": "% Generated by roxygen2 (4.1.1): do not edit by hand\n% Please edit documentation in R/weighted-stats.r\n\\name{weighted.var}\n\\alias{weighted.sd}\n\\alias{weighted.var}\n\\title{Compute a weighted variance or standard deviation of a vector.}\n\\usage{\nweighted.var(x, w = NULL, na.rm = FALSE)\n\nweighted.sd(x, w, na.rm = TRUE)\n}\n\\arguments{\n\\item{x}{numeric vector of observations}\n\n\\item{w}{integer vector of weights, representing the number of\ntime each \\code{x} was observed}\n\n\\item{na.rm}{if \\code{TRUE}, missing values in both \\code{w} and \\code{x}\nwill be removed prior computation. Otherwise if there are missing values\nthe result will always be missing.}\n}\n\\description{\nCompute a weighted variance or standard deviation of a vector.\n}\n\\details{\nNote that unlike the base R \\code{\\link{var}} function, these functions only\nwork with individual vectors not matrices or data frames.\n}\n\\examples{\nx <- c(1:5)\nw <- rpois(5, 5) + 1\ny <- x[rep(seq_along(x), w)]\nweighted.var(x, w)\nvar(y)\n\nstopifnot(all.equal(weighted.var(x, w), var(y)))\n}\n\\seealso{\n\\code{\\link[stats]{weighted.mean}}\n\nOther weighted statistics: \\code{\\link{weighted.ecdf}};\n  \\code{\\link{weighted.quantile}}\n}\n\n"
  },
  {
    "path": "notes.md",
    "content": "# Group\n\n* 1d, nd\n\nFuture work: linear binning\n\n# Summarise\n\n* 1d\n\n  * count, sum\n  * count, mean, sd\n  * median\n\n* 2d\n  * mean\n  * regression\n  * robust regression\n\n* nd \n  * mean\n  * regression (with eigen or armadillo)\n  * robust regression (with eigen or armadillo)\n\nFuture work:\n\n* skew?, kurt?\n* boxplot\n* weighted quantiles (C++ version of R code)\n* compute standard errors / bootstrap standard errors?\n\n* infrastructure for passing multiple z\n  * 2d: cor, lm\n  \n\n# Smooth\n\nKernel smoothing plus binned summary leads to many common statistics: density =~ bin + smooth, loess =~ mean + smooth, rqss =~ quantile + smooth\n\n* weights\n* smoothing type\n  * constant\n  * linear\n  * robust linear (lowess)\n  * (linear poisson?)\n* leave-one-out cross-validation\n* optimisations\n  * convert to integer grid & use pre-computed grid of kernel values\n  * hash in smooth_nd_1 and compute more efficiently along 1d\n* deal with missing values\n\n* smooth needs to create complete grid when factor = TRUE\n\nThink about input data structure: sparse grid, represented as a coordinate list. Binned grid class = integer vector + width/origin/nbins (0 = NA). Most transformations break the grid, in which case all you case preserve is min, max and number of bins. All smoothing methods adapted to work in terms of these integers. Need to extract out bin/unbin into own class (initialised with std::vector of bin sizes)\n\nPossible that more performance is available by switching to a sparse tensor library.\n\n# Visualise\n\n* Product plots\n* Standard errors + cut offs\n\n* Peel: implement nd version using depth"
  },
  {
    "path": "src/.gitignore",
    "content": "*.o\n*.so"
  },
  {
    "path": "src/BigVis.cpp",
    "content": "#include <bigvis.h>\nusing namespace Rcpp;\n\nRCPP_MODULE(BigVis) {\n  class_<BinnedVectorReference>(\"BinnedVector\")\n    .constructor<NumericVector, String, double, double>()\n    .const_method(\"bin_i\", &BinnedVectorReference::bin_i)\n    .const_method(\"bin\", &BinnedVectorReference::bin)\n    .const_method(\"unbin\", &BinnedVectorReference::unbin)\n    .const_method(\"nbins\", &BinnedVectorReference::nbins)\n    .const_method(\"size\", &BinnedVectorReference::size)\n    .const_method(\"origin\", &BinnedVectorReference::origin)\n    .const_method(\"width\", &BinnedVectorReference::width)\n    .const_method(\"name\", &BinnedVectorReference::name)\n  ;\n  class_<BinnedVectors>(\"BinnedVectors\")\n    .constructor<List>()\n    .method(\"add_vector\", &BinnedVectors::add_vector)\n    .field(\"bins\", &BinnedVectors::bins_)\n    .const_method(\"bin_i\", &BinnedVectors::bin_i)\n    .const_method(\"bin\", &BinnedVectors::bin)\n    .const_method(\"unbin\", &BinnedVectors::unbin)\n    .const_method(\"nbins\", &BinnedVectors::nbins)\n  ;\n}\n"
  },
  {
    "path": "src/BinnedVector.cpp",
    "content": "#include <bigvis.h>\nusing namespace Rcpp;\n\nNumericVector frange(const NumericVector& x, const bool finite = true);\n\nint BinnedVector::nbins() const {\n  double max = frange(x_)[1];\n  return bin(max) + 1; \n  // +1 bin for missing values\n}\n"
  },
  {
    "path": "src/BinnedVectors.cpp",
    "content": "#include <bigvis.h>\nusing namespace Rcpp;\n\nint BinnedVectors::bin_i(int i) const {\n  int bin = 0;\n  int ngroups = groups_.size();\n\n  for (int j = 0; j < ngroups; ++j) {\n    bin += groups_[j].bin_i(i) * bins_[(ngroups - 1) - j];\n  }\n\n  return bin;\n}\n\nint BinnedVectors::bin(std::vector<double> x) const {\n  int ngroups = groups_.size();\n  if (x.size() != ngroups) stop(\"x must be same length as groups\");\n  int bin = 0;\n\n  for (int j = 0; j < ngroups; ++j) {\n    int bin_j = groups_[j].bin(x[j]);\n    bin += bin_j * bins_[(ngroups - 1) - j];\n    // Rcout << \"group: \" << j << \" bin: \" << bin << \" bin_j: \" << bin_j << \"\\n\";\n  }\n\n  return bin;\n}\n\nstd::vector<double> BinnedVectors::unbin(int bin) const {\n  int ngroups = groups_.size();\n  std::vector<double> bins(ngroups);\n\n  // if ngroups = 3, then: \n  // bin = groups[0].bin(x[0]) * bins[2] (biggest) +\n  //       groups[1].bin(x[1]) * bins[1] +\n  //       groups[2].bin(x[2]) * bins[0] (smallest)\n  // peel off largest first\n  //   bin_j = bin %/% bin[2]\n  //   groups[0].unbin(bin_j)\n  // and that goes in last output position\n\n  for (int i = 0, j = ngroups - 1; i < ngroups - 1; ++i, --j) {\n    int bin_j = bin % bins_[j];\n    // Rcout << \"group: \" << j << \" bin: \" << bin << \" bin_j: \" << bin_j << \"\\n\";\n    bins[j] = groups_[j].unbin(bin_j);\n\n    bin = (bin - bin_j) / bins_[j];\n  }\n  // Rcout << \"group: \" << 0 << \" bin: \" << bin << \" bin_j: \" << bin << \"\\n\";\n  // Special case for last group because x %% 1 = 0\n  bins[0] = groups_[0].unbin(bin);\n\n  return bins;\n}\n\n"
  },
  {
    "path": "src/Makevars",
    "content": "PKG_CPPFLAGS=-I../inst/include\n"
  },
  {
    "path": "src/RcppExports.cpp",
    "content": "// This file was generated by Rcpp::compileAttributes\n// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393\n\n#include \"../inst/include/bigvis.h\"\n#include <Rcpp.h>\n\nusing namespace Rcpp;\n\n// condense_count\nList condense_count(const List& x, const NumericVector& z, const NumericVector& weight, bool drop);\nRcppExport SEXP bigvis_condense_count(SEXP xSEXP, SEXP zSEXP, SEXP weightSEXP, SEXP dropSEXP) {\nBEGIN_RCPP\n    Rcpp::RObject __result;\n    Rcpp::RNGScope __rngScope;\n    Rcpp::traits::input_parameter< const List& >::type x(xSEXP);\n    Rcpp::traits::input_parameter< const NumericVector& >::type z(zSEXP);\n    Rcpp::traits::input_parameter< const NumericVector& >::type weight(weightSEXP);\n    Rcpp::traits::input_parameter< bool >::type drop(dropSEXP);\n    __result = Rcpp::wrap(condense_count(x, z, weight, drop));\n    return __result;\nEND_RCPP\n}\n// condense_sum\nList condense_sum(const List& x, const NumericVector& z, const NumericVector& weight, bool drop);\nRcppExport SEXP bigvis_condense_sum(SEXP xSEXP, SEXP zSEXP, SEXP weightSEXP, SEXP dropSEXP) {\nBEGIN_RCPP\n    Rcpp::RObject __result;\n    Rcpp::RNGScope __rngScope;\n    Rcpp::traits::input_parameter< const List& >::type x(xSEXP);\n    Rcpp::traits::input_parameter< const NumericVector& >::type z(zSEXP);\n    Rcpp::traits::input_parameter< const NumericVector& >::type weight(weightSEXP);\n    Rcpp::traits::input_parameter< bool >::type drop(dropSEXP);\n    __result = Rcpp::wrap(condense_sum(x, z, weight, drop));\n    return __result;\nEND_RCPP\n}\n// condense_mean\nList condense_mean(const List& x, const NumericVector& z, const NumericVector& weight, bool drop);\nRcppExport SEXP bigvis_condense_mean(SEXP xSEXP, SEXP zSEXP, SEXP weightSEXP, SEXP dropSEXP) {\nBEGIN_RCPP\n    Rcpp::RObject __result;\n    Rcpp::RNGScope __rngScope;\n    Rcpp::traits::input_parameter< const List& >::type x(xSEXP);\n    Rcpp::traits::input_parameter< const NumericVector& >::type z(zSEXP);\n    Rcpp::traits::input_parameter< const NumericVector& >::type weight(weightSEXP);\n    Rcpp::traits::input_parameter< bool >::type drop(dropSEXP);\n    __result = Rcpp::wrap(condense_mean(x, z, weight, drop));\n    return __result;\nEND_RCPP\n}\n// condense_sd\nList condense_sd(const List& x, const NumericVector& z, const NumericVector& weight, bool drop);\nRcppExport SEXP bigvis_condense_sd(SEXP xSEXP, SEXP zSEXP, SEXP weightSEXP, SEXP dropSEXP) {\nBEGIN_RCPP\n    Rcpp::RObject __result;\n    Rcpp::RNGScope __rngScope;\n    Rcpp::traits::input_parameter< const List& >::type x(xSEXP);\n    Rcpp::traits::input_parameter< const NumericVector& >::type z(zSEXP);\n    Rcpp::traits::input_parameter< const NumericVector& >::type weight(weightSEXP);\n    Rcpp::traits::input_parameter< bool >::type drop(dropSEXP);\n    __result = Rcpp::wrap(condense_sd(x, z, weight, drop));\n    return __result;\nEND_RCPP\n}\n// condense_median\nList condense_median(const List& x, const NumericVector& z, const NumericVector& weight, bool drop);\nRcppExport SEXP bigvis_condense_median(SEXP xSEXP, SEXP zSEXP, SEXP weightSEXP, SEXP dropSEXP) {\nBEGIN_RCPP\n    Rcpp::RObject __result;\n    Rcpp::RNGScope __rngScope;\n    Rcpp::traits::input_parameter< const List& >::type x(xSEXP);\n    Rcpp::traits::input_parameter< const NumericVector& >::type z(zSEXP);\n    Rcpp::traits::input_parameter< const NumericVector& >::type weight(weightSEXP);\n    Rcpp::traits::input_parameter< bool >::type drop(dropSEXP);\n    __result = Rcpp::wrap(condense_median(x, z, weight, drop));\n    return __result;\nEND_RCPP\n}\n// double_diff_sum\nstd::vector<int> double_diff_sum(IntegerVector bin, IntegerVector count);\nRcppExport SEXP bigvis_double_diff_sum(SEXP binSEXP, SEXP countSEXP) {\nBEGIN_RCPP\n    Rcpp::RObject __result;\n    Rcpp::RNGScope __rngScope;\n    Rcpp::traits::input_parameter< IntegerVector >::type bin(binSEXP);\n    Rcpp::traits::input_parameter< IntegerVector >::type count(countSEXP);\n    __result = Rcpp::wrap(double_diff_sum(bin, count));\n    return __result;\nEND_RCPP\n}\n// frange\nNumericVector frange(const NumericVector& x, const bool finite);\nRcppExport SEXP bigvis_frange(SEXP xSEXP, SEXP finiteSEXP) {\nBEGIN_RCPP\n    Rcpp::RObject __result;\n    Rcpp::RNGScope __rngScope;\n    Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP);\n    Rcpp::traits::input_parameter< const bool >::type finite(finiteSEXP);\n    __result = Rcpp::wrap(frange(x, finite));\n    return __result;\nEND_RCPP\n}\n// group_fixed\nIntegerVector group_fixed(const NumericVector& x, double width, double origin);\nRcppExport SEXP bigvis_group_fixed(SEXP xSEXP, SEXP widthSEXP, SEXP originSEXP) {\nBEGIN_RCPP\n    Rcpp::RObject __result;\n    Rcpp::RNGScope __rngScope;\n    Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP);\n    Rcpp::traits::input_parameter< double >::type width(widthSEXP);\n    Rcpp::traits::input_parameter< double >::type origin(originSEXP);\n    __result = Rcpp::wrap(group_fixed(x, width, origin));\n    return __result;\nEND_RCPP\n}\n// group_rect\nIntegerVector group_rect(const NumericVector& x, const NumericVector& y, double x_width, double y_width, double x_origin, double y_origin);\nRcppExport SEXP bigvis_group_rect(SEXP xSEXP, SEXP ySEXP, SEXP x_widthSEXP, SEXP y_widthSEXP, SEXP x_originSEXP, SEXP y_originSEXP) {\nBEGIN_RCPP\n    Rcpp::RObject __result;\n    Rcpp::RNGScope __rngScope;\n    Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP);\n    Rcpp::traits::input_parameter< const NumericVector& >::type y(ySEXP);\n    Rcpp::traits::input_parameter< double >::type x_width(x_widthSEXP);\n    Rcpp::traits::input_parameter< double >::type y_width(y_widthSEXP);\n    Rcpp::traits::input_parameter< double >::type x_origin(x_originSEXP);\n    Rcpp::traits::input_parameter< double >::type y_origin(y_originSEXP);\n    __result = Rcpp::wrap(group_rect(x, y, x_width, y_width, x_origin, y_origin));\n    return __result;\nEND_RCPP\n}\n// group_hex\nIntegerVector group_hex(const NumericVector& x, const NumericVector& y, double x_width, double y_width, double x_origin, double y_origin, double x_max);\nRcppExport SEXP bigvis_group_hex(SEXP xSEXP, SEXP ySEXP, SEXP x_widthSEXP, SEXP y_widthSEXP, SEXP x_originSEXP, SEXP y_originSEXP, SEXP x_maxSEXP) {\nBEGIN_RCPP\n    Rcpp::RObject __result;\n    Rcpp::RNGScope __rngScope;\n    Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP);\n    Rcpp::traits::input_parameter< const NumericVector& >::type y(ySEXP);\n    Rcpp::traits::input_parameter< double >::type x_width(x_widthSEXP);\n    Rcpp::traits::input_parameter< double >::type y_width(y_widthSEXP);\n    Rcpp::traits::input_parameter< double >::type x_origin(x_originSEXP);\n    Rcpp::traits::input_parameter< double >::type y_origin(y_originSEXP);\n    Rcpp::traits::input_parameter< double >::type x_max(x_maxSEXP);\n    __result = Rcpp::wrap(group_hex(x, y, x_width, y_width, x_origin, y_origin, x_max));\n    return __result;\nEND_RCPP\n}\n// lowerBound\nIntegerVector lowerBound(const NumericVector& x, const NumericVector& breaks);\nRcppExport SEXP bigvis_lowerBound(SEXP xSEXP, SEXP breaksSEXP) {\nBEGIN_RCPP\n    Rcpp::RObject __result;\n    Rcpp::RNGScope __rngScope;\n    Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP);\n    Rcpp::traits::input_parameter< const NumericVector& >::type breaks(breaksSEXP);\n    __result = Rcpp::wrap(lowerBound(x, breaks));\n    return __result;\nEND_RCPP\n}\n// smooth_nd_1\nNumericVector smooth_nd_1(const NumericMatrix& grid_in, const NumericVector& z_in, const NumericVector& w_in_, const NumericMatrix& grid_out, const int var, const double h, const std::string type);\nRcppExport SEXP bigvis_smooth_nd_1(SEXP grid_inSEXP, SEXP z_inSEXP, SEXP w_in_SEXP, SEXP grid_outSEXP, SEXP varSEXP, SEXP hSEXP, SEXP typeSEXP) {\nBEGIN_RCPP\n    Rcpp::RObject __result;\n    Rcpp::RNGScope __rngScope;\n    Rcpp::traits::input_parameter< const NumericMatrix& >::type grid_in(grid_inSEXP);\n    Rcpp::traits::input_parameter< const NumericVector& >::type z_in(z_inSEXP);\n    Rcpp::traits::input_parameter< const NumericVector& >::type w_in_(w_in_SEXP);\n    Rcpp::traits::input_parameter< const NumericMatrix& >::type grid_out(grid_outSEXP);\n    Rcpp::traits::input_parameter< const int >::type var(varSEXP);\n    Rcpp::traits::input_parameter< const double >::type h(hSEXP);\n    Rcpp::traits::input_parameter< const std::string >::type type(typeSEXP);\n    __result = Rcpp::wrap(smooth_nd_1(grid_in, z_in, w_in_, grid_out, var, h, type));\n    return __result;\nEND_RCPP\n}\n// smooth_nd\nNumericVector smooth_nd(const NumericMatrix& grid_in, const NumericVector& z_in, const NumericVector& w_in_, const NumericMatrix& grid_out, const NumericVector h);\nRcppExport SEXP bigvis_smooth_nd(SEXP grid_inSEXP, SEXP z_inSEXP, SEXP w_in_SEXP, SEXP grid_outSEXP, SEXP hSEXP) {\nBEGIN_RCPP\n    Rcpp::RObject __result;\n    Rcpp::RNGScope __rngScope;\n    Rcpp::traits::input_parameter< const NumericMatrix& >::type grid_in(grid_inSEXP);\n    Rcpp::traits::input_parameter< const NumericVector& >::type z_in(z_inSEXP);\n    Rcpp::traits::input_parameter< const NumericVector& >::type w_in_(w_in_SEXP);\n    Rcpp::traits::input_parameter< const NumericMatrix& >::type grid_out(grid_outSEXP);\n    Rcpp::traits::input_parameter< const NumericVector >::type h(hSEXP);\n    __result = Rcpp::wrap(smooth_nd(grid_in, z_in, w_in_, grid_out, h));\n    return __result;\nEND_RCPP\n}\n// bisquare\ndouble bisquare(double u, double b);\nRcppExport SEXP bigvis_bisquare(SEXP uSEXP, SEXP bSEXP) {\nBEGIN_RCPP\n    Rcpp::RObject __result;\n    Rcpp::RNGScope __rngScope;\n    Rcpp::traits::input_parameter< double >::type u(uSEXP);\n    Rcpp::traits::input_parameter< double >::type b(bSEXP);\n    __result = Rcpp::wrap(bisquare(u, b));\n    return __result;\nEND_RCPP\n}\n// regress\nNumericVector regress(const std::vector<double>& x, const std::vector<double>& y, const std::vector<double>& w);\nRcppExport SEXP bigvis_regress(SEXP xSEXP, SEXP ySEXP, SEXP wSEXP) {\nBEGIN_RCPP\n    Rcpp::RObject __result;\n    Rcpp::RNGScope __rngScope;\n    Rcpp::traits::input_parameter< const std::vector<double>& >::type x(xSEXP);\n    Rcpp::traits::input_parameter< const std::vector<double>& >::type y(ySEXP);\n    Rcpp::traits::input_parameter< const std::vector<double>& >::type w(wSEXP);\n    __result = Rcpp::wrap(regress(x, y, w));\n    return __result;\nEND_RCPP\n}\n// median\ndouble median(const std::vector<double>& x);\nRcppExport SEXP bigvis_median(SEXP xSEXP) {\nBEGIN_RCPP\n    Rcpp::RObject __result;\n    Rcpp::RNGScope __rngScope;\n    Rcpp::traits::input_parameter< const std::vector<double>& >::type x(xSEXP);\n    __result = Rcpp::wrap(median(x));\n    return __result;\nEND_RCPP\n}\n// regress_robust\nNumericVector regress_robust(const std::vector<double>& x, const std::vector<double>& y, const std::vector<double>& w, int iterations);\nRcppExport SEXP bigvis_regress_robust(SEXP xSEXP, SEXP ySEXP, SEXP wSEXP, SEXP iterationsSEXP) {\nBEGIN_RCPP\n    Rcpp::RObject __result;\n    Rcpp::RNGScope __rngScope;\n    Rcpp::traits::input_parameter< const std::vector<double>& >::type x(xSEXP);\n    Rcpp::traits::input_parameter< const std::vector<double>& >::type y(ySEXP);\n    Rcpp::traits::input_parameter< const std::vector<double>& >::type w(wSEXP);\n    Rcpp::traits::input_parameter< int >::type iterations(iterationsSEXP);\n    __result = Rcpp::wrap(regress_robust(x, y, w, iterations));\n    return __result;\nEND_RCPP\n}\n// compute_moments\nNumericVector compute_moments(const NumericVector& x);\nRcppExport SEXP bigvis_compute_moments(SEXP xSEXP) {\nBEGIN_RCPP\n    Rcpp::RObject __result;\n    Rcpp::RNGScope __rngScope;\n    Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP);\n    __result = Rcpp::wrap(compute_moments(x));\n    return __result;\nEND_RCPP\n}\n// compute_sum\nNumericVector compute_sum(const NumericVector& x);\nRcppExport SEXP bigvis_compute_sum(SEXP xSEXP) {\nBEGIN_RCPP\n    Rcpp::RObject __result;\n    Rcpp::RNGScope __rngScope;\n    Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP);\n    __result = Rcpp::wrap(compute_sum(x));\n    return __result;\nEND_RCPP\n}\n// compute_median\nNumericVector compute_median(const NumericVector& x);\nRcppExport SEXP bigvis_compute_median(SEXP xSEXP) {\nBEGIN_RCPP\n    Rcpp::RObject __result;\n    Rcpp::RNGScope __rngScope;\n    Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP);\n    __result = Rcpp::wrap(compute_median(x));\n    return __result;\nEND_RCPP\n}\n"
  },
  {
    "path": "src/Summary2d.cpp",
    "content": "#include <Rcpp.h>\n#include \"Summary2d.h\"\nusing namespace Rcpp;\n"
  },
  {
    "path": "src/Summary2d.h",
    "content": "#include <Rcpp.h>\n#include \"stats.h\"\nusing namespace Rcpp;\n\nclass Summary2d {\n  public:\n    virtual void push(double x, double z, double w) =0;\n    virtual double compute() =0;\n    virtual ~Summary2d() {}\n};\n\nclass Summary2dMean: public Summary2d {\n    double w_, z_;\n\n  public:\n    Summary2dMean() : w_(0), z_(0) {}\n\n    void push(double x, double z, double w) {\n      // Rcout << \"  x: \" << x << \" z: \" << z << \" w: \" << w << \"\\n\";\n      w_ += w;\n      z_ += z * w;\n    }\n\n    double compute() {\n      // Rcout << \"Result: \" << z_ / w_ << \"\\n\";\n      return z_ / w_;\n    }\n};\n\nclass Summary2dRegression: public Summary2d {\n    std::vector<double> x_, z_, w_;\n\n  public:\n    Summary2dRegression() {}\n\n    void push(double x, double z, double w) {\n      x_.push_back(x);\n      z_.push_back(z);\n      w_.push_back(w);\n    }\n\n    double compute() {\n      return simpleLinearRegression(x_, z_, w_).alpha;\n    }\n};\n\nclass Summary2dRobustRegression: public Summary2d {\n    int iterations_;\n    std::vector<double> x_, z_, w_;\n\n  public:\n    Summary2dRobustRegression() : iterations_(3) {}\n    Summary2dRobustRegression(int iterations) : iterations_(iterations) {}\n\n    void push(double x, double z, double w) {\n      x_.push_back(x);\n      z_.push_back(z);\n      w_.push_back(w);\n    }\n\n    double compute() {\n      return simpleRobustRegression(x_, z_, w_, iterations_).alpha;\n    }\n};\n"
  },
  {
    "path": "src/condense-gen.r",
    "content": "library(whisker)\n\n# Generate template specialisations for groupwise - these are the functions\n# that are called from R.\n\nsummaries <- c(\n  count = \"Sum(0)\",\n  sum = \"Sum(1)\",\n  mean = \"Moments(1)\",\n  sd = \"Moments(2)\",\n  median = \"Median()\"\n)\n\ntemplate <- \"\n// [[Rcpp::export]]\nList condense_{{name}}(const List& x, const NumericVector& z,\n                       const NumericVector& weight, bool drop = false) {\n  if (drop) {\n    return sparse_condense(BinnedVectors(x), z, weight, Summary{{summary}});\n  } else {\n    return condense(BinnedVectors(x), z, weight, Summary{{summary}});\n  }\n}\n\"\n\ncpp_fun <- function(summary) {\n  whisker.render(template, list(\n    name = tolower(summary),\n    summary = summaries[[summary]]\n  ))\n}\n\n\ngroupwise <- readLines(\"condense.cpp\")\nsplit <- which(grepl(\"// -{40,}\", groupwise))[1]\noriginal <- groupwise[1:split]\n\nwriteLines(original, \"condense.cpp\")\n\ncat(\"// Autogenerated by condense-gen.r\\n\", file = \"condense.cpp\", append = TRUE)\nfuns <- unlist(lapply(names(summaries), cpp_fun))\ncat(funs, file = \"condense.cpp\", append = TRUE, sep = \"\")\n"
  },
  {
    "path": "src/condense.cpp",
    "content": "#include <Rcpp.h>\n#include <bigvis.h>\n#include \"group.h\"\n#include \"summary.h\"\n\ntemplate<typename Stat>\nList condense(const BinnedVectors& group, const NumericVector& z, \n                        const NumericVector& weight, const Stat& stat) {\n  int n_obs = group.size();\n  int n_bins = group.nbins();\n\n  const NumericVector& weight_ = (weight.size() > 0) ? weight : \n    rep(NumericVector::create(1), n_obs);\n  const NumericVector& z_ = (z.size() > 0) ? z : \n    rep(NumericVector::create(1), n_obs);\n\n  // Push values into stats\n  std::vector<Stat> stats(n_bins, stat);\n  for(int i = 0; i < n_obs; ++i) {\n    int bin = group.bin_i(i);\n    // Rcout << \"i: \" << i << \" bin: \" << bin << \"\\n\"; \n    stats.at(bin).push(z_[i], weight_[i]);      \n  }\n\n  // Compute values from stats and determine bins\n  int n_stats = stat.size();\n  int n_groups = group.ngroups();\n  NumericMatrix out(n_bins, n_stats), bin(n_bins, n_groups);\n\n  for (int i = 0; i < n_bins; ++i) {\n    for (int j = 0; j < n_stats; ++j) {\n      out(i, j) = stats[i].compute(j);\n    }\n\n    std::vector<double> bins = group.unbin(i);\n    for (int j = 0; j < n_groups; ++j) {\n      bin(i, j) = bins[j];\n    }\n  }\n\n  // Name \n  CharacterVector out_cols(n_stats), bin_cols(n_groups);\n  for (int j = 0; j < n_stats; ++j) {\n    out_cols[j] = stat.name(j);\n  }\n  for (int j = 0; j < n_groups; ++j) {\n    bin_cols[j] = group.name(j);\n  }\n  out.attr(\"dimnames\") = List::create(CharacterVector::create(), out_cols);\n  bin.attr(\"dimnames\") = List::create(CharacterVector::create(), bin_cols);\n\n  return List::create(bin, out);\n}\n\ntemplate<typename Stat>\nList sparse_condense(const BinnedVectors& group, const NumericVector& z, \n                        const NumericVector& weight, const Stat& stat) {\n  int n_obs = group.size();\n\n  const NumericVector& weight_ = (weight.size() > 0) ? weight : \n    rep(NumericVector::create(1), n_obs);\n  const NumericVector& z_ = (z.size() > 0) ? z : \n    rep(NumericVector::create(1), n_obs);\n\n  // Push values into stats\n  typename std::map<int, Stat> stats;\n  for(int i = 0; i < n_obs; ++i) {\n    int bin = group.bin_i(i);\n    \n    typename std::map<int, Stat>::iterator loc = stats.find(bin);\n    if (loc == stats.end()) {\n      Stat new_stat(stat);\n      new_stat.push(z_[i], weight_[i]);\n      stats.insert(std::pair<int, Stat>(bin, new_stat));\n    } else {\n      (loc->second).push(z_[i], weight_[i]);\n    }\n  }\n\n  // Compute values from stats and determine bins\n  int n_bins = stats.size();\n  int n_stats = stat.size();\n  int n_groups = group.ngroups();\n  NumericMatrix out(n_bins, n_stats), bin(n_bins, n_groups);\n\n  typename std::map<int, Stat>::iterator stats_it = stats.begin(),\n    stats_end = stats.end();\n\n\n  for (int i = 0; stats_it != stats_end; ++stats_it, ++i) {\n    for (int j = 0; j < n_stats; ++j) {\n      out(i, j) = (stats_it->second).compute(j);\n    }\n\n    std::vector<double> bins = group.unbin(stats_it->first);\n    for (int j = 0; j < n_groups; ++j) {\n      bin(i, j) = bins[j];\n    }\n  }\n\n  // Name \n  CharacterVector out_cols(n_stats), bin_cols(n_groups);\n  for (int j = 0; j < n_stats; ++j) {\n    out_cols[j] = stat.name(j);\n  }\n  for (int j = 0; j < n_groups; ++j) {\n    bin_cols[j] = group.name(j);\n  }\n  out.attr(\"dimnames\") = List::create(CharacterVector::create(), out_cols);\n  bin.attr(\"dimnames\") = List::create(CharacterVector::create(), bin_cols);\n\n  return List::create(bin, out);\n}\n\n// -----------------------------------------------------------------------------\n// Autogenerated by condense-gen.r\n\n// [[Rcpp::export]]\nList condense_count(const List& x, const NumericVector& z,\n                       const NumericVector& weight, bool drop = false) {\n  if (drop) {\n    return sparse_condense(BinnedVectors(x), z, weight, SummarySum(0));\n  } else {\n    return condense(BinnedVectors(x), z, weight, SummarySum(0));\n  }\n}\n\n// [[Rcpp::export]]\nList condense_sum(const List& x, const NumericVector& z,\n                       const NumericVector& weight, bool drop = false) {\n  if (drop) {\n    return sparse_condense(BinnedVectors(x), z, weight, SummarySum(1));\n  } else {\n    return condense(BinnedVectors(x), z, weight, SummarySum(1));\n  }\n}\n\n// [[Rcpp::export]]\nList condense_mean(const List& x, const NumericVector& z,\n                       const NumericVector& weight, bool drop = false) {\n  if (drop) {\n    return sparse_condense(BinnedVectors(x), z, weight, SummaryMoments(1));\n  } else {\n    return condense(BinnedVectors(x), z, weight, SummaryMoments(1));\n  }\n}\n\n// [[Rcpp::export]]\nList condense_sd(const List& x, const NumericVector& z,\n                       const NumericVector& weight, bool drop = false) {\n  if (drop) {\n    return sparse_condense(BinnedVectors(x), z, weight, SummaryMoments(2));\n  } else {\n    return condense(BinnedVectors(x), z, weight, SummaryMoments(2));\n  }\n}\n\n// [[Rcpp::export]]\nList condense_median(const List& x, const NumericVector& z,\n                       const NumericVector& weight, bool drop = false) {\n  if (drop) {\n    return sparse_condense(BinnedVectors(x), z, weight, SummaryMedian());\n  } else {\n    return condense(BinnedVectors(x), z, weight, SummaryMedian());\n  }\n}\n"
  },
  {
    "path": "src/double-diff-sum.cpp",
    "content": "#include <Rcpp.h>\n#include <iostream>\n#include <algorithm>\nusing namespace Rcpp;\n\n// Efficiently compute \\sum \\sum abs(x_i - x_j) for binned data\n// \n// It's effectively equivalent to this R code on the ungrouped observations\n// bin <- trunc(x / bw)\n// diffs <- abs(outer(bin, bin, \"-\"))\n// tabulate(diffs + 1)\n// \n// [[Rcpp::export]]\nstd::vector<int> double_diff_sum(IntegerVector bin, IntegerVector count) {\n  int n = bin.size();\n  std::vector<int> out;\n\n  for (int i = 0; i < n; i++) {\n    for (int j = 0; j < n; j++) {\n      int pos = abs(bin[i] - bin[j]);\n\n      if (pos + 1 > out.size()) {\n        out.resize(pos + 1);\n      }\n      out[pos] += count[i] * count[j];\n    }\n  }\n\n  return out;\n}\n"
  },
  {
    "path": "src/frange.cpp",
    "content": "#include <Rcpp.h>\nusing namespace Rcpp;\n\n//' Efficient implementation of range.\n//'\n//' This is an efficient C++ implementation of range for numeric vectors:\n//' it avoids S3 dispatch, and computes both min and max in a single pass\n//' through the input.\n//'\n//' If \\code{x} has a \\code{range} attribute (e.g. it's a \\code{\\link{ranged}}\n//' object), it will be used instead of computing the range from scratch.\n//' \n//' @param x a numeric vector, or a \\code{\\link{ranged}} object\n//' @param finite If \\code{TRUE} ignores missing values and infinities. Note\n//'   that if the vector is empty, or only contains missing values, \n//'   \\code{frange} will return \\code{c(Inf, -Inf)} because those are the\n//'   identity values for \\code{\\link{min}} and \\code{\\link{max}} respectively.\n//' @export\n//' @examples\n//' x <- runif(1e6)\n//' system.time(range(x))\n//' system.time(frange(x))\n//'\n//' rx <- ranged(x)\n//' system.time(frange(rx))\n// [[Rcpp::export]]\nNumericVector frange(const NumericVector& x, const bool finite = true) {\n  RObject cache = x.attr(\"range\");\n  if (cache.sexp_type() == REALSXP) return as<NumericVector>(cache);\n\n  NumericVector out(2);\n  out[0] = INFINITY;\n  out[1] = -INFINITY;\n\n  int n = x.length();\n  for(int i = 0; i < n; ++i) {\n    if (!finite && R_IsNA(x[i])) {\n      out[0] = NA_REAL;\n      out[1] = NA_REAL;\n      return out;\n    }\n\n    // If finite, skip infinite values\n    if (finite && (x[i] == INFINITY || x[i] == -INFINITY)) continue;\n\n    if (x[i] < out[0]) out[0] = x[i];\n    if (x[i] > out[1]) out[1] = x[i];\n  }\n\n  return out;\n}\n"
  },
  {
    "path": "src/group-hex.h",
    "content": "/*\n * Translated from\n * https://github.com/d3/d3-plugins/blob/master/hexbin/hexbin.js\n *\n * Copyright (C) 2013 Hadley Wickham\n * Copyright (C) 2012 Mike Bostock (mbostock at gmail dot com)\n */\nclass GroupHex {\n    const NumericVector x_;\n    const NumericVector y_;\n    double x_width_;\n    double x_origin_;\n    double y_width_;\n    double y_origin_;\n    double x_bins;\n\n  public:\n    GroupHex (const NumericVector& x, const NumericVector& y, \n                double x_width, double y_width, \n                double x_origin, double y_origin, \n                double x_max)\n       : x_(x), y_(y), x_width_(x_width), x_origin_(x_origin), \n          y_width_(y_width), y_origin_(y_origin) {\n      if (x.size() != y.size()) stop(\"x & y are not the same size\");\n      x_bins = x_max / x_width_ + 1;\n    }\n\n    int bin_i(int i) const {\n      double py = ISNAN(y_[i]) ? 0 : (y_[i] - y_origin_) / y_width_ + 1;\n      int pj = py;\n      double py1 = py - pj;\n      \n      double px = ISNAN(x_[i]) ? 0 : (x_[i] - x_origin_) / x_width_ + 1 - \n        (pj % 2 ? 0.5 : 0);\n      int pi = px;\n\n       if (fabs(py1) * 3 > 1) {\n        double px1 = px - pi,\n               pi2 = pi + (px < pi ? -1 : 1) / 2,\n               pj2 = pj + (py < pj ? -1 : 1),\n               px2 = px - pi2,\n               py2 = py - pj2;\n        if (px1 * px1 + py1 * py1 > px2 * px2 + py2 * py2) {\n          pi = pi2 + (pj % 2 ? 1 : -1) / 2;\n          pj = pj2;\n        }\n      }\n\n      return pj * x_bins + pj;\n    }\n\n    int size() const {\n      return x_.size();\n    }\n};\n"
  },
  {
    "path": "src/group.cpp",
    "content": "#include <Rcpp.h>\n#include \"group.h\"\n#include \"group-hex.h\"\nusing namespace Rcpp;\n\ntemplate<typename Group>\nIntegerVector group_out(const Group& group) {\n  int n = group.size();\n  IntegerVector out(n);\n  for(int i = 0; i < n; ++i) {\n    out[i] = group.bin_i(i);\n  }\n\n  return out;\n}\n\nRCPP_MODULE(Group) {\n  class_<GroupFixed>(\"GroupFixed\")\n  .constructor<NumericVector, double, double>()\n  .const_method(\"bin_i\", &GroupFixed::bin_i)\n  .const_method(\"bin\", &GroupFixed::bin)\n  .const_method(\"unbin\", &GroupFixed::unbin)\n\n  .const_method(\"size\", &GroupFixed::size)\n  .const_method(\"nbins\", &GroupFixed::nbins)\n\n  .const_method(\"origin\", &GroupFixed::origin)\n  .const_method(\"width\", &GroupFixed::width)\n  ;\n}\nRCPP_EXPOSED_AS(GroupFixed)\nRCPP_EXPOSED_WRAP(GroupFixed)\n\n\n// [[Rcpp::export]]\nIntegerVector group_fixed(const NumericVector& x, double width, double origin = 0) {\n  return group_out(GroupFixed(x, width, origin));\n}\n\n// [[Rcpp::export]]\nIntegerVector group_rect(const NumericVector& x, const NumericVector& y, \n                         double x_width, double y_width,\n                         double x_origin, double y_origin) {\n  return group_out(Group2d<GroupFixed>(\n    GroupFixed(x, x_width, x_origin), \n    GroupFixed(y, y_width, y_origin)));\n}\n\n\n// [[Rcpp::export]]\nIntegerVector group_hex(const NumericVector& x, const NumericVector& y, \n                         double x_width, double y_width,\n                         double x_origin, double y_origin,\n                         double x_max) {\n  return group_out(GroupHex(x, y, x_width, y_width, x_origin, y_origin, x_max));\n}\n"
  },
  {
    "path": "src/group.h",
    "content": "#include <Rcpp.h>\nusing namespace Rcpp;\n\nNumericVector frange(const NumericVector& x, const bool finite = true);\n\nclass GroupFixed {\n    const NumericVector x_;\n    double width_;\n    double origin_;\n  public:\n    GroupFixed (NumericVector x, double width, double origin = 0)\n       : x_(x), width_(width), origin_(origin) {\n    }\n\n    int bin_i(int i) const {\n      if (ISNAN(x_[i]) || x_[i] == INFINITY || x_[i] == -INFINITY) return 0;\n      if (x_[i] < origin_) return 0;\n      \n      return bin(x_[i]);\n    }\n\n    int bin(double x) const {\n      return (x - origin_) / width_ + 1;\n    }\n\n    double unbin(int bin) const {\n      if (bin == 0) return(NAN);\n      return (bin - 1) * width_ + origin_;\n    }\n\n    double origin() const {\n      return origin_;\n    }\n    double width() const {\n      return width_;\n    }\n\n\n    int size() const {\n      return x_.size();\n    }\n\n    int nbins() const {\n      double max = frange(x_)(1);\n      double dest = floor((max - origin_) / width_) * width_ + origin_;\n\n      // + 1 for missing values\n      // + 1 if highest value is on right-open boundary\n      return (dest - origin_) / width_ + 1 + ((max >= dest) ? 1 : 0);\n    }\n\n};\n\ntemplate<typename Group>\nclass Group2d {\n    const Group& x_;\n    const Group& y_;\n    int x_bins_;\n    int y_bins_;\n\n  public:\n    Group2d (const Group& x, const Group& y) : x_(x), y_(y) {\n      if (x_.size() != y_.size()) {\n        stop(\"x and y are not equal sizes\");\n      }\n      x_bins_ = x_.nbins();\n      y_bins_ = y_.nbins();\n\n      // Rcout << \"x_bins: \" << x_bins_ << \" y_bins: \" << y_bins_ << \"\\n\";\n    }\n\n    int bin_i(int i) const {\n      int x_bin = x_.bin_i(i), y_bin = y_.bin_i(i);\n      int bin = y_bin * x_bins_ + x_bin;\n      // Rcout << i << \": (\" << x_bin << \",\" << y_bin << \") -> \" << bin << \"\\n\";\n      return bin;\n    }\n\n    int size() const {\n      return x_.size();\n    }\n\n    int nbins() const {\n      return x_bins_ * y_bins_;\n    }\n};\n\n\ntemplate<typename Group>\nclass GroupNd {\n    const std::vector<Group> groups_;\n    const int ngroups_;\n\n    int size_;\n    std::vector<int> bins_;\n\n  public:\n    GroupNd (const std::vector<Group> groups) \n        : groups_(groups), ngroups_(groups.size()) {\n      if (groups.size() == 0) {\n        stop(\"Empty groups vector passed to GroupCompound\");\n      }\n\n      size_ = groups[0].size();\n\n      bins_[0] = 1;\n      for (int i = 0; i < ngroups_ - 1; ++i) {\n        if (groups_[i].size() != size_) stop(\"Groups not equal sizes\");\n\n        bins_[i + 1] = bins_[i] * groups_[i].nbins(); \n      }\n    }\n\n    int bin_i(int i) const {\n      int bin = 0;\n\n      for (int j = 0; j < ngroups_; ++j) {\n        bin += groups_[j].bin(i) * bins_[j];\n      }\n\n      return bin;\n    }\n\n    // int nbins() const {\n    //   return bins_[ngroups_ - 1];\n    // }\n\n    int ngroups() const {\n      return groups_.size();\n    }\n\n    int size() const {\n      return size_;\n    }\n\n    std::vector<double> unbin(int bin) const {\n      std::vector<double> bins(ngroups_);\n\n      for (int j = 0; j < ngroups_; ++j) {\n        int bin_j = bin % bins_[j];\n        bins[j] = groups_[j].unbin(bin_j);\n\n        bin = bin - bin * bins_[j];\n      }\n\n      return bins;\n    }\n\n};\n"
  },
  {
    "path": "src/lowerBound.cpp",
    "content": "#include <algorithm>\n#include <Rcpp.h>\nusing namespace Rcpp;\n\n// Quick and dirty implementation of lowerBound, the complement to R's\n// findInterval\n// [[Rcpp::export]]\nIntegerVector lowerBound(const NumericVector& x, const NumericVector& breaks) {\n  int n = x.size();\n  IntegerVector out(n);\n\n  for (int i = 0; i < n; i++) {\n    NumericVector::const_iterator it =\n      std::lower_bound(breaks.begin(), breaks.end(), x[i]);\n    if (it == breaks.end()) --it;\n    out[i] = it - breaks.begin() + 1;\n  }\n  return out;\n}\n"
  },
  {
    "path": "src/smooth-nd.cpp",
    "content": "#include <algorithm>\n#include <Rcpp.h>\n#include \"group.h\"\n#include \"Summary2d.h\"\n#include <boost/shared_ptr.hpp>\nusing namespace Rcpp;\n\nboost::shared_ptr<Summary2d> createSummary(std::string type) {\n  if (type == \"mean\") {\n    return boost::shared_ptr<Summary2d>(new Summary2dMean());\n  } else if (type == \"regression\") {\n    return boost::shared_ptr<Summary2d>(new Summary2dRegression());\n  } else if (type == \"robust_regression\") {\n    return boost::shared_ptr<Summary2d>(new Summary2dRobustRegression());\n  } else {\n    stop(\"Unknown type\");\n    // Quiet warning\n    return boost::shared_ptr<Summary2d>(new Summary2dMean());\n  }\n}\n\ndouble tricube(double x) {\n  if (NumericVector::is_na(x)) return 0;\n  x = fabs(x);\n  if (x > 1) return 0;\n\n  double y = 1 - x * x * x;\n  return y * y * y;\n}\n\nbool both_na(double x, double y) {\n  return (NumericVector::is_na(x) && NumericVector::is_na(y));\n}\n\n// [[Rcpp::export]]\nNumericVector smooth_nd_1(const NumericMatrix& grid_in, \n                          const NumericVector& z_in, \n                          const NumericVector& w_in_,\n                          const NumericMatrix& grid_out, \n                          const int var, const double h,\n                          const std::string type = \"mean\") {\n\n  if (var < 0) stop(\"var < 0\");\n  if (var >= grid_in.ncol()) stop(\"var too large\");\n  if (h <= 0) stop(\"h <= 0\");\n  if (grid_in.ncol() != grid_out.ncol()) stop(\"Incompatible grid sizes\");\n\n  int n_in = grid_in.nrow(), n_out = grid_out.nrow(), d = grid_in.ncol();\n  NumericVector w_in = (w_in_.size() > 0) ? w_in_ : \n    rep(NumericVector::create(1), n_in);\n  NumericVector z_out(n_out), w_out(n_out);\n\n  // Will be much more efficient to slice up by input dimension:\n  // and most efficient way of doing that will be to bin with / bw\n  // My data structure: sparse grids\n  // \n  // And once we're smoothing in one direction, with guaranteed e2venly spaced\n  // grid can avoid many kernel evaluations and can also compute more\n  // efficient running sum\n\n  for(int j = 0; j < n_out; ++j) {\n    boost::shared_ptr<Summary2d> summary = createSummary(type);\n    for (int i = 0; i < n_in; ++i) {\n      // Check that all variables (apart from var) are equal\n      bool equiv = true;\n      for (int k = 0; k < d; ++k) {\n        if (k == var) continue;\n\n        double in = grid_in(i, k), out = grid_out(j, k);\n        if (in != out && !both_na(in, out)) {\n          equiv = false;\n          break;\n        }\n      };\n      if (!equiv) continue;\n\n      double in = grid_in(i, var), out = grid_out(j, var);\n      double dist = both_na(in, out) ? 0 : in - out;\n      double w = tricube(dist / h) * w_in[i];\n      if (w == 0) continue;\n\n      summary->push(dist, z_in[i], w);\n    }\n    z_out[j] = summary->compute();\n  }\n\n  return z_out;\n}\n\n// [[Rcpp::export]]\nNumericVector smooth_nd(const NumericMatrix& grid_in, \n                        const NumericVector& z_in, \n                        const NumericVector& w_in_,\n                        const NumericMatrix& grid_out, \n                        const NumericVector h) {\n\n  if (grid_in.nrow() != z_in.size()) stop(\"Incompatible input lengths\");\n  if (grid_in.ncol() != grid_out.ncol()) stop(\"Incompatible grid sizes\");\n  if (h.size() != grid_in.ncol()) stop(\"Incorrect h length\");\n\n  int n_in = grid_in.nrow(), n_out = grid_out.nrow(), d = grid_in.ncol();\n  NumericVector w_in = (w_in_.size() > 0) ? w_in_ : \n    rep(NumericVector::create(1), n_in);\n  NumericVector z_out(n_out), w_out(n_out);\n\n  for (int i = 0; i < n_in; ++i) {\n    for(int j = 0; j < n_out; ++j) {\n      double w = 1;\n      for (int k = 0; k < d; ++k) {\n        double dist = (grid_in(i, k) - grid_out(j, k)) / h[k];\n        w *= tricube(dist);\n      }\n      w *= w_in[i];\n\n      w_out[j] += w;\n      z_out[j] += z_in[i] * w;\n    }\n  }\n\n  for(int j = 0; j < n_out; ++j) {\n    z_out[j] /= w_out[j];\n  }\n\n  return z_out;\n}\n"
  },
  {
    "path": "src/stats.cpp",
    "content": "#include <Rcpp.h>\nusing namespace Rcpp;\n\nstruct Regression {\n  double alpha, beta;\n};\n\n// [[Rcpp::export]]\ndouble bisquare(double u, double b) {\n  u = fabs(u);\n  return (u < b) ? pow(1 - pow(u / b, 2), 2) : 0;\n}\n\nRegression simpleLinearRegression(const std::vector<double>& x, \n                                  const std::vector<double>& y,\n                                  const std::vector<double>& w) {\n  int n = x.size();\n\n  double x_wsum = 0, y_wsum = 0, w_sum = 0;\n  for (int i = 0; i < n; ++i) {\n    x_wsum += x[i] * w[i];\n    y_wsum += y[i] * w[i];\n    w_sum += w[i];\n  };\n  double x_mean = x_wsum / w_sum, y_mean = y_wsum / w_sum;\n\n  double var_xy = 0, var_x = 0;\n  for (int i = 0; i < n; ++i) {\n    var_xy += w[i] * (x[i] - x_mean) * (y[i] - y_mean);\n    var_x += w[i] * pow((x[i] - x_mean), 2);\n  }\n\n  Regression results;\n  results.beta = (var_xy / var_x);\n  results.alpha = y_mean - results.beta * x_mean;\n  return results;\n}\n\n// [[Rcpp::export]]\nNumericVector regress(const std::vector<double>& x, \n                      const std::vector<double>& y,\n                      const std::vector<double>& w) {\n  Regression regression = simpleLinearRegression(x, y, w);\n  return NumericVector::create(regression.alpha, regression.beta);\n}\n\ndouble median(std::vector<double>* x) {\n  if (x->empty()) return NAN;\n\n  int size = x->size();\n  std::vector<double>::iterator upper = x->begin() + (int) (size / 2);\n  std::nth_element(x->begin(), upper, x->end());\n\n  if (size % 2 == 1) {\n    return *upper;\n  } else {\n    std::vector<double>::iterator lower = upper - 1;\n    std::nth_element(x->begin(), lower, upper);\n    return (*upper + *lower) / 2.0;\n  }  \n}\n\n// [[Rcpp::export(\"medianC\")]]\ndouble median(const std::vector<double>& x) {\n  std::vector<double> x_(x);\n  return median(&x_);\n}\n\n\nRegression simpleRobustRegression(const std::vector<double>& x, \n                       const std::vector<double>& y,\n                       const std::vector<double>& w,\n                       int iterations = 3) {\n  int n = x.size();\n  Regression prev = simpleLinearRegression(x, y, w);\n\n  for (int k = 0; k < iterations; ++k) {\n    std::vector<double> resid(n);\n    for (int i = 0; i < n; ++i) {\n      resid[i] = fabs(y[i] - (prev.alpha + prev.beta * x[i]));\n    }\n\n    std::vector<double> w_(w);\n    double b = 6 * median(resid);\n    if (b < 1e-20) break;\n    for (int i = 0; i < n; ++i) {\n      w_[i] *= bisquare(resid[i], b);\n    }\n\n    prev = simpleLinearRegression(x, y, w_);\n  }\n\n  return prev;\n}\n\n// [[Rcpp::export]]\nNumericVector regress_robust(const std::vector<double>& x, \n                             const std::vector<double>& y,\n                             const std::vector<double>& w,\n                             int iterations = 3) {\n  Regression regression = simpleRobustRegression(x, y, w, iterations);\n  return NumericVector::create(regression.alpha, regression.beta);\n}\n"
  },
  {
    "path": "src/stats.h",
    "content": "struct Regression {\n  double alpha, beta;\n};\n\ndouble bisquare(double u, double b);\n\nRegression simpleLinearRegression(const std::vector<double>& x, \n                                  const std::vector<double>& y,\n                                  const std::vector<double>& w);\n\nRegression simpleRobustRegression(const std::vector<double>& x, \n                       const std::vector<double>& y,\n                       const std::vector<double>& w,\n                       int iterations = 3);\n\ndouble median(const std::vector<double>& x);\ndouble median(std::vector<double>* x);\n"
  },
  {
    "path": "src/summary.cpp",
    "content": "#include <Rcpp.h>\n#include \"summary.h\"\nusing namespace Rcpp;\n\ntemplate<typename Summary>\nNumericVector summary_compute(const NumericVector& x, Summary summary) {\n  int n = x.size();\n  for(int i = 0; i < n; ++i) {\n    summary.push(x[i], 1);\n  }\n\n  int m = summary.size();\n  NumericVector out(m);\n  for(int i = 0; i < m; ++i) {\n    out[i] = summary.compute(i);\n  }\n\n  return out;\n}\n\n// [[Rcpp::export]]\nNumericVector compute_moments(const NumericVector& x) {\n  return summary_compute(x, SummaryMoments(2));\n}\n\n// [[Rcpp::export]]\nNumericVector compute_sum(const NumericVector& x) {\n  return summary_compute(x, SummarySum(1));\n}\n\n// [[Rcpp::export]]\nNumericVector compute_median(const NumericVector& x) {\n  return summary_compute(x, SummaryMedian());\n}\n"
  },
  {
    "path": "src/summary.h",
    "content": "#include <Rcpp.h>\n#include \"stats.h\"\nusing namespace Rcpp;\n\nclass SummaryMoments {\n    int i_;\n    double weight;\n    double mean;\n    double m2;\n\n  public:\n    SummaryMoments (int i) : i_(i), weight(0), mean(0), m2(0) {\n      if (i > 2) stop(\"Invalid moment\");\n    }\n\n    // Algorithm adapted from \n    // http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Weighted_incremental_algorithm\n    void push(double y, double w) {\n      if (NumericVector::is_na(y)) return;\n\n      // counts and weights\n      weight += w;\n\n      // mean\n      if (i_ < 1) return;\n      double delta = y - mean;\n      mean += delta * w / weight;\n\n      // variance\n      if (i_ < 2) return;      \n      m2 += delta * delta * w * (1 - w / weight);\n\n      return;\n    }\n\n    const int size() const {\n      return i_ + 1;\n    }\n\n    double compute(int i) const {\n      switch (i) {\n        case 0: return weight;\n        case 1: return (weight == 0) ? NAN : mean;\n        case 2: return (weight == 0) ? NAN : pow(m2 / (weight - 1), 0.5);\n        default: \n          stop(\"Invalid output requested\");\n          return NAN;\n      }\n    }\n\n    std::string name(int i) const {\n      switch (i) {\n        case 0: return \"count\";\n        case 1: return \"mean\";\n        case 2: return \"sd\";\n        default: \n          stop(\"Invalid output requested\");\n          return \"\";\n      }\n    }\n};\n\nclass SummarySum {\n    int i_;\n    int weight;\n    double sum;\n\n  public:\n    SummarySum (int i) : i_(i), weight(0), sum(0) {\n      if (i > 1 || i < 0) stop(\"Invalid moment\");\n    }\n\n    void push(double y, double w) {\n      if (NumericVector::is_na(y)) return;\n\n      weight += w;\n      if (i_ < 1) return;\n\n      sum += y * w;\n    }\n\n    const int size() const {\n      return i_ + 1;\n    }\n\n    double compute(int i) const  {\n      switch (i) {\n        case 0: return weight;\n        case 1: return sum;\n        default: \n          stop(\"Invalid output requested\");\n          return NAN;\n      }\n    }\n\n    std::string name(int i) const {\n      switch (i) {\n        case 0: return \"count\";\n        case 1: return \"sum\";\n        default: \n          stop(\"Invalid output requested\");\n          return \"\";\n      }\n    }\n\n};\n\nclass SummaryMedian {\n    std::vector<double> ys;\n\n  public:\n    void push(double y, double w) {\n      if (NumericVector::is_na(y)) return;\n\n      ys.push_back(y);\n    }\n\n    int size() const {\n      return 1;\n    }\n\n    // Adapted from http://stackoverflow.com/questions/1719070/\n    double compute(int i) {\n      return median(&ys);\n    }\n\n    std::string name(int i) const {\n      return \"median\";\n    }\n};\n"
  }
]