Repository: hadley/bigvis Branch: master Commit: 9cce240578a2 Files: 103 Total size: 162.1 KB Directory structure: gitextract_t_3h8d36/ ├── .Rbuildignore ├── .gitignore ├── .travis.yml ├── DESCRIPTION ├── NAMESPACE ├── R/ │ ├── RcppExports.R │ ├── adjust.r │ ├── autoplot.r │ ├── bigvis.r │ ├── bin.r │ ├── breaks.r │ ├── challenge.r │ ├── condense.r │ ├── condensed.r │ ├── dgrid.r │ ├── h.r │ ├── id.r │ ├── movies.r │ ├── mt.r │ ├── origin.r │ ├── peel.r │ ├── ranged.r │ ├── rebin.r │ ├── rmse.r │ ├── smooth.r │ ├── standardise.r │ ├── utils.r │ ├── weighted-stats.r │ └── width.r ├── README.md ├── bench/ │ ├── bin-structure.cpp │ ├── bin.cpp │ ├── count.cpp │ ├── group-tempvar.cpp │ ├── kernel.cpp │ ├── mean.cpp │ ├── median.cpp │ └── smooth-1d.cpp ├── bigvis.Rproj ├── data/ │ └── movies.rdata ├── inst/ │ ├── include/ │ │ └── bigvis.h │ └── tests/ │ ├── test-binned-vectors.r │ ├── test-breaks.r │ ├── test-condense.r │ ├── test-frange.r │ ├── test-group-1d.r │ ├── test-group-2d.r │ ├── test-origin.r │ ├── test-ranged.r │ ├── test-smooth.r │ ├── test-stat.r │ ├── test-summary-moments.r │ └── test-weighted-stats.r ├── man/ │ ├── autoplot.condensed.Rd │ ├── best_h.Rd │ ├── bigvis.Rd │ ├── bin.Rd │ ├── breaks.Rd │ ├── condense.Rd │ ├── condensed.Rd │ ├── dchallenge.Rd │ ├── dgrid.Rd │ ├── find_origin.Rd │ ├── find_width.Rd │ ├── frange.Rd │ ├── h_grid.Rd │ ├── is.ranged.Rd │ ├── movies.Rd │ ├── mt.Rd │ ├── peel.Rd │ ├── ranged.Rd │ ├── rmse_cvs.Rd │ ├── round_any.condensed.Rd │ ├── smooth.Rd │ ├── standardise.Rd │ ├── transform.condensed.Rd │ ├── weighted.IQR.Rd │ ├── weighted.ecdf.Rd │ ├── weighted.median.Rd │ ├── weighted.quantile.Rd │ └── weighted.var.Rd ├── notes.md └── src/ ├── .gitignore ├── BigVis.cpp ├── BinnedVector.cpp ├── BinnedVectors.cpp ├── Makevars ├── RcppExports.cpp ├── Summary2d.cpp ├── Summary2d.h ├── condense-gen.r ├── condense.cpp ├── double-diff-sum.cpp ├── frange.cpp ├── group-hex.h ├── group.cpp ├── group.h ├── lowerBound.cpp ├── smooth-nd.cpp ├── stats.cpp ├── stats.h ├── summary.cpp └── summary.h ================================================ FILE CONTENTS ================================================ ================================================ FILE: .Rbuildignore ================================================ bench notes.md ^.*\.Rproj$ ^\.Rproj\.user$ ^\.travis\.yml$ ^src/condense-gen\.r$ ================================================ FILE: .gitignore ================================================ .Rproj.user .Rhistory .RData src/*.o src/*.so src/*.dll ================================================ FILE: .travis.yml ================================================ # Sample .travis.yml for R projects language: r warnings_are_errors: true sudo: required r_github_packages: - jimhester/covr after_success: - Rscript -e 'covr::codecov()' ================================================ FILE: DESCRIPTION ================================================ Package: bigvis Version: 0.1.0.9000 Title: Tools for visualisation of big data sets Description: Tools for visualising large datasets. Authors@R: c( person("Hadley", "Wickham", role = c("aut", "cre"), , "hadley@rstudio.com"), person("Yue", "Hue", role = "aut"), person("R Core team", role = "ctb", comment = "guess_bandwidth adapted from stats::bw.SJ") ) Depends: Rcpp Imports: methods Suggests: plyr, ggplot2, scales LazyData: true LinkingTo: Rcpp, BH License: GPL (>= 2) Collate: 'standardise.r' 'movies.r' 'RcppExports.R' 'adjust.r' 'ranged.r' 'bigvis.r' 'rebin.r' 'autoplot.r' 'origin.r' 'utils.r' 'breaks.r' 'weighted-stats.r' 'condense.r' 'condensed.r' 'bin.r' 'smooth.r' 'challenge.r' 'peel.r' 'id.r' 'rmse.r' 'width.r' 'h.r' 'mt.r' 'dgrid.r' ================================================ FILE: NAMESPACE ================================================ # Generated by roxygen2 (4.1.1): do not edit by hand S3method("[",dgrid) S3method("[<-",ranged) S3method(Math,condensed) S3method(Ops,condensed) S3method(Ops,ranged) S3method(as.condensed,condensed) S3method(as.condensed,data.frame) S3method(as.data.frame,dgrid) S3method(as.data.frame,ranged) S3method(as.integer,dgrid) S3method(max,dgrid) S3method(max,ranged) S3method(min,dgrid) S3method(min,ranged) S3method(print,ranged) S3method(range,dgrid) S3method(range,ranged) S3method(str,ranged) S3method(transform,condensed) export(as.condensed) export(autoplot.condensed) export(best_h) export(bin) export(breaks) export(condense) export(dchallenge) export(dgrid) export(find_origin) export(find_width) export(frange) export(h_grid) export(inv_mt) export(is.condensed) export(is.dgrid) export(is.ranged) export(mt) export(mt_trans) export(peel) export(ranged) export(rchallenge) export(rebin) export(rmse_cv) export(rmse_cvs) export(round_any.condensed) export(smooth) export(standardise) export(weighted.IQR) export(weighted.ecdf) export(weighted.median) export(weighted.quantile) export(weighted.sd) export(weighted.var) exportMethods(as.integer) exportMethods(show) importFrom(Rcpp,compileAttributes) importFrom(Rcpp,cpp_object_initializer) importFrom(methods,new) useDynLib(bigvis) ================================================ FILE: R/RcppExports.R ================================================ # This file was generated by Rcpp::compileAttributes # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 condense_count <- function(x, z, weight, drop = FALSE) { .Call('bigvis_condense_count', PACKAGE = 'bigvis', x, z, weight, drop) } condense_sum <- function(x, z, weight, drop = FALSE) { .Call('bigvis_condense_sum', PACKAGE = 'bigvis', x, z, weight, drop) } condense_mean <- function(x, z, weight, drop = FALSE) { .Call('bigvis_condense_mean', PACKAGE = 'bigvis', x, z, weight, drop) } condense_sd <- function(x, z, weight, drop = FALSE) { .Call('bigvis_condense_sd', PACKAGE = 'bigvis', x, z, weight, drop) } condense_median <- function(x, z, weight, drop = FALSE) { .Call('bigvis_condense_median', PACKAGE = 'bigvis', x, z, weight, drop) } double_diff_sum <- function(bin, count) { .Call('bigvis_double_diff_sum', PACKAGE = 'bigvis', bin, count) } #' Efficient implementation of range. #' #' This is an efficient C++ implementation of range for numeric vectors: #' it avoids S3 dispatch, and computes both min and max in a single pass #' through the input. #' #' If \code{x} has a \code{range} attribute (e.g. it's a \code{\link{ranged}} #' object), it will be used instead of computing the range from scratch. #' #' @param x a numeric vector, or a \code{\link{ranged}} object #' @param finite If \code{TRUE} ignores missing values and infinities. Note #' that if the vector is empty, or only contains missing values, #' \code{frange} will return \code{c(Inf, -Inf)} because those are the #' identity values for \code{\link{min}} and \code{\link{max}} respectively. #' @export #' @examples #' x <- runif(1e6) #' system.time(range(x)) #' system.time(frange(x)) #' #' rx <- ranged(x) #' system.time(frange(rx)) frange <- function(x, finite = TRUE) { .Call('bigvis_frange', PACKAGE = 'bigvis', x, finite) } group_fixed <- function(x, width, origin = 0) { .Call('bigvis_group_fixed', PACKAGE = 'bigvis', x, width, origin) } group_rect <- function(x, y, x_width, y_width, x_origin, y_origin) { .Call('bigvis_group_rect', PACKAGE = 'bigvis', x, y, x_width, y_width, x_origin, y_origin) } group_hex <- function(x, y, x_width, y_width, x_origin, y_origin, x_max) { .Call('bigvis_group_hex', PACKAGE = 'bigvis', x, y, x_width, y_width, x_origin, y_origin, x_max) } lowerBound <- function(x, breaks) { .Call('bigvis_lowerBound', PACKAGE = 'bigvis', x, breaks) } smooth_nd_1 <- function(grid_in, z_in, w_in_, grid_out, var, h, type = "mean") { .Call('bigvis_smooth_nd_1', PACKAGE = 'bigvis', grid_in, z_in, w_in_, grid_out, var, h, type) } smooth_nd <- function(grid_in, z_in, w_in_, grid_out, h) { .Call('bigvis_smooth_nd', PACKAGE = 'bigvis', grid_in, z_in, w_in_, grid_out, h) } bisquare <- function(u, b) { .Call('bigvis_bisquare', PACKAGE = 'bigvis', u, b) } regress <- function(x, y, w) { .Call('bigvis_regress', PACKAGE = 'bigvis', x, y, w) } medianC <- function(x) { .Call('bigvis_median', PACKAGE = 'bigvis', x) } regress_robust <- function(x, y, w, iterations = 3L) { .Call('bigvis_regress_robust', PACKAGE = 'bigvis', x, y, w, iterations) } compute_moments <- function(x) { .Call('bigvis_compute_moments', PACKAGE = 'bigvis', x) } compute_sum <- function(x) { .Call('bigvis_compute_sum', PACKAGE = 'bigvis', x) } compute_median <- function(x) { .Call('bigvis_compute_median', PACKAGE = 'bigvis', x) } ================================================ FILE: R/adjust.r ================================================ # Protect against floating point areas by slightly adjusting breaks. # Adapted from graphics::hist.default. adjust_breaks <- function(breaks, open = "right") { open <- match.arg(open, c("left", "right")) breaks <- sort(breaks) diddle <- 1e-07 * median(diff(breaks)) if (open == "left") { fuzz <- c(-diddle, rep.int(diddle, length(breaks) - 1)) } else { fuzz <- c(rep.int(-diddle, length(breaks) - 1), diddle) } breaks + fuzz } ================================================ FILE: R/autoplot.r ================================================ #' Autoplot condensed summaries. #' #' @param x a condensed summary #' @param var which summary variable to display #' @param ... other arguments passed on to individual methods #' @method autoplot condensed #' @export autoplot.condensed #' @examples #' if (require("ggplot2")) { #' #' # 1d summaries ----------------------------- #' x <- rchallenge(1e4) #' z <- x + rt(length(x), df = 2) #' xsum <- condense(bin(x, 0.1)) #' zsum <- condense(bin(x, 0.1), z = z) #' #' autoplot(xsum) #' autoplot(peel(xsum)) #' #' autoplot(zsum) #' autoplot(peel(zsum, keep = 1)) #' autoplot(peel(zsum)) #' #' # 2d summaries ----------------------------- #' y <- runif(length(x)) #' xysum <- condense(bin(x, 0.1), bin(y, 0.1)) #' xyzsum <- condense(bin(x, 0.1), bin(y, 0.1), z = z) #' #' autoplot(xysum) #' autoplot(peel(xysum)) #' autoplot(xyzsum) #' autoplot(peel(xyzsum)) #' } autoplot.condensed <- function(x, var = last(summary_vars(x)), ...) { stopifnot(is.condensed(x)) stopifnot(is.character(var), length(var) == 1) summaries <- c( .count = "total", .sum = "total", .mean = "summary", .sd = "summary", .median = "summary" ) if (!(var %in% names(summaries))) { stop("Unknown varible", call. = FALSE) } d <- gcol(x) if (d > 2) { stop("No autoplot methods available for more than two d") } f <- paste0("plot_", summaries[var], "_", d) find_fun(f)(x, var = var, ...) } plot_total_1 <- function(x, var = ".count", show_na = TRUE, log = "") { xvar <- names(x)[[1]] plot <- ggplot2::ggplot(x[-1, ], ggplot2::aes_string(x = xvar, y = var)) + ggplot2::geom_line(na.rm = TRUE) if (show_na) { plot <- plot + na_layer(x, var) } if (logv(log, "y")) { plot <- plot + ggplot2::scale_y_continuous(trans = "log1p") } if (logv(log, "x")) { plot <- plot + ggplot2::scale_x_log10() } plot } plot_total_2 <- function(x, var = ".count", show_na = TRUE, log = "") { x <- peel(x, keep = 1) xvar <- names(x)[[1]] yvar <- names(x)[[2]] miss <- is.na(x[[1]]) + 2 * is.na(x[[2]]) fill_trans <- if (logv(log, "z")) "log1p" else "identity" plot <- ggplot2::ggplot(x[miss == 0, ], ggplot2::aes_string(x = xvar, y = yvar)) + ggplot2::geom_raster(ggplot2::aes_string(fill = var)) + ggplot2::scale_fill_gradient(low = "grey90", high = "black", trans = fill_trans) + ggplot2::expand_limits(fill = 0) if (show_na) { } plot <- plot + if (logv(log, "x")) ggplot2::scale_x_log10() plot <- plot + if (logv(log, "y")) ggplot2::scale_y_log10() plot } plot_summary_1 <- function(x, var = ".mean", show_na = TRUE, show_n = x %contains% ".count", log = NULL) { xvar <- names(x)[[1]] plot <- ggplot2::ggplot(x[-1, ], ggplot2::aes_string(x = xvar, y = var)) + ggplot2::geom_line(na.rm = TRUE) + ggplot2::scale_size_area() if (show_n) { plot <- plot + ggplot2::geom_point(ggplot2::aes_string(color = ".count"), na.rm = TRUE) + ggplot2::scale_colour_gradient(trans = "log10") } if (show_na) { plot <- plot + na_layer(x, var) } plot } plot_summary_2 <- function(x, var = ".mean", show_na = TRUE, log = "") { x <- peel(x, keep = 1) xvar <- names(x)[[1]] yvar <- names(x)[[2]] miss <- is.na(x[[1]]) + 2 * is.na(x[[2]]) plot <- ggplot2::ggplot(x[miss == 0, ], ggplot2::aes_string(x = xvar, y = yvar)) + ggplot2::geom_tile(ggplot2::aes_string(fill = var)) + ggplot2::scale_fill_gradient2() if (show_na) { } plot <- plot + if (logv(log, "x")) ggplot2::scale_x_log10() plot <- plot + if (logv(log, "y")) ggplot2::scale_y_log10() plot } na_layer <- function(x, var) { val <- x[[var]][is.na(x[[1]])] if (length(val) == 0 || is.na(val) || val == 0) return() xloc <- miss_poss(x[[1]]) ggplot2::annotate("text", x = xloc, y = val, colour = "red", label = "NA", size = 3) } logv <- function(log, var) var %in% strsplit(log, "")[[1]] miss_poss <- function(x) { rng <- frange(x) rng[1] - (rng[2] - rng[1]) * 0.05 } ================================================ FILE: R/bigvis.r ================================================ #' The big vis package. #' #' @useDynLib bigvis #' @docType package #' @name bigvis NULL if (!exists("BigVis")) { BigVis <- Rcpp::Module("BigVis") } #' @param x,object,... Generic args #' @rdname bigvis #' @export setMethod("show", "Rcpp_BinnedVector", function(object) { cat("Binned [", object$size(), "]. ", "Width: ", object$width(), " Origin: ", object$origin(), "\n", sep = "") }) #' @rdname bigvis #' @export setMethod("as.integer", "Rcpp_BinnedVector", function(x, ...) { vapply(seq_len(x$size()), x$bin_i, integer(1)) }) # Silence R CMD check note #' @importFrom methods new #' @importFrom Rcpp compileAttributes cpp_object_initializer NULL ================================================ FILE: R/bin.r ================================================ #' Create a binned variable. #' #' @details #' This function produces an R reference class that wraps around a C++ function. #' Generally, you should just treat this as an opaque object with reference #' semantics, and you shouldn't call the methods on it - pass it to #' \code{\link{condense}} and friends. #' #' @param x numeric or integer vector #' @param width bin width. If not specified, about 10,000 bins will be chosen #' using the algorithim in \code{\link{find_width}}. #' @param origin origin. If not specified, guessed by \code{\link{find_origin}}. #' @param name name of original variable. This will be guessed from the input to #' \code{group} if not supplied. Used in the output of #' \code{\link{condense}} etc. #' @export #' @examples #' x <- runif(1e6) #' bin(x) #' bin(x, 0.01) #' bin(x, 0.01, origin = 0.5) bin <- function(x, width = find_width(x), origin = find_origin(x, width), name = NULL) { stopifnot(is.numeric(x)) stopifnot(is.numeric(width), length(width) == 1, width > 0) stopifnot(is.numeric(origin), length(origin) == 1) if (is.null(name)) { name <- deparse(substitute(x)) } stopifnot(is.character(name), length(name) == 1) if (!is.ranged(x)) { attr(x, "range") <- frange(x) class(x) <- "ranged" } if (origin > min(x)) { warning("Origin larger than min(x): some values will be truncated", call. = FALSE) } BigVis$BinnedVector$new(x, name, width, origin) } is.binned <- function(x) { is(x, "Rcpp_BinnedVector") } bins <- function(...) { BigVis$BinnedVectors$new(list(...)) } ================================================ FILE: R/breaks.r ================================================ #' Compute breaks given origin and width. #' #' Breaks are right-open, left-closed [x, y), so if \code{max(x)} is an integer #' multiple of binwidth, then we need one more break. This function only returns #' the left-side of the breaks. #' #' The first break is special, because it always contains missing values. #' #' @param x numeric vector #' @param origin bin origin #' @param binwidth bin width #' @export #' @keywords internal #' @examples #' breaks(10, origin = 0, binwidth = 1) #' breaks(9.9, origin = 0, binwidth = 1) #' #' breaks(1:10, origin = 0, binwidth = 2) breaks <- function(x, binwidth, origin = min(x)) { if (!is.binned(x)) { x <- bin(x, binwidth, origin) } # -1 for NA bin, -1 since R is 1 indexed nbins <- x$nbins() - 2 c(NA, x$origin() + seq.int(1, nbins) * x$width()) } ================================================ FILE: R/challenge.r ================================================ #' Density and random number generation functions for a challenging #' distribution. #' #' This is a 1/3-2/3 mixture of a t-distribution with 2 degrees of freedom #' centered at 15 and scaled by 2, and a gamma distribution with shape 2 #' and rate 1/3. (The t-distribution is windsorised at 0, but this #' has negligible effect.) This distribution is challenging because it #' mixes heavy tailed and asymmetric distributions. #' #' @param x values to evaluate pdf at #' @param n number of random samples to generate #' @export #' @examples #' plot(dchallenge, xlim = c(-5, 60), n = 500) #' #' x <- rchallenge(1e4) #' hist(x, breaks = 1000) #' xsum <- condense(bin(x, 0.1)) #' plot(xsum$x, xsum$.count, type = "l") #' xsmu <- smooth(xsum, 0.3) #' plot(xsmu$x, xsmu$.count, type = "l") #' plot(xsmu$x, xsmu$.count, type = "l", xlim = c(0, 30)) dchallenge <- function(x) { # Windorised t-distribution scale <- function(x) (x - 30) / 2 spike <- ifelse(x < 0, 0, dt(scale(x), df = 2)) + pt(scale(0), df = 2) * (x == 0) slope <- dgamma(x, 2, 1/3) (spike + 2 * slope) / 3 } # plot(pchallenge, xlim = c(-5, 60), n = 500) pchallenge <- function(x) { # H(y) = # = int_0^y h(x) dx # = int_0^y 1/3 f(x) + 2/3 g(x) dx # = 1/3 int_0^y f(x) dx + 2/3 int_0^y g(x) dx = # = 1/3 F(y) + 2/3 G(y) # h(x) = g((x - 30) / 2) # H(y) = int_0^y g((x - 30) / 2) dx # complete transformation scale <- function(x) (x - 30) / 2 spike <- ifelse(x < 0, 0, pt(scale(x), df = 2)) slope <- pgamma(x, 2, 1/3) (spike + 2 * slope) / 3 } qchallenge <- function(x) { # approximate pchallenge with 1000 points, and linearise # use to implement fast option to rchallenge that does inverse pdf # transformation + runif() } #' @rdname dchallenge #' @export rchallenge <- function(n) { nt <- rbinom(1, n, 1 / 3) ngamma <- n - nt spike <- 2 * rt(nt, df = 2) + 15 spike[spike < 0] <- 0 slope <- rgamma(ngamma, 2, 1/3) c(spike, slope) } ================================================ FILE: R/condense.r ================================================ #' Efficient binned summaries. #' #' @param ... group objects created by \code{\link{bin}} #' @param z a numeric vector to summary for each group. Optional for some #' summary statistics. #' @param summary the summary statistic to use. Currently must be one of #' count, sum, mean, median or sd. If \code{NULL}, defaults to mean if #' y is present, count if not. #' @param w a vector of weights. Not currently supported by all summary #' functions. #' @param drop if \code{TRUE} only locations with data will be returned. This #' is more efficient if the data is very sparse (<1\% of cells filled), and #' is slightly less efficient. Defaults to \code{TRUE} if you are condensing #' over two or more dimensions, \code{FALSE} for 1d. #' @export #' @examples #' x <- runif(1e5) #' gx <- bin(x, 0.1) #' condense(gx) condense <- function(..., z = NULL, summary = NULL, w = NULL, drop = NULL) { gs <- list(...) if (length(gs) == 1 && is.list(gs[[1]])) gs <- gs[[1]] is_binned <- vapply(gs, is.binned, logical(1)) if (!all(is_binned)) { stop("All objects passed to ... must be binned.", call. = FALSE) } drop <- drop %||% (length(gs) > 1) if (is.null(summary)) { summary <- if (is.null(z)) "count" else "mean" message("Summarising with ", summary) } # C++ code can deal with NULL inputs more efficiently than R code z <- z %||% numeric() w <- w %||% numeric() # Check lengths consistent n <- gs[[1]]$size() stopifnot(length(z) == 0 || length(z) == n) stopifnot(length(w) == 0 || length(w) == n) f <- find_fun(paste("condense", summary, sep = "_")) out <- f(gs, z, w, drop = drop) condensed(gs, out[[1]], out[[2]]) } ================================================ FILE: R/condensed.r ================================================ #' Condensed: an S3 class for condensed summaries. #' #' This object managed the properties of condensed (summarised) data frames. #' #' @section S3 methods: #' #' Mathematical functions with methods for \code{binsum} object will modify #' the x column of the data frame and \code{\link{rebin}} the data, calculating #' updated summary statistics. #' #' Currently methods are provided for the \code{Math} group generic, #' logical comparison and arithmetic operators, and #' \code{\link[plyr]{round_any}}. #' #' @param groups list of \code{\link{bin}}ed objects #' @param grouped,summary output from C++ condense function #' @keywords internal #' @examples #' if (require("ggplot2")) { #' #' x <- rchallenge(1e4) #' xsum <- condense(bin(x, 1 / 10)) #' #' # Basic math operations just modify the first column #' autoplot(xsum) #' autoplot(xsum * 10) #' autoplot(xsum - 30) #' autoplot(abs(xsum - 30)) #' #' # Similarly, logical operations work on the first col #' autoplot(xsum[xsum > 10, ]) #'} condensed <- function(groups, grouped, summary) { grouped <- as.data.frame(grouped) summary <- as.data.frame(summary) for (i in seq_along(groups)) { grouped[[i]] <- dgrid(grouped[[i]], groups[[i]]$width(), groups[[i]]$origin(), groups[[i]]$nbins()) } names(summary) <- paste0(".", names(summary)) df <- data.frame(grouped, summary) class(df) <- c("condensed", class(df)) df } #' @export #' @rdname condensed #' @param x object to test or coerce is.condensed <- function(x) inherits(x, "condensed") #' @export #' @rdname condensed as.condensed <- function(x) UseMethod("as.condensed") #' @export as.condensed.condensed <- function(x) x #' @export as.condensed.data.frame <- function(x) { structure(x, class = c("condensed", class(x))) } summary_vars <- function(x) { stopifnot(is.condensed(x)) nm <- names(x) names(x)[grepl("^\\.", names(x))] } group_vars <- function(x) { setdiff(names(x), summary_vars(x)) } gcol <- function(x) length(group_vars(x)) #' @export Math.condensed <- function(x, ...) { generic <- match.fun(.Generic) x[[1]] <- generic(x[[1]], ...) rebin(x) } #' @export Ops.condensed <- function(e1, e2) { logical_ops <- c("==", "!=", "<", "<=", ">=", ">") math_ops <- c("+", "-", "*", "/", "^", "%%", "%/%") generic <- match.fun(.Generic) if (.Generic %in% logical_ops) { l <- generic(e1[[1]], e2) l[1] <- TRUE # always preserve missings l & !is.na(l) } else if (.Generic %in% math_ops) { e1[[1]] <- generic(e1[[1]], e2) rebin(e1) } else { stop(.Generic, " not supported for condensed objects", call. = FALSE) } } #' Round any method for condensed objects #' #' @inheritParams plyr::round_any #' @export #' @keywords internal round_any.condensed <- function(x, accuracy, f = round) { gvars <- group_vars(x) x[gvars] <- lapply(x[gvars], plyr::round_any, accuracy = accuracy, f = f) rebin(x) } ================================================ FILE: R/dgrid.r ================================================ #' dgrid: an S3 class for data grids #' #' @param x a numeric vector to test or coerce. #' @param width bin width #' @param origin bin origins #' @param nbins number of bins #' @export #' @examples #' g <- dgrid(0:10 + 0.5, width = 1) #' range(g) #' as.integer(g) dgrid <- function(x, width, origin = 0, nbins = NULL) { stopifnot(is.numeric(x)) stopifnot(is.numeric(width), length(width) == 1, width > 0) stopifnot(is.numeric(origin), length(origin) == 1) if (is.null(nbins)) { nbins <- floor((max(x) - origin) / width) } structure(x, class = c("dgrid", "numeric"), width = width, origin = origin, nbins = nbins) } #' @export #' @rdname dgrid is.dgrid <- function(x) inherits(x, "dgrid") #' @export "[.dgrid" <- function(x, ...) { dgrid(NextMethod(), width = attr(x, "width"), origin = attr(x, "origin"), nbins = attr(x, "nbins")) } #' @export min.dgrid <- function(x, ...) attr(x, "origin") #' @export max.dgrid <- function(x, ...) { min(x) + attr(x, "nbins") * attr(x, "width") } #' @export range.dgrid <- function(x, ...) c(min(x), max(x)) #' @export as.integer.dgrid <- function(x, ...) { as.integer((unclass(x) - attr(x, "origin")) / attr(x, "width") + 1L) } #' @export as.data.frame.dgrid <- function(x, ...) { n <- length(x) list <- list(x) class(list) <- "data.frame" attr(list, "row.names") <- c(NA_integer_, -n) list } ================================================ FILE: R/h.r ================================================ #' Find "best" smoothing parameter using leave-one-out cross validation. #' #' Minimises the leave-one-out estimate of root mean-squared error to find #' find the "optimal" bandwidth for smoothing. #' #' L-BFGS-B optimisation is used to constrain the bandwidths to be greater #' than the binwidths: if the bandwidth is smaller than the binwidth it's #' impossible to compute the rmse because no smoothing occurs. The tolerance #' is set relatively high for numerical optimisation since the precise choice #' of bandwidth makes little difference visually, and we're unlikely to have #' sufficient data to make a statistically significant choice anyway. #' #' @param x condensed summary to smooth #' @param h_init initial values of bandwidths to start search out. If not #' specified defaults to 5 times the binwidth of each variable. #' @param ... other arguments (like \code{var}) passed on to #' \code{\link{rmse_cv}} #' @param tol numerical tolerance, defaults to 1\%. #' @param control additional control parameters passed on to \code{\link{optim}} #' The most useful argument is probably trace, which makes it possible to #' follow the progress of the optimisation. #' @family bandwidth estimation functions #' @return a single numeric value representing the bandwidth that minimises #' the leave-one-out estimate of rmse. Vector has attributes #' \code{evaluations} giving the number of times the objective function #' was evaluated. If the optimisation does not converge, or smoothing is not #' needed (i.e. the estimate is on the lower bounds), a warning is thrown. #' @export #' @examples #' \donttest{ #' x <- rchallenge(1e4) #' xsum <- condense(bin(x, 1 / 10)) #' h <- best_h(xsum, control = list(trace = 3, REPORT = 1)) #' #' if (require("ggplot2")) { #' autoplot(xsum) #' autoplot(smooth(xsum, h)) #' } #' } best_h <- function(x, h_init = NULL, ..., tol = 1e-2, control = list()) { stopifnot(is.condensed(x)) gvars <- group_vars(x) widths <- vapply(x[gvars], attr, "width", FUN.VALUE = numeric(1)) h_init <- h_init %||% widths * 5 stopifnot(is.numeric(h_init), length(h_init) == length(gvars)) stopifnot(is.list(control)) control <- modifyList(list(factr = tol / .Machine$double.eps), control) # Optimise rmse <- function(h) { rmse_cv(x, h, ...) } res <- optim(h_init, rmse, method = "L-BFGS-B", lower = widths * 1.01, control = control) h <- unname(res$par) # Feedback if (res$convergence != 0) { warning("Failed to converge: ", res$message, call. = FALSE) } else if (rel_dist(h, widths) < 1e-3) { warning("h close to lower bound: smoothing not needed", call. = FALSE) } structure(h, evaluations = res$counts[1], conv = res$convergence) } rel_dist <- function(x, y) { mean(abs(x - y) / abs(x + y)) } #' Generate grid of plausible bandwidths for condensed summary. #' #' By default, the bandwidths start at the bin width, and then continue #' up 50 (\code{n}) steps until 20 (\code{max}) times the bin width. #' #' @param x a condensed summary #' @param n number of bandwidths to generate (in each dimension) #' @param max maximum bandwidth to generate, as multiple of binwidth. #' @family bandwidth estimation functions #' @export #' @examples #' x <- rchallenge(1e4) #' xsum <- condense(bin(x, 1 / 10)) #' h_grid(xsum) #' #' y <- runif(1e4) #' xysum <- condense(bin(x, 1 / 10), bin(y, 1 / 100)) #' h_grid(xysum, n = 10) h_grid <- function(x, n = 50, max = 20) { stopifnot(is.condensed(x)) stopifnot(is.numeric(n), length(n) == 1, n > 0) stopifnot(is.numeric(max), length(max) == 1, max > 0) gs <- x[group_vars(x)] widths <- vapply(gs, attr, "width", FUN.VALUE = numeric(1)) hs <- lapply(widths, function(w) w * seq(2, max, length = n)) expand.grid(hs, KEEP.OUT.ATTRS = FALSE) } ================================================ FILE: R/id.r ================================================ # Copied and pasted from plyr to avoid dependency id <- function(.variables, drop = FALSE) { # Drop all zero length inputs lengths <- vapply(.variables, length, integer(1)) .variables <- .variables[lengths != 0] if (length(.variables) == 0) { n <- nrow(.variables) %||% 0L return(structure(seq_len(n), n = n)) } # Special case for single variable if (length(.variables) == 1) { return(id_var(.variables[[1]], drop = drop)) } # Calculate individual ids ids <- rev(lapply(.variables, id_var, drop = drop)) p <- length(ids) # Calculate dimensions ndistinct <- vapply(ids, attr, "n", FUN.VALUE = numeric(1), USE.NAMES = FALSE) n <- prod(ndistinct) if (n > 2 ^ 31) { # Too big for integers, have to use strings, which will be much slower :( char_id <- do.call("paste", c(ids, sep = "\r")) res <- match(char_id, unique(char_id)) } else { combs <- c(1, cumprod(ndistinct[-p])) mat <- do.call("cbind", ids) res <- c((mat - 1L) %*% combs + 1L) } attr(res, "n") <- n if (drop) { id_var(res, drop = TRUE) } else { structure(as.integer(res), n = attr(res, "n")) } } id_var <- function(x, drop = FALSE) { if (length(x) == 0) return(structure(integer(), n = 0L)) if (!is.null(attr(x, "n")) && !drop) return(x) if (is.factor(x) && !drop) { id <- as.integer(addNA(x, ifany = TRUE)) n <- length(levels(x)) } else { levels <- sort(unique(x), na.last = TRUE) id <- match(x, levels) n <- max(id) } structure(id, n = n) } ================================================ FILE: R/movies.r ================================================ #' Movie information and user ratings from IMDB.com. #' #' The internet movie database, \url{http://imdb.com/}, is a website devoted #' to collecting movie data supplied by studios and fans. It claims to be the #' biggest movie database on the web and is run by amazon. More about #' information imdb.com can be found online, #' \url{http://imdb.com/help/show_leaf?about}, including information about #' the data collection process, #' \url{http://imdb.com/help/show_leaf?infosource}. #' #' Movies were selected for inclusion if they had a known length and had been rated by at least one imdb user. The data set contains the following fields: #' #' \itemize{ #' \item title. Title of the movie. #' \item year. Year of release. #' \item budget. Total budget (if known) in US dollars #' \item length. Length in minutes. #' \item rating. Average IMDB user rating. #' \item votes. Number of IMDB users who rated this movie. #' \item mpaa. MPAA rating. #' \item action, animation, comedy, drama, documentary, romance, short: #' \code{TRUE} if movie belongs to that genre. #' } #' #' @docType data #' @usage data(movies) #' @name movies #' @format A data frame with 130,456 rows and 14 variables #' @references \url{http://had.co.nz/data/movies/} NULL ================================================ FILE: R/mt.r ================================================ #' Modulus transformation (and its inverse). #' #' A generalisation of the box-cox transformation that works for #' values with both positive and negative values. #' #' This is useful for compressing the tails of long-tailed distributions, #' often encountered with very large datasets. #' #' @param x values to transform #' @param lambda degree of transformation #' @export #' @references J. John and N. Draper. "An alternative family of #' transformations." Applied Statistics, pages 190-197, 1980. #' \url{http://www.jstor.org/stable/2986305} #' @examples #' x <- seq(-10, 10, length = 100) #' plot(x, mt(x, 0), type = "l") #' plot(x, mt(x, 0.25), type = "l") #' plot(x, mt(x, 0.5), type = "l") #' plot(x, mt(x, 1), type = "l") #' plot(x, mt(x, 2), type = "l") #' plot(x, mt(x, -1), type = "l") #' plot(x, mt(x, -2), type = "l") mt <- function(x, lambda) { stopifnot(is.numeric(x)) stopifnot(is.numeric(lambda), length(lambda) == 1) if (lambda == 0) { sign(x) * log(abs(x) + 1) } else { sign(x) * ((abs(x) + 1) ^ lambda - 1) / lambda } } #' @rdname mt #' @export inv_mt <- function(x, lambda) { stopifnot(is.numeric(x)) stopifnot(is.numeric(lambda), length(lambda) == 1) if (lambda == 0) { sign(x) * (exp(abs(x)) - 1) } else { sign(x) * ((abs(x) * lambda + 1) ^ (1 / lambda) - 1) } } #' @rdname mt #' @export mt_trans <- function(lambda) { scales::trans_new("modulo", function(x) mt(x, lambda), function(x) inv_mt(x, lambda) ) } ================================================ FILE: R/origin.r ================================================ #' Find the origin. #' #' @details #' This algorithm implements simple heuristics for determining the origin of #' a histogram when only the binwidth is specified. It: #' #' \itemize{ #' \item rounds to zero, if relatively close #' \item subtracts 0.5 offset, if an x is integer #' \item ensures the origin is a multiple of the binwidth #' } #' @param x numeric or integer vector #' @param binwidth binwidth #' @export #' @keywords internal #' @family reasonable defaults #' @examples #' find_origin(1:10, 1) #' find_origin(1:10, 2) #' find_origin(c(1, 1e6), 1) find_origin <- function(x, binwidth) { rng <- frange(x, finite = TRUE) if (!all(is.finite(rng))) stop("No valid values in x", call. = FALSE) offset <- is.integer(x) * 0.5 if (close_to_zero(rng[1], rng)) { 0 - offset } else { floor_any(rng[1], binwidth) - offset } } close_to_zero <- function(x, rng) { (abs(x) / abs(rng[2] - rng[1])) < 1e-3 } floor_any <- function(x, accuracy) { floor(x / accuracy) * accuracy } ================================================ FILE: R/peel.r ================================================ #' Peel off low density regions of the data. #' #' Keeps specified proportion of data by removing the lowest density regions, #' either anywhere on the plot, or for 2d, just around the edges. #' #' This is useful for visualisation, as an easy way of focussing on the regions #' where the majority of the data lies. #' #' @param x condensed summary #' @param keep (approximate) proportion of data to keep. If \code{1}, will #' remove all cells with counts. All missing values will be preserved. #' @param central if \code{TRUE} peels off regions of lowest density only from #' the outside of the data. In 2d this works by progressively peeling off #' convex hull of the data: the current algorithm is quite slow. #' If \code{FALSE}, just removes the lowest density regions wherever they are #' found. Regions with 0 density are removed regardless of location. #' Defaults to TRUE if there are two or fewer grouping variables is less. #' @export #' @examples #' x <- rt(1e5, df = 2) #' y <- rt(1e5, df = 2) #' xysum <- condense(bin(x, 1 / 10), bin(y, 1 / 10)) #' plot(xysum$x, xysum$y) #' #' plot(peel(xysum, 0.95, central = TRUE)[1:2]) #' plot(peel(xysum, 0.90, central = TRUE)[1:2]) #' plot(peel(xysum, 0.50, central = TRUE)[1:2]) peel <- function(x, keep = 0.99, central = NULL) { stopifnot(is.condensed(x)) stopifnot(is.numeric(keep), length(keep) == 1, keep > 0, keep <= 1) central <- central %||% (gcol(x) <= 2) stopifnot(is.logical(central), length(central) == 1) if (is.null(x$.count)) { stop("Can only peel objects with .count variable", call. = FALSE) } x <- x[x$.count > 0, , drop = FALSE] if (keep == 1) return(x) complete <- complete.cases(x[group_vars(x)]) x_complete <- x[complete, , drop = FALSE] if (central) { peeled <- peel_outer(x_complete, keep) } else { peeled <- peel_anywhere(x_complete, keep) } rbind(peeled, x[!complete, , drop = FALSE]) } peel_anywhere <- function(x, keep) { ord <- order(x$.count, decreasing = TRUE) prop <- cumsum(x$.count[ord]) / sum(x$.count) ind <- which(prop >= keep)[1] x[ord[seq_len(ind)], , drop = FALSE] } peel_outer <- function(x, keep) { d <- gcol(x) if (d > 2) { stop("Outer peeling only works with 1d or 2d data", call. = FALSE) } n <- sum(x$.count) x <- x[order(x$.count, decreasing = TRUE), ] prop <- cumsum(x$.count) / n # Peel off smallest values on chull candidate <- which(prop >= keep) on_hull <- intersect(candidate, chull(x[1:d])) left <- sum(x$.count[-on_hull]) / n while(left >= keep && length(on_hull) > 0) { x <- x[-on_hull, , drop = FALSE] prop <- prop[-on_hull] candidate <- which(prop >= keep) on_hull <- intersect(candidate, chull(x[1:d])) left <- sum(x$.count[-on_hull]) / n } x } ================================================ FILE: R/ranged.r ================================================ #' A S3 class for caching the range of a vector #' #' This class is designed for dealing with large vectors, where the cost of #' recomputing the range multiple times is prohibitive. It provides methods #' for \code{\link{print}} and \code{\link{str}} that display only the range, #' not the contents. #' #' @section Performance: #' For best performance, you may want to run copy and paste the contents of #' this function into your function, to avoid making any copies of \code{x}. #' This is probably only necessary if you're dealing with extremely large #' vectors, > 100 million obs. #' #' @param x a numeric vector #' @param range the range of the vector (excluding missing values), if known. #' If unknown, it will be computed with \code{\link{frange}}, a fast C++ #' implementation of \code{\link{range}}. #' @export #' @examples #' x <- runif(1e6) #' y <- ranged(x) #' range(y) #' y #' str(y) #' #' # Modifications to the class currently destroy the cache #' y[1] <- 10 #' max(y) #' class(y) #' z <- y + 10 #' max(z) #' class(z) ranged <- function(x, range = frange(x, finite = TRUE)) { stopifnot(is.numeric(x)) # Reset range attribute so that lazy evaluation of range # always recomputes from scratch attr(x, "range") <- NULL attr(x, "range") <- range class(x) <- "ranged" x } #' Test if an object is of class ranged. #' #' @export #' @param x object to test #' @keywords internal is.ranged <- function(x) inherits(x, "ranged") #' @export min.ranged <- function(x, ...) attr(x, "range")[1] #' @export max.ranged <- function(x, ...) attr(x, "range")[2] #' @export range.ranged <- function(x, ...) attr(x, "range") #' @export print.ranged <- function(x, ...) { rng <- attr(x, "range") # attr(x, "range") <- NULL # attr(x, "class") <- NULL # print.default(x) cat("Ranged 1:", length(x), " [", format(rng[1]), ", ", format(rng[2]), "]\n", sep = "") } #' @export str.ranged <- function(object, ...) { rng <- attr(object, "range") cat(" Ranged [1:", length(object), "] ", format(rng[1]), "--", format(rng[2]), "\n", sep = "") } #' @export Ops.ranged <- function(e1, e2) { attr(e1, "range") <- NULL class(e1) <- NULL NextMethod(e1, e2) } #' @export "[<-.ranged" <- function(x, ..., value) { attr(x, "range") <- NULL attr(x, "class") <- NULL NextMethod(x, ..., value = value) } #' @export as.data.frame.ranged <- function(x, ...) { n <- length(x) x <- list(x) class(x) <- "data.frame" attr(x, "row.names") <- c(NA_integer_, -n) x } ================================================ FILE: R/rebin.r ================================================ #' Transform condensed objects, collapsing unique bins. #' #' @details #' You don't need to use \code{rebin} if you use transform: it will #' automatically rebin for you. You will need to use it if you manually #' transform any grouping variables. #' #' @param data,`_data` a condensed summary #' @param ... named arguments evaluated in the context of the data #' @usage \\method{transform}{condensed}(`_data`, ...) #' @keywords internal #' @examples #' x <- runif(1e4, -1, 1) #' xsum <- condense(bin(x, 1 / 50)) #' #' # Transforming by hand: must use rebin #' xsum$x <- abs(xsum$x) #' rebin(xsum) #' if (require("ggplot2")) { #' autoplot(xsum) + geom_point() #' autoplot(rebin(xsum)) + geom_point() #' } #' #' #' Transforming with transform #' y <- x ^ 2 + runif(length(x), -0.1, 0.1) #' xysum <- condense(bin(x, 1 / 50), z = y) #' xysum <- transform(xysum, x = abs(x)) #' if (require("ggplot2")) { #' autoplot(xysum) #' } #' @export #' @method transform condensed transform.condensed <- function(`_data`, ...) { df <- transform.data.frame(`_data`, ...) rebin(as.condensed(df)) } #' @export #' @rdname transform.condensed rebin <- function(data) { stopifnot(is.condensed(data)) old_g <- data[group_vars(data)] old_g[] <- lapply(old_g, zapsmall, digits = 3) ids <- id(old_g, drop = TRUE) if (!anyDuplicated(ids)) return(data) old_s <- data[summary_vars(data)] new_s <- lapply(names(old_s), function(var) rebin_var(old_s, ids, var)) names(new_s) <- names(old_s) uids <- !duplicated(ids) new_g <- old_g[uids, , drop = FALSE] ord <- order(ids[uids], na.last = FALSE) as.condensed(data.frame(new_g[ord, , drop = FALSE], new_s)) } rebin_var <- function(df, ids, var) { stopifnot(is.data.frame(df)) stopifnot(is.integer(ids), length(ids) == nrow(df)) stopifnot(is.character(var), length(var) == 1, var %in% names(rebinners)) rows <- split(seq_len(nrow(df)), ids) f <- rebinners[[var]] vapply(rows, function(i) f(df[i, , drop = FALSE]), numeric(1), USE.NAMES = FALSE) } rebinners <- list( .median = function(df) mean(df$.median, na.rm = TRUE), .sum = function(df) sum(df$.sum, na.rm = TRUE), .count = function(df) sum(df$.count, na.rm = TRUE), .mean = function(df) { if (is.null(df$.count)) { mean(df$.mean, na.rm = TRUE) } else { weighted.mean(df$.mean, df$.count) } } ) ================================================ FILE: R/rmse.r ================================================ #' Estimate smoothing RMSE using leave-one-out cross-validation. #' #' \code{rmse_cv} computes the leave-one-out RMSE for a single vector of #' bandwidths, \code{rmse_cvs} computes for a multiple vectors of bandwidths, #' stored as a data frame. #' #' @param x condensed summary table #' @param h,hs for \code{rmse_cv}, a vector of bandwidths; for \code{rmse_cv} #' a data frame of bandwidths, as generated by \code{\link{h_grid}}. #' @param var variable to smooth #' @param ... other variables passed on to \code{\link{smooth}} #' @family bandwidth estimation functions #' @export #' @examples #' \donttest{ #' set.seed(1014) #' # 1d ----------------------------- #' x <- rchallenge(1e4) #' xsum <- condense(bin(x, 1 / 10)) #' cvs <- rmse_cvs(xsum) #' #' if (require("ggplot2")) { #' autoplot(xsum) #' qplot(x, err, data = cvs, geom = "line") #' xsmu <- smooth(xsum, 1.3) #' autoplot(xsmu) #' autoplot(peel(xsmu)) #' } #' #' # 2d ----------------------------- #' y <- runif(1e4) #' xysum <- condense(bin(x, 1 / 10), bin(y, 1 / 100)) #' cvs <- rmse_cvs(xysum, h_grid(xysum, 10)) #' if (require("ggplot2")) { #' qplot(x, y, data = cvs, size = err) #' } #' } rmse_cvs <- function(x, hs = h_grid(x), ...) { rmse_1 <- function(i) { rmse_cv(x, as.numeric(hs[i, ]), ...) } err <- vapply(seq_len(nrow(hs)), rmse_1, numeric(1)) data.frame(hs, err) } #' @rdname rmse_cvs #' @export rmse_cv <- function(x, h, var = summary_vars(x)[1], ...) { # can't smooth missing values, so drop. x <- x[complete.cases(x), , drop = FALSE] gvars <- group_vars(x) pred_error <- function(i) { out <- as.matrix(x[i, gvars, drop = FALSE]) smu <- smooth(x[-i, , drop = FALSE], grid = out, h = h, var = var, ...) smu[[var]] - x[[var]][i] } err <- vapply(seq_len(nrow(x)), pred_error, numeric(1)) sqrt(mean(err ^ 2, na.rm = TRUE)) } ================================================ FILE: R/smooth.r ================================================ #' Smooth a condensed data frame. #' #' @param x a condensed summary #' @param h numeric vector of bandwidths, one for each grouping variable in #' \code{x} #' @param var variable to smooth #' @param grid a data frame with the grouping colums as x. In order for the #' factored version of \code{smooth_nd} to work, this grid must be a superset #' of \code{x}. #' @param type type of smoothing to use. Current options are \code{"mean"}, #' a kernel weighted mean; \code{"regression"}, a kernel weighted local #' regression; and \code{"robust_regression"}, robust kernel weighted local #' regression in the style of \code{\link{loess}}. Unique prefixes are also #' acceptable. #' @param factor if \code{TRUE} compute the n-dimensional smooth by a sequence #' of 1d smoothes. For \code{type = "mean"} the results are always the same # as \code{FALSE}; for \code{type = "regress"} they will be equal if the #' grid values are uncorrelated (e.g. the grid is complete at every location); #' and is very approximate for \code{type = "robust"}. #' @export #' @examples #' x <- runif(1e5) #' xsum <- condense(bin(x, 1 / 100)) #' xsmu1 <- smooth(xsum, 5 / 100) #' xsmu2 <- smooth(xsum, 5 / 100, factor = FALSE) #' #' # More challenging distribution #' x <- rchallenge(1e4) #' xsum <- condense(bin(x, 0.1)) #' xsmu <- smooth(xsum, 1) #' #' plot(xsum$x, xsum$.count, type = "l") #' lines(xsmu$x, xsmu$.count, col = "red") #' #' xsmu2 <- smooth(xsum, 1, type = "regress") #' plot(xsmu$x, xsmu$.count, type = "l", xlim = c(0, 50)) #' lines(xsmu2$x, xsmu2$.count, col = "red") #' # Note difference in tails smooth <- function(x, h, var = summary_vars(x)[1], grid = NULL, type = "mean", factor = TRUE) { stopifnot(is.condensed(x)) stopifnot(is.numeric(h), all(h > 0)) type <- match.arg(type, c("mean", "regression", "robust_regression")) if (type != "mean" && !factor) { stop("Only factored approximations available for types other than mean", call. = FALSE) } grid_in <- as.matrix(x[group_vars(x)]) grid_out <- grid %||% grid_in stopifnot(is.matrix(grid_out), is.numeric(grid_out), ncol(grid_out) == ncol(grid_in), nrow(grid_out) > 0) z <- x[[var]] w <- if (var != ".count" && x %contains% ".count") x$.count else numeric() if (factor) { for(i in 1:ncol(grid_in)) { # smooth_nd_1 is a C++ function, so var is 0 indexed z <- smooth_nd_1(grid_in, z, w, grid_out, var = i - 1, h = h[i], type = type) } } else { z <- smooth_nd(grid_in, z, w, grid_out, h) } out <- data.frame(grid_out) out[[var]] <- z structure(out, class = c("condensed", class(out))) } complete_grid <- function(df) { stopifnot(is.data.frame(df)) breaks <- function(width, origin, nbins) { origin + width * seq.int(nbins) } cols <- lapply(df, function(x) do.call(breaks, attributes(x))) expand.grid(cols, KEEP.OUT.ATTRS = FALSE) } ================================================ FILE: R/standardise.r ================================================ #' Standardise a summary to sum to one. #' #' @param x a condensed summary. Must have \code{.count} variable. #' @param margin margins to standardise along. If \code{NULL}, the default, #' standardises the whole array. #' @export #' @examples #' b1 <- condense(bin(movies$year, 1)) #' d1 <- smooth(b1, 2, type = "reg") #' #' if (require("ggplot2")) { #' #' autoplot(b1) #' autoplot(d1) #' #' # Note change in x-axis limits #' autoplot(standardise(d1)) #' } #' #' # Can also standardise a dimension at a time #' b2 <- with(movies, condense(bin(year, 2), bin(length, 10))) #' b2 <- peel(b2, central = TRUE) #' #' if (require("ggplot2")) { #' #' autoplot(b2) #' autoplot(standardise(b2)) # note legend #' autoplot(standardise(b2, "year")) # each row sums to 1 #' autoplot(standardise(b2, "length")) # each col sums to 1 #' #' base <- ggplot(b2, aes(length, .count)) + #' geom_line(aes(group = year, colour = year)) #' base #' base %+% standardise(b2) # Just affects y axis labels #' base %+% standardise(b2, "year") # Makes year comparable #' base %+% standardise(b2, "length") # Meaningless for this display #' #' } standardise <- function(x, margin = integer()) { stopifnot(is.condensed(x), !is.null(x$.count)) if (length(margin) == 0) { x$.count <- prop(x$.count) } else { x$.count <- ave(x$.count, id(x[margin]), FUN = prop) x$.count[is.na(x$.count)] <- 0 } x } prop <- function(x) x / sum(x, na.rm = TRUE) ================================================ FILE: R/utils.r ================================================ "%||%" <- function(x, y) if (is.null(x)) y else x last <- function(x) x[length(x)] "%contains%" <- function(df, var) { var %in% names(df) } find_fun <- function(name, env = globalenv()) { if (is.function(name)) return(name) ns_env <- asNamespace("bigvis") if (exists(name, ns_env, mode = "function")) { return(get(name, ns_env)) } if (exists(name, env, mode = "function")) { return(get(name, env)) } stop("Could not find function ", name, call. = FALSE) } ================================================ FILE: R/weighted-stats.r ================================================ #' Compute a weighted variance or standard deviation of a vector. #' #' @details #' Note that unlike the base R \code{\link{var}} function, these functions only #' work with individual vectors not matrices or data frames. #' #' @family weighted statistics #' @seealso \code{\link[stats]{weighted.mean}} #' @param x numeric vector of observations #' @param w integer vector of weights, representing the number of #' time each \code{x} was observed #' @param na.rm if \code{TRUE}, missing values in both \code{w} and \code{x} #' will be removed prior computation. Otherwise if there are missing values #' the result will always be missing. #' @export #' @examples #' x <- c(1:5) #' w <- rpois(5, 5) + 1 #' y <- x[rep(seq_along(x), w)] #' weighted.var(x, w) #' var(y) #' #' stopifnot(all.equal(weighted.var(x, w), var(y))) weighted.var <- function(x, w = NULL, na.rm = FALSE) { if (na.rm) { na <- is.na(x) | is.na(w) x <- x[!na] w <- w[!na] } sum(w * (x - weighted.mean(x, w)) ^ 2) / (sum(w) - 1) } #' @export #' @rdname weighted.var weighted.sd <- function(x, w, na.rm = TRUE) sqrt(weighted.var(x, w, na.rm = TRUE)) #' A weighted ecdf function. #' #' An extension of the base \code{\link[stats]{ecdf}} function which works #' with weighted data. #' #' @section S3 methods: #' The \code{ecdf} class has methods for \code{\link{plot}}, #' \code{\link{lines}}, \code{\link{summary}} and \code{\link{quantile}}. #' \code{\link{quantile}} does not currently correctly compute values for #' weighted ecdfs. #' #' @inheritParams weighted.var #' @family weighted statistics #' @seealso \code{\link[stats]{weighted.mean}} #' @export #' @examples #' x <- runif(200) #' w <- rpois(200, 5) + 1 #' #' e <- weighted.ecdf(x, w) #' plot(e) #' summary(e) #' #' y <- x[rep(seq_along(x), w)] #' plot(ecdf(y)) weighted.ecdf <- function(x, w) { stopifnot(length(x) == length(w)) stopifnot(anyDuplicated(x) == 0) ord <- order(x) x <- x[ord] w <- w[ord] n <- sum(w) wts <- cumsum(w / n) f <- approxfun(x, wts, method = "constant", yleft = 0, yright = 1, f = 0) class(f) <- c("wecdf", "ecdf", "stepfun", class(f)) attr(f, "call") <- sys.call() environment(f)$nobs <- n f } #' Compute quantiles of weighted data. #' #' @details #' Currently only implements the type 7 algorithm, as described in #' \code{\link{quantile}}. Based on \code{\link{quantile}} written by R-core. #' #' @inheritParams weighted.var #' @param probs numeric vector of probabilities between 0 and 1 #' @param na.rm If \code{TRUE} will automatically remove missing values #' in \code{x} or \code{w}. #' @family weighted statistics #' @export #' @examples #' x <- runif(200) #' w <- rpois(200, 5) + 1 #' weighted.quantile(x, w) weighted.quantile <- function (x, w, probs = seq(0, 1, 0.25), na.rm = FALSE) { stopifnot(length(x) == length(w)) na <- is.na(x) | is.na(w) if (any(na)) { if (na.rm) { x <- x[!na] w <- w[!na] } else { stop("Missing values not allowed when na.rm is FALSE", call. = FALSE) } } # Ensure x and w in ascending order of x ord <- order(x) x <- x[ord] w <- w[ord] # Find closest x just below and above index n <- sum(w) index <- 1 + (n - 1) * probs j <- floor(index) wts <- cumsum(w) lo <- x[lowerBound(j, wts)] # X_j hi <- x[lowerBound(j + 1, wts)] g <- index - j ifelse(lo == hi, lo, (1 - g) * lo + g * hi) } # Q[i](p) = (1 - g) x[j] + g x[j+1] # j = floor(np + m) # g = np + m - j # # For type 7: # m = 1 - p => # j = floor(1 + (n - 1) * p) # g = (np + 1 - p) - floor(1 + (n - 1) * p) #' Compute the median of weighted data. #' #' @details This is a simple wrapper around \code{\link{weighted.quantile}} #' @inheritParams weighted.quantile #' @export #' @examples #' x <- runif(200) #' w <- rpois(200, 5) + 1 #' #' median(x) #' weighted.median(x, w) weighted.median <- function(x, w, na.rm = FALSE) { weighted.quantile(x, w, probs = 0.5, na.rm = na.rm) } #' Compute the interquartile range of weighted data. #' #' @details This is a simple wrapper around \code{\link{weighted.quantile}} #' @inheritParams weighted.quantile #' @export #' @examples #' x <- sort(runif(200)) #' w <- rpois(200, seq(1, 10, length = 200)) + 1 #' #' IQR(x) #' weighted.IQR(x, w) weighted.IQR <- function(x, w, na.rm = FALSE) { diff(weighted.quantile(x, w, probs = c(0.25, 0.75), na.rm = na.rm)) } ================================================ FILE: R/width.r ================================================ #' Compute a reasonable default binwidth. #' #' @param x a numeric vector. If a numeric vector of length one is supplied, #' it's assumed that #' @param nbins desired number of bins (approximate) #' @export #' @keywords internal #' @family reasonable defaults #' @examples #' find_width(c(0, 5)) #' find_width(c(0, 5.023432)) #' find_width(c(0, 5.9)) find_width <- function(x, nbins = 1e4) { stopifnot(is.numeric(x)) stopifnot(is.numeric(nbins), length(nbins) == 1, nbins > 0) x <- diff(frange(x)) size <- x / nbins # divide into order of magnitude and multiplier om <- 10 ^ ceiling(log10(size)) mult <- size / om # ensure number per unit is multiple of 1, 2, 3, 4, or 5 per_unit <- 1 / mult rounders <- c(1, 2, 3, 4, 5) poss <- round(per_unit / rounders) * rounders poss <- poss[poss != 0] width <- om / poss[which.min(abs(poss - per_unit))] structure(width, n = ceiling(x / width), per_unit = 1 / width) } ================================================ FILE: README.md ================================================ # bigvis [![Travis-CI Build Status](https://travis-ci.org/hadley/bigvis.svg?branch=master)](https://travis-ci.org/hadley/bigvis) [![Coverage Status](https://img.shields.io/codecov/c/github/hadley/bigvis/master.svg)](https://codecov.io/github/hadley/bigvis?branch=master) The bigvis package provides tools for exploratory data analysis of __large datasets__ (10-100 million obs). The aim is to have most operations take less than 5 seconds on commodity hardware, even for 100,000,000 data points. Since bigvis is not currently available on CRAN, the easiest way to try it out is to: ```R # install.packages("devtools") devtools::install_github("hadley/bigvis") ``` ## Workflow The bigvis package is structured around the following workflow: * `bin()` and `condense()` to get a compact summary of the data * if the estimates are rough, you might want to `smooth()`. See `best_h()` and `rmse_cvs()` to figure out a good starting bandwidth * if you're working with counts, you might want to `standardise()` * visualise the results with `autoplot()` (you'll need to load `ggplot2` to use this) ## Weighted statistics Bigvis also provides a number of standard statistics efficiently implemented on weighted/binned data: `weighted.median`, `weighted.IQR`, `weighted.var`, `weighted.sd`, `weighted.ecdf` and `weighted.quantile`. ## Acknowledgements This package wouldn't be possible without: * the fantastic [Rcpp](http://dirk.eddelbuettel.com/code/rcpp.html) package, which makes it amazingly easy to integrate R and C++ * JJ Allaire and Carlos Scheidegger who have indefatigably answered my many C++ questions * the generous support of Revolution Analytics who supported the early development. * Yue Hu, who implemented a proof of concepts that showed that it might be possible to work with this much data in R. ================================================ FILE: bench/bin-structure.cpp ================================================ // How does the data structure implementing the bin affect performance. #include using namespace Rcpp; class Grouper { const Fast x_; double width_; double origin_; public: Grouper (const NumericVector& x, double width, double origin = 0) : x_(x), width_(width), origin_(origin) { } int bin(int i) const { if (ISNAN(x_[i])) return 0; return (x_[i] - origin_) / width_ + 1; } int size() const { return x_.size(); } }; // [[Rcpp::export]] std::vector count_vector(const NumericVector& x, double width, double origin = 0) { Grouper grouper = Grouper(x, width, origin); std::vector count; int n = grouper.size(); for(int i = 0; i < n; ++i) { int bin = grouper.bin(i); if (bin >= count.size()) { count.resize(bin + 1); } ++count[bin]; } return count; } // [[Rcpp::export]] List count_map(const NumericVector& x, double width, double origin = 0) { Grouper grouper = Grouper(x, width, origin); std::map count; int n = grouper.size(); for(int i = 0; i < n; ++i) { int bin = grouper.bin(i); ++count[bin]; } IntegerVector out_x(count.size()), out_y(count.size()); std::map::const_iterator count_it = count.begin(), count_end = count.begin(); for (int i = 0; count_it != count_end; ++count_it, ++i) { out_x[i] = count_it->first; out_y[i] = count_it->second; } return List::create(_["x"] = out_x, _["count"] = out_y); } // [[Rcpp::export]] List count_umap(const NumericVector& x, double width, double origin = 0) { Grouper grouper = Grouper(x, width, origin); std::tr1::unordered_map count; int n = grouper.size(); for(int i = 0; i < n; ++i) { int bin = grouper.bin(i); ++count[bin]; } IntegerVector out_x(count.size()), out_y(count.size()); std::tr1::unordered_map::iterator count_it = count.begin(), count_end = count.end(); for (int i = 0; count_it != count_end; ++count_it, ++i) { out_x[i] = count_it->first; out_y[i] = count_it->second; } return List::create(_["x"] = out_x, _["count"] = out_y); } template inline void hash_combine(std::size_t & seed, const T & v) { std::tr1::hash hasher; seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2); } namespace std { namespace tr1 { template struct hash > { inline size_t operator()(const pair & v) const { size_t seed = 0; ::hash_combine(seed, v.first); ::hash_combine(seed, v.second); return seed; } }; } } // [[Rcpp::export]] List count_umap2(const NumericVector& x, double width, double origin = 0) { Grouper grouper = Grouper(x, width, origin); std::tr1::unordered_map, int> count; int n = grouper.size(); for(int i = 0; i < n; ++i) { int bin = grouper.bin(i); ++count[std::make_pair(bin, bin)]; } IntegerVector out_x(count.size()), out_y(count.size()); std::tr1::unordered_map, int>::iterator count_it = count.begin(), count_end = count.end(); for (int i = 0; count_it != count_end; ++count_it, ++i) { out_x[i] = count_it->first.first; out_y[i] = count_it->second; } return List::create(_["x"] = out_x, _["count"] = out_y); } // [[Rcpp::export]] List count_umap2_man(const NumericVector& x, double width, double origin = 0) { Grouper grouper = Grouper(x, width, origin); std::tr1::unordered_map count; int n = grouper.size(); for(int i = 0; i < n; ++i) { int bin = grouper.bin(i); bin = bin * 100 + bin; ++count[bin]; } IntegerVector out_x(count.size()), out_y(count.size()); std::tr1::unordered_map::iterator count_it = count.begin(), count_end = count.end(); for (int i = 0; count_it != count_end; ++count_it, ++i) { out_x[i] = count_it->first; out_y[i] = count_it->second; } return List::create(_["x"] = out_x, _["count"] = out_y); } /*** R options(digits = 3) library(microbenchmark) x <- runif(1e5) # As expected, for small contiguous inputs, vector is fastest, followed by # unordered maps (about half as fast), with maps in a distant last place. microbenchmark( count_vector(x, 1 / 1000), count_map(x, 1 / 1000), count_umap(x, 1 / 1000) ) y <- c(x, x) y1 <- c(x, x + 10) y2 <- c(x, x + 100) y3 <- c(x, x + 1000) y4 <- c(x, x + 1000) # While using std::vector is somewhat faster, the asymptotic behaviour is # much worse - count_umap is basically constant, regardless of the number # of bins microbenchmark( count_vector(y, 1 / 1000), count_vector(y1, 1 / 1000), count_vector(y2, 1 / 1000), count_vector(y3, 1 / 1000), count_vector(y4, 1 / 1000), count_umap(y, 1 / 1000), count_umap(y1, 1 / 1000), count_umap(y2, 1 / 1000), count_umap(y3, 1 / 1000), count_umap(y4, 1 / 1000), times = 10 ) # Using umap with a pair is about twice as slow as with an int: this probably # implies that I should do the hashing myself. microbenchmark( count_umap(x, 1 / 1000), count_umap2(x, 1 / 1000), count_umap2_man(x, 1 / 1000) ) */ ================================================ FILE: bench/bin.cpp ================================================ #include #include #include using namespace Rcpp; //' @param breaks must be ordered and span the complete range of x. // [[Rcpp::export]] IntegerVector bin(NumericVector x, NumericVector breaks) { // Put missing values in the last position int n = breaks.size(); IntegerVector out(n + 1); for(NumericVector::iterator it = x.begin(); it != x.end(); it++) { double val = *it; if (ISNAN(val)) { out[n]++; } else { NumericVector::iterator bin_it = std::upper_bound(breaks.begin(), breaks.end(), val); int bin = std::distance(breaks.begin(), bin_it); out[bin]++; } } return out; } // [[Rcpp::export]] IntegerVector bin2(NumericVector x, NumericVector breaks) { // Put missing values in the last position int n = breaks.size(), bin; IntegerVector out(n + 1); NumericVector::iterator x_it = x.begin(), x_end, bin_it, breaks_it = breaks.begin(), breaks_end = breaks.end(); for(; x_it != x.end(); ++x_it) { double val = *x_it; if (ISNAN(val)) { ++out[n]; } else { bin_it = std::upper_bound(breaks_it, breaks_end, val); bin = std::distance(breaks_it, bin_it); ++out[bin]; } } return out; } // [[Rcpp::export]] std::vector bin3(NumericVector x, double width, double origin = 0) { int bin, nmissing = 0; std::vector out; NumericVector::iterator x_it = x.begin(), x_end; for(; x_it != x.end(); ++x_it) { double val = *x_it; if (ISNAN(val)) { ++nmissing; } else { bin = (val - origin) / width; if (bin < 0) continue; // Make sure there's enough space if (bin >= out.size()) { out.resize(bin + 1); } ++out[bin]; } } // Put missing values in the last position out.push_back(nmissing); return out; } // Create class to encapsulate binning operations ------------------------------ class BinFixed { double width_; double origin_; public: BinFixed (double width, double origin = 0) { width_ = width; origin_ = origin; } int inline operator() (double val) const { return (val - origin_) / width_; } }; class BinBreaks { NumericVector breaks_; NumericVector::iterator breaks_it_, breaks_end_; public: BinBreaks (NumericVector& breaks) { breaks_ = breaks; breaks_it_ = breaks.begin(); breaks_end_ = breaks.end(); } int inline operator() (double val) const { NumericVector::iterator bin_it = std::upper_bound(breaks_it_, breaks_end_, val); return std::distance(breaks_it_, bin_it); } }; template std::vector bin_bin(NumericVector x, Binner binner) { int bin, nmissing = 0; std::vector out; NumericVector::iterator x_it = x.begin(), x_end; for(; x_it != x.end(); ++x_it) { double val = *x_it; if (ISNAN(val)) { ++nmissing; } else { bin = binner(val); if (bin < 0) continue; // Make sure there's enough space if (bin >= out.size()) { out.resize(bin + 1); } ++out[bin]; } } // Put missing values in the last position out.push_back(nmissing); return out; } // [[Rcpp::export]] std::vector bin_bin_fixed(NumericVector x, double width, double origin = 0) { return bin_bin(x, BinFixed(width, origin)); } // [[Rcpp::export]] std::vector bin_bin_breaks(NumericVector x, NumericVector breaks) { return bin_bin(x, BinBreaks(breaks)); } // Try using a Fast ------------------------------ // Considerable speed improvement for simple binning function template std::vector fbin_bin(NumericVector x, Binner binner) { int bin, nmissing = 0; std::vector out; Fast fx(x); int n = x.size(); for(int i = 0; i < n; ++i) { double val = fx[i]; if (ISNAN(val)) { ++nmissing; } else { bin = binner(val); if (bin < 0) continue; // Make sure there's enough space if (bin >= out.size()) { out.resize(bin + 1); } ++out[bin]; } } // Put missing values in the last position out.push_back(nmissing); return out; } // [[Rcpp::export]] std::vector fbin_bin_fixed(NumericVector x, double width, double origin = 0) { return fbin_bin(x, BinFixed(width, origin)); } // [[Rcpp::export]] std::vector fbin_bin_breaks(NumericVector x, NumericVector breaks) { return fbin_bin(x, BinBreaks(breaks)); } /*** R options(digits = 3) library(microbenchmark) x <- runif(1e5) breaks <- seq(0, 1, length = 100) # Breaks microbenchmark( hist(x, breaks, plot = F), bin(x, breaks), bin2(x, breaks), bin_bin_breaks(x, breaks), fbin_bin_breaks(x, breaks) ) # Fixed bins microbenchmark( bin3(x, 1/100, 0), bin_bin_fixed(x, 1/100, 0), fbin_bin_fixed(x, 1/100, 0) ) x6 <- runif(1e6) x7 <- runif(1e7) x8 <- runif(1e8) microbenchmark( bin_bin_fixed(x6, 1/100, 0), fbin_bin_fixed(x6, 1/100, 0), bin_bin_fixed(x7, 1/100, 0), fbin_bin_fixed(x7, 1/100, 0), bin_bin_fixed(x8, 1/100, 0), fbin_bin_fixed(x8, 1/100, 0), times = 10) */ ================================================ FILE: bench/count.cpp ================================================ // Experiment with making the binner more generic, so that the binner // class also stores the variable being binned over - this is important // for separating the grouping from the numeric operation. #include #include #include using namespace Rcpp; template std::vector count_x(const NumericVector& x, Binner binner) { std::vector out; int n = x.size(); for(int i = 0; i < n; ++i) { int bin = binner(x[i]); if (bin < 0) continue; // Make sure there's enough space if (bin >= out.size()) { out.resize(bin + 1); } ++out[bin]; } return out; } template std::vector count(Binner binner) { std::vector out; int n = binner.size(); for(int i = 0; i < n; ++i) { int bin = binner.bin(i); if (bin < 0) continue; // Make sure there's enough space if (bin >= out.size()) { out.resize(bin + 1); } ++out[bin]; } return out; } class BinFixed { double width_; double origin_; public: BinFixed (double width, double origin = 0) { width_ = width; origin_ = origin; } int inline operator() (double val) const { if (ISNAN(val)) return 0; return (val - origin_) / width_ + 1; } }; class BinFixed2 { const NumericVector& x_; double width_; double origin_; public: BinFixed2 (const NumericVector& x, double width, double origin = 0) : x_(x), width_(width), origin_(origin) { } int bin(int i) const { if (ISNAN(x_[i])) return 0; return (x_[i] - origin_) / width_ + 1; } int size() const { return x_.size(); } }; // [[Rcpp::export]] std::vector count_x2(NumericVector x, double width, double origin = 0) { return count_x(x, BinFixed(width, origin)); } // [[Rcpp::export]] std::vector count2(NumericVector x, double width, double origin = 0) { return count(BinFixed2(x, width, origin)); } /*** R options(digits = 3) library(microbenchmark) x <- runif(1e5) # Breaks microbenchmark( count_x2(x, 1/100), count2(x, 1/100) ) */ ================================================ FILE: bench/group-tempvar.cpp ================================================ // In a function like // // unsigned int bin(unsigned int i) const { // if (ISNAN(x_[i])) return 0; // if (x_[i] < origin_) return 0; // // return (x_[i] - origin_) / width_ + 1; // } // // should I create my own temporary double val = x_[i] ? // // It looks like it saves ~0.2 ns per invocation, so probably not worth it for // performance reasons. #include using namespace Rcpp; class Group1 { const Fast x_; double width_; double origin_; public: Group1 (const NumericVector& x, double width, double origin = 0) : x_(x), width_(width), origin_(origin) { } unsigned int bin(unsigned int i) const { if (ISNAN(x_[i])) return 0; if (x_[i] < origin_) return 0; return (x_[i] - origin_) / width_ + 1; } int size() const { return x_.size(); } }; class Group2 { const Fast x_; double width_; double origin_; public: Group2 (const NumericVector& x, double width, double origin = 0) : x_(x), width_(width), origin_(origin) { } unsigned int bin(unsigned int i) const { double val = x_[i]; if (ISNAN(val)) return 0; if (val < origin_) return 0; return (val - origin_) / width_ + 1; } int size() const { return x_.size(); } }; template IntegerVector group_out(const Group& group) { int n = group.size(); IntegerVector out(n); for(int i = 0; i < n; ++i) { out[i] = group.bin(i); } return out; } // [[Rcpp::export]] IntegerVector group1(const NumericVector& x, double width, double origin = 0) { return group_out(Group1(x, width, origin)); } // [[Rcpp::export]] IntegerVector group2(const NumericVector& x, double width, double origin = 0) { return group_out(Group2(x, width, origin)); } /*** R x <- runif(1e6) library(microbenchmark) stopifnot(all.equal(group1(x, 1/1000), group2(x, 1/1000))) (m <- microbenchmark( group1(x, 1/1000), group2(x, 1/1000) )) diff(summary(m)$median) / length(x) * 1e9 / 1e3 */ ================================================ FILE: bench/kernel.cpp ================================================ // Differences in kernel performance #include using namespace Rcpp; // [[Rcpp::export]] NumericVector normal_kernel(NumericVector x) { int n = x.size(); NumericVector out(n); for (int i = 0; i < n; ++i) { out[i] = R::dnorm(x[i], 0, 1, 0); } return out; } // [[Rcpp::export]] double tricube2(double x) { x = fabs(x); if (x > 1) return 0; return pow(1 - pow(x, 3), 3); } // [[Rcpp::export]] double tricube(double x) { x = fabs(x); if (x > 1) return 0; double y = 1 - x * x * x; return y * y * y; } // [[Rcpp::export]] NumericVector tricube_kernel(NumericVector x) { int n = x.size(); NumericVector out(n); for (int i = 0; i < n; ++i) { out[i] = tricube(x[i]); } return out; } // [[Rcpp::export]] NumericVector copy(NumericVector x) { int n = x.size(); NumericVector out(n); for (int i = 0; i < n; ++i) { out[i] = x[i]; } return out; } /*** R options(digits = 3) library(microbenchmark) x <- runif(1e4) mean(sapply(x, tricube) - sapply(x, tricube2)) microbenchmark( copy(x), tricube_kernel(x), normal_kernel(x) ) */ ================================================ FILE: bench/mean.cpp ================================================ // Instead of counting, compute a more complicated statistic: a weighted mean #include #include #include using namespace Rcpp; class BinFixed { const Fast x_; double width_; double origin_; public: BinFixed (const NumericVector& x, double width, double origin = 0) : x_(x), width_(width), origin_(origin) { } int bin(int i) const { if (ISNAN(x_[i])) return 0; return (x_[i] - origin_) / width_ + 1; } int size() const { return x_.size(); } }; template NumericVector group_mean(NumericVector& y, NumericVector& weight, Binner binner) { std::vector count; std::vector sum; int n = binner.size(); for(int i = 0; i < n; ++i) { int bin = binner.bin(i); if (bin < 0) continue; // Make sure there's enough space if (bin >= sum.size()) { sum.resize(bin + 1); count.resize(bin + 1); } count[bin] += weight[i]; sum[bin] += y[i]; } int m = count.size(); NumericVector res(m); for (int i = 0; i < m; ++i) { res[i] = sum[i] / count[i]; } return res; } class StatMean { double count; double sum; public: StatMean () : count(0), sum(0) { } void push(double x, double weight) { count += weight; sum += x; } double compute() { return sum / count; } }; template NumericVector group_mean2(NumericVector& y, NumericVector& weight, Binner binner) { std::vector stat; int n = binner.size(); for(int i = 0; i < n; ++i) { int bin = binner.bin(i); if (bin < 0) continue; if (bin >= stat.size()) { stat.resize(bin + 1); } stat[bin].push(y[i], weight[i]); } int m = stat.size(); NumericVector res(m); for (int i = 0; i < m; ++i) { res[i] = stat[i].compute(); } return res; } // [[Rcpp::export]] NumericVector group_mean_(NumericVector x, NumericVector y, NumericVector weight, double width, double origin = 0) { return group_mean(y, weight, BinFixed(x, width, origin)); } // [[Rcpp::export]] NumericVector group_mean2_(NumericVector x, NumericVector y, NumericVector weight, double width, double origin = 0) { return group_mean2(y, weight, BinFixed(x, width, origin)); } /*** R options(digits = 3) library(microbenchmark) x <- runif(1e6) y <- runif(1e6) weight <- rep(1, 1e6) # Breaks microbenchmark( group_mean_(x, y, weight, width = 1/100), group_mean2_(x, y, weight, width = 1/100) ) */ ================================================ FILE: bench/median.cpp ================================================ // Instead of counting, compute a more complicated statistic: a median #include #include #include using namespace Rcpp; class BinFixed { const Fast x_; double width_; double origin_; public: BinFixed (const NumericVector& x, double width, double origin = 0) : x_(x), width_(width), origin_(origin) { } int bin(int i) const { if (ISNAN(x_[i])) return 0; return (x_[i] - origin_) / width_ + 1; } int size() const { return x_.size(); } }; class StatMedian { std::vector ys; public: void push(double x) { ys.push_back(x); } // Adapted from http://stackoverflow.com/questions/1719070/ double compute() { if (ys.empty()) return NAN; int size = ys.size(); std::vector::iterator upper = ys.begin() + (int) (size / 2); std::nth_element(ys.begin(), upper, ys.end()); if (size % 2 == 1) { return *upper; } else { std::vector::iterator lower = upper - 1; std::nth_element(ys.begin(), lower, upper); return (*upper + *lower) / 2.0; } } }; template NumericVector group_median(NumericVector& y, Binner binner) { std::vector stat; int n = binner.size(); for(int i = 0; i < n; ++i) { int bin = binner.bin(i); if (bin < 0) continue; if (bin >= stat.size()) { stat.resize(bin + 1); } stat[bin].push(y[i]); } int m = stat.size(); NumericVector res(m); for (int i = 0; i < m; ++i) { res[i] = stat[i].compute(); } return res; } // [[Rcpp::export]] NumericVector group_median_(NumericVector x, NumericVector y, double width, double origin = 0) { return group_median(y, BinFixed(x, width, origin)); } /*** R options(digits = 3) library(microbenchmark) x <- runif(1e5) y <- runif(1e5) group_median_tapply <- function(x, y, width, origin = 0) { bins <- trunc((x - origin) / width) c(NaN, unname(tapply(y, bins, median))) } med1 <- group_median_tapply(x, y, width = 1/1000) med2 <- group_median_(x, y, width = 1/1000) stopifnot(all.equal(med1, med2)) # Breaks microbenchmark( # group_median_tapply(x, y, width = 1/1000), group_median_(x, y, width = 1/1000) ) */ ================================================ FILE: bench/smooth-1d.cpp ================================================ // Explore opportunities for making smooth_1d faster // // Bounding to a given range is really important, and memoisation helps offset // the cost of making a call back to R, but the biggest win is using a pure // C/C++ kernel function. // // #include using namespace Rcpp; // Base implementation // [[Rcpp::export]] NumericVector smooth_1d(const NumericVector& x, const NumericVector& z, const NumericVector& x_out, const Function& kernel) { int n_in = x.size(), n_out = x_out.size(); NumericVector z_out(n_out); for (int i = 0; i < n_out; i++) { for (int j = 0; j < n_in; j++) { double dist = x[j] - x_out[i]; double k = as(kernel(dist))[0]; z_out[i] += z[j] * k; } } return z_out; } // Memoise distance calculations // [[Rcpp::export]] NumericVector smooth_1d_memo(const NumericVector& x, const NumericVector& z, const NumericVector& x_out, const Function& kernel) { int n_in = x.size(), n_out = x_out.size(); NumericVector z_out(n_out); std::unordered_map k_memo; for (int i = 0; i < n_out; i++) { for (int j = 0; j < n_in; j++) { double dist = x[j] - x_out[i]; std::unordered_map::const_iterator it = k_memo.find(dist); double k; if (it == k_memo.end()) { k = as(kernel(dist))[0]; k_memo[dist] = k; } else { k = it->second; } z_out[i] += z[j] * k; } } return z_out; } // Use range of kernel // [[Rcpp::export]] NumericVector smooth_1d_range(const NumericVector& x, const NumericVector& z, const NumericVector& x_out, const Function& kernel, double kmin, double kmax) { int n_in = x.size(), n_out = x_out.size(); NumericVector z_out(n_out); for (int i = 0; i < n_out; i++) { for (int j = 0; j < n_in; j++) { double dist = x[j] - x_out[i]; if (dist < kmin || dist > kmax) continue; double k = as(kernel(dist))[0]; z_out[i] += z[j] * k; } } return z_out; } // Memoise and use range // [[Rcpp::export]] NumericVector smooth_1d_memo_range(const NumericVector& x, const NumericVector& z, const NumericVector& x_out, const Function& kernel, double kmin, double kmax) { int n_in = x.size(), n_out = x_out.size(); NumericVector z_out(n_out); std::unordered_map k_memo; for (int i = 0; i < n_out; i++) { for (int j = 0; j < n_in; j++) { double dist = x[j] - x_out[i]; if (dist < kmin || dist > kmax) continue; std::unordered_map::const_iterator it = k_memo.find(dist); double k; if (it == k_memo.end()) { k = as(kernel(dist))[0]; k_memo[dist] = k; } else { k = it->second; } z_out[i] += z[j] * k; } } return z_out; } // Memoise and use range // [[Rcpp::export]] NumericVector smooth_1d_memo_range_map(const NumericVector& x, const NumericVector& z, const NumericVector& x_out, const Function& kernel, double kmin, double kmax) { int n_in = x.size(), n_out = x_out.size(); NumericVector z_out(n_out); std::map k_memo; for (int i = 0; i < n_out; i++) { for (int j = 0; j < n_in; j++) { double dist = x[j] - x_out[i]; if (dist < kmin || dist > kmax) continue; std::map::const_iterator it = k_memo.find(dist); double k; if (it == k_memo.end()) { k = as(kernel(dist))[0]; k_memo[dist] = k; } else { k = it->second; } z_out[i] += z[j] * k; } } return z_out; } // Memoise, use range & use C++ function for kernel // [[Rcpp::export]] NumericVector smooth_1d_memo_range_kcpp(const NumericVector& x, const NumericVector& z, const NumericVector& x_out, double kmin, double kmax) { int n_in = x.size(), n_out = x_out.size(); NumericVector z_out(n_out); std::unordered_map k_memo; for (int i = 0; i < n_out; i++) { for (int j = 0; j < n_in; j++) { double dist = x[j] - x_out[i]; if (dist < kmin || dist > kmax) continue; std::unordered_map::const_iterator it = k_memo.find(dist); double k; if (it == k_memo.end()) { k = R::dnorm(dist, 0.0, 0.1, 0); k_memo[dist] = k; } else { k = it->second; } z_out[i] += z[j] * k; } } return z_out; } // Use cpp kernel function without memoisation // [[Rcpp::export]] NumericVector smooth_1d_range_kcpp(const NumericVector& x, const NumericVector& z, const NumericVector& x_out, double kmin, double kmax) { int n_in = x.size(), n_out = x_out.size(); NumericVector z_out(n_out); for (int i = 0; i < n_out; i++) { for (int j = 0; j < n_in; j++) { double dist = x[j] - x_out[i]; if (dist < kmin || dist > kmax) continue; double k = R::dnorm(dist, 0.0, 0.1, 0); z_out[i] += z[j] * k; } } return z_out; } /*** R options(digits = 2) x <- 1:10 z <- rep(c(1, 2), length = length(x)) k <- kernel("norm", sd = 0.1) krng <- range(k) grid <- seq(0, 11, length = 100) stopifnot(all.equal( smooth_1d_memo_range(x, z, grid, k, krng[1], krng[2]), smooth_1d_range_kcpp(x, z, grid, krng[1], krng[2]) )) library(microbenchmark) microbenchmark( base = smooth_1d(x, z, grid, k), memo = smooth_1d_memo(x, z, grid, k), range = smooth_1d_range(x, z, grid, k, krng[1], krng[2]), "range + kcpp" = smooth_1d_range_kcpp(x, z, grid, krng[1], krng[2]), "range + memo" = smooth_1d_memo_range(x, z, grid, k, krng[1], krng[2]), "range + memo + kcpp" = smooth_1d_memo_range_kcpp(x, z, grid, krng[1], krng[2]), "range + memo + map" = smooth_1d_memo_range_map(x, z, grid, k, krng[1], krng[2]) ) # More realistic sample sizes x <- 1:3e3 z <- rep(c(1, 2), length = length(x)) grid3 <- seq(0, 11, length = 3e3) grid4 <- seq(0, 11, length = 3e4) microbenchmark( grid3_c = smooth_1d_range_kcpp(x, z, grid3, krng[1], krng[2]), grid3_r = smooth_1d_memo_range_map(x, z, grid3, k, krng[1], krng[2]), grid4_c = smooth_1d_range_kcpp(x, z, grid4, krng[1], krng[2]), grid4_r = smooth_1d_memo_range_map(x, z, grid4, k, krng[1], krng[2]), times = 10) */ ================================================ FILE: bigvis.Rproj ================================================ Version: 1.0 RestoreWorkspace: Default SaveWorkspace: Default AlwaysSaveHistory: Default EnableCodeIndexing: Yes UseSpacesForTab: Yes NumSpacesForTab: 2 Encoding: UTF-8 RnwWeave: Sweave LaTeX: pdfLaTeX AutoAppendNewline: Yes BuildType: Package PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source PackageRoxygenize: rd,collate,namespace ================================================ FILE: inst/include/bigvis.h ================================================ #include #include using namespace Rcpp; // Wrapper for numeric vector that makes it easy figure to out which // bin each observation belongs to. class BinnedVector { // This should probably be a const NumericVector&, but that doesn't work // with modules currently NumericVector x_; String name_; double width_; double origin_; public: BinnedVector(NumericVector x, String name, double width, double origin = 0) : x_(x), name_(name), width_(width), origin_(origin) { } int bin_i(int i) const { return bin(x_[i]); } int bin(double x) const { if (ISNAN(x) || x == INFINITY || x == -INFINITY) return 0; if (x < origin_) return 0; return (x - origin_) / width_ + 1; } double unbin(int bin) const { if (bin == 0) return(NA_REAL); return (bin - 1) * width_ + origin_ + width_ / 2; } int nbins() const; int size() const { return x_.size(); } double origin() const { return origin_; } double width() const { return width_; } String name() const { return name_; } }; // This class is just boilerplate. There might be rcpp magic that does the right thing here // but I don't know it. class BinnedVectorReference { boost::shared_ptr ref; const BinnedVector *get() const { return ref.get(); }; BinnedVector *get() { return ref.get(); }; public: BinnedVectorReference() {}; BinnedVectorReference(const BinnedVectorReference &o): ref(o.ref) {}; explicit BinnedVectorReference(BinnedVector *ptr) { // Watch out, this takes ownership of the pointer! ref = boost::shared_ptr(ptr); } BinnedVectorReference(NumericVector x, String name, double width, double origin = 0) { BinnedVector *vec = new BinnedVector(x, name, width, origin); ref = boost::shared_ptr(vec); } int bin_i(int i) const { return get()->bin_i(i); } int bin(double x) const { return get()->bin(x); } double unbin(int bin) const { return get()->unbin(bin); } int nbins() const { return get()->nbins(); } int size() const { return get()->size();} double origin() const { return get()->origin();} double width() const { return get()->width();} String name() const { return get()->name();} }; // A data structure to store multiple binned vectors class BinnedVectors { int size_; std::vector groups_; public: std::vector bins_; BinnedVectors () : groups_(0), bins_(0) { } BinnedVectors (List gs) : groups_(0), bins_(0) { int n = gs.size(); for (int i = 0; i < n; ++i) { add_vector(as(gs[i])); } } void add_vector(BinnedVectorReference g) { if (groups_.empty()) { bins_.push_back(1); size_ = g.size(); } else { if (g.size() != size_) stop("Inconsistent sizes"); bins_.push_back(bins_.back() * g.nbins()); } groups_.push_back(g); } int bin_i(int i) const; int bin(std::vector x) const; std::vector unbin(int bin) const; int nbins() const { return bins_.back() * groups_.front().nbins(); } int ngroups() const { return bins_.size(); } int size() const { return size_; } String name(int j) const { return groups_[j].name(); } }; RCPP_EXPOSED_AS(BinnedVectorReference) RCPP_EXPOSED_WRAP(BinnedVectorReference) RCPP_EXPOSED_AS(BinnedVectors) RCPP_EXPOSED_WRAP(BinnedVectors) ================================================ FILE: inst/tests/test-binned-vectors.r ================================================ context("Binned vectors") if (require("plyr")) { test_that("bins agree with plyr::id", { grid <- expand.grid(x = c(NA, seq(0, 0.5, by = 0.1)), y = c(NA, seq(0, 0.7, by = 0.1))) x <- grid$x y <- grid$y gx <- bin(x, 0.1) gy <- bin(y, 0.1) bv <- bins(gx, gy) bigvis <- sapply(seq_along(x) - 1, bv$bin_i) bin_x <- sapply(seq_along(x) - 1, gx$bin_i) bin_y <- sapply(seq_along(x) - 1, gy$bin_i) plyr <- as.vector(id(list(bin_x, bin_y))) expect_equal(bigvis + 1, plyr) }) } test_that("square nbins correct", { g <- bin(1:10, 1) expect_equal(bins(g)$nbins(), 11) expect_equal(bins(g, g)$nbins(), 11 ^ 2) expect_equal(bins(g, g, g)$nbins(), 11 ^ 3) }) test_that("rectangular nbins correct", { g11 <- bin(1:10, 1) g2 <- bin(rep(1, 10), 1) expect_equal(bins(g2, g11)$nbins(), 22) expect_equal(bins(g11, g2)$nbins(), 22) }) test_that("diagonal nbins correct", { x <- runif(1e3) y <- x + runif(1e3, -0.2, 0.2) z <- rnorm(1e3, x) gx <- bin(x, 0.1) gy <- bin(y, 0.1) expect_equal(gx$nbins(), 11) expect_equal(gy$nbins(), 15) bvs <- bins(gx, gy) expect_equal(bvs$nbins(), 165) bins <- vapply(seq_along(x) - 1, bvs$bin_i, integer(1)) expect_true(all(bins <= 165)) }) test_that("bin and unbin are symmetric", { g <- bin(-10:10, 1) bvs <- bins(g, g) grid <- expand.grid(x = -10:10, y = -10:10) bins <- unlist(Map(function(x, y) bvs$bin(c(x, y)), grid$x, grid$y)) unbin <- t(vapply(bins, bvs$unbin, numeric(2))) colnames(unbin) <- c("x", "y") expect_equal(unbin, as.matrix(grid)) }) test_that("bin and unbin are symmetric with diff binning", { x <- c(-1, 5) y <- c(0.1, 1) bx <- bin(x, 1) by <- bin(y, 0.1) bvs <- bins(bx, by) grid <- expand.grid( x = breaks(bx)[-1] + 1 / 2, y = breaks(by)[-1] + 0.1 / 2) bins <- unlist(Map(function(x, y) bvs$bin(c(x, y)), grid$x, grid$y)) unbin <- t(vapply(bins, bvs$unbin, numeric(2))) colnames(unbin) <- c("x", "y") expect_equal(unbin, as.matrix(grid)) }) ================================================ FILE: inst/tests/test-breaks.r ================================================ context("Breaks") last <- function(x) x[length(x)] test_that("breaks includes max value, only if on border", { expect_equal(last(breaks(10, origin = 0, binwidth = 1)), 10) expect_equal(last(breaks(9.99, origin = 0, binwidth = 1)), 9) }) test_that("breaks includes max value even when origin != 0", { expect_equal(last(breaks(10.5, origin = 0.5, binwidth = 1)), 10.5) expect_equal(last(breaks(10.49, origin = 0.5, binwidth = 1)), 9.5) }) ================================================ FILE: inst/tests/test-condense.r ================================================ context("Condense") test_that("condense counts small vectors accurately", { x <- c(NA, 0:10) s1 <- condense(bin(x, 1, -0.5), summary = "count") # Pathological origin: need to add extra bin on end, because they're # right open, left closed s2 <- condense(bin(x, 1, 0), summary = "count") expect_equivalent(s1$x, c(NA, 0:10)) expect_equivalent(s2$x, c(NA, 0:10 + 0.5)) expect_equal(s1$.count, rep(1, length(x))) expect_equal(s2$.count, rep(1, length(x))) }) test_that("weights modify counts", { x <- c(NA, 0:10) w <- rep(2, length(x)) s <- condense(bin(x, 1), w = w, summary = "count") expect_equivalent(s$x, c(NA, 0:10)) expect_equal(s$.count, rep(2, length(x))) }) test_that("z affects sums, but not counts", { x <- c(NA, 0:10) z <- 0:11 s <- condense(bin(x, 1), z = z, summary = "sum") expect_equal(s$.count, rep(1, length(x))) expect_equal(s$.sum, z) }) test_that("drop = FALSE and drop = TRUE results agree", { x <- runif(1e3) y <- x + runif(1e3, -0.2, 0.2) z <- rnorm(1e3, x) gx <- bin(x, 0.1) gy <- bin(y, 0.1) count1 <- condense(gx, gy, summary = "count", drop = TRUE) expect_equal(sum(count1$.count == 0), 0) count2 <- condense(gx, gy, summary = "count", drop = FALSE) expect_equivalent(count1, count2[count2$.count != 0, ]) }) # 2d tests --------------------------------------------------------------------- test_that("grid counted accurately", { # expand.grid orders in opposite way to bigvis grid <- expand.grid(y = c(NA, 1:2), x = c(NA, 1:2)) s <- condense(bin(grid$x, 1), bin(grid$y, 1)) expect_equal(s$.count, rep(1, nrow(grid))) expect_equivalent(s$grid.x, grid$x) expect_equivalent(s$grid.y, grid$y) }) test_that("diagonal counted correctly", { df <- data.frame(x = c(NA, 1:2), y = c(NA, 1:2)) s <- condense(bin(df$x, 1), bin(df$y, 1)) expect_equal(nrow(s), nrow(df)) expect_equal(s$df.x, s$df.y) }) test_that("random data doesn't crash", { x <- runif(1e3, 8, 4963) y <- runif(1e3, 1e-2, 1e3) gx <- bin(x, 10) gy <- bin(y, 10) condense(gx, gy) }) ================================================ FILE: inst/tests/test-frange.r ================================================ context("frange") test_that("frange agrees with range", { x <- rnorm(1e4) expect_equal(frange(x), range(x)) }) test_that("frange uses cache if present", { x <- rnorm(1e4) attr(x, "range") <- c(0, 10) expect_equal(frange(x), c(0, 10)) }) test_that("frange ignores NA and infinities by default", { x <- c(1, NA, Inf, -Inf) expect_equal(frange(x), c(1, 1)) }) ================================================ FILE: inst/tests/test-group-1d.r ================================================ context("Grouping: 1d") group <- function(x, width, origin = NULL) { g <- bin(x, width, origin) vapply(seq_along(x) - 1, g$bin_i, integer(1)) } test_that("NAs belong to group 0", { x <- NA_real_ expect_equal(group(x, 1, 0), 0L) }) test_that("Inf and -Inf belong to group 0", { x <- c(-Inf, Inf) expect_equal(group(x, 1, 0), c(0, 0)) }) test_that("Out of range values belong to group 0", { expect_equal(group(-10, 1, 0), 0) }) test_that("Positive integers unchanged if origin is 1", { expect_equal(group(1:10, 1, 1), 1:10) }) ================================================ FILE: inst/tests/test-group-2d.r ================================================ context("Grouping: 2d") test_that("Two NAs gets bin 0", { expect_equal(group_rect(NA, NA, 1, 1, 0, 0), 0) }) test_that("Sequential locations get sequential groups", { grid <- expand.grid(x = c(NA, 1:2), y = c(NA, 1:2)) expect_equal(group_rect(grid$x, grid$y, 1, 1, 0.5, 0.5), 0:8) }) ================================================ FILE: inst/tests/test-origin.r ================================================ context("Origin") test_that("origins close to zero rounded to zero" ,{ expect_equal(find_origin(c(0.01, 1000)), 0) expect_equal(find_origin(c(10, 1e6)), 0) }) test_that("origins rounded down by binwidth", { expect_equal(find_origin(c(1, 10), 1), 1) expect_equal(find_origin(c(1, 10), 2), 0) expect_equal(find_origin(c(5, 10), 2), 4) expect_equal(find_origin(c(5, 10), 5), 5) }) test_that("integers have origin offset by 0.5", { expect_equal(find_origin(c(1L, 10L), 1), 0.5) expect_equal(find_origin(c(5L, 10L), 2), 3.5) expect_equal(find_origin(c(5L, 10L), 5), 4.5) }) ================================================ FILE: inst/tests/test-ranged.r ================================================ context("Ranged") test_that("range attribute lost when modified", { x <- ranged(10:1) expect_equal(max(x), 10) x[1] <- 1 expect_equal(max(x), 9) expect_equal(attr(x, "range"), NULL) }) ================================================ FILE: inst/tests/test-smooth.r ================================================ context("Smooth") tricube <- function(x) { x <- abs(x) ifelse(x > 1, 0, (1 - x ^ 3) ^ 3) } # plot(tricube, xlim = c(-1.5, 1.5)) test_that("factorised smooth equal to manual smooth", { grid <- as.matrix(expand.grid(x = 1:10, y = 1:10, KEEP.OUT.ATTRS = FALSE)) z <- rep(0, nrow(grid)) z[c(5, 23, 84)] <- 1 z_x <- smooth_nd_1(grid, z, numeric(), grid, 0, 3) z_y <- smooth_nd_1(grid, z, numeric(), grid, 1, 3) z_xy <- smooth_nd_1(grid, z_x, numeric(), grid, 1, 3) z_yx <- smooth_nd_1(grid, z_y, numeric(), grid, 0, 3) z2 <- smooth_nd(grid, z, numeric(), grid, c(3, 3)) expect_equal(z_xy, z2) expect_equal(z_yx, z2) }) # library(ggplot2) # qplot(grid[, 1], grid[, 2], fill = z, geom = "raster") # qplot(grid[, 1], grid[, 2], fill = z_xy, geom = "raster") # qplot(grid[, 1], grid[, 2], fill = z_yx, geom = "raster") # qplot(grid[, 1], grid[, 2], fill = z2, geom = "raster") test_that("factorised smooth equal to manual smooth", { grid <- as.matrix(expand.grid(x = 1:10, y = 1:10, KEEP.OUT.ATTRS = FALSE)) z <- rep(0, nrow(grid)) z[c(5, 23, 84)] <- 1 grid <- as.data.frame(grid) grid$.count <- z class(grid) <- c("condensed", class(grid)) z1 <- smooth(grid, c(3, 3), ".count", factor = FALSE) z2 <- smooth(grid, c(3, 3), ".count", factor = TRUE) expect_equal(z1, z2) }) ================================================ FILE: inst/tests/test-stat.r ================================================ context("Stats") test_that("linear regression recovers slope & intercept if no errors", { x <- 1:10 w <- rep(1, 10) expect_equal(regress(x, x * 2, w), c(0, 2)) expect_equal(regress(x, x * -2, w), c(0, -2)) expect_equal(regress(x, x * -2 + 5, w), c(5, -2)) expect_equal(regress(x, x * -2 + -5, w), c(-5, -2)) }) simpleLm <- function(x, y, w) { unname(coef(lm(y ~ x, weights = w))) } test_that("linear regression matches lm", { x <- 1:10 y <- 10 + x * 2 + rnorm(10) w <- rep(1, 10) expect_equal(regress(x, y, w), simpleLm(x, y, w)) }) test_that("linear regression matches lm with weights", { x <- 1:10 y <- 10 + x * 2 + rnorm(10) w <- runif(10) expect_equal(regress(x, y, w), simpleLm(x, y, w)) }) test_that("robust regression effectively removes outlier", { x <- 1:10 y <- 10 + x * 2 + c(rep(0, 9), 10) w <- rep(1, 10) expect_equal(regress_robust(x, y, w, 10), c(10, 2)) }) ================================================ FILE: inst/tests/test-summary-moments.r ================================================ context("Summary: moments") count2 <- function(x) compute_moments(x)[1] mean2 <- function(x) compute_moments(x)[2] sd2 <- function(x) compute_moments(x)[3] test_that("count agrees with length", { expect_equal(count2(1:10), 10) expect_equal(count2(5), 1) expect_equal(count2(numeric()), 0) }) test_that("mean agree with base::mean", { expect_equal(mean2(1:10), mean(1:10)) x <- runif(1e6) expect_equal(mean2(x), mean(x)) }) test_that("missing values are ignored", { x <- c(NA, 5, 5) expect_equal(count2(x), 2) expect_equal(mean2(x), 5) }) test_that("standard deviation agrees with sd", { expect_equal(sd2(1:10), sd(1:10)) x <- runif(1e6) expect_equal(sd2(x), sd(x)) }) test_that("summary statistics of zero length input are NaN", { expect_equal(compute_moments(numeric()), c(0, NaN, NaN)) }) ================================================ FILE: inst/tests/test-weighted-stats.r ================================================ context("Weighted statistics") test_that("weighted.var agrees with var when weights = 1", { samples <- replicate(20, runif(100), simplify = FALSE) var <- sapply(samples, var) wvar <- sapply(samples, weighted.var, w = rep(1, 100)) expect_equal(wvar, var) }) test_that("weighted.var agrees with var on repeated vector", { samples <- replicate(20, runif(100), simplify = FALSE) w <- rep(1:2, 50) samples_ex <- lapply(samples, rep, times = w) var <- sapply(samples_ex, var) wvar <- sapply(samples, weighted.var, w = w) expect_equal(wvar, var) }) test_that("weighed.quantile agrees with quantile on repeated vector", { samples <- replicate(20, runif(100), simplify = FALSE) w <- rep(1:2, 50) samples_ex <- lapply(samples, rep, times = w) quant <- sapply(samples_ex, quantile, probs = 0.325, names = FALSE) wquant <- sapply(samples, weighted.quantile, w = w, probs = 0.325) expect_equal(quant, wquant) }) ================================================ FILE: man/autoplot.condensed.Rd ================================================ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/autoplot.r \name{autoplot.condensed} \alias{autoplot.condensed} \title{Autoplot condensed summaries.} \usage{ \method{autoplot}{condensed}(x, var = last(summary_vars(x)), ...) } \arguments{ \item{x}{a condensed summary} \item{var}{which summary variable to display} \item{...}{other arguments passed on to individual methods} } \description{ Autoplot condensed summaries. } \examples{ if (require("ggplot2")) { # 1d summaries ----------------------------- x <- rchallenge(1e4) z <- x + rt(length(x), df = 2) xsum <- condense(bin(x, 0.1)) zsum <- condense(bin(x, 0.1), z = z) autoplot(xsum) autoplot(peel(xsum)) autoplot(zsum) autoplot(peel(zsum, keep = 1)) autoplot(peel(zsum)) # 2d summaries ----------------------------- y <- runif(length(x)) xysum <- condense(bin(x, 0.1), bin(y, 0.1)) xyzsum <- condense(bin(x, 0.1), bin(y, 0.1), z = z) autoplot(xysum) autoplot(peel(xysum)) autoplot(xyzsum) autoplot(peel(xyzsum)) } } ================================================ FILE: man/best_h.Rd ================================================ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/h.r \name{best_h} \alias{best_h} \title{Find "best" smoothing parameter using leave-one-out cross validation.} \usage{ best_h(x, h_init = NULL, ..., tol = 0.01, control = list()) } \arguments{ \item{x}{condensed summary to smooth} \item{h_init}{initial values of bandwidths to start search out. If not specified defaults to 5 times the binwidth of each variable.} \item{...}{other arguments (like \code{var}) passed on to \code{\link{rmse_cv}}} \item{tol}{numerical tolerance, defaults to 1\%.} \item{control}{additional control parameters passed on to \code{\link{optim}} The most useful argument is probably trace, which makes it possible to follow the progress of the optimisation.} } \value{ a single numeric value representing the bandwidth that minimises the leave-one-out estimate of rmse. Vector has attributes \code{evaluations} giving the number of times the objective function was evaluated. If the optimisation does not converge, or smoothing is not needed (i.e. the estimate is on the lower bounds), a warning is thrown. } \description{ Minimises the leave-one-out estimate of root mean-squared error to find find the "optimal" bandwidth for smoothing. } \details{ L-BFGS-B optimisation is used to constrain the bandwidths to be greater than the binwidths: if the bandwidth is smaller than the binwidth it's impossible to compute the rmse because no smoothing occurs. The tolerance is set relatively high for numerical optimisation since the precise choice of bandwidth makes little difference visually, and we're unlikely to have sufficient data to make a statistically significant choice anyway. } \examples{ \donttest{ x <- rchallenge(1e4) xsum <- condense(bin(x, 1 / 10)) h <- best_h(xsum, control = list(trace = 3, REPORT = 1)) if (require("ggplot2")) { autoplot(xsum) autoplot(smooth(xsum, h)) } } } \seealso{ Other bandwidth estimation functions: \code{\link{h_grid}}; \code{\link{rmse_cv}}, \code{\link{rmse_cvs}} } ================================================ FILE: man/bigvis.Rd ================================================ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/bigvis.r \docType{package} \name{bigvis} \alias{as.integer,Rcpp_BinnedVector-method} \alias{bigvis} \alias{bigvis-package} \alias{show,Rcpp_BinnedVector-method} \title{The big vis package.} \usage{ \S4method{show}{Rcpp_BinnedVector}(object) \S4method{as.integer}{Rcpp_BinnedVector}(x, ...) } \arguments{ \item{x,object,...}{Generic args} } \description{ The big vis package. } ================================================ FILE: man/bin.Rd ================================================ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/bin.r \name{bin} \alias{bin} \title{Create a binned variable.} \usage{ bin(x, width = find_width(x), origin = find_origin(x, width), name = NULL) } \arguments{ \item{x}{numeric or integer vector} \item{width}{bin width. If not specified, about 10,000 bins will be chosen using the algorithim in \code{\link{find_width}}.} \item{origin}{origin. If not specified, guessed by \code{\link{find_origin}}.} \item{name}{name of original variable. This will be guessed from the input to \code{group} if not supplied. Used in the output of \code{\link{condense}} etc.} } \description{ Create a binned variable. } \details{ This function produces an R reference class that wraps around a C++ function. Generally, you should just treat this as an opaque object with reference semantics, and you shouldn't call the methods on it - pass it to \code{\link{condense}} and friends. } \examples{ x <- runif(1e6) bin(x) bin(x, 0.01) bin(x, 0.01, origin = 0.5) } ================================================ FILE: man/breaks.Rd ================================================ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/breaks.r \name{breaks} \alias{breaks} \title{Compute breaks given origin and width.} \usage{ breaks(x, binwidth, origin = min(x)) } \arguments{ \item{x}{numeric vector} \item{binwidth}{bin width} \item{origin}{bin origin} } \description{ Breaks are right-open, left-closed [x, y), so if \code{max(x)} is an integer multiple of binwidth, then we need one more break. This function only returns the left-side of the breaks. } \details{ The first break is special, because it always contains missing values. } \examples{ breaks(10, origin = 0, binwidth = 1) breaks(9.9, origin = 0, binwidth = 1) breaks(1:10, origin = 0, binwidth = 2) } \keyword{internal} ================================================ FILE: man/condense.Rd ================================================ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/condense.r \name{condense} \alias{condense} \title{Efficient binned summaries.} \usage{ condense(..., z = NULL, summary = NULL, w = NULL, drop = NULL) } \arguments{ \item{...}{group objects created by \code{\link{bin}}} \item{z}{a numeric vector to summary for each group. Optional for some summary statistics.} \item{summary}{the summary statistic to use. Currently must be one of count, sum, mean, median or sd. If \code{NULL}, defaults to mean if y is present, count if not.} \item{w}{a vector of weights. Not currently supported by all summary functions.} \item{drop}{if \code{TRUE} only locations with data will be returned. This is more efficient if the data is very sparse (<1\% of cells filled), and is slightly less efficient. Defaults to \code{TRUE} if you are condensing over two or more dimensions, \code{FALSE} for 1d.} } \description{ Efficient binned summaries. } \examples{ x <- runif(1e5) gx <- bin(x, 0.1) condense(gx) } ================================================ FILE: man/condensed.Rd ================================================ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/condensed.r \name{condensed} \alias{as.condensed} \alias{condensed} \alias{is.condensed} \title{Condensed: an S3 class for condensed summaries.} \usage{ condensed(groups, grouped, summary) is.condensed(x) as.condensed(x) } \arguments{ \item{groups}{list of \code{\link{bin}}ed objects} \item{grouped,summary}{output from C++ condense function} \item{x}{object to test or coerce} } \description{ This object managed the properties of condensed (summarised) data frames. } \section{S3 methods}{ Mathematical functions with methods for \code{binsum} object will modify the x column of the data frame and \code{\link{rebin}} the data, calculating updated summary statistics. Currently methods are provided for the \code{Math} group generic, logical comparison and arithmetic operators, and \code{\link[plyr]{round_any}}. } \examples{ if (require("ggplot2")) { x <- rchallenge(1e4) xsum <- condense(bin(x, 1 / 10)) # Basic math operations just modify the first column autoplot(xsum) autoplot(xsum * 10) autoplot(xsum - 30) autoplot(abs(xsum - 30)) # Similarly, logical operations work on the first col autoplot(xsum[xsum > 10, ]) } } \keyword{internal} ================================================ FILE: man/dchallenge.Rd ================================================ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/challenge.r \name{dchallenge} \alias{dchallenge} \alias{rchallenge} \title{Density and random number generation functions for a challenging distribution.} \usage{ dchallenge(x) rchallenge(n) } \arguments{ \item{x}{values to evaluate pdf at} \item{n}{number of random samples to generate} } \description{ This is a 1/3-2/3 mixture of a t-distribution with 2 degrees of freedom centered at 15 and scaled by 2, and a gamma distribution with shape 2 and rate 1/3. (The t-distribution is windsorised at 0, but this has negligible effect.) This distribution is challenging because it mixes heavy tailed and asymmetric distributions. } \examples{ plot(dchallenge, xlim = c(-5, 60), n = 500) x <- rchallenge(1e4) hist(x, breaks = 1000) xsum <- condense(bin(x, 0.1)) plot(xsum$x, xsum$.count, type = "l") xsmu <- smooth(xsum, 0.3) plot(xsmu$x, xsmu$.count, type = "l") plot(xsmu$x, xsmu$.count, type = "l", xlim = c(0, 30)) } ================================================ FILE: man/dgrid.Rd ================================================ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/dgrid.r \name{dgrid} \alias{dgrid} \alias{is.dgrid} \title{dgrid: an S3 class for data grids} \usage{ dgrid(x, width, origin = 0, nbins = NULL) is.dgrid(x) } \arguments{ \item{x}{a numeric vector to test or coerce.} \item{width}{bin width} \item{origin}{bin origins} \item{nbins}{number of bins} } \description{ dgrid: an S3 class for data grids } \examples{ g <- dgrid(0:10 + 0.5, width = 1) range(g) as.integer(g) } ================================================ FILE: man/find_origin.Rd ================================================ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/origin.r \name{find_origin} \alias{find_origin} \title{Find the origin.} \usage{ find_origin(x, binwidth) } \arguments{ \item{x}{numeric or integer vector} \item{binwidth}{binwidth} } \description{ Find the origin. } \details{ This algorithm implements simple heuristics for determining the origin of a histogram when only the binwidth is specified. It: \itemize{ \item rounds to zero, if relatively close \item subtracts 0.5 offset, if an x is integer \item ensures the origin is a multiple of the binwidth } } \examples{ find_origin(1:10, 1) find_origin(1:10, 2) find_origin(c(1, 1e6), 1) } \seealso{ Other reasonable defaults: \code{\link{find_width}} } \keyword{internal} ================================================ FILE: man/find_width.Rd ================================================ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/width.r \name{find_width} \alias{find_width} \title{Compute a reasonable default binwidth.} \usage{ find_width(x, nbins = 10000) } \arguments{ \item{x}{a numeric vector. If a numeric vector of length one is supplied, it's assumed that} \item{nbins}{desired number of bins (approximate)} } \description{ Compute a reasonable default binwidth. } \examples{ find_width(c(0, 5)) find_width(c(0, 5.023432)) find_width(c(0, 5.9)) } \seealso{ Other reasonable defaults: \code{\link{find_origin}} } \keyword{internal} ================================================ FILE: man/frange.Rd ================================================ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/RcppExports.R \name{frange} \alias{frange} \title{Efficient implementation of range.} \usage{ frange(x, finite = TRUE) } \arguments{ \item{x}{a numeric vector, or a \code{\link{ranged}} object} \item{finite}{If \code{TRUE} ignores missing values and infinities. Note that if the vector is empty, or only contains missing values, \code{frange} will return \code{c(Inf, -Inf)} because those are the identity values for \code{\link{min}} and \code{\link{max}} respectively.} } \description{ This is an efficient C++ implementation of range for numeric vectors: it avoids S3 dispatch, and computes both min and max in a single pass through the input. } \details{ If \code{x} has a \code{range} attribute (e.g. it's a \code{\link{ranged}} object), it will be used instead of computing the range from scratch. } \examples{ x <- runif(1e6) system.time(range(x)) system.time(frange(x)) rx <- ranged(x) system.time(frange(rx)) } ================================================ FILE: man/h_grid.Rd ================================================ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/h.r \name{h_grid} \alias{h_grid} \title{Generate grid of plausible bandwidths for condensed summary.} \usage{ h_grid(x, n = 50, max = 20) } \arguments{ \item{x}{a condensed summary} \item{n}{number of bandwidths to generate (in each dimension)} \item{max}{maximum bandwidth to generate, as multiple of binwidth.} } \description{ By default, the bandwidths start at the bin width, and then continue up 50 (\code{n}) steps until 20 (\code{max}) times the bin width. } \examples{ x <- rchallenge(1e4) xsum <- condense(bin(x, 1 / 10)) h_grid(xsum) y <- runif(1e4) xysum <- condense(bin(x, 1 / 10), bin(y, 1 / 100)) h_grid(xysum, n = 10) } \seealso{ Other bandwidth estimation functions: \code{\link{best_h}}; \code{\link{rmse_cv}}, \code{\link{rmse_cvs}} } ================================================ FILE: man/is.ranged.Rd ================================================ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/ranged.r \name{is.ranged} \alias{is.ranged} \title{Test if an object is of class ranged.} \usage{ is.ranged(x) } \arguments{ \item{x}{object to test} } \description{ Test if an object is of class ranged. } \keyword{internal} ================================================ FILE: man/movies.Rd ================================================ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/movies.r \docType{data} \name{movies} \alias{movies} \title{Movie information and user ratings from IMDB.com.} \format{A data frame with 130,456 rows and 14 variables} \usage{ data(movies) } \description{ The internet movie database, \url{http://imdb.com/}, is a website devoted to collecting movie data supplied by studios and fans. It claims to be the biggest movie database on the web and is run by amazon. More about information imdb.com can be found online, \url{http://imdb.com/help/show_leaf?about}, including information about the data collection process, \url{http://imdb.com/help/show_leaf?infosource}. } \details{ Movies were selected for inclusion if they had a known length and had been rated by at least one imdb user. The data set contains the following fields: \itemize{ \item title. Title of the movie. \item year. Year of release. \item budget. Total budget (if known) in US dollars \item length. Length in minutes. \item rating. Average IMDB user rating. \item votes. Number of IMDB users who rated this movie. \item mpaa. MPAA rating. \item action, animation, comedy, drama, documentary, romance, short: \code{TRUE} if movie belongs to that genre. } } \references{ \url{http://had.co.nz/data/movies/} } ================================================ FILE: man/mt.Rd ================================================ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/mt.r \name{mt} \alias{inv_mt} \alias{mt} \alias{mt_trans} \title{Modulus transformation (and its inverse).} \usage{ mt(x, lambda) inv_mt(x, lambda) mt_trans(lambda) } \arguments{ \item{x}{values to transform} \item{lambda}{degree of transformation} } \description{ A generalisation of the box-cox transformation that works for values with both positive and negative values. } \details{ This is useful for compressing the tails of long-tailed distributions, often encountered with very large datasets. } \examples{ x <- seq(-10, 10, length = 100) plot(x, mt(x, 0), type = "l") plot(x, mt(x, 0.25), type = "l") plot(x, mt(x, 0.5), type = "l") plot(x, mt(x, 1), type = "l") plot(x, mt(x, 2), type = "l") plot(x, mt(x, -1), type = "l") plot(x, mt(x, -2), type = "l") } \references{ J. John and N. Draper. "An alternative family of transformations." Applied Statistics, pages 190-197, 1980. \url{http://www.jstor.org/stable/2986305} } ================================================ FILE: man/peel.Rd ================================================ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/peel.r \name{peel} \alias{peel} \title{Peel off low density regions of the data.} \usage{ peel(x, keep = 0.99, central = NULL) } \arguments{ \item{x}{condensed summary} \item{keep}{(approximate) proportion of data to keep. If \code{1}, will remove all cells with counts. All missing values will be preserved.} \item{central}{if \code{TRUE} peels off regions of lowest density only from the outside of the data. In 2d this works by progressively peeling off convex hull of the data: the current algorithm is quite slow. If \code{FALSE}, just removes the lowest density regions wherever they are found. Regions with 0 density are removed regardless of location. Defaults to TRUE if there are two or fewer grouping variables is less.} } \description{ Keeps specified proportion of data by removing the lowest density regions, either anywhere on the plot, or for 2d, just around the edges. } \details{ This is useful for visualisation, as an easy way of focussing on the regions where the majority of the data lies. } \examples{ x <- rt(1e5, df = 2) y <- rt(1e5, df = 2) xysum <- condense(bin(x, 1 / 10), bin(y, 1 / 10)) plot(xysum$x, xysum$y) plot(peel(xysum, 0.95, central = TRUE)[1:2]) plot(peel(xysum, 0.90, central = TRUE)[1:2]) plot(peel(xysum, 0.50, central = TRUE)[1:2]) } ================================================ FILE: man/ranged.Rd ================================================ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/ranged.r \name{ranged} \alias{ranged} \title{A S3 class for caching the range of a vector} \usage{ ranged(x, range = frange(x, finite = TRUE)) } \arguments{ \item{x}{a numeric vector} \item{range}{the range of the vector (excluding missing values), if known. If unknown, it will be computed with \code{\link{frange}}, a fast C++ implementation of \code{\link{range}}.} } \description{ This class is designed for dealing with large vectors, where the cost of recomputing the range multiple times is prohibitive. It provides methods for \code{\link{print}} and \code{\link{str}} that display only the range, not the contents. } \section{Performance}{ For best performance, you may want to run copy and paste the contents of this function into your function, to avoid making any copies of \code{x}. This is probably only necessary if you're dealing with extremely large vectors, > 100 million obs. } \examples{ x <- runif(1e6) y <- ranged(x) range(y) y str(y) # Modifications to the class currently destroy the cache y[1] <- 10 max(y) class(y) z <- y + 10 max(z) class(z) } ================================================ FILE: man/rmse_cvs.Rd ================================================ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/rmse.r \name{rmse_cvs} \alias{rmse_cv} \alias{rmse_cvs} \title{Estimate smoothing RMSE using leave-one-out cross-validation.} \usage{ rmse_cvs(x, hs = h_grid(x), ...) rmse_cv(x, h, var = summary_vars(x)[1], ...) } \arguments{ \item{x}{condensed summary table} \item{...}{other variables passed on to \code{\link{smooth}}} \item{h,hs}{for \code{rmse_cv}, a vector of bandwidths; for \code{rmse_cv} a data frame of bandwidths, as generated by \code{\link{h_grid}}.} \item{var}{variable to smooth} } \description{ \code{rmse_cv} computes the leave-one-out RMSE for a single vector of bandwidths, \code{rmse_cvs} computes for a multiple vectors of bandwidths, stored as a data frame. } \examples{ \donttest{ set.seed(1014) # 1d ----------------------------- x <- rchallenge(1e4) xsum <- condense(bin(x, 1 / 10)) cvs <- rmse_cvs(xsum) if (require("ggplot2")) { autoplot(xsum) qplot(x, err, data = cvs, geom = "line") xsmu <- smooth(xsum, 1.3) autoplot(xsmu) autoplot(peel(xsmu)) } # 2d ----------------------------- y <- runif(1e4) xysum <- condense(bin(x, 1 / 10), bin(y, 1 / 100)) cvs <- rmse_cvs(xysum, h_grid(xysum, 10)) if (require("ggplot2")) { qplot(x, y, data = cvs, size = err) } } } \seealso{ Other bandwidth estimation functions: \code{\link{best_h}}; \code{\link{h_grid}} } ================================================ FILE: man/round_any.condensed.Rd ================================================ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/condensed.r \name{round_any.condensed} \alias{round_any.condensed} \title{Round any method for condensed objects} \usage{ round_any.condensed(x, accuracy, f = round) } \arguments{ \item{x}{numeric or date-time (POSIXct) vector to round} \item{accuracy}{number to round to; for POSIXct objects, a number of seconds} \item{f}{rounding function: \code{\link{floor}}, \code{\link{ceiling}} or \code{\link{round}}} } \description{ Round any method for condensed objects } \keyword{internal} ================================================ FILE: man/smooth.Rd ================================================ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/smooth.r \name{smooth} \alias{smooth} \title{Smooth a condensed data frame.} \usage{ smooth(x, h, var = summary_vars(x)[1], grid = NULL, type = "mean", factor = TRUE) } \arguments{ \item{x}{a condensed summary} \item{h}{numeric vector of bandwidths, one for each grouping variable in \code{x}} \item{var}{variable to smooth} \item{grid}{a data frame with the grouping colums as x. In order for the factored version of \code{smooth_nd} to work, this grid must be a superset of \code{x}.} \item{type}{type of smoothing to use. Current options are \code{"mean"}, a kernel weighted mean; \code{"regression"}, a kernel weighted local regression; and \code{"robust_regression"}, robust kernel weighted local regression in the style of \code{\link{loess}}. Unique prefixes are also acceptable.} \item{factor}{if \code{TRUE} compute the n-dimensional smooth by a sequence of 1d smoothes. For \code{type = "mean"} the results are always the same grid values are uncorrelated (e.g. the grid is complete at every location); and is very approximate for \code{type = "robust"}.} } \description{ Smooth a condensed data frame. } \examples{ x <- runif(1e5) xsum <- condense(bin(x, 1 / 100)) xsmu1 <- smooth(xsum, 5 / 100) xsmu2 <- smooth(xsum, 5 / 100, factor = FALSE) # More challenging distribution x <- rchallenge(1e4) xsum <- condense(bin(x, 0.1)) xsmu <- smooth(xsum, 1) plot(xsum$x, xsum$.count, type = "l") lines(xsmu$x, xsmu$.count, col = "red") xsmu2 <- smooth(xsum, 1, type = "regress") plot(xsmu$x, xsmu$.count, type = "l", xlim = c(0, 50)) lines(xsmu2$x, xsmu2$.count, col = "red") # Note difference in tails } ================================================ FILE: man/standardise.Rd ================================================ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/standardise.r \name{standardise} \alias{standardise} \title{Standardise a summary to sum to one.} \usage{ standardise(x, margin = integer()) } \arguments{ \item{x}{a condensed summary. Must have \code{.count} variable.} \item{margin}{margins to standardise along. If \code{NULL}, the default, standardises the whole array.} } \description{ Standardise a summary to sum to one. } \examples{ b1 <- condense(bin(movies$year, 1)) d1 <- smooth(b1, 2, type = "reg") if (require("ggplot2")) { autoplot(b1) autoplot(d1) # Note change in x-axis limits autoplot(standardise(d1)) } # Can also standardise a dimension at a time b2 <- with(movies, condense(bin(year, 2), bin(length, 10))) b2 <- peel(b2, central = TRUE) if (require("ggplot2")) { autoplot(b2) autoplot(standardise(b2)) # note legend autoplot(standardise(b2, "year")) # each row sums to 1 autoplot(standardise(b2, "length")) # each col sums to 1 base <- ggplot(b2, aes(length, .count)) + geom_line(aes(group = year, colour = year)) base base \%+\% standardise(b2) # Just affects y axis labels base \%+\% standardise(b2, "year") # Makes year comparable base \%+\% standardise(b2, "length") # Meaningless for this display } } ================================================ FILE: man/transform.condensed.Rd ================================================ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/rebin.r \name{transform.condensed} \alias{rebin} \alias{transform.condensed} \title{Transform condensed objects, collapsing unique bins.} \usage{ \\method{transform}{condensed}(`_data`, ...) rebin(data) } \arguments{ \item{...}{named arguments evaluated in the context of the data} \item{data,`_data`}{a condensed summary} } \description{ Transform condensed objects, collapsing unique bins. } \details{ You don't need to use \code{rebin} if you use transform: it will automatically rebin for you. You will need to use it if you manually transform any grouping variables. } \examples{ x <- runif(1e4, -1, 1) xsum <- condense(bin(x, 1 / 50)) # Transforming by hand: must use rebin xsum$x <- abs(xsum$x) rebin(xsum) if (require("ggplot2")) { autoplot(xsum) + geom_point() autoplot(rebin(xsum)) + geom_point() } #' Transforming with transform y <- x ^ 2 + runif(length(x), -0.1, 0.1) xysum <- condense(bin(x, 1 / 50), z = y) xysum <- transform(xysum, x = abs(x)) if (require("ggplot2")) { autoplot(xysum) } } \keyword{internal} ================================================ FILE: man/weighted.IQR.Rd ================================================ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/weighted-stats.r \name{weighted.IQR} \alias{weighted.IQR} \title{Compute the interquartile range of weighted data.} \usage{ weighted.IQR(x, w, na.rm = FALSE) } \arguments{ \item{x}{numeric vector of observations} \item{w}{integer vector of weights, representing the number of time each \code{x} was observed} \item{na.rm}{If \code{TRUE} will automatically remove missing values in \code{x} or \code{w}.} } \description{ Compute the interquartile range of weighted data. } \details{ This is a simple wrapper around \code{\link{weighted.quantile}} } \examples{ x <- sort(runif(200)) w <- rpois(200, seq(1, 10, length = 200)) + 1 IQR(x) weighted.IQR(x, w) } ================================================ FILE: man/weighted.ecdf.Rd ================================================ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/weighted-stats.r \name{weighted.ecdf} \alias{weighted.ecdf} \title{A weighted ecdf function.} \usage{ weighted.ecdf(x, w) } \arguments{ \item{x}{numeric vector of observations} \item{w}{integer vector of weights, representing the number of time each \code{x} was observed} } \description{ An extension of the base \code{\link[stats]{ecdf}} function which works with weighted data. } \section{S3 methods}{ The \code{ecdf} class has methods for \code{\link{plot}}, \code{\link{lines}}, \code{\link{summary}} and \code{\link{quantile}}. \code{\link{quantile}} does not currently correctly compute values for weighted ecdfs. } \examples{ x <- runif(200) w <- rpois(200, 5) + 1 e <- weighted.ecdf(x, w) plot(e) summary(e) y <- x[rep(seq_along(x), w)] plot(ecdf(y)) } \seealso{ \code{\link[stats]{weighted.mean}} Other weighted statistics: \code{\link{weighted.quantile}}; \code{\link{weighted.sd}}, \code{\link{weighted.var}} } ================================================ FILE: man/weighted.median.Rd ================================================ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/weighted-stats.r \name{weighted.median} \alias{weighted.median} \title{Compute the median of weighted data.} \usage{ weighted.median(x, w, na.rm = FALSE) } \arguments{ \item{x}{numeric vector of observations} \item{w}{integer vector of weights, representing the number of time each \code{x} was observed} \item{na.rm}{If \code{TRUE} will automatically remove missing values in \code{x} or \code{w}.} } \description{ Compute the median of weighted data. } \details{ This is a simple wrapper around \code{\link{weighted.quantile}} } \examples{ x <- runif(200) w <- rpois(200, 5) + 1 median(x) weighted.median(x, w) } ================================================ FILE: man/weighted.quantile.Rd ================================================ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/weighted-stats.r \name{weighted.quantile} \alias{weighted.quantile} \title{Compute quantiles of weighted data.} \usage{ weighted.quantile(x, w, probs = seq(0, 1, 0.25), na.rm = FALSE) } \arguments{ \item{x}{numeric vector of observations} \item{w}{integer vector of weights, representing the number of time each \code{x} was observed} \item{probs}{numeric vector of probabilities between 0 and 1} \item{na.rm}{If \code{TRUE} will automatically remove missing values in \code{x} or \code{w}.} } \description{ Compute quantiles of weighted data. } \details{ Currently only implements the type 7 algorithm, as described in \code{\link{quantile}}. Based on \code{\link{quantile}} written by R-core. } \examples{ x <- runif(200) w <- rpois(200, 5) + 1 weighted.quantile(x, w) } \seealso{ Other weighted statistics: \code{\link{weighted.ecdf}}; \code{\link{weighted.sd}}, \code{\link{weighted.var}} } ================================================ FILE: man/weighted.var.Rd ================================================ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/weighted-stats.r \name{weighted.var} \alias{weighted.sd} \alias{weighted.var} \title{Compute a weighted variance or standard deviation of a vector.} \usage{ weighted.var(x, w = NULL, na.rm = FALSE) weighted.sd(x, w, na.rm = TRUE) } \arguments{ \item{x}{numeric vector of observations} \item{w}{integer vector of weights, representing the number of time each \code{x} was observed} \item{na.rm}{if \code{TRUE}, missing values in both \code{w} and \code{x} will be removed prior computation. Otherwise if there are missing values the result will always be missing.} } \description{ Compute a weighted variance or standard deviation of a vector. } \details{ Note that unlike the base R \code{\link{var}} function, these functions only work with individual vectors not matrices or data frames. } \examples{ x <- c(1:5) w <- rpois(5, 5) + 1 y <- x[rep(seq_along(x), w)] weighted.var(x, w) var(y) stopifnot(all.equal(weighted.var(x, w), var(y))) } \seealso{ \code{\link[stats]{weighted.mean}} Other weighted statistics: \code{\link{weighted.ecdf}}; \code{\link{weighted.quantile}} } ================================================ FILE: notes.md ================================================ # Group * 1d, nd Future work: linear binning # Summarise * 1d * count, sum * count, mean, sd * median * 2d * mean * regression * robust regression * nd * mean * regression (with eigen or armadillo) * robust regression (with eigen or armadillo) Future work: * skew?, kurt? * boxplot * weighted quantiles (C++ version of R code) * compute standard errors / bootstrap standard errors? * infrastructure for passing multiple z * 2d: cor, lm # Smooth Kernel smoothing plus binned summary leads to many common statistics: density =~ bin + smooth, loess =~ mean + smooth, rqss =~ quantile + smooth * weights * smoothing type * constant * linear * robust linear (lowess) * (linear poisson?) * leave-one-out cross-validation * optimisations * convert to integer grid & use pre-computed grid of kernel values * hash in smooth_nd_1 and compute more efficiently along 1d * deal with missing values * smooth needs to create complete grid when factor = TRUE Think about input data structure: sparse grid, represented as a coordinate list. Binned grid class = integer vector + width/origin/nbins (0 = NA). Most transformations break the grid, in which case all you case preserve is min, max and number of bins. All smoothing methods adapted to work in terms of these integers. Need to extract out bin/unbin into own class (initialised with std::vector of bin sizes) Possible that more performance is available by switching to a sparse tensor library. # Visualise * Product plots * Standard errors + cut offs * Peel: implement nd version using depth ================================================ FILE: src/.gitignore ================================================ *.o *.so ================================================ FILE: src/BigVis.cpp ================================================ #include using namespace Rcpp; RCPP_MODULE(BigVis) { class_("BinnedVector") .constructor() .const_method("bin_i", &BinnedVectorReference::bin_i) .const_method("bin", &BinnedVectorReference::bin) .const_method("unbin", &BinnedVectorReference::unbin) .const_method("nbins", &BinnedVectorReference::nbins) .const_method("size", &BinnedVectorReference::size) .const_method("origin", &BinnedVectorReference::origin) .const_method("width", &BinnedVectorReference::width) .const_method("name", &BinnedVectorReference::name) ; class_("BinnedVectors") .constructor() .method("add_vector", &BinnedVectors::add_vector) .field("bins", &BinnedVectors::bins_) .const_method("bin_i", &BinnedVectors::bin_i) .const_method("bin", &BinnedVectors::bin) .const_method("unbin", &BinnedVectors::unbin) .const_method("nbins", &BinnedVectors::nbins) ; } ================================================ FILE: src/BinnedVector.cpp ================================================ #include using namespace Rcpp; NumericVector frange(const NumericVector& x, const bool finite = true); int BinnedVector::nbins() const { double max = frange(x_)[1]; return bin(max) + 1; // +1 bin for missing values } ================================================ FILE: src/BinnedVectors.cpp ================================================ #include using namespace Rcpp; int BinnedVectors::bin_i(int i) const { int bin = 0; int ngroups = groups_.size(); for (int j = 0; j < ngroups; ++j) { bin += groups_[j].bin_i(i) * bins_[(ngroups - 1) - j]; } return bin; } int BinnedVectors::bin(std::vector x) const { int ngroups = groups_.size(); if (x.size() != ngroups) stop("x must be same length as groups"); int bin = 0; for (int j = 0; j < ngroups; ++j) { int bin_j = groups_[j].bin(x[j]); bin += bin_j * bins_[(ngroups - 1) - j]; // Rcout << "group: " << j << " bin: " << bin << " bin_j: " << bin_j << "\n"; } return bin; } std::vector BinnedVectors::unbin(int bin) const { int ngroups = groups_.size(); std::vector bins(ngroups); // if ngroups = 3, then: // bin = groups[0].bin(x[0]) * bins[2] (biggest) + // groups[1].bin(x[1]) * bins[1] + // groups[2].bin(x[2]) * bins[0] (smallest) // peel off largest first // bin_j = bin %/% bin[2] // groups[0].unbin(bin_j) // and that goes in last output position for (int i = 0, j = ngroups - 1; i < ngroups - 1; ++i, --j) { int bin_j = bin % bins_[j]; // Rcout << "group: " << j << " bin: " << bin << " bin_j: " << bin_j << "\n"; bins[j] = groups_[j].unbin(bin_j); bin = (bin - bin_j) / bins_[j]; } // Rcout << "group: " << 0 << " bin: " << bin << " bin_j: " << bin << "\n"; // Special case for last group because x %% 1 = 0 bins[0] = groups_[0].unbin(bin); return bins; } ================================================ FILE: src/Makevars ================================================ PKG_CPPFLAGS=-I../inst/include ================================================ FILE: src/RcppExports.cpp ================================================ // This file was generated by Rcpp::compileAttributes // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 #include "../inst/include/bigvis.h" #include using namespace Rcpp; // condense_count List condense_count(const List& x, const NumericVector& z, const NumericVector& weight, bool drop); RcppExport SEXP bigvis_condense_count(SEXP xSEXP, SEXP zSEXP, SEXP weightSEXP, SEXP dropSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; Rcpp::traits::input_parameter< const List& >::type x(xSEXP); Rcpp::traits::input_parameter< const NumericVector& >::type z(zSEXP); Rcpp::traits::input_parameter< const NumericVector& >::type weight(weightSEXP); Rcpp::traits::input_parameter< bool >::type drop(dropSEXP); __result = Rcpp::wrap(condense_count(x, z, weight, drop)); return __result; END_RCPP } // condense_sum List condense_sum(const List& x, const NumericVector& z, const NumericVector& weight, bool drop); RcppExport SEXP bigvis_condense_sum(SEXP xSEXP, SEXP zSEXP, SEXP weightSEXP, SEXP dropSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; Rcpp::traits::input_parameter< const List& >::type x(xSEXP); Rcpp::traits::input_parameter< const NumericVector& >::type z(zSEXP); Rcpp::traits::input_parameter< const NumericVector& >::type weight(weightSEXP); Rcpp::traits::input_parameter< bool >::type drop(dropSEXP); __result = Rcpp::wrap(condense_sum(x, z, weight, drop)); return __result; END_RCPP } // condense_mean List condense_mean(const List& x, const NumericVector& z, const NumericVector& weight, bool drop); RcppExport SEXP bigvis_condense_mean(SEXP xSEXP, SEXP zSEXP, SEXP weightSEXP, SEXP dropSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; Rcpp::traits::input_parameter< const List& >::type x(xSEXP); Rcpp::traits::input_parameter< const NumericVector& >::type z(zSEXP); Rcpp::traits::input_parameter< const NumericVector& >::type weight(weightSEXP); Rcpp::traits::input_parameter< bool >::type drop(dropSEXP); __result = Rcpp::wrap(condense_mean(x, z, weight, drop)); return __result; END_RCPP } // condense_sd List condense_sd(const List& x, const NumericVector& z, const NumericVector& weight, bool drop); RcppExport SEXP bigvis_condense_sd(SEXP xSEXP, SEXP zSEXP, SEXP weightSEXP, SEXP dropSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; Rcpp::traits::input_parameter< const List& >::type x(xSEXP); Rcpp::traits::input_parameter< const NumericVector& >::type z(zSEXP); Rcpp::traits::input_parameter< const NumericVector& >::type weight(weightSEXP); Rcpp::traits::input_parameter< bool >::type drop(dropSEXP); __result = Rcpp::wrap(condense_sd(x, z, weight, drop)); return __result; END_RCPP } // condense_median List condense_median(const List& x, const NumericVector& z, const NumericVector& weight, bool drop); RcppExport SEXP bigvis_condense_median(SEXP xSEXP, SEXP zSEXP, SEXP weightSEXP, SEXP dropSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; Rcpp::traits::input_parameter< const List& >::type x(xSEXP); Rcpp::traits::input_parameter< const NumericVector& >::type z(zSEXP); Rcpp::traits::input_parameter< const NumericVector& >::type weight(weightSEXP); Rcpp::traits::input_parameter< bool >::type drop(dropSEXP); __result = Rcpp::wrap(condense_median(x, z, weight, drop)); return __result; END_RCPP } // double_diff_sum std::vector double_diff_sum(IntegerVector bin, IntegerVector count); RcppExport SEXP bigvis_double_diff_sum(SEXP binSEXP, SEXP countSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; Rcpp::traits::input_parameter< IntegerVector >::type bin(binSEXP); Rcpp::traits::input_parameter< IntegerVector >::type count(countSEXP); __result = Rcpp::wrap(double_diff_sum(bin, count)); return __result; END_RCPP } // frange NumericVector frange(const NumericVector& x, const bool finite); RcppExport SEXP bigvis_frange(SEXP xSEXP, SEXP finiteSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP); Rcpp::traits::input_parameter< const bool >::type finite(finiteSEXP); __result = Rcpp::wrap(frange(x, finite)); return __result; END_RCPP } // group_fixed IntegerVector group_fixed(const NumericVector& x, double width, double origin); RcppExport SEXP bigvis_group_fixed(SEXP xSEXP, SEXP widthSEXP, SEXP originSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP); Rcpp::traits::input_parameter< double >::type width(widthSEXP); Rcpp::traits::input_parameter< double >::type origin(originSEXP); __result = Rcpp::wrap(group_fixed(x, width, origin)); return __result; END_RCPP } // group_rect IntegerVector group_rect(const NumericVector& x, const NumericVector& y, double x_width, double y_width, double x_origin, double y_origin); RcppExport SEXP bigvis_group_rect(SEXP xSEXP, SEXP ySEXP, SEXP x_widthSEXP, SEXP y_widthSEXP, SEXP x_originSEXP, SEXP y_originSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP); Rcpp::traits::input_parameter< const NumericVector& >::type y(ySEXP); Rcpp::traits::input_parameter< double >::type x_width(x_widthSEXP); Rcpp::traits::input_parameter< double >::type y_width(y_widthSEXP); Rcpp::traits::input_parameter< double >::type x_origin(x_originSEXP); Rcpp::traits::input_parameter< double >::type y_origin(y_originSEXP); __result = Rcpp::wrap(group_rect(x, y, x_width, y_width, x_origin, y_origin)); return __result; END_RCPP } // group_hex IntegerVector group_hex(const NumericVector& x, const NumericVector& y, double x_width, double y_width, double x_origin, double y_origin, double x_max); RcppExport SEXP bigvis_group_hex(SEXP xSEXP, SEXP ySEXP, SEXP x_widthSEXP, SEXP y_widthSEXP, SEXP x_originSEXP, SEXP y_originSEXP, SEXP x_maxSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP); Rcpp::traits::input_parameter< const NumericVector& >::type y(ySEXP); Rcpp::traits::input_parameter< double >::type x_width(x_widthSEXP); Rcpp::traits::input_parameter< double >::type y_width(y_widthSEXP); Rcpp::traits::input_parameter< double >::type x_origin(x_originSEXP); Rcpp::traits::input_parameter< double >::type y_origin(y_originSEXP); Rcpp::traits::input_parameter< double >::type x_max(x_maxSEXP); __result = Rcpp::wrap(group_hex(x, y, x_width, y_width, x_origin, y_origin, x_max)); return __result; END_RCPP } // lowerBound IntegerVector lowerBound(const NumericVector& x, const NumericVector& breaks); RcppExport SEXP bigvis_lowerBound(SEXP xSEXP, SEXP breaksSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP); Rcpp::traits::input_parameter< const NumericVector& >::type breaks(breaksSEXP); __result = Rcpp::wrap(lowerBound(x, breaks)); return __result; END_RCPP } // smooth_nd_1 NumericVector smooth_nd_1(const NumericMatrix& grid_in, const NumericVector& z_in, const NumericVector& w_in_, const NumericMatrix& grid_out, const int var, const double h, const std::string type); RcppExport SEXP bigvis_smooth_nd_1(SEXP grid_inSEXP, SEXP z_inSEXP, SEXP w_in_SEXP, SEXP grid_outSEXP, SEXP varSEXP, SEXP hSEXP, SEXP typeSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; Rcpp::traits::input_parameter< const NumericMatrix& >::type grid_in(grid_inSEXP); Rcpp::traits::input_parameter< const NumericVector& >::type z_in(z_inSEXP); Rcpp::traits::input_parameter< const NumericVector& >::type w_in_(w_in_SEXP); Rcpp::traits::input_parameter< const NumericMatrix& >::type grid_out(grid_outSEXP); Rcpp::traits::input_parameter< const int >::type var(varSEXP); Rcpp::traits::input_parameter< const double >::type h(hSEXP); Rcpp::traits::input_parameter< const std::string >::type type(typeSEXP); __result = Rcpp::wrap(smooth_nd_1(grid_in, z_in, w_in_, grid_out, var, h, type)); return __result; END_RCPP } // smooth_nd NumericVector smooth_nd(const NumericMatrix& grid_in, const NumericVector& z_in, const NumericVector& w_in_, const NumericMatrix& grid_out, const NumericVector h); RcppExport SEXP bigvis_smooth_nd(SEXP grid_inSEXP, SEXP z_inSEXP, SEXP w_in_SEXP, SEXP grid_outSEXP, SEXP hSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; Rcpp::traits::input_parameter< const NumericMatrix& >::type grid_in(grid_inSEXP); Rcpp::traits::input_parameter< const NumericVector& >::type z_in(z_inSEXP); Rcpp::traits::input_parameter< const NumericVector& >::type w_in_(w_in_SEXP); Rcpp::traits::input_parameter< const NumericMatrix& >::type grid_out(grid_outSEXP); Rcpp::traits::input_parameter< const NumericVector >::type h(hSEXP); __result = Rcpp::wrap(smooth_nd(grid_in, z_in, w_in_, grid_out, h)); return __result; END_RCPP } // bisquare double bisquare(double u, double b); RcppExport SEXP bigvis_bisquare(SEXP uSEXP, SEXP bSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; Rcpp::traits::input_parameter< double >::type u(uSEXP); Rcpp::traits::input_parameter< double >::type b(bSEXP); __result = Rcpp::wrap(bisquare(u, b)); return __result; END_RCPP } // regress NumericVector regress(const std::vector& x, const std::vector& y, const std::vector& w); RcppExport SEXP bigvis_regress(SEXP xSEXP, SEXP ySEXP, SEXP wSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; Rcpp::traits::input_parameter< const std::vector& >::type x(xSEXP); Rcpp::traits::input_parameter< const std::vector& >::type y(ySEXP); Rcpp::traits::input_parameter< const std::vector& >::type w(wSEXP); __result = Rcpp::wrap(regress(x, y, w)); return __result; END_RCPP } // median double median(const std::vector& x); RcppExport SEXP bigvis_median(SEXP xSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; Rcpp::traits::input_parameter< const std::vector& >::type x(xSEXP); __result = Rcpp::wrap(median(x)); return __result; END_RCPP } // regress_robust NumericVector regress_robust(const std::vector& x, const std::vector& y, const std::vector& w, int iterations); RcppExport SEXP bigvis_regress_robust(SEXP xSEXP, SEXP ySEXP, SEXP wSEXP, SEXP iterationsSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; Rcpp::traits::input_parameter< const std::vector& >::type x(xSEXP); Rcpp::traits::input_parameter< const std::vector& >::type y(ySEXP); Rcpp::traits::input_parameter< const std::vector& >::type w(wSEXP); Rcpp::traits::input_parameter< int >::type iterations(iterationsSEXP); __result = Rcpp::wrap(regress_robust(x, y, w, iterations)); return __result; END_RCPP } // compute_moments NumericVector compute_moments(const NumericVector& x); RcppExport SEXP bigvis_compute_moments(SEXP xSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP); __result = Rcpp::wrap(compute_moments(x)); return __result; END_RCPP } // compute_sum NumericVector compute_sum(const NumericVector& x); RcppExport SEXP bigvis_compute_sum(SEXP xSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP); __result = Rcpp::wrap(compute_sum(x)); return __result; END_RCPP } // compute_median NumericVector compute_median(const NumericVector& x); RcppExport SEXP bigvis_compute_median(SEXP xSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP); __result = Rcpp::wrap(compute_median(x)); return __result; END_RCPP } ================================================ FILE: src/Summary2d.cpp ================================================ #include #include "Summary2d.h" using namespace Rcpp; ================================================ FILE: src/Summary2d.h ================================================ #include #include "stats.h" using namespace Rcpp; class Summary2d { public: virtual void push(double x, double z, double w) =0; virtual double compute() =0; virtual ~Summary2d() {} }; class Summary2dMean: public Summary2d { double w_, z_; public: Summary2dMean() : w_(0), z_(0) {} void push(double x, double z, double w) { // Rcout << " x: " << x << " z: " << z << " w: " << w << "\n"; w_ += w; z_ += z * w; } double compute() { // Rcout << "Result: " << z_ / w_ << "\n"; return z_ / w_; } }; class Summary2dRegression: public Summary2d { std::vector x_, z_, w_; public: Summary2dRegression() {} void push(double x, double z, double w) { x_.push_back(x); z_.push_back(z); w_.push_back(w); } double compute() { return simpleLinearRegression(x_, z_, w_).alpha; } }; class Summary2dRobustRegression: public Summary2d { int iterations_; std::vector x_, z_, w_; public: Summary2dRobustRegression() : iterations_(3) {} Summary2dRobustRegression(int iterations) : iterations_(iterations) {} void push(double x, double z, double w) { x_.push_back(x); z_.push_back(z); w_.push_back(w); } double compute() { return simpleRobustRegression(x_, z_, w_, iterations_).alpha; } }; ================================================ FILE: src/condense-gen.r ================================================ library(whisker) # Generate template specialisations for groupwise - these are the functions # that are called from R. summaries <- c( count = "Sum(0)", sum = "Sum(1)", mean = "Moments(1)", sd = "Moments(2)", median = "Median()" ) template <- " // [[Rcpp::export]] List condense_{{name}}(const List& x, const NumericVector& z, const NumericVector& weight, bool drop = false) { if (drop) { return sparse_condense(BinnedVectors(x), z, weight, Summary{{summary}}); } else { return condense(BinnedVectors(x), z, weight, Summary{{summary}}); } } " cpp_fun <- function(summary) { whisker.render(template, list( name = tolower(summary), summary = summaries[[summary]] )) } groupwise <- readLines("condense.cpp") split <- which(grepl("// -{40,}", groupwise))[1] original <- groupwise[1:split] writeLines(original, "condense.cpp") cat("// Autogenerated by condense-gen.r\n", file = "condense.cpp", append = TRUE) funs <- unlist(lapply(names(summaries), cpp_fun)) cat(funs, file = "condense.cpp", append = TRUE, sep = "") ================================================ FILE: src/condense.cpp ================================================ #include #include #include "group.h" #include "summary.h" template List condense(const BinnedVectors& group, const NumericVector& z, const NumericVector& weight, const Stat& stat) { int n_obs = group.size(); int n_bins = group.nbins(); const NumericVector& weight_ = (weight.size() > 0) ? weight : rep(NumericVector::create(1), n_obs); const NumericVector& z_ = (z.size() > 0) ? z : rep(NumericVector::create(1), n_obs); // Push values into stats std::vector stats(n_bins, stat); for(int i = 0; i < n_obs; ++i) { int bin = group.bin_i(i); // Rcout << "i: " << i << " bin: " << bin << "\n"; stats.at(bin).push(z_[i], weight_[i]); } // Compute values from stats and determine bins int n_stats = stat.size(); int n_groups = group.ngroups(); NumericMatrix out(n_bins, n_stats), bin(n_bins, n_groups); for (int i = 0; i < n_bins; ++i) { for (int j = 0; j < n_stats; ++j) { out(i, j) = stats[i].compute(j); } std::vector bins = group.unbin(i); for (int j = 0; j < n_groups; ++j) { bin(i, j) = bins[j]; } } // Name CharacterVector out_cols(n_stats), bin_cols(n_groups); for (int j = 0; j < n_stats; ++j) { out_cols[j] = stat.name(j); } for (int j = 0; j < n_groups; ++j) { bin_cols[j] = group.name(j); } out.attr("dimnames") = List::create(CharacterVector::create(), out_cols); bin.attr("dimnames") = List::create(CharacterVector::create(), bin_cols); return List::create(bin, out); } template List sparse_condense(const BinnedVectors& group, const NumericVector& z, const NumericVector& weight, const Stat& stat) { int n_obs = group.size(); const NumericVector& weight_ = (weight.size() > 0) ? weight : rep(NumericVector::create(1), n_obs); const NumericVector& z_ = (z.size() > 0) ? z : rep(NumericVector::create(1), n_obs); // Push values into stats typename std::map stats; for(int i = 0; i < n_obs; ++i) { int bin = group.bin_i(i); typename std::map::iterator loc = stats.find(bin); if (loc == stats.end()) { Stat new_stat(stat); new_stat.push(z_[i], weight_[i]); stats.insert(std::pair(bin, new_stat)); } else { (loc->second).push(z_[i], weight_[i]); } } // Compute values from stats and determine bins int n_bins = stats.size(); int n_stats = stat.size(); int n_groups = group.ngroups(); NumericMatrix out(n_bins, n_stats), bin(n_bins, n_groups); typename std::map::iterator stats_it = stats.begin(), stats_end = stats.end(); for (int i = 0; stats_it != stats_end; ++stats_it, ++i) { for (int j = 0; j < n_stats; ++j) { out(i, j) = (stats_it->second).compute(j); } std::vector bins = group.unbin(stats_it->first); for (int j = 0; j < n_groups; ++j) { bin(i, j) = bins[j]; } } // Name CharacterVector out_cols(n_stats), bin_cols(n_groups); for (int j = 0; j < n_stats; ++j) { out_cols[j] = stat.name(j); } for (int j = 0; j < n_groups; ++j) { bin_cols[j] = group.name(j); } out.attr("dimnames") = List::create(CharacterVector::create(), out_cols); bin.attr("dimnames") = List::create(CharacterVector::create(), bin_cols); return List::create(bin, out); } // ----------------------------------------------------------------------------- // Autogenerated by condense-gen.r // [[Rcpp::export]] List condense_count(const List& x, const NumericVector& z, const NumericVector& weight, bool drop = false) { if (drop) { return sparse_condense(BinnedVectors(x), z, weight, SummarySum(0)); } else { return condense(BinnedVectors(x), z, weight, SummarySum(0)); } } // [[Rcpp::export]] List condense_sum(const List& x, const NumericVector& z, const NumericVector& weight, bool drop = false) { if (drop) { return sparse_condense(BinnedVectors(x), z, weight, SummarySum(1)); } else { return condense(BinnedVectors(x), z, weight, SummarySum(1)); } } // [[Rcpp::export]] List condense_mean(const List& x, const NumericVector& z, const NumericVector& weight, bool drop = false) { if (drop) { return sparse_condense(BinnedVectors(x), z, weight, SummaryMoments(1)); } else { return condense(BinnedVectors(x), z, weight, SummaryMoments(1)); } } // [[Rcpp::export]] List condense_sd(const List& x, const NumericVector& z, const NumericVector& weight, bool drop = false) { if (drop) { return sparse_condense(BinnedVectors(x), z, weight, SummaryMoments(2)); } else { return condense(BinnedVectors(x), z, weight, SummaryMoments(2)); } } // [[Rcpp::export]] List condense_median(const List& x, const NumericVector& z, const NumericVector& weight, bool drop = false) { if (drop) { return sparse_condense(BinnedVectors(x), z, weight, SummaryMedian()); } else { return condense(BinnedVectors(x), z, weight, SummaryMedian()); } } ================================================ FILE: src/double-diff-sum.cpp ================================================ #include #include #include using namespace Rcpp; // Efficiently compute \sum \sum abs(x_i - x_j) for binned data // // It's effectively equivalent to this R code on the ungrouped observations // bin <- trunc(x / bw) // diffs <- abs(outer(bin, bin, "-")) // tabulate(diffs + 1) // // [[Rcpp::export]] std::vector double_diff_sum(IntegerVector bin, IntegerVector count) { int n = bin.size(); std::vector out; for (int i = 0; i < n; i++) { for (int j = 0; j < n; j++) { int pos = abs(bin[i] - bin[j]); if (pos + 1 > out.size()) { out.resize(pos + 1); } out[pos] += count[i] * count[j]; } } return out; } ================================================ FILE: src/frange.cpp ================================================ #include using namespace Rcpp; //' Efficient implementation of range. //' //' This is an efficient C++ implementation of range for numeric vectors: //' it avoids S3 dispatch, and computes both min and max in a single pass //' through the input. //' //' If \code{x} has a \code{range} attribute (e.g. it's a \code{\link{ranged}} //' object), it will be used instead of computing the range from scratch. //' //' @param x a numeric vector, or a \code{\link{ranged}} object //' @param finite If \code{TRUE} ignores missing values and infinities. Note //' that if the vector is empty, or only contains missing values, //' \code{frange} will return \code{c(Inf, -Inf)} because those are the //' identity values for \code{\link{min}} and \code{\link{max}} respectively. //' @export //' @examples //' x <- runif(1e6) //' system.time(range(x)) //' system.time(frange(x)) //' //' rx <- ranged(x) //' system.time(frange(rx)) // [[Rcpp::export]] NumericVector frange(const NumericVector& x, const bool finite = true) { RObject cache = x.attr("range"); if (cache.sexp_type() == REALSXP) return as(cache); NumericVector out(2); out[0] = INFINITY; out[1] = -INFINITY; int n = x.length(); for(int i = 0; i < n; ++i) { if (!finite && R_IsNA(x[i])) { out[0] = NA_REAL; out[1] = NA_REAL; return out; } // If finite, skip infinite values if (finite && (x[i] == INFINITY || x[i] == -INFINITY)) continue; if (x[i] < out[0]) out[0] = x[i]; if (x[i] > out[1]) out[1] = x[i]; } return out; } ================================================ FILE: src/group-hex.h ================================================ /* * Translated from * https://github.com/d3/d3-plugins/blob/master/hexbin/hexbin.js * * Copyright (C) 2013 Hadley Wickham * Copyright (C) 2012 Mike Bostock (mbostock at gmail dot com) */ class GroupHex { const NumericVector x_; const NumericVector y_; double x_width_; double x_origin_; double y_width_; double y_origin_; double x_bins; public: GroupHex (const NumericVector& x, const NumericVector& y, double x_width, double y_width, double x_origin, double y_origin, double x_max) : x_(x), y_(y), x_width_(x_width), x_origin_(x_origin), y_width_(y_width), y_origin_(y_origin) { if (x.size() != y.size()) stop("x & y are not the same size"); x_bins = x_max / x_width_ + 1; } int bin_i(int i) const { double py = ISNAN(y_[i]) ? 0 : (y_[i] - y_origin_) / y_width_ + 1; int pj = py; double py1 = py - pj; double px = ISNAN(x_[i]) ? 0 : (x_[i] - x_origin_) / x_width_ + 1 - (pj % 2 ? 0.5 : 0); int pi = px; if (fabs(py1) * 3 > 1) { double px1 = px - pi, pi2 = pi + (px < pi ? -1 : 1) / 2, pj2 = pj + (py < pj ? -1 : 1), px2 = px - pi2, py2 = py - pj2; if (px1 * px1 + py1 * py1 > px2 * px2 + py2 * py2) { pi = pi2 + (pj % 2 ? 1 : -1) / 2; pj = pj2; } } return pj * x_bins + pj; } int size() const { return x_.size(); } }; ================================================ FILE: src/group.cpp ================================================ #include #include "group.h" #include "group-hex.h" using namespace Rcpp; template IntegerVector group_out(const Group& group) { int n = group.size(); IntegerVector out(n); for(int i = 0; i < n; ++i) { out[i] = group.bin_i(i); } return out; } RCPP_MODULE(Group) { class_("GroupFixed") .constructor() .const_method("bin_i", &GroupFixed::bin_i) .const_method("bin", &GroupFixed::bin) .const_method("unbin", &GroupFixed::unbin) .const_method("size", &GroupFixed::size) .const_method("nbins", &GroupFixed::nbins) .const_method("origin", &GroupFixed::origin) .const_method("width", &GroupFixed::width) ; } RCPP_EXPOSED_AS(GroupFixed) RCPP_EXPOSED_WRAP(GroupFixed) // [[Rcpp::export]] IntegerVector group_fixed(const NumericVector& x, double width, double origin = 0) { return group_out(GroupFixed(x, width, origin)); } // [[Rcpp::export]] IntegerVector group_rect(const NumericVector& x, const NumericVector& y, double x_width, double y_width, double x_origin, double y_origin) { return group_out(Group2d( GroupFixed(x, x_width, x_origin), GroupFixed(y, y_width, y_origin))); } // [[Rcpp::export]] IntegerVector group_hex(const NumericVector& x, const NumericVector& y, double x_width, double y_width, double x_origin, double y_origin, double x_max) { return group_out(GroupHex(x, y, x_width, y_width, x_origin, y_origin, x_max)); } ================================================ FILE: src/group.h ================================================ #include using namespace Rcpp; NumericVector frange(const NumericVector& x, const bool finite = true); class GroupFixed { const NumericVector x_; double width_; double origin_; public: GroupFixed (NumericVector x, double width, double origin = 0) : x_(x), width_(width), origin_(origin) { } int bin_i(int i) const { if (ISNAN(x_[i]) || x_[i] == INFINITY || x_[i] == -INFINITY) return 0; if (x_[i] < origin_) return 0; return bin(x_[i]); } int bin(double x) const { return (x - origin_) / width_ + 1; } double unbin(int bin) const { if (bin == 0) return(NAN); return (bin - 1) * width_ + origin_; } double origin() const { return origin_; } double width() const { return width_; } int size() const { return x_.size(); } int nbins() const { double max = frange(x_)(1); double dest = floor((max - origin_) / width_) * width_ + origin_; // + 1 for missing values // + 1 if highest value is on right-open boundary return (dest - origin_) / width_ + 1 + ((max >= dest) ? 1 : 0); } }; template class Group2d { const Group& x_; const Group& y_; int x_bins_; int y_bins_; public: Group2d (const Group& x, const Group& y) : x_(x), y_(y) { if (x_.size() != y_.size()) { stop("x and y are not equal sizes"); } x_bins_ = x_.nbins(); y_bins_ = y_.nbins(); // Rcout << "x_bins: " << x_bins_ << " y_bins: " << y_bins_ << "\n"; } int bin_i(int i) const { int x_bin = x_.bin_i(i), y_bin = y_.bin_i(i); int bin = y_bin * x_bins_ + x_bin; // Rcout << i << ": (" << x_bin << "," << y_bin << ") -> " << bin << "\n"; return bin; } int size() const { return x_.size(); } int nbins() const { return x_bins_ * y_bins_; } }; template class GroupNd { const std::vector groups_; const int ngroups_; int size_; std::vector bins_; public: GroupNd (const std::vector groups) : groups_(groups), ngroups_(groups.size()) { if (groups.size() == 0) { stop("Empty groups vector passed to GroupCompound"); } size_ = groups[0].size(); bins_[0] = 1; for (int i = 0; i < ngroups_ - 1; ++i) { if (groups_[i].size() != size_) stop("Groups not equal sizes"); bins_[i + 1] = bins_[i] * groups_[i].nbins(); } } int bin_i(int i) const { int bin = 0; for (int j = 0; j < ngroups_; ++j) { bin += groups_[j].bin(i) * bins_[j]; } return bin; } // int nbins() const { // return bins_[ngroups_ - 1]; // } int ngroups() const { return groups_.size(); } int size() const { return size_; } std::vector unbin(int bin) const { std::vector bins(ngroups_); for (int j = 0; j < ngroups_; ++j) { int bin_j = bin % bins_[j]; bins[j] = groups_[j].unbin(bin_j); bin = bin - bin * bins_[j]; } return bins; } }; ================================================ FILE: src/lowerBound.cpp ================================================ #include #include using namespace Rcpp; // Quick and dirty implementation of lowerBound, the complement to R's // findInterval // [[Rcpp::export]] IntegerVector lowerBound(const NumericVector& x, const NumericVector& breaks) { int n = x.size(); IntegerVector out(n); for (int i = 0; i < n; i++) { NumericVector::const_iterator it = std::lower_bound(breaks.begin(), breaks.end(), x[i]); if (it == breaks.end()) --it; out[i] = it - breaks.begin() + 1; } return out; } ================================================ FILE: src/smooth-nd.cpp ================================================ #include #include #include "group.h" #include "Summary2d.h" #include using namespace Rcpp; boost::shared_ptr createSummary(std::string type) { if (type == "mean") { return boost::shared_ptr(new Summary2dMean()); } else if (type == "regression") { return boost::shared_ptr(new Summary2dRegression()); } else if (type == "robust_regression") { return boost::shared_ptr(new Summary2dRobustRegression()); } else { stop("Unknown type"); // Quiet warning return boost::shared_ptr(new Summary2dMean()); } } double tricube(double x) { if (NumericVector::is_na(x)) return 0; x = fabs(x); if (x > 1) return 0; double y = 1 - x * x * x; return y * y * y; } bool both_na(double x, double y) { return (NumericVector::is_na(x) && NumericVector::is_na(y)); } // [[Rcpp::export]] NumericVector smooth_nd_1(const NumericMatrix& grid_in, const NumericVector& z_in, const NumericVector& w_in_, const NumericMatrix& grid_out, const int var, const double h, const std::string type = "mean") { if (var < 0) stop("var < 0"); if (var >= grid_in.ncol()) stop("var too large"); if (h <= 0) stop("h <= 0"); if (grid_in.ncol() != grid_out.ncol()) stop("Incompatible grid sizes"); int n_in = grid_in.nrow(), n_out = grid_out.nrow(), d = grid_in.ncol(); NumericVector w_in = (w_in_.size() > 0) ? w_in_ : rep(NumericVector::create(1), n_in); NumericVector z_out(n_out), w_out(n_out); // Will be much more efficient to slice up by input dimension: // and most efficient way of doing that will be to bin with / bw // My data structure: sparse grids // // And once we're smoothing in one direction, with guaranteed e2venly spaced // grid can avoid many kernel evaluations and can also compute more // efficient running sum for(int j = 0; j < n_out; ++j) { boost::shared_ptr summary = createSummary(type); for (int i = 0; i < n_in; ++i) { // Check that all variables (apart from var) are equal bool equiv = true; for (int k = 0; k < d; ++k) { if (k == var) continue; double in = grid_in(i, k), out = grid_out(j, k); if (in != out && !both_na(in, out)) { equiv = false; break; } }; if (!equiv) continue; double in = grid_in(i, var), out = grid_out(j, var); double dist = both_na(in, out) ? 0 : in - out; double w = tricube(dist / h) * w_in[i]; if (w == 0) continue; summary->push(dist, z_in[i], w); } z_out[j] = summary->compute(); } return z_out; } // [[Rcpp::export]] NumericVector smooth_nd(const NumericMatrix& grid_in, const NumericVector& z_in, const NumericVector& w_in_, const NumericMatrix& grid_out, const NumericVector h) { if (grid_in.nrow() != z_in.size()) stop("Incompatible input lengths"); if (grid_in.ncol() != grid_out.ncol()) stop("Incompatible grid sizes"); if (h.size() != grid_in.ncol()) stop("Incorrect h length"); int n_in = grid_in.nrow(), n_out = grid_out.nrow(), d = grid_in.ncol(); NumericVector w_in = (w_in_.size() > 0) ? w_in_ : rep(NumericVector::create(1), n_in); NumericVector z_out(n_out), w_out(n_out); for (int i = 0; i < n_in; ++i) { for(int j = 0; j < n_out; ++j) { double w = 1; for (int k = 0; k < d; ++k) { double dist = (grid_in(i, k) - grid_out(j, k)) / h[k]; w *= tricube(dist); } w *= w_in[i]; w_out[j] += w; z_out[j] += z_in[i] * w; } } for(int j = 0; j < n_out; ++j) { z_out[j] /= w_out[j]; } return z_out; } ================================================ FILE: src/stats.cpp ================================================ #include using namespace Rcpp; struct Regression { double alpha, beta; }; // [[Rcpp::export]] double bisquare(double u, double b) { u = fabs(u); return (u < b) ? pow(1 - pow(u / b, 2), 2) : 0; } Regression simpleLinearRegression(const std::vector& x, const std::vector& y, const std::vector& w) { int n = x.size(); double x_wsum = 0, y_wsum = 0, w_sum = 0; for (int i = 0; i < n; ++i) { x_wsum += x[i] * w[i]; y_wsum += y[i] * w[i]; w_sum += w[i]; }; double x_mean = x_wsum / w_sum, y_mean = y_wsum / w_sum; double var_xy = 0, var_x = 0; for (int i = 0; i < n; ++i) { var_xy += w[i] * (x[i] - x_mean) * (y[i] - y_mean); var_x += w[i] * pow((x[i] - x_mean), 2); } Regression results; results.beta = (var_xy / var_x); results.alpha = y_mean - results.beta * x_mean; return results; } // [[Rcpp::export]] NumericVector regress(const std::vector& x, const std::vector& y, const std::vector& w) { Regression regression = simpleLinearRegression(x, y, w); return NumericVector::create(regression.alpha, regression.beta); } double median(std::vector* x) { if (x->empty()) return NAN; int size = x->size(); std::vector::iterator upper = x->begin() + (int) (size / 2); std::nth_element(x->begin(), upper, x->end()); if (size % 2 == 1) { return *upper; } else { std::vector::iterator lower = upper - 1; std::nth_element(x->begin(), lower, upper); return (*upper + *lower) / 2.0; } } // [[Rcpp::export("medianC")]] double median(const std::vector& x) { std::vector x_(x); return median(&x_); } Regression simpleRobustRegression(const std::vector& x, const std::vector& y, const std::vector& w, int iterations = 3) { int n = x.size(); Regression prev = simpleLinearRegression(x, y, w); for (int k = 0; k < iterations; ++k) { std::vector resid(n); for (int i = 0; i < n; ++i) { resid[i] = fabs(y[i] - (prev.alpha + prev.beta * x[i])); } std::vector w_(w); double b = 6 * median(resid); if (b < 1e-20) break; for (int i = 0; i < n; ++i) { w_[i] *= bisquare(resid[i], b); } prev = simpleLinearRegression(x, y, w_); } return prev; } // [[Rcpp::export]] NumericVector regress_robust(const std::vector& x, const std::vector& y, const std::vector& w, int iterations = 3) { Regression regression = simpleRobustRegression(x, y, w, iterations); return NumericVector::create(regression.alpha, regression.beta); } ================================================ FILE: src/stats.h ================================================ struct Regression { double alpha, beta; }; double bisquare(double u, double b); Regression simpleLinearRegression(const std::vector& x, const std::vector& y, const std::vector& w); Regression simpleRobustRegression(const std::vector& x, const std::vector& y, const std::vector& w, int iterations = 3); double median(const std::vector& x); double median(std::vector* x); ================================================ FILE: src/summary.cpp ================================================ #include #include "summary.h" using namespace Rcpp; template NumericVector summary_compute(const NumericVector& x, Summary summary) { int n = x.size(); for(int i = 0; i < n; ++i) { summary.push(x[i], 1); } int m = summary.size(); NumericVector out(m); for(int i = 0; i < m; ++i) { out[i] = summary.compute(i); } return out; } // [[Rcpp::export]] NumericVector compute_moments(const NumericVector& x) { return summary_compute(x, SummaryMoments(2)); } // [[Rcpp::export]] NumericVector compute_sum(const NumericVector& x) { return summary_compute(x, SummarySum(1)); } // [[Rcpp::export]] NumericVector compute_median(const NumericVector& x) { return summary_compute(x, SummaryMedian()); } ================================================ FILE: src/summary.h ================================================ #include #include "stats.h" using namespace Rcpp; class SummaryMoments { int i_; double weight; double mean; double m2; public: SummaryMoments (int i) : i_(i), weight(0), mean(0), m2(0) { if (i > 2) stop("Invalid moment"); } // Algorithm adapted from // http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Weighted_incremental_algorithm void push(double y, double w) { if (NumericVector::is_na(y)) return; // counts and weights weight += w; // mean if (i_ < 1) return; double delta = y - mean; mean += delta * w / weight; // variance if (i_ < 2) return; m2 += delta * delta * w * (1 - w / weight); return; } const int size() const { return i_ + 1; } double compute(int i) const { switch (i) { case 0: return weight; case 1: return (weight == 0) ? NAN : mean; case 2: return (weight == 0) ? NAN : pow(m2 / (weight - 1), 0.5); default: stop("Invalid output requested"); return NAN; } } std::string name(int i) const { switch (i) { case 0: return "count"; case 1: return "mean"; case 2: return "sd"; default: stop("Invalid output requested"); return ""; } } }; class SummarySum { int i_; int weight; double sum; public: SummarySum (int i) : i_(i), weight(0), sum(0) { if (i > 1 || i < 0) stop("Invalid moment"); } void push(double y, double w) { if (NumericVector::is_na(y)) return; weight += w; if (i_ < 1) return; sum += y * w; } const int size() const { return i_ + 1; } double compute(int i) const { switch (i) { case 0: return weight; case 1: return sum; default: stop("Invalid output requested"); return NAN; } } std::string name(int i) const { switch (i) { case 0: return "count"; case 1: return "sum"; default: stop("Invalid output requested"); return ""; } } }; class SummaryMedian { std::vector ys; public: void push(double y, double w) { if (NumericVector::is_na(y)) return; ys.push_back(y); } int size() const { return 1; } // Adapted from http://stackoverflow.com/questions/1719070/ double compute(int i) { return median(&ys); } std::string name(int i) const { return "median"; } };