Repository: clbustos/statsample
Branch: master
Commit: d5caf4ecf82c
Files: 157
Total size: 643.6 KB
Directory structure:
gitextract_b74amxs6/
├── .gitignore
├── .travis.yml
├── Gemfile
├── History.txt
├── LICENSE.txt
├── Manifest.txt
├── README.md
├── Rakefile
├── benchmarks/
│ ├── correlation_matrix_15_variables.rb
│ ├── correlation_matrix_5_variables.rb
│ ├── correlation_matrix_methods/
│ │ ├── correlation_matrix.ds
│ │ ├── correlation_matrix.html
│ │ ├── correlation_matrix.rb
│ │ ├── correlation_matrix.xls
│ │ ├── correlation_matrix_gsl_ruby.ods
│ │ ├── correlation_matrix_with_graphics.ods
│ │ └── results.ds
│ ├── factor_map.rb
│ └── helpers_benchmark.rb
├── data/
│ └── locale/
│ └── es/
│ └── LC_MESSAGES/
│ └── statsample.mo
├── doc_latex/
│ └── manual/
│ └── equations.tex
├── examples/
│ ├── boxplot.rb
│ ├── correlation_matrix.rb
│ ├── dataset.rb
│ ├── dominance_analysis.rb
│ ├── dominance_analysis_bootstrap.rb
│ ├── histogram.rb
│ ├── icc.rb
│ ├── levene.rb
│ ├── multiple_regression.rb
│ ├── multivariate_correlation.rb
│ ├── parallel_analysis.rb
│ ├── polychoric.rb
│ ├── principal_axis.rb
│ ├── reliability.rb
│ ├── scatterplot.rb
│ ├── t_test.rb
│ ├── tetrachoric.rb
│ ├── u_test.rb
│ ├── vector.rb
│ └── velicer_map_test.rb
├── grab_references.rb
├── lib/
│ ├── spss.rb
│ ├── statsample/
│ │ ├── analysis/
│ │ │ ├── suite.rb
│ │ │ └── suitereportbuilder.rb
│ │ ├── analysis.rb
│ │ ├── anova/
│ │ │ ├── contrast.rb
│ │ │ ├── oneway.rb
│ │ │ └── twoway.rb
│ │ ├── anova.rb
│ │ ├── bivariate/
│ │ │ └── pearson.rb
│ │ ├── bivariate.rb
│ │ ├── codification.rb
│ │ ├── converter/
│ │ │ ├── csv.rb
│ │ │ └── spss.rb
│ │ ├── converters.rb
│ │ ├── crosstab.rb
│ │ ├── dataset.rb
│ │ ├── dominanceanalysis/
│ │ │ └── bootstrap.rb
│ │ ├── dominanceanalysis.rb
│ │ ├── factor/
│ │ │ ├── map.rb
│ │ │ ├── parallelanalysis.rb
│ │ │ ├── pca.rb
│ │ │ ├── principalaxis.rb
│ │ │ └── rotation.rb
│ │ ├── factor.rb
│ │ ├── graph/
│ │ │ ├── boxplot.rb
│ │ │ ├── histogram.rb
│ │ │ └── scatterplot.rb
│ │ ├── graph.rb
│ │ ├── histogram.rb
│ │ ├── matrix.rb
│ │ ├── multiset.rb
│ │ ├── regression/
│ │ │ ├── multiple/
│ │ │ │ ├── alglibengine.rb
│ │ │ │ ├── baseengine.rb
│ │ │ │ ├── gslengine.rb
│ │ │ │ ├── matrixengine.rb
│ │ │ │ └── rubyengine.rb
│ │ │ ├── multiple.rb
│ │ │ └── simple.rb
│ │ ├── regression.rb
│ │ ├── reliability/
│ │ │ ├── icc.rb
│ │ │ ├── multiscaleanalysis.rb
│ │ │ ├── scaleanalysis.rb
│ │ │ └── skillscaleanalysis.rb
│ │ ├── reliability.rb
│ │ ├── resample.rb
│ │ ├── rserve_extension.rb
│ │ ├── shorthand.rb
│ │ ├── srs.rb
│ │ ├── test/
│ │ │ ├── bartlettsphericity.rb
│ │ │ ├── chisquare.rb
│ │ │ ├── f.rb
│ │ │ ├── kolmogorovsmirnov.rb
│ │ │ ├── levene.rb
│ │ │ ├── t.rb
│ │ │ ├── umannwhitney.rb
│ │ │ └── wilcoxonsignedrank.rb
│ │ ├── test.rb
│ │ ├── vector/
│ │ │ └── gsl.rb
│ │ ├── vector.rb
│ │ └── version.rb
│ └── statsample.rb
├── po/
│ ├── es/
│ │ ├── statsample.mo
│ │ └── statsample.po
│ └── statsample.pot
├── references.txt
├── setup.rb
├── test/
│ ├── fixtures/
│ │ ├── correlation_matrix.rb
│ │ ├── hartman_23.matrix
│ │ ├── repeated_fields.csv
│ │ ├── stock_data.csv
│ │ ├── test_csv.csv
│ │ ├── test_xls.xls
│ │ ├── tetmat_matrix.txt
│ │ └── tetmat_test.txt
│ ├── helpers_tests.rb
│ ├── test_analysis.rb
│ ├── test_anova_contrast.rb
│ ├── test_anovaoneway.rb
│ ├── test_anovatwoway.rb
│ ├── test_anovatwowaywithdataset.rb
│ ├── test_anovawithvectors.rb
│ ├── test_awesome_print_bug.rb
│ ├── test_bartlettsphericity.rb
│ ├── test_bivariate.rb
│ ├── test_codification.rb
│ ├── test_crosstab.rb
│ ├── test_csv.rb
│ ├── test_dataset.rb
│ ├── test_dominance_analysis.rb
│ ├── test_factor.rb
│ ├── test_factor_map.rb
│ ├── test_factor_pa.rb
│ ├── test_ggobi.rb
│ ├── test_gsl.rb
│ ├── test_histogram.rb
│ ├── test_matrix.rb
│ ├── test_multiset.rb
│ ├── test_regression.rb
│ ├── test_reliability.rb
│ ├── test_reliability_icc.rb
│ ├── test_reliability_skillscale.rb
│ ├── test_resample.rb
│ ├── test_rserve_extension.rb
│ ├── test_srs.rb
│ ├── test_statistics.rb
│ ├── test_stest.rb
│ ├── test_stratified.rb
│ ├── test_test_f.rb
│ ├── test_test_kolmogorovsmirnov.rb
│ ├── test_test_t.rb
│ ├── test_umannwhitney.rb
│ ├── test_vector.rb
│ ├── test_wilcoxonsignedrank.rb
│ └── test_xls.rb
└── web/
└── Rakefile
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
doc.yaml
*.swp
*.rbc
coverage
*~
agregar_adsense_a_doc.rb
pkg
doc
.yardoc
examples/images/*
examples/*.html
web/upload_task.rb
.idea
================================================
FILE: .travis.yml
================================================
language:
ruby
rvm:
- '1.9.3'
- '2.0.0'
- '2.1.1'
script:
bundle exec rake test
before_install:
- sudo apt-get update -qq
- sudo apt-get install -y libgsl0-dev r-base r-base-dev
- sudo Rscript -e "install.packages(c('Rserve','irr'),,'http://cran.us.r-project.org')"
================================================
FILE: Gemfile
================================================
source "https://www.rubygems.org"
gem 'minitest'
gem 'rdoc'
gem 'mocha', '0.14.0' #:require=>'mocha/setup'
gem 'shoulda','3.5.0'
gem 'shoulda-matchers','2.2.0'
gem 'hoe'
#gem 'bio-statsample-timeseries'
gem 'reportbuilder'
gem 'dirty-memoize'
gem 'distribution'
gem 'extendmatrix'
gem 'minimization'
gem 'rserve-client'
gem 'rubyvis'
gem 'spreadsheet'
gem 'rb-gsl'
gem 'awesome_print'
================================================
FILE: History.txt
================================================
=== 1.4.0 / 2014-10-11
* Replaced README.txt for README.md
* Replace File.exists? for File.exist?
+ New Dataset.join to join two dataset based on some fields
* Deleted MLE based regression (Probit and logistic). Now all GML methods are on statsample-glm
=== 1.3.1 / 2014-06-26
* Example referred to a SimpleRegression class which doesn't exist. Updated to working example.
* Merge pull request #15 from Blahah/patch-1
* Updated Gemfile
* Updated README.txt for v1.3.0
* Updated to ruby 2.1.0
=== 1.3.0 / 2013-09-19
* Merge remote-tracking branch 'vpereira/master' into vpereira
* New Wilcoxon Signed Rank test
* Remove TimeSeries class. Now is available on gem "bio-statsample-timeseries" [GSOC 2013 project :) ]
* Update shoulda support
* added Bundle depds
* improved the csv read method (requires tests)
* open svg on mac osx
=== 1.2.0 / 2011-12-15
* Added support for time series (TimeSeries object): MA, EMA, MACD, acf, lag and delta. [Rob Britton]
* Changed summary attribute to properly display 'b' value for simple linear regression [hstove]
* Merge pull request #6 from hstove/patch-1Changed summary attribute to properly display 'b' value for simple linear regression [Claudio Bustos]
* fix example code for CovariateMatrix [James Kebinger]
=== 1.1.0 / 2011-06-02
* New Statsample::Anova::Contrast
* Jacknife and bootstrap for Vector. Thanks to John Firebaugh for the idea
* Improved Statsample::Analysis API
* Updated CSV.read. Third argument is a Hash with options to CSV class
* Added restriction on Statsample::Excel.read
* Updated spanish po
* Better summary for Vector
* Improving summary of t related test (confidence interval and estimate output)
* Replaced c for vector on Statsample::Analysis examples
* Added Vector#median_absolute_deviation
* First implementation of Kolmogorov Smirnov test. Returns correct D value, but without Kolmogorov distribution isn't very useful.
=== 1.0.1 / 2011-01-28
* Updated spanish po.
* Update distribution gem dependence. On Ruby 1.8.7, distribution 0.2.0 raises an error.
=== 1.0.0 / 2011-01-27
* Added Statsample::Analysis, a beautiful DSL to perform fast statistical analysis using statsample. See directory /examples
* Created benchmarks directory
* Removed Distribution module from statsample and moved to a gem. Changes on code to reflect new API
* Optimized simple regression. Better library detection
* New 'should_with_gsl' to test methods with gsl. Refactored Factor::MAP
* Almost complete GSL cleanup on Vector
* Updated some doc on Vector
* Used GSL::Matrix on Factor classes when available
* SkillScaleAnalysis doesn't crash with one or more vectors with 0 variance
* Modified examples using Statsample::Analysis
* Simplified eigen calculations
* Updated some examples. Added correlation matrix speed suite
* Correlation matrix optimized. Better specs
* Optimized correlation matrix. Use gsl matrix algebra or pairwise correlations depending on empiric calculated equations. See benchmarks/correlation_matrix.rb to see implementation of calculation
* Moved tests fixtures from data to test/fixtures
* Fixed some errors on tests
* Bug fix: constant_se on binomial regression have an error
* All test should work on ruby 1.9.3
* New Vector.[] and Vector.new_scale
* Detect linearly dependent predictors on OLS.
=== 0.18.0 / 2011-01-07
* New Statsample.load_excel
* New Statsample.load_csv
* Statsample::Dataset#[] accepts an array of fields and uses clone
* New Dataset#correlation_matrix and Statsample::Dataset#covariance_matrix
* Statsample::Dataset.filter add labels to vectors
* Principal Components generation complete on PCA (covariance matrix prefered)
* Added note on Statsample::Factor::PCA about erratic signs on eigenvalues,
* Statsample::Factor::PCA.component_matrix calculated different for covariance matrix
* Improved summary for PCA using covariance matrix
* New attribute :label_angle for Statsample::Graph::Boxplot
* Fixed Scatterplots scaling problems
* New attributes for Scatterplots: groups, minimum_x, minimum_y, maximum_x,
* New Statsample::Multiset#union allows to create a new dataset based on a m
* New Statsample::Multiset#each to traverse through datasets
* Bug fix: Vector#standarized and Vector#percentile crash on nil data
* Bug fix: Vector#mean and Vector#sd crash on data without valid values
* Modified methods names on Statsample::Factor::PCA : feature_vector to feature_matrix, data_transformation to principal_components
* Added Statsample::Vector.vector_centered
* Factor::MAP.with_dataset() implemented
* Bug fix: Factor::MAP with correlation matrix with non-real eigenvalues crashes * Added documentation for Graph::Histogram
* Added MPA to Reliability::MultiScaleAnalysis
* Added custom names for returned vectors and datasets
* Updated spanish traslation
* Graph::Histogram updated. Custom x and y max and min, optional normal distribution drawing
* Updated Histogram class, with several new methods compatibles with GSL::Histogram
=== 0.17.0 / 2010-12-09
* Added Statsample::Graph::Histogram and Statsample::Graph::Boxplot
* Added Statsample::Reliability::SkillScaleAnalysis for analysis of skill based scales.
* Delete combination and permutation clases. Backport for ruby 1.8.7 widely available
* Deleted unused variables (thanks, ruby-head)
=== 0.16.0 / 2010-11-13
* Works on ruby 1.9.2 and HEAD. Updated Rakefile and manifest
* Removed all graph based on Svg::Graph.
* First operative version of Graph with Rubyvis
* Corrected bug on Distribution::Normal.cdf.
* Added reference on references.txt
* Ruby-based random gaussian distribution generator when gsl not available
* Added population average deviation [Al Chou]
=== 0.15.1 / 2010-10-20
* Statsample::Excel and Statsample::PlainText add name to vectors equal to field name
* Statsample::Dataset.delete_vector accept multiple fields.
* Statsample::Dataset.dup_only_valid allows duplication of specific fields
* ScaleAnalysis doesn't crash on one-item scales
* Updated references
=== 0.15.0 / 2010-09-07
* Added class Statsample::Reliability::ICC for calculation of Intra-class correlation (Shrout & Fleiss, 1979; McGraw & Wong, 1996). Tested with SPSS and R values.
* References: Updated and standarized references on many classes. Added grab_references.rb script, to create a list of references for library
* Added Spearman-Brown prophecy on Reliability module
* Distribution::F uses Gsl when available
* Added mean r.p.b. and item sd on Scale Analysis
* Corrected bug on Vector.ary_method and example of Anova Two Way using vector.
=== 0.14.1 / 2010-08-18
* Added extra information on $DEBUG=true.
* Changed ParallelAnalysis: with_random_data parameters, bootstrap_method options are data and random, resolve bug related to number of factors to preserve, resolved bug related to original eigenvalues, can support failed bootstrap of data for Tetrachoric correlation.
* Optimized eigenpairs on Matrix when GSL is available.
* Added test for parallel analysis using data bootstraping
* Updated .pot and Manifest.txt
* Added test for kmo(global and univariate), bartlett and anti-image. Kmo and Bartlett have test based on Dziuban and Shirkey with correct results
* Complete set of test to test if a correlation matrix is appropriate for factor analysis: test of sphericity, KMO and anti-image (see Dziuban and Shirkey, 1974)
* Updated Parallel Analysis to work on Principal Axis Analysis based on O'Connors formulae
* Added reference for Statsample::Factor::MAP
=== 0.14.0 / 2010-08-16
* Added Statsample::Factor::MAP, to execute Velicer's (1976) MAP to determine the number of factors to retain on EFA
* Bug fix on test suite on Ruby 1.8.7
* Horn's Parallel Analysis operational and tested for pure random data
* Fixed bug on Excel writer on Ruby1.9 (frozen string on header raises an error).
* Extra information on Factorial Analysis on summaries
* Fixed bug on Factor::Rotation when used ::Matrix without field method.
* Added Vector#vector_percentil method
* Summaries for PCA, Rotation, MultiScale and ScaleAnalysis created or improved.
* Factor::PCA could have rotation and parallel analysis on summary.
* Cronbach's alpha from covariance matrix raise an error on size<2
* MultiScaleAnalysis could have Parallel Analysis on summary.
* Added Chi Square test
* Added new information on README.txt
=== 0.13.1 / 2010-07-03
* Rserve extensions for dataset and vector operational
* On x86_64, variance from gsl is not exactly equal to sum of variance-covariance on Statsample::Reliability::Scale, but in delta 1e-10
* Updated README.txt
* Reliability::ScaleAnalysis uses covariance matrix for 'if deleted' calculations to optimize memory and speed. Test for 'if deleted' statistics
* More string translated. Added dependency on tetrachoric on parallel analysis
=== 0.13.0 / 2010-06-13
* Polychoric and Tetrachoric moved to gem statsample-bivariate-extension
* All classes left with summary method include Summarizable now. Every method which return localizable string is now parsed with _()
* Correct implementation of Reliability::MultiScaleAnalysis.
* Spanish translation for Mann-Whitney's U
* Added example for Mann-Whitney's U test
* Better summary for Mann-Whitney's U Test
* Added Statsample::Bivariate::Pearson class to retrieve complete analysis for r correlations
* Bug fix on DominanceAnalysis::Bootstrap
=== 0.12.0 / 2010-06-09
* Modified Rakefile to remove dependencies based on C extensions. These are moved to statsample-optimization
* T test with unequal variance fixed on i686
* API Change: Renamed Reliability::ItemAnalysis and moved to independent file
* New Reliability::MultiScaleAnalysis for easy analysis of scales on a same survey, includind reliability, correlation matrix and Factor Analysis
* Updated README to reflect changes on Reliability module
* SvgGraph works with reportbuilder.
* Added methods on Polychoric based on Olsson(1979): the idea is estimate using second derivatives.
* Distribution test changed (reduced precision on 32 bits system
=== 0.11.2 / 2010-05-05
* Updated dependency for 'extendedmatrix' to 0.2 (Matrix#build method)
=== 0.11.1 / 2010-05-04
* Removed Matrix almost all Matrix extensions and replaced by dependency on 'extendmatrix' gem
* Added dependency to gsl >=1.12.109. Polychoric with joint method fails without this explicit dependency
=== 0.11.0 / 2010-04-16
New features:
* Added Statsample::Anova::TwoWay and Statsample::Anova::TwoWayWithVectors
* Added Statsample.clone_only valid and Statsample::Dataset.clone_only_valid, for cheap copy on already clean vectors
Optimizations and bug fix
* Removed library statistics2 from package. Used gem statistics2 instead, because have a extension version
* Added example for Reliability class
* Bug fix on Statsample::DominanceAnalysis
=== 0.10.0 / 2010-04-13
API modifications
* Refactoring of Statsample::Anova module.
* Statsample::Anova::OneWay :implementation of generic ANOVA One-Way, used by Multiple Regression, for example.
* Statsample::Anova::OneWayWithVectors: implementation of ANOVA One-Way to test differences of means.
New features
* New Statsample::Factor::Parallel Analysis, to performs Horn's 'parallel analysis' to a PCA, to adjust for sample bias on retention of components.
* New Statsample.only_valid_clone and Statsample::Dataset.clone, which allows to create shallow copys of valid vector and datasets. Used by correlation matrix methods to optimize calculations
* New module Statsample::Summarizable, which add GetText and ReportBuilder support to classes. Better summaries for Vector, Dataset, Crosstab, PrincipalAxis, PCA and Regression::Multiple classes
Optimizations and bug fix
* Refactoring of Statsample::Regression::Multiple classes. Still needs works
* Bug fix on Statsample::Factor::PCA and Statsample::Factor::PrincipalAxis
* Bug fix on Statsample::Bivariate::Polychoric.new_with_vectors. Should be defined class method, no instance method.
* Optimized correlation and covariance matrix. Only calculates the half of matrix and the other half is returned from cache
* More tests coverage. RCOV Total: 82.51% , Code: 77.83%
=== 0.9.0 / 2010-04-04
* New Statsample::Test::F. Anova::OneWay subclasses it and Regression classes uses it.
=== 0.8.2 / 2010-04-01
* Statsample::PromiseAfter replaced by external package DirtyMemoize [http://rubygems.org/gems/dirty-memoize]
=== 0.8.1 / 2010-03-29
* Fixed Regression summaries
=== 0.8.0 / 2010-03-29
* New Statsample::Test::T module, with classes and methods to do Student's t tests for one and two samples.
* Statsample::PromiseAfter module to set a number of variables without explicitly call the compute or iterate method
* All tests ported to MiniUnit
* Directory 'demo' renamed to 'examples'
* Bug fix on report_building on Statsample::Regression::Multiple classes
=== 0.7.0 / 2010-03-25
* Ported to ReportBuilder 1.x series
* Implementation of ruby based covariance and correlation changed to a clearer code
* Statsample::Vector#svggraph_frequencies accepts IO
* Some test ported to Miniunit
* CSV on Ruby1.8 uses FasterCSV
=== 0.6.7 / 2010-03-23
* Bug fix: dependency on ReportBuilder should be set to "~>0.2.0", not "0.2"
=== 0.6.6 / 2010-03-22
* Set ReportBuilder dependency to '0.2.~' version, because future API break
* Removed Alglib dependency
* Factor::PrincipalAxis and Factor::PCA reworked
* Standarization of documentation on almost every file
* New Statsample::Test::Levene, to test equality of variances
* Constant HAS_GSL replaced by Statsample.has_gsl?
* PCA and Principal Axis test based on R and SPSS results
* Bug fix on test_dataset.rb / test_saveload
* Added Rakefile
* Demos for levene, Principal Axis
=== 0.6.5 / 2010-02-24
* Bug fix on test: Use tempfile instead of tempdir
* Multiple Regression: Calculation of constant standard error , using covariance matrix.
* Calculation of R^2_yx and P^2_yx for Regresion on Multiple Dependents variables
* Dominance Analysis could use Correlation or Covariance Matrix as input.
* Dominance Analysis extension to multiple dependent variables (Azen & Budescu, 2006)
* Two-step estimate of Polychoric correlation uses minimization gem, so could be executed without rb-gsl
=== 0.6.4 / 2010-02-19
* Dominance Analysis and Dominance Analysis Bootstrap allows multivariate dependent analysis.
* Test suite for Dominance Analysis, using Azen and Budescu papers as references
* X^2 for polychoric correlation
=== 0.6.3 / 2010-02-15
* Statsample::Bivariate::Polychoric have joint estimation.
* Some extra documentation and bug fixs
=== 0.6.2 / 2010-02-11
* New Statsample::Bivariate::Polychoric. For implement: X2 and G2
* New matrix.rb, for faster development of Contingence Tables and Correlation Matrix
=== 0.6.1 / 2010-02-08
* Bug fix on DominanceAnalysis summary for Ruby1.9
* Some extra documentation
=== 0.6.0 / 2010-02-05
* New Statsample::Factor module. Include classes for extracting factors (Statsample::Factor::PCA and Statsample::Factor::PrincipalAxis) and rotate component matrix ( Statsample::Factor::Rotation subclasses). For now, only orthogonal rotations
* New Statsample::Dataset.crosstab_with_asignation, Statsample::Dataset.one_to_many
* New class Statsample::Permutation to produce permutations of a given array
* New class Statsample::Histogram, with same interface as GSL one
* New class Statsample::Test::UMannWhitney, to perform Mann-Whitney's U test. Gives z based and exact calculation of probability
* Improved support for ReportBuilder
* Statsample::Codification module reworked
* Fixed bugs on Dominance Analysis classes
* Fixed bugs on Statsample::Vector.kurtosis and Statsample::Vector.skew
=== 0.5.1 / 2009-10-06
* New class Statsample::Bivariate::Tetrachoric, for calculation of tetrachoric correlations. See http://www.john-uebersax.com/stat/tetra.htm for information.
* New Statsample::Dataset.merge
* New Statsample::Vector.dichotomize
* New ItemReliability.item_difficulty_analysis
* New module Statsample::SPSS, to export information to SPSS. For now, only tetrachoric correlation matrix are provided
* All SpreadSheet based importers now accept repeated variable names and renames they on the fly
* MultipleRegression::BaseEngine moved to new file
* Bug fix for MultipleRegression::GslEngine checks for Alglib, not GSL
=== 0.5.0 / 2009-09-26
* Vector now uses a Hash as a third argument
* Tested on Ruby 1.8.6, 1.8.7 and 1.9.1 with multiruby
=== 0.4.1 / 2009-09-12
* More methods and usage documentation
* Logit tests
* Bug fix: rescue for requires doesn't specify LoadError
* Binomial::BaseEngine new methods: coeffs_se, coeffs, constant and constant_se
=== 0.4.0 / 2009-09-10
* New Distribution module, based on statistics2.rb by Shin-ichiro HARA. Replaces all instances of GSL distributions pdf and cdf calculations for native calculation.
* New Maximum Likehood Estimation for Logit, Probit and Normal Distribution using Von Tessin(2005) algorithm. See MLE class and subclasses for more information.
* New Binomial regression subclasses (Logit and Probit), usign MLE class
* Added tests for gsl, Distribution, MLE and Logit
* Bug fix on svggraph.rb. Added check_type for scale graphics
* Bug fix on gdchart. Replaced old Nominal, Ordinal and Scale for Vector
=== 0.3.4 / 2009-08-21
* Works with statsample-optimization 2.0.0
* Vector doesn't uses delegation. All methods are part of Vector
* Added Combination. Generates all combination of n elements taken r at a time
* Bivariate#prop_pearson now can uses as a second parameter :both, :left, :right, :positive or :negative
* Added LICENSE.txt
=== 0.3.3 / 2009-08-11
* Added i18n support. For now, only spanish translation available
* Bug fix: Test now load libraries on ../lib path
* Excel and CSV importers automatically modify type of vector to Scale when all data are numbers or nils values
=== 0.3.2 / 2009-08-04
* Added Regression::Multiple::GslEngine
* Added setup.rb
* Crosstab#row_label and #column_name
* DominanceAnalysis and DominanceAnalysisBootstrap uses Dataset#labels for Vector names.
=== 0.3.1 / 2009-08-03
* Name and logic of Regression classes changed. Now, you have Regression::Simple class and Regression::Multiple module with two engines: RubyEngine and AlglibEngne
* New Crosstab#summary
=== 0.3.0 / 2009-08-02
* Statsample renamed to Statsample
* Optimization extension goes to another gem: ruby-statsample-optimization
=== 0.2.0 / 2009-08-01
* One Way Anova on Statsample::Anova::OneWay
* Dominance Analysis!!!! The one and only reason to develop a Multiple Regression on pure ruby.
* Multiple Regression on Multiple Regression module. Pairwise (pure ruby) or MultipleRegressionPairwise and Listwise (optimized) on MultipleRegressionAlglib and
* New Dataset#to_gsl_matrix, #from_to,#[..],#bootstrap,#vector_missing_values, #vector_count_characters, #each_with_index, #collect_with_index
* New Vector#box_cox_transformation
* Module Correlation renamed to Bivariate
* Some fancy methods and classes to create Summaries
* Some documentation about Algorithm used on doc_latex
* Deleted 'distributions' extension. Ruby/GSL has all the pdf and cdf you ever need.
* Tests work without any dependency. Only nags about missing deps.
* Test for MultipleRegression, Anova, Excel, Bivariate.correlation_matrix and many others
=== 0.1.9 / 2009-05-22
* Class Vector: new method vector_standarized_pop, []=, min,max
* Class Dataset: global variable $RUBY_SS_ROW stores the row number on each() and related methods. dup() with argument returns a copy of the dataset only for given fields. New methods: standarize, vector_mean, collect, verify,collect_matrix
* Module Correlation: new methods covariance, t_pearson, t_r, prop_pearson, covariance_matrix, correlation_matrix, correlation_probability_matrix
* Module SRS: New methods estimation_n0 and estimation_n
* Module Reliability: new ItemCharacteristicCurve class
* New HtmlReport class
* New experimental SPSS Class.
* Converters: Module CSV with new options. Added write() method for GGobi module
* New Mx exporter (http://www.vcu.edu/mx/)
* Class SimpleRegression: new methods standard error
* Added tests for regression and reliability, Vector#vector_mean, Dataset#dup (partial) and Dataset#verify
=== 0.1.8 / 2008-12-10
* Added Regression and Reliability modules
* Class Vector: added methods vector_standarized, recode, inspect, ranked
* Class Dataset: added methods vector_by_calculation, vector_sum, filter_field
* Module Correlation: added methods like spearman, point biserial and tau-b
* Added tests for Vector#ranked, Vector#vector_standarized, Vector#sum_of_squared_deviation, Dataset#vector_by_calculation, Dataset#vector_sum, Dataset#filter_field and various test for Correlation module
* Added demos: item_analysis and sample_test
=== 0.1.7 / 2008-10-1
* New module for codification
* ...
=== 0.1.6 / 2008-09-26
* New modules for SRS and stratified sampling
* Statsample::Database for read and write onto databases.
You could use Database and CSV on-tandem for mass-editing and reimport
of databases
=== 0.1.5 / 2008-08-29
* New extension statsampleopt for optimizing some functions on Statsample submodules
* New submodules Correlation and Test
=== 0.1.4 / 2008-08-27
* New extension, with cdf functions for
chi-square, t, gamma and normal distributions.
Based on dcdflib (http://www.netlib.org/random/)
Also, has a function to calculate the tail for a noncentral T distribution
=== 0.1.3 / 2008-08-22
* Operational versions of Vector, Dataset, Crosstab and Resample
* Read and write CSV files
* Calculate chi-square for 2 matrixes
=== 0.1.1 - 0.1.2 / 2008-08-18
* Included several methods on Ruby::Type classes
* Organized dirs with sow
=== 0.1.0 / 2008-08-12
* First version.
================================================
FILE: LICENSE.txt
================================================
Copyright (c) 2009-2014, Claudio Bustos
All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
================================================
FILE: Manifest.txt
================================================
.travis.yml
Gemfile
Gemfile.lock
History.txt
LICENSE.txt
Manifest.txt
README.md
Rakefile
benchmarks/correlation_matrix_15_variables.rb
benchmarks/correlation_matrix_5_variables.rb
benchmarks/correlation_matrix_methods/correlation_matrix.ds
benchmarks/correlation_matrix_methods/correlation_matrix.html
benchmarks/correlation_matrix_methods/correlation_matrix.rb
benchmarks/correlation_matrix_methods/correlation_matrix.xls
benchmarks/correlation_matrix_methods/correlation_matrix_gsl_ruby.ods
benchmarks/correlation_matrix_methods/correlation_matrix_with_graphics.ods
benchmarks/correlation_matrix_methods/results.ds
benchmarks/factor_map.rb
benchmarks/helpers_benchmark.rb
data/locale/es/LC_MESSAGES/statsample.mo
doc_latex/manual/equations.tex
examples/boxplot.rb
examples/correlation_matrix.rb
examples/dataset.rb
examples/dominance_analysis.rb
examples/dominance_analysis_bootstrap.rb
examples/histogram.rb
examples/icc.rb
examples/levene.rb
examples/multiple_regression.rb
examples/multivariate_correlation.rb
examples/parallel_analysis.rb
examples/polychoric.rb
examples/principal_axis.rb
examples/reliability.rb
examples/scatterplot.rb
examples/t_test.rb
examples/tetrachoric.rb
examples/u_test.rb
examples/vector.rb
examples/velicer_map_test.rb
grab_references.rb
lib/spss.rb
lib/statsample.rb
lib/statsample/analysis.rb
lib/statsample/analysis/suite.rb
lib/statsample/analysis/suitereportbuilder.rb
lib/statsample/anova.rb
lib/statsample/anova/contrast.rb
lib/statsample/anova/oneway.rb
lib/statsample/anova/twoway.rb
lib/statsample/bivariate.rb
lib/statsample/bivariate/pearson.rb
lib/statsample/codification.rb
lib/statsample/converter/csv.rb
lib/statsample/converter/spss.rb
lib/statsample/converters.rb
lib/statsample/crosstab.rb
lib/statsample/dataset.rb
lib/statsample/dominanceanalysis.rb
lib/statsample/dominanceanalysis/bootstrap.rb
lib/statsample/factor.rb
lib/statsample/factor/map.rb
lib/statsample/factor/parallelanalysis.rb
lib/statsample/factor/pca.rb
lib/statsample/factor/principalaxis.rb
lib/statsample/factor/rotation.rb
lib/statsample/graph.rb
lib/statsample/graph/boxplot.rb
lib/statsample/graph/histogram.rb
lib/statsample/graph/scatterplot.rb
lib/statsample/histogram.rb
lib/statsample/matrix.rb
lib/statsample/multiset.rb
lib/statsample/regression.rb
lib/statsample/regression/multiple.rb
lib/statsample/regression/multiple/alglibengine.rb
lib/statsample/regression/multiple/baseengine.rb
lib/statsample/regression/multiple/gslengine.rb
lib/statsample/regression/multiple/matrixengine.rb
lib/statsample/regression/multiple/rubyengine.rb
lib/statsample/regression/simple.rb
lib/statsample/reliability.rb
lib/statsample/reliability/icc.rb
lib/statsample/reliability/multiscaleanalysis.rb
lib/statsample/reliability/scaleanalysis.rb
lib/statsample/reliability/skillscaleanalysis.rb
lib/statsample/resample.rb
lib/statsample/rserve_extension.rb
lib/statsample/shorthand.rb
lib/statsample/srs.rb
lib/statsample/test.rb
lib/statsample/test/bartlettsphericity.rb
lib/statsample/test/chisquare.rb
lib/statsample/test/f.rb
lib/statsample/test/kolmogorovsmirnov.rb
lib/statsample/test/levene.rb
lib/statsample/test/t.rb
lib/statsample/test/umannwhitney.rb
lib/statsample/test/wilcoxonsignedrank.rb
lib/statsample/vector.rb
lib/statsample/vector/gsl.rb
lib/statsample/version.rb
po/es/statsample.mo
po/es/statsample.po
po/statsample.pot
references.txt
setup.rb
test/fixtures/bank2.dat
test/fixtures/correlation_matrix.rb
test/fixtures/hartman_23.matrix
test/fixtures/repeated_fields.csv
test/fixtures/stock_data.csv
test/fixtures/test_csv.csv
test/fixtures/test_xls.xls
test/fixtures/tetmat_matrix.txt
test/fixtures/tetmat_test.txt
test/helpers_tests.rb
test/test_analysis.rb
test/test_anova_contrast.rb
test/test_anovaoneway.rb
test/test_anovatwoway.rb
test/test_anovatwowaywithdataset.rb
test/test_anovawithvectors.rb
test/test_bartlettsphericity.rb
test/test_bivariate.rb
test/test_codification.rb
test/test_crosstab.rb
test/test_csv.rb
test/test_dataset.rb
test/test_dominance_analysis.rb
test/test_factor.rb
test/test_factor_map.rb
test/test_factor_pa.rb
test/test_ggobi.rb
test/test_gsl.rb
test/test_histogram.rb
test/test_matrix.rb
test/test_multiset.rb
test/test_regression.rb
test/test_reliability.rb
test/test_reliability_icc.rb
test/test_reliability_skillscale.rb
test/test_resample.rb
test/test_rserve_extension.rb
test/test_srs.rb
test/test_statistics.rb
test/test_stest.rb
test/test_stratified.rb
test/test_test_f.rb
test/test_test_kolmogorovsmirnov.rb
test/test_test_t.rb
test/test_umannwhitney.rb
test/test_vector.rb
test/test_wilcoxonsignedrank.rb
test/test_xls.rb
web/Rakefile
================================================
FILE: README.md
================================================
# Statsample
Homepage :: https://github.com/sciruby/statsample
[](https://travis-ci.org/clbustos/statsample)
[](http://badge.fury.io/rb/statsample)
## DESCRIPTION
A suite for basic and advanced statistics on Ruby. Tested on Ruby 2.1.1p76 (June 2014), 1.8.7, 1.9.1, 1.9.2 (April, 2010), ruby-head(June, 2011) and JRuby 1.4 (Ruby 1.8.7 compatible).
Include:
* Descriptive statistics: frequencies, median, mean, standard error, skew, kurtosis (and many others).
* Imports and exports datasets from and to Excel, CSV and plain text files.
* Correlations: Pearson's r, Spearman's rank correlation (rho), point biserial, tau a, tau b and gamma. Tetrachoric and Polychoric correlation provides by +statsample-bivariate-extension+ gem.
* Intra-class correlation
* Anova: generic and vector-based One-way ANOVA and Two-way ANOVA, with contrasts for One-way ANOVA.
* Tests: F, T, Levene, U-Mannwhitney.
* Regression: Simple, Multiple (OLS), Probit and Logit
* Factorial Analysis: Extraction (PCA and Principal Axis), Rotation (Varimax, Equimax, Quartimax) and Parallel Analysis and Velicer's MAP test, for estimation of number of factors.
* Reliability analysis for simple scale and a DSL to easily analyze multiple scales using factor analysis and correlations, if you want it.
* Basic time series support
* Dominance Analysis, with multivariate dependent and bootstrap (Azen & Budescu)
* Sample calculation related formulas
* Structural Equation Modeling (SEM), using R libraries +sem+ and +OpenMx+
* Creates reports on text, html and rtf, using ReportBuilder gem
* Graphics: Histogram, Boxplot and Scatterplot
## Principles
* Software Design:
* One module/class for each type of analysis
* Options can be set as hash on initialize() or as setters methods
* Clean API for interactive sessions
* summary() returns all necessary informacion for interactive sessions
* All statistical data available though methods on objects
* All (important) methods should be tested. Better with random data.
* Statistical Design
* Results are tested against text results, SPSS and R outputs.
* Go beyond Null Hiphotesis Testing, using confidence intervals and effect sizes when possible
* (When possible) All references for methods are documented, providing sensible information on documentation
## Features
* Classes for manipulation and storage of data:
* Statsample::Vector: An extension of an array, with statistical methods like sum, mean and standard deviation
* Statsample::Dataset: a group of Statsample::Vector, analog to a excel spreadsheet or a dataframe on R. The base of almost all operations on statsample.
* Statsample::Multiset: multiple datasets with same fields and type of vectors
* Anova module provides generic Statsample::Anova::OneWay and vector based Statsample::Anova::OneWayWithVectors. Also you can create contrast using Statsample::Anova::Contrast
* Module Statsample::Bivariate provides covariance and pearson, spearman, point biserial, tau a, tau b, gamma, tetrachoric (see Bivariate::Tetrachoric) and polychoric (see Bivariate::Polychoric) correlations. Include methods to create correlation and covariance matrices
* Multiple types of regression.
* Simple Regression : Statsample::Regression::Simple
* Multiple Regression: Statsample::Regression::Multiple
* Logit Regression: Statsample::Regression::Binomial::Logit
* Probit Regression: Statsample::Regression::Binomial::Probit
* Factorial Analysis algorithms on Statsample::Factor module.
* Classes for Extraction of factors:
* Statsample::Factor::PCA
* Statsample::Factor::PrincipalAxis
* Classes for Rotation of factors:
* Statsample::Factor::Varimax
* Statsample::Factor::Equimax
* Statsample::Factor::Quartimax
* Classes for calculation of factors to retain
* Statsample::Factor::ParallelAnalysis performs Horn's 'parallel analysis' to a principal components analysis to adjust for sample bias in the retention of components.
* Statsample::Factor::MAP performs Velicer's Minimum Average Partial (MAP) test, which retain components as long as the variance in the correlation matrix represents systematic variance.
* Dominance Analysis. Based on Budescu and Azen papers, dominance analysis is a method to analyze the relative importance of one predictor relative to another on multiple regression
* Statsample::DominanceAnalysis class can report dominance analysis for a sample, using uni or multivariate dependent variables
* Statsample::DominanceAnalysis::Bootstrap can execute bootstrap analysis to determine dominance stability, as recomended by Azen & Budescu (2003) link[http://psycnet.apa.org/journals/met/8/2/129/].
* Module Statsample::Codification, to help to codify open questions
* Converters to import and export data:
* Statsample::Database : Can create sql to create tables, read and insert data
* Statsample::CSV : Read and write CSV files
* Statsample::Excel : Read and write Excel files
* Statsample::Mx : Write Mx Files
* Statsample::GGobi : Write Ggobi files
* Module Statsample::Crosstab provides function to create crosstab for categorical data
* Module Statsample::Reliability provides functions to analyze scales with psychometric methods.
* Class Statsample::Reliability::ScaleAnalysis provides statistics like mean, standard deviation for a scale, Cronbach's alpha and standarized Cronbach's alpha, and for each item: mean, correlation with total scale, mean if deleted, Cronbach's alpha is deleted.
* Class Statsample::Reliability::MultiScaleAnalysis provides a DSL to easily analyze reliability of multiple scales and retrieve correlation matrix and factor analysis of them.
* Class Statsample::Reliability::ICC provides intra-class correlation, using Shrout & Fleiss(1979) and McGraw & Wong (1996) formulations.
* Module Statsample::SRS (Simple Random Sampling) provides a lot of functions to estimate standard error for several type of samples
* Module Statsample::Test provides several methods and classes to perform inferencial statistics
* Statsample::Test::BartlettSphericity
* Statsample::Test::ChiSquare
* Statsample::Test::F
* Statsample::Test::KolmogorovSmirnov (only D value)
* Statsample::Test::Levene
* Statsample::Test::UMannWhitney
* Statsample::Test::T
* Statsample::Test::WilcoxonSignedRank
* Module Graph provides several classes to create beautiful graphs using rubyvis
* Statsample::Graph::Boxplot
* Statsample::Graph::Histogram
* Statsample::Graph::Scatterplot
* Gem bio-statsample-timeseries provides module Statsample::TimeSeries with support for time series, including ARIMA estimation using Kalman-Filter.
* Gem statsample-sem provides a DSL to R libraries +sem+ and +OpenMx+
* Gem statsample-glm provides you with GML method, to work with Logistic, Poisson and Gaussian regression ,using ML or IRWLS.
* Close integration with gem reportbuilder, to easily create reports on text, html and rtf formats.
# Examples of use:
See the [examples folder](https://github.com/clbustos/statsample/tree/master/examples/) too.
## Boxplot
```ruby
require 'statsample'
ss_analysis(Statsample::Graph::Boxplot) do
n=30
a=rnorm(n-1,50,10)
b=rnorm(n, 30,5)
c=rnorm(n,5,1)
a.push(2)
boxplot(:vectors=>[a,b,c], :width=>300, :height=>300, :groups=>%w{first first second}, :minimum=>0)
end
Statsample::Analysis.run # Open svg file on *nix application defined
```
## Correlation matrix
```ruby
require 'statsample'
# Note R like generation of random gaussian variable
# and correlation matrix
ss_analysis("Statsample::Bivariate.correlation_matrix") do
samples=1000
ds=data_frame(
'a'=>rnorm(samples),
'b'=>rnorm(samples),
'c'=>rnorm(samples),
'd'=>rnorm(samples))
cm=cor(ds)
summary(cm)
end
Statsample::Analysis.run_batch # Echo output to console
```
# Requirements
Optional:
* Plotting: gnuplot and rbgnuplot, SVG::Graph
* Factorial analysis and polychorical correlation(joint estimate and polychoric series): gsl library and rb-gsl (https://rubygems.org/gems/rb-gsl/). You should install it using gem install rb-gsl.
*Note*: Use gsl 1.12.109 or later.
# Resources
* Source code on github :: http://github.com/clbustos/statsample
* Docs :: http://statsample.apsique.cl/
* Bug report and feature request :: http://github.com/clbustos/statsample/issues
* E-mailing list :: http://groups.google.com/group/statsample
# Installation
```bash
$ sudo gem install statsample
```
On *nix, you should install statsample-optimization to retrieve gems gsl, statistics2 and a C extension to speed some methods.
There are available precompiled version for Ruby 1.9 on x86, x86_64 and mingw32 archs.
```bash
$ sudo gem install statsample-optimization
```
If you use Ruby 1.8, you should compile statsample-optimization, usign parameter --platform ruby
```bash
$ sudo gem install statsample-optimization --platform ruby
```
If you need to work on Structural Equation Modeling, you could see +statsample-sem+. You need R with +sem+ or +OpenMx+ [http://openmx.psyc.virginia.edu/] libraries installed
```bash
$ sudo gem install statsample-sem
```
Available setup.rb file
```bash
sudo gem ruby setup.rb
```
## License
BSD-3 (See LICENSE.txt)
Could change between version, without previous warning. If you want a specific license, just choose the version that you need.
================================================
FILE: Rakefile
================================================
#!/usr/bin/ruby
# -*- ruby -*-
# -*- coding: utf-8 -*-
$:.unshift(File.dirname(__FILE__)+'/lib/')
require 'rubygems'
require 'statsample'
require 'hoe'
require 'rdoc'
Hoe.plugin :git
Hoe.plugin :doofus
desc "Ruby Lint"
task :lint do
executable=Config::CONFIG['RUBY_INSTALL_NAME']
Dir.glob("lib/**/*.rb") {|f|
if !system %{#{executable} -w -c "#{f}"}
puts "Error on: #{f}"
end
}
end
task :release do
system %{git push origin master}
end
task "clobber_docs" do
# Only to omit warnings
end
desc "Update pot/po files."
task "gettext:updatepo" do
require 'gettext/tools'
GetText.update_pofiles("statsample", Dir.glob("{lib,bin}/**/*.{rb,rhtml}"), "statsample #{Statsample::VERSION}")
end
desc "Create mo-files"
task "gettext:makemo" do
require 'gettext/tools'
GetText.create_mofiles()
# GetText.create_mofiles(true, "po", "locale") # This is for "Ruby on Rails".
end
h=Hoe.spec('statsample') do
self.version=Statsample::VERSION
self.urls=["https://github.com/clbustos/statsample"]
#self.testlib=:minitest
self.readme_file = 'README.md'
self.urls = ['https://github.com/clbustos/statsample']
self.developer('Claudio Bustos', 'clbustos@gmail.com')
self.extra_deps << ["spreadsheet","~>0.6"] << ["reportbuilder", "~>1.4"] << ["minimization", "~>0.2.0"] << ["fastercsv", ">0"] << ["dirty-memoize", "~>0.0"] << ["extendmatrix","~>0.3.1"] << ["statsample-bivariate-extension", ">0"] << ["rserve-client"] << ["rubyvis"] << ["distribution"]
self.extra_dev_deps << ["hoe","~>0"] << ["shoulda","~>3"] << ["minitest", "~>2"] << ["gettext", "~>0"] << ["mocha", "~>0"] << ["hoe-git", "~>0"]
self.clean_globs << "test/images/*" << "demo/item_analysis/*" << "demo/Regression"
self.post_install_message = <<-EOF
***************************************************
Thanks for installing statsample.
On *nix, you could install statsample-optimization
to retrieve gems gsl, statistics2 and a C extension
to speed some methods.
$ sudo gem install statsample-optimization
On Ubuntu, install build-essential and libgsl0-dev
using apt-get. Compile ruby 1.8 or 1.9 from
source code first.
$ sudo apt-get install build-essential libgsl0-dev
*****************************************************
EOF
self.need_rdoc=false
end
if Rake.const_defined?(:RDocTask)
Rake::RDocTask.new(:docs) do |rd|
rd.main = h.readme_file
rd.options << '-d' if (`which dot` =~ /\/dot/) unless
ENV['NODOT'] || Hoe::WINDOZE
rd.rdoc_dir = 'doc'
rd.rdoc_files.include("lib/**/*.rb")
rd.rdoc_files += h.spec.extra_rdoc_files
rd.rdoc_files.reject! {|f| f=="Manifest.txt"}
title = h.spec.rdoc_options.grep(/^(-t|--title)=?$/).first
if title then
rd.options << title
unless title =~ /\=/ then # for ['-t', 'title here']
title_index = spec.rdoc_options.index(title)
rd.options << spec.rdoc_options[title_index + 1]
end
else
title = "#{h.name}-#{h.version} Documentation"
title = "#{h.rubyforge_name}'s " + title if h.rubyforge_name != h.name
rd.options << '--title' << title
end
end
end
desc 'Publish rdocs with analytics support'
task :publicar_docs => [:clean] do
# ruby %{agregar_adsense_a_doc.rb}
path = File.expand_path("./doc.yaml")
config = YAML.load(File.read(path))
host = "#{config["user"]}@#{config["host"]}"
remote_dir = config["dir"]
local_dir = h.local_rdoc_dir
Dir.glob(local_dir+"/**/*") {|file|
sh %{chmod 755 #{file}}
}
sh %{rsync #{h.rsync_args} #{local_dir}/ #{host}:#{remote_dir}}
end
# vim: syntax=Ruby
================================================
FILE: benchmarks/correlation_matrix_15_variables.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_benchmark.rb'))
extend BenchPress
cases=250
vars=20
name "gsl matrix based vs. manual ruby correlation matrix (#{vars} vars, #{cases} cases)"
author 'Clbustos'
date '2011-01-18'
summary "
A correlation matrix could be constructed using matrix algebra or
mannualy, calculating covariances, means and sd for each pair of vectors.
In this test, we test the calculation using #{vars} variables with
#{cases} cases on each vector
"
reps 200 #number of repetitions
ds=vars.times.inject({}) {|ac,v|
ac["x#{v}"]=Statsample::Vector.new_scale(cases) {rand()}
ac
}.to_dataset
measure "Statsample::Bivariate.correlation_matrix_optimized" do
Statsample::Bivariate.correlation_matrix_optimized(ds)
end
measure "Statsample::Bivariate.correlation_matrix_pairwise" do
Statsample::Bivariate.correlation_matrix_pairwise(ds)
end
================================================
FILE: benchmarks/correlation_matrix_5_variables.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_benchmark.rb'))
extend BenchPress
cases=500
vars=5
name "gsl matrix based vs. manual ruby correlation matrix (#{vars} vars, #{cases} cases)"
author 'Clbustos'
date '2011-01-18'
summary "
A correlation matrix could be constructed using matrix algebra or
mannualy, calculating covariances, means and sd for each pair of vectors.
In this test, we test the calculation using #{vars} variables with
#{cases} cases on each vector
"
reps 200 #number of repetitions
ds=vars.times.inject({}) {|ac,v|
ac["x#{v}"]=Statsample::Vector.new_scale(cases) {rand()}
ac
}.to_dataset
measure "Statsample::Bivariate.correlation_matrix_optimized" do
Statsample::Bivariate.correlation_matrix_optimized(ds)
end
measure "Statsample::Bivariate.correlation_matrix_pairwise" do
Statsample::Bivariate.correlation_matrix_pairwise(ds)
end
================================================
FILE: benchmarks/correlation_matrix_methods/correlation_matrix.html
================================================
Correlation matrix analysis
Correlation matrix analysis
Multiple reggresion of cases,vars,c_v on time_optimized
Engine: Statsample::Regression::Multiple::RubyEngine
Cases(listwise)=63(63)
R=0.978844
R^2=0.958137
R^2 Adj=0.956008
Std.Error R=3.092024
Equation=4.031667 + 0.018039cases + 0.244790vars + 0.001197c_v
ANOVA
ANOVA Table| source | ss | df | ms | f | p |
| Regression | 12910.098 | 3 | 4303.366 | 450.114 | 0.000 |
| Error | 564.076 | 59 | 9.561 | | |
| Total | 13474.174 | 62 | 4312.927 | | |
Beta coefficients| coeff | b | beta | se | t |
| Constant | 4.031667 | - | 0.752604 | 5.356953 |
| cases | 0.018039 | 0.381587 | 0.001961 | 9.200093 |
| vars | 0.244790 | 0.224390 | 0.036055 | 6.789335 |
| c_v | 0.001197 | 0.584174 | 0.000094 | 12.738410 |
Multiple reggresion of cases,vars,c_v on time_pairwise
Engine: Statsample::Regression::Multiple::RubyEngine
Cases(listwise)=63(63)
R=0.999637
R^2=0.999275
R^2 Adj=0.999238
Std.Error R=0.538365
Equation=-0.520303 + -0.000708cases + 1.234451vars + 0.000735c_v
ANOVA
ANOVA Table| source | ss | df | ms | f | p |
| Regression | 23554.271 | 3 | 7851.424 | 27089.134 | 0.000 |
| Error | 17.100 | 59 | 0.290 | | |
| Total | 23571.372 | 62 | 7851.714 | | |
Beta coefficients| coeff | b | beta | se | t |
| Constant | -0.520303 | - | 0.131039 | -3.970594 |
| cases | -0.000708 | -0.011324 | 0.000341 | -2.074007 |
| vars | 1.234451 | 0.855546 | 0.006278 | 196.641087 |
| c_v | 0.000735 | 0.271138 | 0.000016 | 44.912972 |
================================================
FILE: benchmarks/correlation_matrix_methods/correlation_matrix.rb
================================================
# This test create a database to adjust the best algorithm
# to use on correlation matrix
require(File.expand_path(File.dirname(__FILE__)+'/../helpers_benchmark.rb'))
require 'statsample'
require 'benchmark'
def create_dataset(vars,cases)
ran=Distribution::Normal.rng
ds=vars.times.inject({}) {|ac,v|
ac["x#{v}"]=Statsample::Vector.new_scale(cases) {ran.call}
ac
}.to_dataset
end
def prediction_pairwise(vars,cases)
Statsample::Bivariate.prediction_pairwise(vars,cases) / 10
end
def prediction_optimized(vars,cases)
Statsample::Bivariate.prediction_optimized(vars,cases) / 10
end
if !File.exists?("correlation_matrix.ds") or File.mtime(__FILE__) > File.mtime("correlation_matrix.ds")
reps=100 #number of repetitions
ds_sizes=[5,10,30,50,100,150,200,500,1000]
ds_vars=[3,4,5,10,20,30,40]
#ds_sizes=[5,10]
#ds_vars=[3,5,20]
rs=Statsample::Dataset.new(%w{cases vars time_optimized time_pairwise})
ds_sizes.each do |cases|
ds_vars.each do |vars|
ds=create_dataset(vars,cases)
time_optimized= Benchmark.realtime do
reps.times {
Statsample::Bivariate.correlation_matrix_optimized(ds)
ds.clear_gsl
}
end
time_pairwise= Benchmark.realtime do
reps.times {
Statsample::Bivariate.correlation_matrix_pairwise(ds)
}
end
puts "Cases:#{cases}, vars:#{vars} -> opt:%0.3f (%0.3f) | pair: %0.3f (%0.3f)" % [time_optimized, prediction_optimized(vars,cases), time_pairwise, prediction_pairwise(vars,cases)]
rs.add_case({'cases'=>cases,'vars'=>vars,'time_optimized'=>Math.sqrt(time_optimized*1000),'time_pairwise'=>Math.sqrt(time_pairwise*1000)})
end
end
else
rs=Statsample.load("correlation_matrix.ds")
end
rs.fields.each {|f| rs[f].type=:scale}
rs['c_v']=rs.collect {|row| row['cases']*row['vars']}
rs.update_valid_data
rs.save("correlation_matrix.ds")
Statsample::Excel.write(rs,"correlation_matrix.xls")
rb=ReportBuilder.new(:name=>"Correlation matrix analysis")
rb.add(Statsample::Regression.multiple(rs[['cases','vars','time_optimized','c_v']],'time_optimized', :digits=>6))
rb.add(Statsample::Regression.multiple(rs[['cases','vars','time_pairwise','c_v']],'time_pairwise', :digits=>6))
rb.save_html("correlation_matrix.html")
================================================
FILE: benchmarks/factor_map.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_benchmark.rb'))
extend BenchPress
name "Statsample::Factor::Map with and without GSL"
author 'Clbustos'
date '2011-01-18'
summary "Velicer's MAP uses a lot of Matrix algebra. How much we can improve the timing using GSL?
"
reps 20 #number of repetitions
m=Matrix[
[ 1, 0.846, 0.805, 0.859, 0.473, 0.398, 0.301, 0.382],
[ 0.846, 1, 0.881, 0.826, 0.376, 0.326, 0.277, 0.415],
[ 0.805, 0.881, 1, 0.801, 0.38, 0.319, 0.237, 0.345],
[ 0.859, 0.826, 0.801, 1, 0.436, 0.329, 0.327, 0.365],
[ 0.473, 0.376, 0.38, 0.436, 1, 0.762, 0.73, 0.629],
[ 0.398, 0.326, 0.319, 0.329, 0.762, 1, 0.583, 0.577],
[ 0.301, 0.277, 0.237, 0.327, 0.73, 0.583, 1, 0.539],
[ 0.382, 0.415, 0.345, 0.365, 0.629, 0.577, 0.539, 1]
]
map=Statsample::Factor::MAP.new(m)
measure "Statsample::Factor::MAP without GSL" do
map.use_gsl=false
map.compute
end
measure "Statsample::Factor::MAP with GSL" do
map.use_gsl=true
map.compute
end
================================================
FILE: benchmarks/helpers_benchmark.rb
================================================
$:.unshift(File.expand_path(File.dirname(__FILE__)+'/../lib/'))
$:.unshift(File.expand_path(File.dirname(__FILE__)+'/'))
require 'statsample'
require 'bench_press'
================================================
FILE: doc_latex/manual/equations.tex
================================================
\part{Equations}
\section{Convention}
\begin{align*}
n &= \text{sample size}\\
N &= \text{population size}\\
p &= \text{proportion inside a sample}\\
P &= \text{proportion inside a population}
\end{align*}
\section{Ruby::Regression::Multiple}
To compute the standard error of coefficients, you obtain the estimated variance-covariance matrix of error.
Let \mathbf{X} be matrix of predictors data, including a constant column; \mathbf{MSE} as mean square error; SSE as Sum of squares of errors; n the number of cases; p as number of predictors
\begin{equation}
\mathbf{MSE}=\frac{SSE}{n-p-1}
\end{equation}
\begin{equation}
\mathbf{E}=(\mathbf{X'}\mathbf{X})^-1\mathbf{MSE}
\end{equation}
The root squares of diagonal should be standard errors
\section{Ruby::SRS}
Finite Poblation correction is used on standard error calculation on poblation below 10.000. Function
\begin{verbatim}
fpc_var(sam,pop)
\end{verbatim}
calculate FPC for variance with
\begin{equation}
fpc_{var} = \frac{N-n} {N-1}
\end{equation}
with n as sam and N as pop
Function
\begin{verbatim}
fpc = fpc(sam,pop)
\end{verbatim}
calculate FPC for standard deviation with
\begin{equation}
fpc_{sd} = \sqrt{\frac{N-n} {N-1}}
\label{fpc}
\end{equation}
with n as sample size and N as population size.
\subsection{Sample Size estimation for proportions}
On infinite poblations, you should use method
\begin{verbatim}
estimation_n0(d,prop,margin=0.95)
\end{verbatim}
which uses
\begin{equation}
n = \frac{t^2(pq)}{d^2}
\label{n_i}
\end{equation}
where
\begin{align*}
t &= \text{t value for given level of confidence ( 1.96 for 95\% )}\\
d &= \text{margin of error}
\end{align*}
On finite poblations, you should use
\begin{verbatim}
estimation_n(d,prop,n_pobl, margin=0.95)
\end{verbatim}
which uses
\begin{equation}
n = \frac{n_i}{1+(\frac{n_i-1}{N})}
\end{equation}
Where $n_i$ is n on \ref{n_i} and N is population size
================================================
FILE: examples/boxplot.rb
================================================
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib/')
require 'statsample'
Statsample::Analysis.store(Statsample::Graph::Boxplot) do
n=30
a=rnorm(n-1,50,10)
b=rnorm(n, 30,5)
c=rnorm(n,5,1)
a.push(2)
boxplot(:vectors=>[a,b,c],:width=>300, :height=>300, :groups=>%w{first first second}, :minimum=>0)
end
if __FILE__==$0
Statsample::Analysis.run
end
================================================
FILE: examples/correlation_matrix.rb
================================================
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib/')
require 'statsample'
Statsample::Analysis.store("Statsample::Bivariate.correlation_matrix") do
samples=1000
ds=data_frame(
'a'=>rnorm(samples),
'b'=>rnorm(samples),
'c'=>rnorm(samples),
'd'=>rnorm(samples))
cm=cor(ds)
summary(cm)
end
if __FILE__==$0
Statsample::Analysis.run_batch
end
================================================
FILE: examples/dataset.rb
================================================
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib/')
require 'statsample'
Statsample::Analysis.store(Statsample::Dataset) do
samples=1000
a=Statsample::Vector.new_scale(samples) {r=rand(5); r==4 ? nil: r}
b=Statsample::Vector.new_scale(samples) {r=rand(5); r==4 ? nil: r}
ds={'a'=>a,'b'=>b}.to_dataset
summary(ds)
end
if __FILE__==$0
Statsample::Analysis.run_batch
end
================================================
FILE: examples/dominance_analysis.rb
================================================
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib/')
require 'statsample'
Statsample::Analysis.store(Statsample::DominanceAnalysis) do
sample=300
a=rnorm(sample)
b=rnorm(sample)
c=rnorm(sample)
d=rnorm(sample)
ds={'a'=>a,'b'=>b,'cc'=>c,'d'=>d}.to_dataset
attach(ds)
ds['y']=a*5+b*3+cc*2+d+rnorm(300)
cm=cor(ds)
summary(cm)
lr=lr(ds,'y')
summary(lr)
da=dominance_analysis(ds,'y')
summary(da)
da=dominance_analysis(ds,'y',:name=>"Dominance Analysis using group of predictors", :predictors=>['a', 'b', %w{cc d}])
summary(da)
end
if __FILE__==$0
Statsample::Analysis.run_batch
end
================================================
FILE: examples/dominance_analysis_bootstrap.rb
================================================
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib/')
require 'statsample'
Statsample::Analysis.store(Statsample::DominanceAnalysis::Bootstrap) do
sample=300
a=rnorm(sample)
b=rnorm(sample)
c=rnorm(sample)
d=rnorm(sample)
a.name="a"
b.name="b"
c.name="c"
d.name="d"
ds={'a'=>a,'b'=>b,'cc'=>c,'d'=>d}.to_dataset
attach(ds)
ds['y1']=a*5+b*2+cc*2+d*2+rnorm(sample,0,10)
ds['y2']=a*10+rnorm(sample)
dab=dominance_analysis_bootstrap(ds, ['y1','y2'], :debug=>true)
dab.bootstrap(100,nil)
summary(dab)
ds2=ds['a'..'y1']
dab2=dominance_analysis_bootstrap(ds2, 'y1', :debug=>true)
dab2.bootstrap(100,nil)
summary(dab2)
end
if __FILE__==$0
Statsample::Analysis.run_batch
end
================================================
FILE: examples/histogram.rb
================================================
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib/')
require 'statsample'
Statsample::Analysis.store(Statsample::Graph::Histogram) do
histogram(rnorm(3000,0,20))
end
if __FILE__==$0
Statsample::Analysis.run
end
================================================
FILE: examples/icc.rb
================================================
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib/')
require 'statsample'
Statsample::Analysis.store(Statsample::Reliability::ICC) do
size=1000
a=Statsample::Vector.new_scale(size) {rand(10)}
b=a.recode{|i|i+rand(4)-2}
c=a.recode{|i|i+rand(4)-2}
d=a.recode{|i|i+rand(4)-2}
@ds={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset
@icc=Statsample::Reliability::ICC.new(@ds)
summary(@icc)
@icc.type=:icc_3_1
summary(@icc)
@icc.type=:icc_a_k
summary(@icc)
end
if __FILE__==$0
Statsample::Analysis.run_batch
end
================================================
FILE: examples/levene.rb
================================================
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib/')
require 'statsample'
Statsample::Analysis.store(Statsample::Test::Levene) do
a=[1,2,3,4,5,6,7,8,100,10].to_scale
b=[30,40,50,60,70,80,90,100,110,120].to_scale
summary(levene([a,b]))
end
if __FILE__==$0
Statsample::Analysis.run_batch
end
================================================
FILE: examples/multiple_regression.rb
================================================
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib/')
require 'statsample'
Statsample::Analysis.store(Statsample::Regression::Multiple) do
samples=2000
ds=dataset('a'=>rnorm(samples),'b'=>rnorm(samples),'cc'=>rnorm(samples),'d'=>rnorm(samples))
attach(ds)
ds['y']=a*5+b*3+cc*2+d+rnorm(samples)
summary lr(ds,'y')
end
if __FILE__==$0
Statsample::Analysis.run_batch
end
================================================
FILE: examples/multivariate_correlation.rb
================================================
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib/')
require 'statsample'
require 'mathn'
Statsample::Analysis.store(Statsample::Regression::Multiple::MultipleDependent) do
complete=Matrix[
[1,0.53,0.62,0.19,-0.09,0.08,0.02,-0.12,0.08],
[0.53,1,0.61,0.23,0.1,0.18,0.02,-0.1,0.15],
[0.62,0.61,1,0.03,0.1,0.12,0.03,-0.06,0.12],
[0.19,0.23,0.03,1,-0.02,0.02,0,-0.02,-0.02],
[-0.09,0.1,0.1,-0.02,1,0.05,0.06,0.18,0.02],
[0.08,0.18,0.12,0.02,0.05,1,0.22,-0.07,0.36],
[0.02,0.02,0.03,0,0.06,0.22,1,-0.01,-0.05],
[-0.12,-0.1,-0.06,-0.02,0.18,-0.07,-0.01,1,-0.03],
[0.08,0.15,0.12,-0.02,0.02,0.36,-0.05,-0.03,1]]
complete.extend Statsample::CovariateMatrix
complete.fields=%w{adhd cd odd sex age monly mwork mage poverty}
lr=Statsample::Regression::Multiple::MultipleDependent.new(complete, %w{adhd cd odd})
echo "R^2_yx #{lr.r2yx}"
echo "P^2_yx #{lr.p2yx}"
end
if __FILE__==$0
Statsample::Analysis.run_batch
end
================================================
FILE: examples/parallel_analysis.rb
================================================
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib/')
require 'statsample'
samples=150
variables=30
iterations=50
Statsample::Analysis.store(Statsample::Factor::ParallelAnalysis) do
rng = Distribution::Normal.rng()
f1=rnorm(samples)
f2=rnorm(samples)
f3=rnorm(samples)
vectors={}
variables.times do |i|
vectors["v#{i}"]=samples.times.collect {|nv| f1[nv]*i+(f2[nv]*(15-i))+((f3[nv]*(30-i))*1.5)*rng.call}.to_scale
vectors["v#{i}"].name="Vector #{i}"
end
ds=vectors.to_dataset
pa=Statsample::Factor::ParallelAnalysis.new(ds, :iterations=>iterations, :debug=>true)
pca=pca(cor(ds))
echo "There are 3 real factors on data"
summary pca
echo "Traditional Kaiser criterion (k>1) returns #{pca.m} factors"
summary pa
echo "Parallel Analysis returns #{pa.number_of_factors} factors to preserve"
end
if __FILE__==$0
Statsample::Analysis.run_batch
end
================================================
FILE: examples/polychoric.rb
================================================
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib/')
$:.unshift("/home/cdx/usr/lib/statsample-bivariate-extension/lib/")
require 'statsample'
Statsample::Analysis.store(Statsample::Bivariate::Polychoric) do
ct=Matrix[[rand(10)+50, rand(10)+50, rand(10)+1],
[rand(20)+5, rand(50)+4, rand(10)+1],
[rand(8)+1, rand(12)+1, rand(10)+1]]
# Estimation of polychoric correlation using two-step (default)
poly=polychoric(ct, :name=>"Polychoric with two-step", :debug=>false)
summary poly
# Estimation of polychoric correlation using joint method (slow)
poly=polychoric(ct, :method=>:joint, :name=>"Polychoric with joint")
summary poly
# Uses polychoric series (not recomended)
poly=polychoric(ct, :method=>:polychoric_series, :name=>"Polychoric with polychoric series")
summary poly
end
if __FILE__==$0
Statsample::Analysis.run_batch
end
================================================
FILE: examples/principal_axis.rb
================================================
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib/')
require 'statsample'
Statsample::Analysis.store(Statsample::Factor::PrincipalAxis) do
matrix=Matrix[
[1.0, 0.709501601093587, 0.877596585880047, 0.272219316266807], [0.709501601093587, 1.0, 0.291633797330304, 0.871141831433844], [0.877596585880047, 0.291633797330304, 1.0, -0.213373722977167], [0.272219316266807, 0.871141831433844, -0.213373722977167, 1.0]]
matrix.extend Statsample::CovariateMatrix
#matrix.fields=%w{a b c d}
fa=principal_axis(matrix,:m=>1,:smc=>false)
summary fa
end
if __FILE__==$0
Statsample::Analysis.run_batch
end
================================================
FILE: examples/reliability.rb
================================================
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib')
require 'statsample'
Statsample::Analysis.store(Statsample::Reliability) do
samples=100
a=rnorm(samples)
ds=Statsample::Dataset.new
20.times do |i|
ds["v#{i}"]=a+rnorm(samples,0,0.2)
end
ds.update_valid_data
rel=Statsample::Reliability::ScaleAnalysis.new(ds)
summary rel
ms=Statsample::Reliability::MultiScaleAnalysis.new(:name=>"Multi Scale analyss") do |m|
m.scale "Scale 1", ds.clone(%w{v1 v2 v3 v4 v5 v6 v7 v8 v9 v10})
m.scale "Scale 2", ds.clone(%w{v11 v12 v13 v14 v15 v16 v17 v18 v19})
end
summary ms
end
if __FILE__==$0
Statsample::Analysis.run_batch
end
================================================
FILE: examples/scatterplot.rb
================================================
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib/')
$:.unshift('/home/cdx/dev/reportbuilder/lib/')
require 'benchmark'
require 'statsample'
n=100
Statsample::Analysis.store(Statsample::Graph::Scatterplot) do
x=rnorm(n)
y=x+rnorm(n,0.5,0.2)
scatterplot(x,y)
end
if __FILE__==$0
Statsample::Analysis.run
end
================================================
FILE: examples/t_test.rb
================================================
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib')
require 'statsample'
Statsample::Analysis.store(Statsample::Test::T) do
a=rnorm(10)
t_1=Statsample::Test.t_one_sample(a,{:u=>50})
summary t_1
b=rnorm(10,2)
t_2=Statsample::Test.t_two_samples_independent(a,b)
summary t_2
end
if __FILE__==$0
Statsample::Analysis.run_batch
end
================================================
FILE: examples/tetrachoric.rb
================================================
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib/')
require 'statsample'
Statsample::Analysis.store(Statsample::Bivariate::Tetrachoric) do
a=40
b=10
c=20
d=30
summary tetrachoric(a,b,c,d)
end
if __FILE__==$0
Statsample::Analysis.run_batch
end
================================================
FILE: examples/u_test.rb
================================================
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib')
require 'statsample'
Statsample::Analysis.store(Statsample::Test::UMannWhitney) do
a=10.times.map {rand(100)}.to_scale
b=20.times.map {(rand(20))**2+50}.to_scale
u=Statsample::Test::UMannWhitney.new(a,b)
summary u
end
if __FILE__==$0
Statsample::Analysis.run_batch
end
================================================
FILE: examples/vector.rb
================================================
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib/')
require 'statsample'
Statsample::Analysis.store(Statsample::Vector) do
a=Statsample::Vector.new_scale(1000) {r=rand(5); r==4 ? nil: r;}
summary a
b=c(1,2,3,4,6..10)
summary b
end
if __FILE__==$0
Statsample::Analysis.run_batch
end
================================================
FILE: examples/velicer_map_test.rb
================================================
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib/')
require 'statsample'
Statsample::Analysis.store(Statsample::Factor::MAP) do
rng=Distribution::Normal.rng
samples=100
variables=10
f1=rnorm(samples)
f2=rnorm(samples)
vectors={}
variables.times do |i|
vectors["v#{i}"]=samples.times.collect {|nv|
if i<5
f1[nv]*5 + f2[nv] *2 +rng.call
else
f1[nv]*2 + f2[nv] *3 +rng.call
end
}.to_scale
end
ds=vectors.to_dataset
cor=cor(ds)
pca=pca(cor)
map=Statsample::Factor::MAP.new(cor)
echo ("There are 2 real factors on data")
summary(pca)
echo("Traditional Kaiser criterion (k>1) returns #{pca.m} factors")
summary(map)
echo("Velicer's MAP Test returns #{map.number_of_factors} factors to preserve")
end
if __FILE__==$0
Statsample::Analysis.run_batch
end
================================================
FILE: grab_references.rb
================================================
#!/usr/bin/env ruby1.9
require 'reportbuilder'
refs=[]
Dir.glob "**/*.rb" do |f|
next if f=~/pkg/
reference=false
File.open(f).each_line do |l|
if l=~/== Reference/
reference=true
elsif reference
if l=~/\*\s+(.+)/
refs.push $1
else
reference=false
end
end
end
end
rb=ReportBuilder.new(:name=>"References") do |g|
refs.uniq.sort.each do |r|
g.text "* #{r}"
end
end
rb.save_text("references.txt")
================================================
FILE: lib/spss.rb
================================================
# = spss.rb -
#
# Provides utilites for working with spss files
#
# Copyright (C) 2009 Claudio Bustos
#
# Claudio Bustos mailto:clbustos@gmail.com
module SPSS # :nodoc: all
module Dictionary
class Element
def add(a)
@elements.push(a)
end
def parse_elements(func=:to_s)
@elements.collect{|e| " "+e.send(func)}.join("\n")
end
def init_with config
config.each {|key,value|
self.send(key.to_s+"=",value) if methods.include? key.to_s
}
end
def initialize(config={})
@config=config
@elements=[]
end
end
class Dictionary < Element
attr_accessor :locale, :date_time, :row_count
def initialize(config={})
super
init_with ({
:locale=>"en_US",
:date_time=>Time.new().strftime("%Y-%m-%dT%H:%M:%S"),
:row_count=>1
})
init_with config
end
def to_xml
"\n"+parse_elements(:to_xml)+"\n"
end
def to_spss
parse_elements(:to_spss)
end
end
class MissingValue < Element
attr_accessor :data, :type, :from, :to
def initialize(data,type=nil)
@data=data
if type.nil? or type=="lowerBound" or type=="upperBound"
@type=type
else
raise Exception,"Incorrect value for type"
end
end
def to_xml
""
end
end
class LabelSet
attr_accessor
def initialize(labels)
@labels=labels
end
def parse_xml(name)
"\n "+@labels.collect{|key,value| ""}.join("\n ")+"\n \n"
end
def parse_spss()
@labels.collect{|key,value| "#{key} '#{value}'"}.join("\n ")
end
end
class Variable < Element
attr_accessor :aligment, :display_width, :label, :measurement_level, :name, :type, :decimals, :width, :type_format, :labelset, :missing_values
def initialize(config={})
super
@@var_number||=1
init_with({
:aligment => "left",
:display_width => 8,
:label => "Variable #{@@var_number}",
:measurement_level => "SCALE",
:name => "var#{@@var_number}",
:type => 0,
:decimals => 2,
:width => 10,
:type_format => "F",
:labelset => nil
})
init_with config
@missing_values=[]
@@var_number+=1
end
def to_xml
labelset_s=(@labelset.nil?) ? "":"\n"+@labelset.parse_xml(@name)
missing_values=(@missing_values.size>0) ? @missing_values.collect {|m| m.to_xml}.join("\n"):""
"\n\n"+parse_elements(:to_xml)+missing_values+""+labelset_s
end
def to_spss
out=<0
out << "MISSING VALUES #{@name} ("+@missing_values.collect{|m| m.data}.join(",")+") ."
end
out
end
end
end
end
n=SPSS::Dictionary::Dictionary.new
ls=SPSS::Dictionary::LabelSet.new({1=>"Si",2=>"No"})
var1=SPSS::Dictionary::Variable.new
var1.labelset=ls
mv1=SPSS::Dictionary::MissingValue.new("-99")
var2=SPSS::Dictionary::Variable.new
n.add(var1)
n.add(var2)
var2.missing_values=[mv1]
File.open("dic_spss.sps","wb") {|f|
f.puts n.to_spss
}
================================================
FILE: lib/statsample/analysis/suite.rb
================================================
module Statsample
module Analysis
class Suite
include Statsample::Shorthand
attr_accessor :output
attr_accessor :name
attr_reader :block
def initialize(opts=Hash.new(), &block)
if !opts.is_a? Hash
opts={:name=>opts}
end
@block=block
@name=opts[:name] || "Analysis #{Time.now}"
@attached=[]
@output=opts[:output] || ::STDOUT
end
# Run the analysis, putting output on
def run
@block.arity<1 ? instance_eval(&@block) : @block.call(self)
end
# Provides a description of the procedure. Only appears as a commentary on
# SuiteReportBuilder outputs
def desc(d)
@output.puts("Description:")
@output.puts(" #{d}")
end
def echo(*args)
@output.puts(*args)
end
def summary(obj)
obj.summary
end
def add_to_reportbuilder(rb)
SuiteReportBuilder.new({:name=>name, :rb=>rb}, &block)
end
def generate(filename)
ar=SuiteReportBuilder.new({:name=>name}, &block)
ar.generate(filename)
end
def to_text
ar=SuiteReportBuilder.new({:name=>name}, &block)
ar.to_text
end
def attach(ds)
@attached.push(ds)
end
def detach(ds=nil)
if ds.nil?
@attached.pop
else
@attached.delete(ds)
end
end
alias :old_boxplot :boxplot
alias :old_histogram :histogram
alias :old_scatterplot :scatterplot
def show_svg(svg)
require 'tmpdir'
fn=Dir.tmpdir+"/image_#{Time.now.to_f}.svg"
File.open(fn,"w") {|fp| fp.write svg}
if RUBY_PLATFORM =~/darwin/
%x(open -a safari #{fn})
else
%x(xdg-open #{fn})
end
end
def boxplot(*args)
show_svg(old_boxplot(*args).to_svg)
end
def histogram(*args)
show_svg(old_histogram(*args).to_svg)
end
def scatterplot(*args)
show_svg(old_scatterplot(*args).to_svg)
end
def method_missing(name, *args,&block)
@attached.reverse.each do |ds|
return ds[name.to_s] if ds.fields.include? (name.to_s)
end
raise "Method #{name} doesn't exists"
end
end
end
end
================================================
FILE: lib/statsample/analysis/suitereportbuilder.rb
================================================
module Statsample
module Analysis
class SuiteReportBuilder < Suite
attr_accessor :rb
def initialize(opts=Hash.new,&block)
if !opts.is_a? Hash
opts={:name=>opts}
end
super(opts,&block)
@rb=opts[:rb] || ReportBuilder.new(:name=>name)
end
def generate(filename)
run if @block
@rb.save(filename)
end
def to_text
run if @block
@rb.to_text
end
def summary(o)
@rb.add(o)
end
def desc(d)
@rb.add(d)
end
def echo(*args)
args.each do |a|
@rb.add(a)
end
end
def boxplot(*args)
@rb.add(old_boxplot(*args))
end
def histogram(*args)
@rb.add(old_histogram(*args))
end
def boxplot(*args)
@rb.add(old_boxplot(*args))
end
end
end
end
================================================
FILE: lib/statsample/analysis.rb
================================================
require 'statsample/analysis/suite'
require 'statsample/analysis/suitereportbuilder'
module Statsample
# DSL to create analysis without hazzle.
# * Shortcuts methods to avoid use complete namescapes, many based on R
# * Attach/detach vectors to workspace, like R
# == Example
# an1=Statsample::Analysis.store(:first) do
# # Load excel file with x,y,z vectors
# ds=excel('data.xls')
# # See variables on ds dataset
# names(ds)
# # Attach the vectors to workspace, like R
# attach(ds)
# # vector 'x' is attached to workspace like a method,
# # so you can use like any variable
# mean,sd=x.mean, x.sd
# # Shameless R robbery
# a=c( 1:10)
# b=c(21:30)
# summary(cor(ds)) # Call summary method on correlation matrix
# end
# # You can run the analysis by its name
# Statsample::Analysis.run(:first)
# # or using the returned variables
# an1.run
# # You can also generate a report using ReportBuilder.
# # .summary() method call 'report_building' on the object,
# # instead of calling text summary
# an1.generate("report.html")
module Analysis
@@stored_analysis={}
@@last_analysis=nil
def self.clear_analysis
@@stored_analysis.clear
end
def self.stored_analysis
@@stored_analysis
end
def self.last
@@stored_analysis[@@last_analysis]
end
def self.store(name, opts=Hash.new,&block)
raise "You should provide a block" if !block
@@last_analysis=name
opts={:name=>name}.merge(opts)
@@stored_analysis[name]=Suite.new(opts,&block)
end
# Run analysis +*args+
# Without arguments, run all stored analysis
# Only 'echo' will be returned to screen
def self.run(*args)
args=stored_analysis.keys if args.size==0
raise "Analysis #{args} doesn't exists" if (args - stored_analysis.keys).size>0
args.each do |name|
stored_analysis[name].run
end
end
# Add analysis +*args+ to an reportbuilder object.
# Without arguments, add all stored analysis
# Each analysis is wrapped inside a ReportBuilder::Section object
# This is the method is used by save() and to_text()
def self.add_to_reportbuilder(rb, *args)
args=stored_analysis.keys if args.size==0
raise "Analysis #{name} doesn't exists" if (args - stored_analysis.keys).size>0
args.each do |name|
section=ReportBuilder::Section.new(:name=>stored_analysis[name].name)
rb_an=stored_analysis[name].add_to_reportbuilder(section)
rb.add(section)
rb_an.run
end
end
# Save the analysis on a file
# Without arguments, add all stored analysis
def self.save(filename, *args)
rb=ReportBuilder.new(:name=>filename)
add_to_reportbuilder(rb, *args)
rb.save(filename)
end
# Run analysis and return as string
# output of echo callings
# Without arguments, add all stored analysis
def self.to_text(*args)
rb=ReportBuilder.new(:name=>"Analysis #{Time.now}")
add_to_reportbuilder(rb, *args)
rb.to_text
end
# Run analysis and return to screen all
# echo and summary callings
def self.run_batch(*args)
puts to_text(*args)
end
end
end
================================================
FILE: lib/statsample/anova/contrast.rb
================================================
module Statsample
module Anova
class Contrast
attr_reader :psi
attr_reader :msw
include Summarizable
def initialize(opts=Hash.new)
raise "Should set at least vectors options" if opts[:vectors].nil?
@vectors=opts[:vectors]
@c=opts[:c]
@c1,@c2=opts[:c1], opts[:c2]
@t_options=opts[:t_options] || {:estimate_name=>_("Psi estimate")}
@name=opts[:name] || _("Contrast")
@psi=nil
@anova=Statsample::Anova::OneWayWithVectors.new(@vectors)
@msw=@anova.msw
end
# Hypothesis contrast, selecting index for each constrast
# For example, if you want to contrast x_0 against x_1 and x_2
# you should use
# c.contrast([0],[1,2])
def c_by_index(c1,c2)
contrast=[0]*@vectors.size
c1.each {|i| contrast[i]=1.quo(c1.size)}
c2.each {|i| contrast[i]=-1.quo(c2.size)}
@c=contrast
c(contrast)
end
def psi
if @psi.nil?
c(@c) if @c
c_by_index(@c1,@c2) if (@c1 and @c2)
end
@psi
end
def confidence_interval(cl=nil)
t_object.confidence_interval(cl)
end
# Hypothesis contrast, using custom values
# Every parameter is a contrast value. You should use
# the same number of contrast as vectors on class and the sum
# of constrast should be 0.
def c(args=nil)
return @c if args.nil?
@c=args
raise "contrast number!=vector number" if args.size!=@vectors.size
#raise "Sum should be 0" if args.inject(0) {|ac,v| ac+v}!=0
@psi=args.size.times.inject(0) {|ac,i| ac+(args[i]*@vectors[i].mean)}
end
def standard_error
sum=@vectors.size.times.inject(0) {|ac,i|
ac+((@c[i].rationalize**2).quo(@vectors[i].size))
}
Math.sqrt(@msw*sum)
end
alias :se :standard_error
def df
@vectors.inject(0) {|ac,v| ac+v.size}-@vectors.size
end
def t_object
Statsample::Test::T.new(psi, se, df, @t_options)
end
def t
t_object.t
end
def probability
t_object.probability
end
def report_building(builder)
builder.section(:name=>@name) do |s|
s.text _("Contrast:%s") % c.join(",")
s.parse_element(t_object)
end
end
end
end
end
================================================
FILE: lib/statsample/anova/oneway.rb
================================================
module Statsample
module Anova
# = Generic Anova one-way.
# You could enter the sum of squares or the mean squares. You
# should enter the degrees of freedom for numerator and denominator.
# == Usage
# anova=Statsample::Anova::OneWay(:ss_num=>10,:ss_den=>20, :df_num=>2, :df_den=>10, @name=>"ANOVA for....")
class OneWay
include Summarizable
attr_reader :df_num, :df_den, :ss_num, :ss_den, :ms_num, :ms_den, :ms_total, :df_total, :ss_total
# Name of ANOVA Analisys
attr_accessor :name
attr_accessor :name_denominator
attr_accessor :name_numerator
def initialize(opts=Hash.new)
@name=@name_numerator=@name_denominator=nil
# First see if sum of squares or mean squares are entered
raise ArgumentError, "You should set d.f." unless (opts.has_key? :df_num and opts.has_key? :df_den)
@df_num=opts.delete :df_num
@df_den=opts.delete :df_den
@df_total=@df_num+@df_den
if(opts.has_key? :ss_num and opts.has_key? :ss_den)
@ss_num = opts.delete :ss_num
@ss_den =opts.delete :ss_den
@ms_num =@ss_num.quo(@df_num)
@ms_den =@ss_den.quo(@df_den)
elsif (opts.has_key? :ms_num and opts.has_key? :ms_den)
@ms_num =opts.delete :ms_num
@ms_den =opts.delete :ms_den
@ss_num =@ms_num * @df_num
@ss_den =@ss_den * @df_den
end
@ss_total=@ss_num+@ss_den
@ms_total=@ms_num+@ms_den
opts_default={:name=>"ANOVA",
:name_denominator=>_("Explained variance"),
:name_numerator=>_("Unexplained variance")}
@opts=opts_default.merge(opts)
opts.keys.each {|k|
send("#{k}=", @opts[k]) if self.respond_to? "#{k}="
}
@f_object=Statsample::Test::F.new(@ms_num, @ms_den, @df_num,@df_den)
end
# F value
def f
@f_object.f
end
# P-value of F test
def probability
@f_object.probability
end
def report_building(builder) #:nodoc:
builder.section(:name=>@name) do |b|
report_building_table(b)
end
end
def report_building_table(builder) #:nodoc:
builder.table(:name=>_("%s Table") % @name, :header=>%w{source ss df ms f p}.map {|v| _(v)}) do |t|
t.row([@name_numerator, sprintf("%0.3f",@ss_num), @df_num, sprintf("%0.3f",@ms_num), sprintf("%0.3f",f), sprintf("%0.3f", probability)])
t.row([@name_denominator, sprintf("%0.3f",@ss_den), @df_den, sprintf("%0.3f",@ms_den), "", ""])
t.row([_("Total"), sprintf("%0.3f",@ss_total), @df_total, sprintf("%0.3f",@ms_total),"",""])
end
end
end
# One Way Anova with vectors
# Example:
# v1=[2,3,4,5,6].to_scale
# v2=[3,3,4,5,6].to_scale
# v3=[5,3,1,5,6].to_scale
# anova=Statsample::Anova::OneWayWithVectors.new([v1,v2,v3])
# anova.f
# => 0.0243902439024391
# anova.probability
# => 0.975953044203438
# anova.sst
# => 32.9333333333333
#
class OneWayWithVectors < OneWay
# Show on summary Levene test
attr_accessor :summary_levene
# Show on summary descriptives for vectors
attr_accessor :summary_descriptives
# Show on summary of contrasts
attr_accessor :summary_contrasts
# Array with stored contrasts
attr_reader :contrasts
def initialize(*args)
if args[0].is_a? Array
@vectors=args.shift
else
@vectors=args.find_all {|v| v.is_a? Statsample::Vector}
opts=args.find {|v| v.is_a? Hash}
end
opts||=Hash.new
opts_default={:name=>_("Anova One-Way"),
:name_numerator=>_("Between Groups"),
:name_denominator=>_("Within Groups"),
:summary_descriptives=>false,
:summary_levene=>true,
:summary_contrasts=>true
}
@opts=opts_default.merge(opts).merge(:ss_num=>ssbg, :ss_den=>sswg, :df_num=>df_bg, :df_den=>df_wg)
@contrasts=[]
super(@opts)
end
alias :sst :ss_total
alias :msb :ms_num
alias :msw :ms_den
# Generates and store a contrast.
# Options should be provided as a hash
# [:c]=>contrast vector
# [:c1 - :c2]=>index for automatic construction of contrast
# [:name]=>contrast name
def contrast(opts=Hash.new)
name=opts[:name] || _("Contrast for %s") % @name
opts=opts.merge({:vectors=>@vectors, :name=>name})
c=Statsample::Anova::Contrast.new(opts)
@contrasts.push(c)
c
end
def levene
Statsample::Test.levene(@vectors, :name=>_("Test of Homogeneity of variances (Levene)"))
end
# Total mean
def total_mean
sum=@vectors.inject(0){|a,v| a+v.sum}
sum.quo(n)
end
# Sum of squares within groups
def sswg
@sswg||=@vectors.inject(0) {|total,vector| total+vector.ss }
end
# Sum of squares between groups
def ssbg
m=total_mean
@vectors.inject(0) do |total,vector|
total + (vector.mean-m).square * vector.size
end
end
# Degrees of freedom within groups
def df_wg
@dk_wg||=n-k
end
def k
@k||=@vectors.size
end
# Degrees of freedom between groups
def df_bg
k-1
end
# Total number of cases
def n
@vectors.inject(0){|a,v| a+v.size}
end
def report_building(builder) # :nodoc:
builder.section(:name=>@name) do |s|
if summary_descriptives
s.table(:name=>_("Descriptives"),:header=>%w{Name N Mean SD Min Max}.map {|v| _(v)}) do |t|
@vectors.each do |v|
t.row [v.name, v.n_valid, "%0.4f" % v.mean, "%0.4f" % v.sd, "%0.4f" % v.min, "%0.4f" % v.max]
end
end
end
if summary_levene
s.parse_element(levene)
end
report_building_table(s)
if summary_contrasts and @contrasts.size>0
@contrasts.each do |c|
s.parse_element(c)
end
end
end
end
end
end
end
================================================
FILE: lib/statsample/anova/twoway.rb
================================================
module Statsample
module Anova
# = Generic Anova two-way.
# You could enter the sum of squares or the mean squares for a, b, axb and within.
# You should enter the degrees of freedom for a,b and within, because df_axb=df_a*df_b
# == Usage
# anova=Statsample::Anova::TwoWay(:ss_a=>10,:ss_b=>20,:ss_axb=>10, :ss_within=>20, :df_a=>2, :df_b=>3,df_within=100 @name=>"ANOVA for....")
class TwoWay
include Summarizable
attr_reader :df_a, :df_b, :df_axb, :df_within, :df_total
attr_reader :ss_a, :ss_b, :ss_axb, :ss_within, :ss_total
attr_reader :ms_a, :ms_b, :ms_axb, :ms_within, :ms_total
# Name of ANOVA Analisys
attr_accessor :name
# Name of a factor
attr_accessor :name_a
# Name of b factor
attr_accessor :name_b
# Name of within factor
attr_accessor :name_within
attr_reader :f_a_object, :f_b_object, :f_axb_object
def initialize(opts=Hash.new)
# First see if sum of squares or mean squares are entered
raise ArgumentError, "You should set all d.f." unless [:df_a, :df_b, :df_within].all? {|v| opts.has_key? v}
@df_a=opts.delete :df_a
@df_b=opts.delete :df_b
@df_axb=@df_a*@df_b
@df_within=opts.delete :df_within
@df_total=@df_a+@df_b+@df_axb+@df_within
if [:ss_a, :ss_b, :ss_axb, :ss_within].all? {|v| opts.has_key? v}
@ss_a = opts.delete :ss_a
@ss_b = opts.delete :ss_b
@ss_axb = opts.delete :ss_axb
@ss_within = opts.delete :ss_within
@ms_a =@ss_a.quo(@df_a)
@ms_b =@ss_b.quo(@df_b)
@ms_axb =@ss_axb.quo(@df_axb)
@ms_within =@ss_within.quo(@df_within)
elsif [:ms_a, :ms_b, :ms_axb, :ms_within].all? {|v| opts.has_key? v}
@ms_a = opts.delete :ms_a
@ms_b = opts.delete :ms_b
@ms_axb = opts.delete :ms_axb
@ms_within = opts.delete :ms_within
@ss_a =@ms_a*@df_a
@ss_b =@ms_b*@df_b
@ss_axb =@ms_axb*@df_axb
@ss_within =@ms_within*@df_within
else
raise "You should set all ss or ss"
end
@ss_total=@ss_a+@ss_b+@ss_axb+@ss_within
@ms_total=@ms_a+@ms_b+@ms_axb+@ms_within
opts_default={:name=>_("ANOVA Two-Way"),
:name_a=>_("A"),
:name_b=>_("B"),
:name_within=>_("Within")
}
@opts=opts_default.merge(opts)
opts_default.keys.each {|k|
send("#{k}=", @opts[k])
}
@f_a_object=Statsample::Test::F.new(@ms_a,@ms_within,@df_a,@df_within)
@f_b_object=Statsample::Test::F.new(@ms_b,@ms_within,@df_b,@df_within)
@f_axb_object=Statsample::Test::F.new(@ms_axb,@ms_within,@df_axb,@df_within)
end
def f_a
@f_a_object.f
end
def f_b
@f_b_object.f
end
def f_axb
@f_axb_object.f
end
def f_a_probability
@f_a_object.probability
end
def f_b_probability
@f_b_object.probability
end
def f_axb_probability
@f_axb_object.probability
end
def report_building(builder) #:nodoc:
builder.section(:name=>@name) do |b|
report_building_table(b)
end
end
def report_building_table(builder) #:nodoc:
builder.table(:name=>_("%s Table") % @name, :header=>%w{source ss df ms f p}.map {|v| _(v)}) do |t|
t.row([@name_a, "%0.3f" % @ss_a, @df_a, "%0.3f" % @ms_a , "%0.3f" % f_a, "%0.4f" % f_a_probability] )
t.row([@name_b, "%0.3f" % @ss_b, @df_b, "%0.3f" % @ms_b , "%0.3f" % f_b, "%0.4f" % f_b_probability] )
t.row(["%s X %s" % [@name_a, @name_b], "%0.3f" % @ss_axb, @df_axb, "%0.3f" % @ms_axb , "%0.3f" % f_axb, "%0.4f" % f_axb_probability] )
t.row([@name_within, "%0.3f" % @ss_within, @df_within, nil,nil,nil] )
t.row([_("Total"), "%0.3f" % @ss_total, @df_total, nil,nil,nil] )
end
end
end
# Two Way Anova with vectors
# Example:
# v1=[1,1,2,2].to_scale
# v2=[1,2,1,2].to_scale
# v3=[5,3,1,5].to_scale
# anova=Statsample::Anova::TwoWayWithVectors.new(:a=>v1,:b=>v2, :dependent=>v3)
#
class TwoWayWithVectors < TwoWay
# Show summary Levene test
attr_accessor :summary_levene
# Show summary descriptives for variables (means)
attr_accessor :summary_descriptives
attr_reader :a_var, :b_var, :dep_var
# For now, only equal sample cells allowed
def initialize(opts=Hash.new)
raise "You should insert at least :a, :b and :dependent" unless [:a, :b, :dependent].all? {|v| opts.has_key? v}
@a_var='a'
@b_var='b'
@dep_var='dependent'
@a_vector, @b_vector, @dep_vector=Statsample.only_valid_clone opts[:a], opts[:b], opts[:dependent]
ds={@a_var=>@a_vector, @b_var=>@b_vector, @dep_var=>@dep_vector}.to_dataset
@ds=ds.clone_only_valid
_p=@a_vector.factors.size
_q=@b_vector.factors.size
@x_general=@dep_vector.mean
@axb_means={}
@axb_sd={}
@vectors=[]
n=nil
@ds.to_multiset_by_split(a_var,b_var).each_vector(dep_var) {|k,v|
@axb_means[k]=v.mean
@axb_sd[k]=v.sd
@vectors << v
n||=v.size
raise "All cell sizes should be equal" if n!=v.size
}
@a_means={}
@ds.to_multiset_by_split(a_var).each_vector(dep_var) {|k,v|
@a_means[k]=v.mean
}
@b_means={}
@ds.to_multiset_by_split(b_var).each_vector(dep_var) {|k,v|
@b_means[k]=v.mean
}
ss_a=n*_q*@ds[a_var].factors.inject(0) {|ac,v|
ac+(@a_means[v]-@x_general)**2
}
ss_b=n*_p*@ds[b_var].factors.inject(0) {|ac,v|
ac+(@b_means[v]-@x_general)**2
}
ss_within=@ds.collect {|row|
(row[dep_var]-@axb_means[[row[a_var],row[b_var]]])**2
}.sum
ss_axb=n*@axb_means.inject(0) {|ac,v|
j,k=v[0]
xjk=v[1]
ac+(xjk-@a_means[j]-@b_means[k]+@x_general)**2
}
df_a=_p-1
df_b=_q-1
df_within=(_p*_q)*(n-1)
opts_default={:name=>_("Anova Two-Way on %s") % @ds[dep_var].name,
:name_a=>@ds[a_var].name,
:name_b=>@ds[b_var].name,
:summary_descriptives=>true,
:summary_levene=>false}
@opts=opts_default.merge(opts).merge({:ss_a=>ss_a,:ss_b=>ss_b, :ss_axb=>ss_axb, :ss_within=>ss_within, :df_a=>df_a, :df_b=>df_b, :df_within=>df_within})
super(@opts)
end
def levene
Statsample::Test.levene(@vectors, :name=>_("Test of Homogeneity of variances (Levene)"))
end
def report_building(builder) #:nodoc:#
builder.section(:name=>@name) do |s|
if summary_descriptives
s.table(:header =>['']+@ds[a_var].factors.map {|a| @ds[a_var].labeling(a)}+[_("%s Mean") % @name_b]) do |t|
@ds[b_var].factors.each do |b|
t.row([@ds[b_var].labeling(b)]+@ds[a_var].factors.map {|a| "%0.3f" % @axb_means[[a,b]] } + ["%0.3f" % @b_means[b]])
end
t.row([_("%s Mean") % @name_a]+@ds[a_var].factors.map {|a| "%0.3f" % @a_means[a]}+ ["%0.3f" % @x_general])
end
end
if summary_levene
s.parse_element(levene)
end
report_building_table(s)
end
end
end
end
end
================================================
FILE: lib/statsample/anova.rb
================================================
module Statsample
module Anova
class << self
def oneway(*args)
OneWay.new(*args)
end
def twoway(*args)
TwoWay.new(*args)
end
def oneway_with_vectors(*args)
OneWayWithVectors.new(*args)
end
def twoway_with_vectors(*args)
TwoWayWithVectors.new(*args)
end
end
end
end
require 'statsample/anova/oneway'
require 'statsample/anova/contrast'
require 'statsample/anova/twoway'
================================================
FILE: lib/statsample/bivariate/pearson.rb
================================================
module Statsample
module Bivariate
# = Pearson correlation coefficient (r)
#
# The moment-product Pearson's correlation coefficient, known as 'r'
# is a measure of bivariate associate between two continous
# variables.
#
# == Usage
# a = [1,2,3,4,5,6].to_scale
# b = [2,3,4,5,6,7].to_scale
# pearson = Statsample::Bivariate::Pearson.new(a,b)
# puts pearson.r
# puts pearson.t
# puts pearson.probability
# puts pearson.summary
#
class Pearson
include Statsample::Test
include Summarizable
# Name of correlation
attr_accessor :name
# Tails for probability (:both, :left or :right)
attr_accessor :tails
attr_accessor :n
def initialize(v1,v2,opts=Hash.new)
@v1_name,@v2_name = v1.name,v2.name
@v1,@v2 = Statsample.only_valid_clone(v1,v2)
@n=@v1.size
opts_default={
:name=>_("Correlation (%s - %s)") % [@v1_name, @v2_name],
:tails=>:both
}
@opts=opts.merge(opts_default)
@opts.each{|k,v|
self.send("#{k}=",v) if self.respond_to? k
}
end
def r
Statsample::Bivariate.pearson(@v1,@v2)
end
def t
Statsample::Bivariate.t_pearson(@v1,@v2)
end
def probability
p_using_cdf(Distribution::T.cdf(t, @v1.size-2), tails)
end
def report_building(builder)
builder.text(_("%s : r=%0.3f (t:%0.3f, g.l.=%d, p:%0.3f / %s tails)") % [@name, r,t, (n-2), probability, tails])
end
end
end
end
================================================
FILE: lib/statsample/bivariate.rb
================================================
require 'statsample/bivariate/pearson'
module Statsample
# Diverse methods and classes to calculate bivariate relations
# Specific classes:
# * Statsample::Bivariate::Pearson : Pearson correlation coefficient (r)
# * Statsample::Bivariate::Tetrachoric : Tetrachoric correlation
# * Statsample::Bivariate::Polychoric : Polychoric correlation (using joint, two-step and polychoric series)
module Bivariate
autoload(:Polychoric, 'statsample/bivariate/polychoric')
autoload(:Tetrachoric, 'statsample/bivariate/tetrachoric')
class << self
# Covariance between two vectors
def covariance(v1,v2)
v1a,v2a=Statsample.only_valid_clone(v1,v2)
return nil if v1a.size==0
if Statsample.has_gsl?
GSL::Stats::covariance(v1a.gsl, v2a.gsl)
else
covariance_slow(v1a,v2a)
end
end
# Estimate the ML between two dichotomic vectors
def maximum_likehood_dichotomic(pred,real)
preda,reala=Statsample.only_valid_clone(pred,real)
sum=0
preda.each_index{|i|
sum+=(reala[i]*Math::log(preda[i])) + ((1-reala[i])*Math::log(1-preda[i]))
}
sum
end
def covariance_slow(v1,v2) # :nodoc:
v1a,v2a=Statsample.only_valid(v1,v2)
sum_of_squares(v1a,v2a) / (v1a.size-1)
end
def sum_of_squares(v1,v2)
v1a,v2a=Statsample.only_valid_clone(v1,v2)
m1=v1a.mean
m2=v2a.mean
(v1a.size).times.inject(0) {|ac,i| ac+(v1a[i]-m1)*(v2a[i]-m2)}
end
# Calculate Pearson correlation coefficient (r) between 2 vectors
def pearson(v1,v2)
v1a,v2a=Statsample.only_valid_clone(v1,v2)
return nil if v1a.size ==0
if Statsample.has_gsl?
GSL::Stats::correlation(v1a.gsl, v2a.gsl)
else
pearson_slow(v1a,v2a)
end
end
def pearson_slow(v1,v2) # :nodoc:
v1a,v2a=Statsample.only_valid_clone(v1,v2)
# Calculate sum of squares
ss=sum_of_squares(v1a,v2a)
ss.quo(Math::sqrt(v1a.sum_of_squares) * Math::sqrt(v2a.sum_of_squares))
end
alias :correlation :pearson
# Retrieves the value for t test for a pearson correlation
# between two vectors to test the null hipothesis of r=0
def t_pearson(v1,v2)
v1a,v2a=Statsample.only_valid_clone(v1,v2)
r=pearson(v1a,v2a)
if(r==1.0)
0
else
t_r(r,v1a.size)
end
end
# Retrieves the value for t test for a pearson correlation
# giving r and vector size
# Source : http://faculty.chass.ncsu.edu/garson/PA765/correl.htm
def t_r(r,size)
r * Math::sqrt(((size)-2).to_f / (1 - r**2))
end
# Retrieves the probability value (a la SPSS)
# for a given t, size and number of tails.
# Uses a second parameter
# * :both or 2 : for r!=0 (default)
# * :right, :positive or 1 : for r > 0
# * :left, :negative : for r < 0
def prop_pearson(t, size, tails=:both)
tails=:both if tails==2
tails=:right if tails==1 or tails==:positive
tails=:left if tails==:negative
n_tails=case tails
when :both then 2
else 1
end
t=-t if t>0 and (tails==:both)
cdf=Distribution::T.cdf(t, size-2)
if(tails==:right)
1.0-(cdf*n_tails)
else
cdf*n_tails
end
end
# Predicted time for pairwise correlation matrix, in miliseconds
# See benchmarks/correlation_matrix.rb to see mode of calculation
def prediction_pairwise(vars,cases)
((-0.518111-0.000746*cases+1.235608*vars+0.000740*cases*vars)**2) / 100
end
# Predicted time for optimized correlation matrix, in miliseconds
# See benchmarks/correlation_matrix.rb to see mode of calculation
def prediction_optimized(vars,cases)
((4+0.018128*cases+0.246871*vars+0.001169*vars*cases)**2) / 100
end
# Returns residual score after delete variance
# from another variable
#
def residuals(from,del)
r=Statsample::Bivariate.pearson(from,del)
froms, dels = from.vector_standarized, del.vector_standarized
nv=[]
froms.data_with_nils.each_index do |i|
if froms[i].nil? or dels[i].nil?
nv.push(nil)
else
nv.push(froms[i]-r*dels[i])
end
end
nv.to_vector(:scale)
end
# Correlation between v1 and v2, controling the effect of
# control on both.
def partial_correlation(v1,v2,control)
v1a,v2a,cona=Statsample.only_valid_clone(v1,v2,control)
rv1v2=pearson(v1a,v2a)
rv1con=pearson(v1a,cona)
rv2con=pearson(v2a,cona)
(rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2))
end
def covariance_matrix_optimized(ds)
x=ds.to_gsl
n=x.row_size
m=x.column_size
means=((1/n.to_f)*GSL::Matrix.ones(1,n)*x).row(0)
centered=x-(GSL::Matrix.ones(n,m)*GSL::Matrix.diag(means))
ss=centered.transpose*centered
s=((1/(n-1).to_f))*ss
s
end
# Covariance matrix.
# Order of rows and columns depends on Dataset#fields order
def covariance_matrix(ds)
vars,cases=ds.fields.size,ds.cases
if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
cm=covariance_matrix_optimized(ds)
else
cm=covariance_matrix_pairwise(ds)
end
cm.extend(Statsample::CovariateMatrix)
cm.fields=ds.fields
cm
end
def covariance_matrix_pairwise(ds)
cache={}
matrix=ds.collect_matrix do |row,col|
if (ds[row].type!=:scale or ds[col].type!=:scale)
nil
elsif row==col
ds[row].variance
else
if cache[[col,row]].nil?
cov=covariance(ds[row],ds[col])
cache[[row,col]]=cov
cov
else
cache[[col,row]]
end
end
end
matrix
end
# Correlation matrix.
# Order of rows and columns depends on Dataset#fields order
def correlation_matrix(ds)
vars,cases=ds.fields.size,ds.cases
if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
cm=correlation_matrix_optimized(ds)
else
cm=correlation_matrix_pairwise(ds)
end
cm.extend(Statsample::CovariateMatrix)
cm.fields=ds.fields
cm
end
def correlation_matrix_optimized(ds)
s=covariance_matrix_optimized(ds)
sds=GSL::Matrix.diagonal(s.diagonal.sqrt.pow(-1))
cm=sds*s*sds
# Fix diagonal
s.row_size.times {|i|
cm[i,i]=1.0
}
cm
end
def correlation_matrix_pairwise(ds)
cache={}
cm=ds.collect_matrix do |row,col|
if row==col
1.0
elsif (ds[row].type!=:scale or ds[col].type!=:scale)
nil
else
if cache[[col,row]].nil?
r=pearson(ds[row],ds[col])
cache[[row,col]]=r
r
else
cache[[col,row]]
end
end
end
end
# Retrieves the n valid pairwise.
def n_valid_matrix(ds)
ds.collect_matrix do |row,col|
if row==col
ds[row].valid_data.size
else
rowa,rowb=Statsample.only_valid_clone(ds[row],ds[col])
rowa.size
end
end
end
# Matrix of correlation probabilities.
# Order of rows and columns depends on Dataset#fields order
def correlation_probability_matrix(ds, tails=:both)
rows=ds.fields.collect do |row|
ds.fields.collect do |col|
v1a,v2a=Statsample.only_valid_clone(ds[row],ds[col])
(row==col or ds[row].type!=:scale or ds[col].type!=:scale) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size, tails)
end
end
Matrix.rows(rows)
end
# Spearman ranked correlation coefficient (rho) between 2 vectors
def spearman(v1,v2)
v1a,v2a=Statsample.only_valid_clone(v1,v2)
v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
pearson(v1r,v2r)
end
# Calculate Point biserial correlation. Equal to Pearson correlation, with
# one dichotomous value replaced by "0" and the other by "1"
def point_biserial(dichotomous,continous)
ds={'d'=>dichotomous,'c'=>continous}.to_dataset.dup_only_valid
raise(TypeError, "First vector should be dichotomous") if ds['d'].factors.size!=2
raise(TypeError, "Second vector should be continous") if ds['c'].type!=:scale
f0=ds['d'].factors.sort[0]
m0=ds.filter_field('c') {|c| c['d']==f0}
m1=ds.filter_field('c') {|c| c['d']!=f0}
((m1.mean-m0.mean).to_f / ds['c'].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.cases**2)
end
# Kendall Rank Correlation Coefficient (Tau a)
# Based on Hervé Adbi article
def tau_a(v1,v2)
v1a,v2a=Statsample.only_valid_clone(v1,v2)
n=v1.size
v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
o1=ordered_pairs(v1r)
o2=ordered_pairs(v2r)
delta= o1.size*2-(o2 & o1).size*2
1-(delta * 2 / (n*(n-1)).to_f)
end
# Calculates Goodman and Kruskal’s Tau b correlation.
# Tb is an asymmetric P-R-E measure of association for nominal scales
# (Mielke, X)
#
# Tau-b defines perfect association as strict monotonicity. Although it
# requires strict monotonicity to reach 1.0, it does not penalize ties as
# much as some other measures.
# == Reference
# Mielke, P. GOODMAN–KRUSKAL TAU AND GAMMA.
# Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
def tau_b(matrix)
v=pairs(matrix)
((v['P']-v['Q']).to_f / Math::sqrt((v['P']+v['Q']+v['Y'])*(v['P']+v['Q']+v['X'])).to_f)
end
# Calculates Goodman and Kruskal's gamma.
#
# Gamma is the surplus of concordant pairs over discordant pairs, as a
# percentage of all pairs ignoring ties.
#
# Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
def gamma(matrix)
v=pairs(matrix)
(v['P']-v['Q']).to_f / (v['P']+v['Q']).to_f
end
# Calculate indexes for a matrix the rows and cols has to be ordered
def pairs(matrix)
# calculate concordant #p matrix
rs=matrix.row_size
cs=matrix.column_size
conc=disc=ties_x=ties_y=0
(0...(rs-1)).each do |x|
(0...(cs-1)).each do |y|
((x+1)...rs).each do |x2|
((y+1)...cs).each do |y2|
# #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
conc+=matrix[x,y]*matrix[x2,y2]
end
end
end
end
(0...(rs-1)).each {|x|
(1...(cs)).each{|y|
((x+1)...rs).each{|x2|
(0...y).each{|y2|
# #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
disc+=matrix[x,y]*matrix[x2,y2]
}
}
}
}
(0...(rs-1)).each {|x|
(0...(cs)).each{|y|
((x+1)...(rs)).each{|x2|
ties_x+=matrix[x,y]*matrix[x2,y]
}
}
}
(0...rs).each {|x|
(0...(cs-1)).each{|y|
((y+1)...(cs)).each{|y2|
ties_y+=matrix[x,y]*matrix[x,y2]
}
}
}
{'P'=>conc,'Q'=>disc,'Y'=>ties_y,'X'=>ties_x}
end
def ordered_pairs(vector)
d=vector.data
a=[]
(0...(d.size-1)).each{|i|
((i+1)...(d.size)).each {|j|
a.push([d[i],d[j]])
}
}
a
end
=begin
def sum_of_codeviated(v1,v2)
v1a,v2a=Statsample.only_valid(v1,v2)
sum=0
(0...v1a.size).each{|i|
sum+=v1a[i]*v2a[i]
}
sum-((v1a.sum*v2a.sum) / v1a.size.to_f)
end
=end
# Report the minimum number of cases valid of a covariate matrix
# based on a dataset
def min_n_valid(ds)
min=ds.cases
m=n_valid_matrix(ds)
for x in 0...m.row_size
for y in 0...m.column_size
min=m[x,y] if m[x,y] < min
end
end
min
end
end
end
end
================================================
FILE: lib/statsample/codification.rb
================================================
require 'yaml'
module Statsample
# This module aids to code open questions
# * Select one or more vectors of a dataset, to create a yaml files, on which each vector is a hash, which keys and values are the vector's factors . If data have Statsample::SPLIT_TOKEN on a value, each value will be separated on two or more hash keys.
# * Edit the yaml and replace the values of hashes with your codes. If you need to create two or mores codes for an answer, use the separator (default Statsample::SPLIT_TOKEN)
# * Recode the vectors, loading the yaml file:
# * recode_dataset_simple!() : The new vectors have the same name of the original plus "_recoded"
# * recode_dataset_split!() : Create equal number of vectors as values. See Vector.add_vectors_by_split() for arguments
#
# Usage:
# recode_file="recodification.yaml"
# phase=:first # flag
# if phase==:first
# File.open(recode_file,"w") {|fp|
# Statsample::Codification.create_yaml(ds,%w{vector1 vector2}, ",",fp)
# }
# # Edit the file recodification.yaml and verify changes
# elsif phase==:second
# File.open(recode_file,"r") {|fp|
# Statsample::Codification.verify(fp,['vector1'])
# }
# # Add new vectors to the dataset
# elsif phase==:third
# File.open(recode_file,"r") {|fp|
# Statsample::Codification.recode_dataset_split!(ds,fp,"*")
# }
# end
#
module Codification
class << self
# Create a hash, based on vectors, to create the dictionary.
# The keys will be vectors name on dataset and the values
# will be hashes, with keys = values, for recodification
def create_hash(dataset, vectors, sep=Statsample::SPLIT_TOKEN)
raise ArgumentError,"Array should't be empty" if vectors.size==0
pro_hash=vectors.inject({}){|h,v_name|
raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
v=dataset[v_name]
split_data=v.splitted(sep).flatten.collect {|c| c.to_s}.find_all {|c| !c.nil?}
factors=split_data.uniq.compact.sort.inject({}) {|ac,val| ac[val]=val;ac }
h[v_name]=factors
h
}
pro_hash
end
# Create a yaml to create a dictionary, based on vectors
# The keys will be vectors name on dataset and the values
# will be hashes, with keys = values, for recodification
#
# v1=%w{a,b b,c d}.to_vector
# ds={"v1"=>v1}.to_dataset
# Statsample::Codification.create_yaml(ds,['v1'])
# => "--- \nv1: \n a: a\n b: b\n c: c\n d: d\n"
def create_yaml(dataset, vectors, io=nil, sep=Statsample::SPLIT_TOKEN)
pro_hash=create_hash(dataset, vectors, sep)
YAML.dump(pro_hash,io)
end
# Create a excel to create a dictionary, based on vectors.
# Raises an error if filename exists
# The rows will be:
# * field: name of vector
# * original: original name
# * recoded: new code
def create_excel(dataset, vectors, filename, sep=Statsample::SPLIT_TOKEN)
require 'spreadsheet'
if File.exist?(filename)
raise "Exists a file named #{filename}. Delete ir before overwrite."
end
book = Spreadsheet::Workbook.new
sheet = book.create_worksheet
sheet.row(0).concat(%w{field original recoded})
i=1
create_hash(dataset, vectors, sep).sort.each do |field, inner_hash|
inner_hash.sort.each do |k,v|
sheet.row(i).concat([field.dup,k.dup,v.dup])
i+=1
end
end
book.write(filename)
end
# From a excel generates a dictionary hash
# to use on recode_dataset_simple!() or recode_dataset_split!().
#
def excel_to_recoded_hash(filename)
require 'spreadsheet'
h={}
book = Spreadsheet.open filename
sheet= book.worksheet 0
row_i=0
sheet.each do |row|
row_i+=1
next if row_i==1 or row[0].nil? or row[1].nil? or row[2].nil?
h[row[0]]={} if h[row[0]].nil?
h[row[0]][row[1]]=row[2]
end
h
end
def inverse_hash(h, sep=Statsample::SPLIT_TOKEN)
h.inject({}) do |a,v|
v[1].split(sep).each do |val|
a[val]||=[]
a[val].push(v[0])
end
a
end
end
def dictionary(h, sep=Statsample::SPLIT_TOKEN)
h.inject({}) {|a,v| a[v[0]]=v[1].split(sep); a }
end
def recode_vector(v,h,sep=Statsample::SPLIT_TOKEN)
dict=dictionary(h,sep)
new_data=v.splitted(sep)
new_data.collect do |c|
if c.nil?
nil
else
c.collect{|value| dict[value] }.flatten.uniq
end
end
end
def recode_dataset_simple!(dataset, dictionary_hash ,sep=Statsample::SPLIT_TOKEN)
_recode_dataset(dataset,dictionary_hash ,sep,false)
end
def recode_dataset_split!(dataset, dictionary_hash, sep=Statsample::SPLIT_TOKEN)
_recode_dataset(dataset, dictionary_hash, sep,true)
end
def _recode_dataset(dataset, h , sep=Statsample::SPLIT_TOKEN, split=false)
v_names||=h.keys
v_names.each do |v_name|
raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
recoded=recode_vector(dataset[v_name], h[v_name],sep).collect { |c|
if c.nil?
nil
else
c.join(sep)
end
}.to_vector
if(split)
recoded.split_by_separator(sep).each {|k,v|
dataset[v_name+"_"+k]=v
}
else
dataset[v_name+"_recoded"]=recoded
end
end
end
def verify(h, v_names=nil,sep=Statsample::SPLIT_TOKEN,io=$>)
require 'pp'
v_names||=h.keys
v_names.each{|v_name|
inverse=inverse_hash(h[v_name],sep)
io.puts "- Field: #{v_name}"
inverse.sort{|a,b| -(a[1].count<=>b[1].count)}.each {|k,v|
io.puts " - \"#{k}\" (#{v.count}) :\n -'"+v.join("\n -'")+"'"
}
}
end
end
end
end
================================================
FILE: lib/statsample/converter/csv.rb
================================================
module Statsample
class CSV < SpreadsheetBase
if RUBY_VERSION<"1.9"
require 'fastercsv'
CSV_klass=::FasterCSV
else
require 'csv'
CSV_klass=::CSV
end
class << self
def read19(filename,ignore_lines=0,csv_opts=Hash.new)
#default first line is header
csv_opts.merge!(:headers=>true, :header_converters => :symbol)
csv = CSV_klass::Table.new(CSV_klass::read(filename,'r',csv_opts))
csv_headers = if csv_opts[:headers]
csv.headers
else
#as in R, if no header we name the headers as V1,V2,V3,V4,..
1.upto(csv.first.length).collect { |i| "V#{i}" }
end
#we invert row -> column. It means csv[0] is the first column and not row. Similar to R
csv.by_col!
thash = {}
csv_headers.each_with_index do |header,idx|
thash[header] = Statsample::Vector.new(csv[idx].drop(ignore_lines))
end
Statsample::Dataset.new(thash)
end
# Returns a Dataset based on a csv file
#
# USE:
# ds=Statsample::CSV.read("test_csv.csv")
def read(filename, empty=[''],ignore_lines=0,csv_opts=Hash.new)
first_row=true
fields=[]
#fields_data={}
ds=nil
line_number=0
csv=CSV_klass.open(filename,'rb', csv_opts)
csv.each do |row|
line_number+=1
if(line_number<=ignore_lines)
#puts "Skip line"
next
end
row.collect!{|c| c.to_s }
if first_row
fields=extract_fields(row)
ds=Statsample::Dataset.new(fields)
first_row=false
else
rowa=process_row(row,empty)
ds.add_case(rowa,false)
end
end
convert_to_scale_and_date(ds,fields)
ds.update_valid_data
ds
end
# Save a Dataset on a csv file
#
# USE:
# Statsample::CSV.write(ds,"test_csv.csv")
def write(dataset,filename, convert_comma=false,*opts)
writer=CSV_klass.open(filename,'w',*opts)
writer << dataset.fields
dataset.each_array do|row|
if(convert_comma)
row.collect!{|v| v.to_s.gsub(".",",")}
end
writer << row
end
writer.close
end
end
end
end
================================================
FILE: lib/statsample/converter/spss.rb
================================================
module Statsample
module SPSS
class << self
# Export a SPSS Matrix with tetrachoric correlations .
#
# Use:
# ds=Statsample::Excel.read("my_data.xls")
# puts Statsample::SPSS.tetrachoric_correlation_matrix(ds)
def tetrachoric_correlation_matrix(ds)
dsv=ds.dup_only_valid
# Delete all vectors doesn't have variation
dsv.fields.each{|f|
if dsv[f].factors.size==1
dsv.delete_vector(f)
else
dsv[f]=dsv[f].dichotomize
end
}
tcm=Statsample::Bivariate.tetrachoric_correlation_matrix(dsv)
n=dsv.fields.collect {|f|
sprintf("%d",dsv[f].size)
}
meanlist=dsv.fields.collect{|f|
sprintf("%0.3f", dsv[f].mean)
}
stddevlist=dsv.fields.collect{|f|
sprintf("%0.3f", dsv[f].sd)
}
out=<<-HEREDOC
MATRIX DATA VARIABLES=ROWTYPE_ #{dsv.fields.join(",")}.
BEGIN DATA
N #{n.join(" ")}
MEAN #{meanlist.join(" ")}
STDDEV #{stddevlist.join(" ")}
HEREDOC
tcm.row_size.times {|i|
out +="CORR "
(i+1).times {|j|
out+=sprintf("%0.3f",tcm[i,j])+" "
}
out +="\n"
}
out+="END DATA.\nEXECUTE.\n"
end
end
end
end
================================================
FILE: lib/statsample/converters.rb
================================================
require 'statsample/converter/spss'
module Statsample
# Create and dumps Datasets on a database
module Database
class << self
# Read a database query and returns a Dataset
#
# USE:
#
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
# Statsample.read(dbh, "SELECT * FROM test")
#
def read(dbh,query)
require 'dbi'
sth=dbh.execute(query)
vectors={}
fields=[]
sth.column_info.each {|c|
vectors[c['name']]=Statsample::Vector.new([])
vectors[c['name']].name=c['name']
vectors[c['name']].type= (c['type_name']=='INTEGER' or c['type_name']=='DOUBLE') ? :scale : :nominal
fields.push(c['name'])
}
ds=Statsample::Dataset.new(vectors,fields)
sth.fetch do |row|
ds.add_case(row.to_a, false )
end
ds.update_valid_data
ds
end
# Insert each case of the Dataset on the selected table
#
# USE:
#
# ds={'id'=>[1,2,3].to_vector, 'name'=>["a","b","c"].to_vector}.to_dataset
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
# Statsample::Database.insert(ds,dbh,"test")
#
def insert(ds, dbh, table)
require 'dbi'
query="INSERT INTO #{table} ("+ds.fields.join(",")+") VALUES ("+((["?"]*ds.fields.size).join(","))+")"
sth=dbh.prepare(query)
ds.each_array{|c| sth.execute(*c) }
return true
end
# Create a sql, basen on a given Dataset
#
# USE:
#
# ds={'id'=>[1,2,3,4,5].to_vector,'name'=>%w{Alex Peter Susan Mary John}.to_vector}.to_dataset
# Statsample::Database.create_sql(ds,'names')
# ==>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
#
def create_sql(ds,table,charset="UTF8")
sql="CREATE TABLE #{table} ("
fields=ds.fields.collect{|f|
v=ds[f]
f+" "+v.db_type
}
sql+fields.join(",\n ")+") CHARACTER SET=#{charset};"
end
end
end
module Mondrian
class << self
def write(dataset,filename)
File.open(filename,"wb") do |fp|
fp.puts dataset.fields.join("\t")
dataset.each_array_with_nils do |row|
row2=row.collect{|v| v.nil? ? "NA" : v.to_s.gsub(/\s+/,"_") }
fp.puts row2.join("\t")
end
end
end
end
end
class SpreadsheetBase
class << self
def extract_fields(row)
i=0;
fields=row.to_a.collect{|c|
if c.nil?
i+=1
"var%05d" % i
else
c.to_s.downcase
end
}
fields.recode_repeated
end
def process_row(row,empty)
row.to_a.map do |c|
if empty.include?(c)
nil
else
if c.is_a? String and c.is_number?
if c=~/^\d+$/
c.to_i
else
c.gsub(",",".").to_f
end
else
c
end
end
end
end
def convert_to_scale_and_date(ds,fields)
fields.each do |f|
if ds[f].can_be_scale?
ds[f].type=:scale
elsif ds[f].can_be_date?
ds[f].type=:date
end
end
end
end
end
class PlainText < SpreadsheetBase
class << self
def read(filename, fields)
ds=Statsample::Dataset.new(fields)
fp=File.open(filename,"r")
fp.each_line do |line|
row=process_row(line.strip.split(/\s+/),[""])
next if row==["\x1A"]
ds.add_case_array(row)
end
convert_to_scale_and_date(ds,fields)
ds.update_valid_data
fields.each {|f|
ds[f].name=f
}
ds
end
end
end
class Excel < SpreadsheetBase
class << self
# Write a Excel spreadsheet based on a dataset
# * TODO: Format nicely date values
def write(dataset,filename)
require 'spreadsheet'
book = Spreadsheet::Workbook.new
sheet = book.create_worksheet
format = Spreadsheet::Format.new :color => :blue,
:weight => :bold
sheet.row(0).concat(dataset.fields.map {|i| i.dup}) # Unfreeze strings
sheet.row(0).default_format = format
i=1
dataset.each_array{|row|
sheet.row(i).concat(row)
i+=1
}
book.write(filename)
end
# This should be fixed.
# If we have a Formula, should be resolver first
def preprocess_row(row, dates)
i=-1
row.collect!{|c|
i+=1
if c.is_a? Spreadsheet::Formula
if(c.value.is_a? Spreadsheet::Excel::Error)
nil
else
c.value
end
elsif dates.include? i and !c.nil? and c.is_a? Numeric
row.date(i)
else
c
end
}
end
private :process_row, :preprocess_row
# Returns a dataset based on a xls file
# USE:
# ds = Statsample::Excel.read("test.xls")
#
def read(filename, opts=Hash.new)
require 'spreadsheet'
raise "options should be Hash" unless opts.is_a? Hash
opts_default={
:worksheet_id=>0,
:ignore_lines=>0,
:empty=>['']
}
opts=opts_default.merge opts
worksheet_id=opts[:worksheet_id]
ignore_lines=opts[:ignore_lines]
empty=opts[:empty]
first_row=true
fields=[]
fields_data={}
ds=nil
line_number=0
book = Spreadsheet.open filename
sheet= book.worksheet worksheet_id
sheet.each do |row|
begin
dates=[]
row.formats.each_index{|i|
if !row.formats[i].nil? and row.formats[i].number_format=="DD/MM/YYYY"
dates.push(i)
end
}
line_number+=1
next if(line_number<=ignore_lines)
preprocess_row(row,dates)
if first_row
fields=extract_fields(row)
ds=Statsample::Dataset.new(fields)
first_row=false
else
rowa=process_row(row,empty)
(fields.size - rowa.size).times {
rowa << nil
}
ds.add_case(rowa,false)
end
rescue => e
error="#{e.to_s}\nError on Line # #{line_number}:#{row.join(",")}"
raise
end
end
convert_to_scale_and_date(ds, fields)
ds.update_valid_data
fields.each {|f|
ds[f].name=f
}
ds.name=filename
ds
end
end
end
module Mx
class << self
def write(dataset,filename,type=:covariance)
puts "Writing MX File"
File.open(filename,"w") do |fp|
fp.puts "! #{filename}"
fp.puts "! Output generated by Statsample"
fp.puts "Data Ninput=#{dataset.fields.size} Nobservations=#{dataset.cases}"
fp.puts "Labels "+dataset.fields.join(" ")
case type
when :raw
fp.puts "Rectangular"
dataset.each do |row|
out=dataset.fields.collect do |f|
if dataset[f].is_valid? row[f]
row[f]
else
"."
end
end
fp.puts out.join("\t")
end
fp.puts "End Rectangular"
when :covariance
fp.puts " CMatrix Full"
cm=Statsample::Bivariate.covariance_matrix(dataset)
d=(0...(cm.row_size)).collect {|row|
(0...(cm.column_size)).collect{|col|
cm[row,col].nil? ? "." : sprintf("%0.3f", cm[row,col])
}.join(" ")
}.join("\n")
fp.puts d
end
end
end
end
end
module GGobi
class << self
def write(dataset,filename,opt={})
File.open(filename,"w") {|fp|
fp.write(self.out(dataset,opt))
}
end
def out(dataset,opt={})
require 'ostruct'
default_opt = {:dataname => "Default", :description=>"", :missing=>"NA"}
default_opt.merge! opt
carrier=OpenStruct.new
carrier.categorials=[]
carrier.conversions={}
variables_def=dataset.fields.collect{|k|
variable_definition(carrier,dataset[k],k)
}.join("\n")
indexes=carrier.categorials.inject({}) {|s,c|
s[dataset.fields.index(c)]=c
s
}
records=""
dataset.each_array {|c|
indexes.each{|ik,iv|
c[ik]=carrier.conversions[iv][c[ik]]
}
records << "#{values_definition(c, default_opt[:missing])}\n"
}
out=<
#{default_opt[:description]}
#{variables_def}
#{records}
EOC
out
end
def values_definition(c,missing)
c.collect{|v|
if v.nil?
"#{missing}"
elsif v.is_a? Numeric
"#{v}"
else
"#{v.gsub(/\s+/,"_")}"
end
}.join(" ")
end
# Outputs a string for a variable definition
# v = vector
# name = name of the variable
# nickname = nickname
def variable_definition(carrier,v,name,nickname=nil)
nickname = (nickname.nil? ? "" : "nickname=\"#{nickname}\"" )
if v.type==:nominal or v.data.find {|d| d.is_a? String }
carrier.categorials.push(name)
carrier.conversions[name]={}
factors=v.factors
out ="\n"
out << "\n"
out << (1..factors.size).to_a.collect{|i|
carrier.conversions[name][factors[i-1]]=i
"#{v.labeling(factors[i-1])}"
}.join("\n")
out << "\n\n"
out
elsif v.data.find {|d| d.is_a? Float}
""
else
""
end
end
end
end
end
require 'statsample/converter/csv.rb'
================================================
FILE: lib/statsample/crosstab.rb
================================================
module Statsample
# Class to create crosstab of data
# With this, you can create reports and do chi square test
# The first vector will be at rows and the second will the the columns
#
class Crosstab
include Summarizable
attr_reader :v_rows, :v_cols
attr_accessor :row_label, :column_label, :name, :percentage_row, :percentage_column, :percentage_total
def initialize(v1, v2, opts=Hash.new)
#raise ArgumentError, "Both arguments should be Vectors" unless v1.is_a? Statsample::Vector and v2.is_a? Statsample::Vector
raise ArgumentError, "Vectors should be the same size" unless v1.size==v2.size
@v_rows, @v_cols=Statsample.only_valid_clone(v1.to_vector,v2.to_vector)
@cases=@v_rows.size
@row_label=v1.name
@column_label=v2.name
@name=nil
@percentage_row = @percentage_column = @percentage_total=false
opts.each{|k,v|
self.send("#{k}=",v) if self.respond_to? k
}
@name||=_("Crosstab %s - %s") % [@row_label, @column_label]
end
def rows_names
@v_rows.factors.sort
end
def cols_names
@v_cols.factors.sort
end
def rows_total
@v_rows.frequencies
end
def cols_total
@v_cols.frequencies
end
def frequencies
base=rows_names.inject([]){|s,row|
s+=cols_names.collect{|col| [row,col]}
}.inject({}) {|s,par|
s[par]=0
s
}
base.update(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a.to_vector.frequencies)
end
def to_matrix
f=frequencies
rn=rows_names
cn=cols_names
Matrix.rows(rn.collect{|row|
cn.collect{|col| f[[row,col]]}
})
end
def frequencies_by_row
f=frequencies
rows_names.inject({}){|sr,row|
sr[row]=cols_names.inject({}) {|sc,col| sc[col]=f[[row,col]]; sc}
sr
}
end
def frequencies_by_col
f=frequencies
cols_names.inject({}){|sc,col|
sc[col]=rows_names.inject({}) {|sr,row| sr[row]=f[[row,col]]; sr}
sc
}
end
# Chi square, based on expected and real matrix
def chi_square
require 'statsample/test'
Statsample::Test.chi_square(self.to_matrix, matrix_expected)
end
# Useful to obtain chi square
def matrix_expected
rn=rows_names
cn=cols_names
rt=rows_total
ct=cols_total
t=@v_rows.size
m=rn.collect{|row|
cn.collect{|col|
(rt[row]*ct[col]).quo(t)
}
}
Matrix.rows(m)
end
def cols_empty_hash
cols_names.inject({}) {|a,x| a[x]=0;a}
end
def report_building(builder)
builder.section(:name=>@name) do |generator|
fq=frequencies
rn=rows_names
cn=cols_names
total=0
total_cols=cols_empty_hash
generator.text "Chi Square: #{chi_square}"
generator.text(_("Rows: %s") % @row_label) unless @row_label.nil?
generator.text(_("Columns: %s") % @column_label) unless @column_label.nil?
t=ReportBuilder::Table.new(:name=>@name+" - "+_("Raw"), :header=>[""]+cols_names.collect {|c| @v_cols.labeling(c)}+[_("Total")])
rn.each do |row|
total_row=0
t_row=[@v_rows.labeling(row)]
cn.each do |col|
data=fq[[row,col]]
total_row+=fq[[row,col]]
total+=fq[[row,col]]
total_cols[col]+=fq[[row,col]]
t_row.push(data)
end
t_row.push(total_row)
t.row(t_row)
end
t.hr
t_row=[_("Total")]
cn.each do |v|
t_row.push(total_cols[v])
end
t_row.push(total)
t.row(t_row)
generator.parse_element(t)
if(@percentage_row)
table_percentage(generator,:row)
end
if(@percentage_column)
table_percentage(generator,:column)
end
if(@percentage_total)
table_percentage(generator,:total)
end
end
end
def table_percentage(generator,type)
fq=frequencies
cn=cols_names
rn=rows_names
rt=rows_total
ct=cols_total
type_name=case type
when :row then _("% Row")
when :column then _("% Column")
when :total then _("% Total")
end
t=ReportBuilder::Table.new(:name=>@name+" - "+_(type_name), :header=>[""]+cols_names.collect {|c| @v_cols.labeling(c) } + [_("Total")])
rn.each do |row|
t_row=[@v_rows.labeling(row)]
cn.each do |col|
total=case type
when :row then rt[row]
when :column then ct[col]
when :total then @cases
end
data = sprintf("%0.2f%%", fq[[row,col]]*100.0/ total )
t_row.push(data)
end
total=case type
when :row then rt[row]
when :column then @cases
when :total then @cases
end
t_row.push(sprintf("%0.2f%%", rt[row]*100.0/total))
t.row(t_row)
end
t.hr
t_row=[_("Total")]
cn.each{|col|
total=case type
when :row then @cases
when :column then ct[col]
when :total then @cases
end
t_row.push(sprintf("%0.2f%%", ct[col]*100.0/total))
}
t_row.push("100%")
t.row(t_row)
generator.parse_element(t)
end
end
end
================================================
FILE: lib/statsample/dataset.rb
================================================
require 'statsample/vector'
class Hash
# Creates a Statsample::Dataset based on a Hash
def to_dataset(*args)
Statsample::Dataset.new(self, *args)
end
end
class Array
def prefix(s) # :nodoc:
self.collect{|c| s+c.to_s }
end
def suffix(s) # :nodoc:
self.collect{|c| c.to_s+s }
end
end
module Statsample
class DatasetException < RuntimeError # :nodoc:
attr_reader :ds,:exp
def initialize(ds,e)
@ds=ds
@exp=e
end
def to_s
m="Error on iteration: "+@exp.message+"\n"+@exp.backtrace.join("\n")
m+="\nRow ##{@ds.i}:#{@ds.case_as_hash(@ds.i)}" unless @ds.i.nil?
m
end
end
# Set of cases with values for one or more variables,
# analog to a dataframe on R or a standard data file of SPSS.
# Every vector has #field name, which represent it. By default,
# the vectors are ordered by it field name, but you can change it
# the fields order manually.
# The Dataset work as a Hash, with keys are field names
# and values are Statsample::Vector
#
#
# ==Usage
# Create a empty dataset:
# Dataset.new()
# Create a dataset with three empty vectors, called v1, v2 and v3:
# Dataset.new(%w{v1 v2 v3})
# Create a dataset with two vectors, called v1
# and v2:
# Dataset.new({'v1'=>%w{1 2 3}.to_vector, 'v2'=>%w{4 5 6}.to_vector})
# Create a dataset with two given vectors (v1 and v2),
# with vectors on inverted order:
# Dataset.new({'v2'=>v2,'v1'=>v1},['v2','v1'])
#
# The fast way to create a dataset uses Hash#to_dataset, with
# field order as arguments
# v1 = [1,2,3].to_scale
# v2 = [1,2,3].to_scale
# ds = {'v1'=>v2, 'v2'=>v2}.to_dataset(%w{v2 v1})
class Dataset
include Writable
include Summarizable
# Hash of Statsample::Vector
attr_reader :vectors
# Ordered ids of vectors
attr_reader :fields
# Name of dataset
attr_accessor :name
# Number of cases
attr_reader :cases
# Location of pointer on enumerations methods (like #each)
attr_reader :i
# Generates a new dataset, using three vectors
# - Rows
# - Columns
# - Values
#
# For example, you have these values
#
# x y v
# a a 0
# a b 1
# b a 1
# b b 0
#
# You obtain
# id a b
# a 0 1
# b 1 0
#
# Useful to process outputs from databases
def self.crosstab_by_asignation(rows,columns,values)
raise "Three vectors should be equal size" if rows.size!=columns.size or rows.size!=values.size
cols_values=columns.factors
cols_n=cols_values.size
h_rows=rows.factors.inject({}){|a,v| a[v]=cols_values.inject({}){
|a1,v1| a1[v1]=nil; a1
}
;a}
values.each_index{|i|
h_rows[rows[i]][columns[i]]=values[i]
}
ds=Dataset.new(["_id"]+cols_values)
cols_values.each{|c|
ds[c].type=values.type
}
rows.factors.each {|row|
n_row=Array.new(cols_n+1)
n_row[0]=row
cols_values.each_index {|i|
n_row[i+1]=h_rows[row][cols_values[i]]
}
ds.add_case_array(n_row)
}
ds.update_valid_data
ds
end
# Return true if any vector has missing data
def has_missing_data?
@vectors.any? {|k,v| v.has_missing_data?}
end
# Return a nested hash using fields as keys and
# an array constructed of hashes with other values.
# If block provided, is used to provide the
# values, with parameters +row+ of dataset,
# +current+ last hash on hierarchy and
# +name+ of the key to include
def nest(*tree_keys,&block)
tree_keys=tree_keys[0] if tree_keys[0].is_a? Array
out=Hash.new
each do |row|
current=out
# Create tree
tree_keys[0,tree_keys.size-1].each do |f|
root=row[f]
current[root]||=Hash.new
current=current[root]
end
name=row[tree_keys.last]
if !block
current[name]||=Array.new
current[name].push(row.delete_if{|key,value| tree_keys.include? key})
else
current[name]=block.call(row, current,name)
end
end
out
end
# Creates a new dataset. A dataset is a set of ordered named vectors
# of the same size.
#
# [vectors] With an array, creates a set of empty vectors named as
# values on the array. With a hash, each Vector is assigned as
# a variable of the Dataset named as its key
# [fields] Array of names for vectors. Is only used for set the
# order of variables. If empty, vectors keys on alfabethic order as
# used as fields.
def initialize(vectors={}, fields=[])
@@n_dataset||=0
@@n_dataset+=1
@name=_("Dataset %d") % @@n_dataset
@cases=0
@gsl=nil
@i=nil
if vectors.instance_of? Array
@fields=vectors.dup
@vectors=vectors.inject({}){|a,x| a[x]=Statsample::Vector.new(); a}
else
# Check vectors
@vectors=vectors
@fields=fields
check_order
check_length
end
end
#
# Creates a copy of the given dataset, deleting all the cases with
# missing data on one of the vectors.
#
# @param array of fields to include. No value include all fields
#
def dup_only_valid(*fields_to_include)
if fields_to_include.size==1 and fields_to_include[0].is_a? Array
fields_to_include=fields_to_include[0]
end
fields_to_include=@fields if fields_to_include.size==0
if fields_to_include.any? {|f| @vectors[f].has_missing_data?}
ds=Dataset.new(fields_to_include)
fields_to_include.each {|f| ds[f].type=@vectors[f].type}
each {|row|
unless fields_to_include.any? {|f| @vectors[f].has_missing_data? and !@vectors[f].is_valid? row[f]}
row_2=fields_to_include.inject({}) {|ac,v| ac[v]=row[v]; ac}
ds.add_case(row_2)
end
}
else
ds=dup fields_to_include
end
ds.name= self.name
ds
end
#
# Returns a duplicate of the Dataset.
# All vectors are copied, so any modification on new
# dataset doesn't affect original dataset's vectors.
# If fields given as parameter, only include those vectors.
#
# @param array of fields to include. No value include all fields
# @return {Statsample::Dataset}
def dup(*fields_to_include)
if fields_to_include.size==1 and fields_to_include[0].is_a? Array
fields_to_include=fields_to_include[0]
end
fields_to_include=@fields if fields_to_include.size==0
vectors={}
fields=[]
fields_to_include.each{|f|
raise "Vector #{f} doesn't exists" unless @vectors.has_key? f
vectors[f]=@vectors[f].dup
fields.push(f)
}
ds=Dataset.new(vectors,fields)
ds.name= self.name
ds
end
# Returns an array with the fields from first argumen to last argument
def from_to(from,to)
raise ArgumentError, "Field #{from} should be on dataset" if !@fields.include? from
raise ArgumentError, "Field #{to} should be on dataset" if !@fields.include? to
@fields.slice(@fields.index(from)..@fields.index(to))
end
# Returns (when possible) a cheap copy of dataset.
# If no vector have missing values, returns original vectors.
# If missing values presents, uses Dataset.dup_only_valid.
#
# @param array of fields to include. No value include all fields
# @return {Statsample::Dataset}
def clone_only_valid(*fields_to_include)
if fields_to_include.size==1 and fields_to_include[0].is_a? Array
fields_to_include=fields_to_include[0]
end
fields_to_include=@fields.dup if fields_to_include.size==0
if fields_to_include.any? {|v| @vectors[v].has_missing_data?}
dup_only_valid(fields_to_include)
else
clone(fields_to_include)
end
end
# Returns a shallow copy of Dataset.
# Object id will be distinct, but @vectors will be the same.
# @param array of fields to include. No value include all fields
# @return {Statsample::Dataset}
def clone(*fields_to_include)
if fields_to_include.size==1 and fields_to_include[0].is_a? Array
fields_to_include=fields_to_include[0]
end
fields_to_include=@fields.dup if fields_to_include.size==0
ds=Dataset.new
fields_to_include.each{|f|
raise "Vector #{f} doesn't exists" unless @vectors.has_key? f
ds[f]=@vectors[f]
}
ds.fields=fields_to_include
ds.name=@name
ds.update_valid_data
ds
end
# Creates a copy of the given dataset, without data on vectors
#
# @return {Statsample::Dataset}
def dup_empty
vectors=@vectors.inject({}) {|a,v|
a[v[0]]=v[1].dup_empty
a
}
Dataset.new(vectors,@fields.dup)
end
# Merge vectors from two datasets
# In case of name collition, the vectors names are changed to
# x_1, x_2 ....
#
# @return {Statsample::Dataset}
def merge(other_ds)
raise "Cases should be equal (this:#{@cases}; other:#{other_ds.cases}" unless @cases==other_ds.cases
types = @fields.collect{|f| @vectors[f].type} + other_ds.fields.collect{|f| other_ds[f].type}
new_fields = (@fields+other_ds.fields).recode_repeated
ds_new=Statsample::Dataset.new(new_fields)
new_fields.each_index{|i|
field=new_fields[i]
ds_new[field].type=types[i]
}
@cases.times {|i|
row=case_as_array(i)+other_ds.case_as_array(i)
ds_new.add_case_array(row)
}
ds_new.update_valid_data
ds_new
end
# Join 2 Datasets by given fields
# type is one of :left and :inner, default is :left
#
# @return {Statsample::Dataset}
def join(other_ds,fields_1=[],fields_2=[],type=:left)
fields_new = other_ds.fields - fields_2
fields = self.fields + fields_new
other_ds_hash = {}
other_ds.each do |row|
key = row.select{|k,v| fields_2.include?(k)}.values
value = row.select{|k,v| fields_new.include?(k)}
if other_ds_hash[key].nil?
other_ds_hash[key] = [value]
else
other_ds_hash[key] << value
end
end
new_ds = Dataset.new(fields)
self.each do |row|
key = row.select{|k,v| fields_1.include?(k)}.values
new_case = row.dup
if other_ds_hash[key].nil?
if type == :left
fields_new.each{|field| new_case[field] = nil}
new_ds.add_case(new_case)
end
else
other_ds_hash[key].each do |new_values|
new_ds.add_case new_case.merge(new_values)
end
end
end
new_ds
end
# Returns a dataset with standarized data.
#
# @return {Statsample::Dataset}
def standarize
ds=dup()
ds.fields.each do |f|
ds[f]=ds[f].vector_standarized
end
ds
end
# Generate a matrix, based on fields of dataset
#
# @return {::Matrix}
def collect_matrix
rows=@fields.collect{|row|
@fields.collect{|col|
yield row,col
}
}
Matrix.rows(rows)
end
# We have the same datasets if +vectors+ and +fields+ are the same
#
# @return {Boolean}
def ==(d2)
@vectors==d2.vectors and @fields==d2.fields
end
# Returns vector c
#
# @return {Statsample::Vector}
def col(c)
@vectors[c]
end
alias_method :vector, :col
# Equal to Dataset[name]=vector
#
# @return self
def add_vector(name, vector)
raise ArgumentError, "Vector have different size" if vector.size!=@cases
@vectors[name]=vector
check_order
self
end
# Returns true if dataset have vector v.
#
# @return {Boolean}
def has_vector? (v)
return @vectors.has_key?(v)
end
# Creates a dataset with the random data, of a n size
# If n not given, uses original number of cases.
#
# @return {Statsample::Dataset}
def bootstrap(n=nil)
n||=@cases
ds_boot=dup_empty
n.times do
ds_boot.add_case_array(case_as_array(rand(n)))
end
ds_boot.update_valid_data
ds_boot
end
# Fast version of #add_case.
# Can only add one case and no error check if performed
# You SHOULD use #update_valid_data at the end of insertion cycle
#
#
def add_case_array(v)
v.each_index {|i| d=@vectors[@fields[i]].data; d.push(v[i])}
end
# Insert a case, using:
# * Array: size equal to number of vectors and values in the same order as fields
# * Hash: keys equal to fields
# If uvd is false, #update_valid_data is not executed after
# inserting a case. This is very useful if you want to increase the
# performance on inserting many cases, because #update_valid_data
# performs check on vectors and on the dataset
def add_case(v,uvd=true)
case v
when Array
if (v[0].is_a? Array)
v.each{|subv| add_case(subv,false)}
else
raise ArgumentError, "Input array size (#{v.size}) should be equal to fields number (#{@fields.size})" if @fields.size!=v.size
v.each_index {|i| @vectors[@fields[i]].add(v[i],false)}
end
when Hash
raise ArgumentError, "Hash keys should be equal to fields #{(v.keys - @fields).join(",")}" if @fields.sort!=v.keys.sort
@fields.each{|f| @vectors[f].add(v[f],false)}
else
raise TypeError, 'Value must be a Array or a Hash'
end
if uvd
update_valid_data
end
end
# Check vectors and fields after inserting data. Use only
# after #add_case_array or #add_case with second parameter to false
def update_valid_data
@gsl=nil
@fields.each{|f| @vectors[f].set_valid_data}
check_length
end
# Delete vector named +name+. Multiple fields accepted.
def delete_vector(*args)
if args.size==1 and args[0].is_a? Array
names=args[0]
else
names=args
end
names.each do |name|
@fields.delete(name)
@vectors.delete(name)
end
end
def add_vectors_by_split_recode(name_,join='-',sep=Statsample::SPLIT_TOKEN)
split=@vectors[name_].split_by_separator(sep)
i=1
split.each{|k,v|
new_field=name_+join+i.to_s
v.name=name_+":"+k
add_vector(new_field,v)
i+=1
}
end
def add_vectors_by_split(name,join='-',sep=Statsample::SPLIT_TOKEN)
split=@vectors[name].split_by_separator(sep)
split.each{|k,v|
add_vector(name+join+k,v)
}
end
def vector_by_calculation(type=:scale)
a=[]
each do |row|
a.push(yield(row))
end
a.to_vector(type)
end
# Returns a vector with sumatory of fields
# if fields parameter is empty, sum all fields
def vector_sum(fields=nil)
fields||=@fields
vector=collect_with_index do |row, i|
if(fields.find{|f| !@vectors[f].data_with_nils[i]})
nil
else
fields.inject(0) {|ac,v| ac + row[v].to_f}
end
end
vector.name=_("Sum from %s") % @name
vector
end
# Check if #fields attribute is correct, after inserting or deleting vectors
def check_fields(fields)
fields||=@fields
raise "Fields #{(fields-@fields).join(", ")} doesn't exists on dataset" if (fields-@fields).size>0
fields
end
# Returns a vector with the numbers of missing values for a case
def vector_missing_values(fields=nil)
fields=check_fields(fields)
collect_with_index do |row, i|
fields.inject(0) {|a,v|
a+ ((@vectors[v].data_with_nils[i].nil?) ? 1: 0)
}
end
end
def vector_count_characters(fields=nil)
fields=check_fields(fields)
collect_with_index do |row, i|
fields.inject(0){|a,v|
a+((@vectors[v].data_with_nils[i].nil?) ? 0: row[v].to_s.size)
}
end
end
# Returns a vector with the mean for a set of fields
# if fields parameter is empty, return the mean for all fields
# if max invalid parameter > 0, returns the mean for all tuples
# with 0 to max_invalid invalid fields
def vector_mean(fields=nil, max_invalid=0)
a=[]
fields=check_fields(fields)
size=fields.size
each_with_index do |row, i |
# numero de invalidos
sum=0
invalids=0
fields.each{|f|
if !@vectors[f].data_with_nils[i].nil?
sum+=row[f].to_f
else
invalids+=1
end
}
if(invalids>max_invalid)
a.push(nil)
else
a.push(sum.quo(size-invalids))
end
end
a=a.to_vector(:scale)
a.name=_("Means from %s") % @name
a
end
# Check vectors for type and size.
def check_length # :nodoc:
size=nil
@vectors.each do |k,v|
raise Exception, "Data #{v.class} is not a vector on key #{k}" if !v.is_a? Statsample::Vector
if size.nil?
size=v.size
else
if v.size!=size
raise Exception, "Vector #{k} have size #{v.size} and dataset have size #{size}"
end
end
end
@cases=size
end
# Retrieves each vector as [key, vector]
def each_vector # :yield: |key, vector|
@fields.each{|k| yield k, @vectors[k]}
end
if Statsample::STATSAMPLE__.respond_to?(:case_as_hash)
def case_as_hash(c) # :nodoc:
Statsample::STATSAMPLE__.case_as_hash(self,c)
end
else
# Retrieves case i as a hash
def case_as_hash(i)
_case_as_hash(i)
end
end
if Statsample::STATSAMPLE__.respond_to?(:case_as_array)
def case_as_array(c) # :nodoc:
Statsample::STATSAMPLE__.case_as_array(self,c)
end
else
# Retrieves case i as a array, ordered on #fields order
def case_as_array(i)
_case_as_array(i)
end
end
def _case_as_hash(c) # :nodoc:
@fields.inject({}) {|a,x| a[x]=@vectors[x][c];a }
end
def _case_as_array(c) # :nodoc:
@fields.collect {|x| @vectors[x][c]}
end
# Returns each case as a hash
def each
begin
@i=0
@cases.times {|i|
@i=i
row=case_as_hash(i)
yield row
}
@i=nil
rescue =>e
raise DatasetException.new(self, e)
end
end
# Returns each case as hash and index
def each_with_index # :yield: |case, i|
begin
@i=0
@cases.times{|i|
@i=i
row=case_as_hash(i)
yield row, i
}
@i=nil
rescue =>e
raise DatasetException.new(self, e)
end
end
# Returns each case as an array, coding missing values as nils
def each_array_with_nils
m=fields.size
@cases.times {|i|
@i=i
row=Array.new(m)
fields.each_index{|j|
f=fields[j]
row[j]=@vectors[f].data_with_nils[i]
}
yield row
}
@i=nil
end
# Returns each case as an array
def each_array
@cases.times {|i|
@i=i
row=case_as_array(i)
yield row
}
@i=nil
end
# Set fields order. If you omit one or more vectors, they are
# ordered by alphabetic order.
def fields=(f)
@fields=f
check_order
end
# Check congruence between +fields+ attribute
# and keys on +vectors
def check_order #:nodoc:
if(@vectors.keys.sort!=@fields.sort)
@fields=@fields&@vectors.keys
@fields+=@vectors.keys.sort-@fields
end
end
# Returns the vector named i
def[](i)
if i.is_a? Range
fields=from_to(i.begin,i.end)
clone(*fields)
elsif i.is_a? Array
clone(i)
else
raise Exception,"Vector '#{i}' doesn't exists on dataset" unless @vectors.has_key?(i)
@vectors[i]
end
end
# Retrieves a Statsample::Vector, based on the result
# of calculation performed on each case.
def collect(type=:scale)
data=[]
each {|row|
data.push yield(row)
}
Statsample::Vector.new(data,type)
end
# Same as Statsample::Vector.collect, but giving case index as second parameter on yield.
def collect_with_index(type=:scale)
data=[]
each_with_index {|row, i|
data.push(yield(row, i))
}
Statsample::Vector.new(data,type)
end
# Recode a vector based on a block
def recode!(vector_name)
0.upto(@cases-1) {|i|
@vectors[vector_name].data[i]=yield case_as_hash(i)
}
@vectors[vector_name].set_valid_data
end
def crosstab(v1,v2,opts={})
Statsample::Crosstab.new(@vectors[v1], @vectors[v2],opts)
end
def[]=(i,v)
if v.instance_of? Statsample::Vector
@vectors[i]=v
check_order
else
raise ArgumentError,"Should pass a Statsample::Vector"
end
end
# Return data as a matrix. Column are ordered by #fields and
# rows by orden of insertion
def to_matrix
rows=[]
self.each_array{|c|
rows.push(c)
}
Matrix.rows(rows)
end
if Statsample.has_gsl?
def clear_gsl
@gsl=nil
end
def to_gsl
if @gsl.nil?
if cases.nil?
update_valid_data
end
@gsl=GSL::Matrix.alloc(cases,fields.size)
self.each_array{|c|
@gsl.set_row(@i,c)
}
end
@gsl
end
end
# Return a correlation matrix for fields included as parameters.
# By default, uses all fields of dataset
def correlation_matrix(fields=nil)
if fields
ds=clone(fields)
else
ds=self
end
Statsample::Bivariate.correlation_matrix(ds)
end
# Return a correlation matrix for fields included as parameters.
# By default, uses all fields of dataset
def covariance_matrix(fields=nil)
if fields
ds=clone(fields)
else
ds=self
end
Statsample::Bivariate.covariance_matrix(ds)
end
# Create a new dataset with all cases which the block returns true
def filter
ds=self.dup_empty
each {|c|
ds.add_case(c, false) if yield c
}
ds.update_valid_data
ds.name=_("%s(filtered)") % @name
ds
end
# creates a new vector with the data of a given field which the block returns true
def filter_field(field)
a=[]
each do |c|
a.push(c[field]) if yield c
end
a.to_vector(@vectors[field].type)
end
# Creates a Stastample::Multiset, using one or more fields
# to split the dataset.
def to_multiset_by_split(*fields)
require 'statsample/multiset'
if fields.size==1
to_multiset_by_split_one_field(fields[0])
else
to_multiset_by_split_multiple_fields(*fields)
end
end
# Creates a Statsample::Multiset, using one field
def to_multiset_by_split_one_field(field)
raise ArgumentError,"Should use a correct field name" if !@fields.include? field
factors=@vectors[field].factors
ms=Multiset.new_empty_vectors(@fields, factors)
each {|c|
ms[c[field]].add_case(c,false)
}
#puts "Ingreso a los dataset"
ms.datasets.each {|k,ds|
ds.update_valid_data
ds.name=@vectors[field].labeling(k)
ds.vectors.each{|k1,v1|
# puts "Vector #{k1}:"+v1.to_s
v1.type=@vectors[k1].type
v1.name=@vectors[k1].name
v1.labels=@vectors[k1].labels
}
}
ms
end
def to_multiset_by_split_multiple_fields(*fields)
factors_total=nil
fields.each do |f|
if factors_total.nil?
factors_total=@vectors[f].factors.collect{|c|
[c]
}
else
suma=[]
factors=@vectors[f].factors
factors_total.each{|f1| factors.each{|f2| suma.push(f1+[f2]) } }
factors_total=suma
end
end
ms=Multiset.new_empty_vectors(@fields,factors_total)
p1=eval "Proc.new {|c| ms[["+fields.collect{|f| "c['#{f}']"}.join(",")+"]].add_case(c,false) }"
each{|c| p1.call(c)}
ms.datasets.each do |k,ds|
ds.update_valid_data
ds.name=fields.size.times.map {|i|
f=fields[i]
sk=k[i]
@vectors[f].labeling(sk)
}.join("-")
ds.vectors.each{|k1,v1|
v1.type=@vectors[k1].type
v1.name=@vectors[k1].name
v1.labels=@vectors[k1].labels
}
end
ms
end
# Returns a vector, based on a string with a calculation based
# on vector
# The calculation will be eval'ed, so you can put any variable
# or expression valid on ruby
# For example:
# a=[1,2].to_vector(scale)
# b=[3,4].to_vector(scale)
# ds={'a'=>a,'b'=>b}.to_dataset
# ds.compute("a+b")
# => Vector [4,6]
def compute(text)
@fields.each{|f|
if @vectors[f].type=:scale
text.gsub!(f,"row['#{f}'].to_f")
else
text.gsub!(f,"row['#{f}']")
end
}
collect_with_index {|row, i|
invalid=false
@fields.each{|f|
if @vectors[f].data_with_nils[i].nil?
invalid=true
end
}
if invalid
nil
else
eval(text)
end
}
end
# Test each row with one or more tests
# each test is a Proc with the form
# Proc.new {|row| row['age']>0}
# The function returns an array with all errors
def verify(*tests)
if(tests[0].is_a? String)
id=tests[0]
tests.shift
else
id=@fields[0]
end
vr=[]
i=0
each do |row|
i+=1
tests.each{|test|
if ! test[2].call(row)
values=""
if test[1].size>0
values=" ("+test[1].collect{|k| "#{k}=#{row[k]}"}.join(", ")+")"
end
vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}")
end
}
end
vr
end
def to_s
"#<"+self.class.to_s+":"+self.object_id.to_s+" @name=#{@name} @fields=["+@fields.join(",")+"] cases="+@vectors[@fields[0]].size.to_s
end
def inspect
self.to_s
end
# Creates a new dataset for one to many relations
# on a dataset, based on pattern of field names.
#
# for example, you have a survey for number of children
# with this structure:
# id, name, child_name_1, child_age_1, child_name_2, child_age_2
# with
# ds.one_to_many(%w{id}, "child_%v_%n"
# the field of first parameters will be copied verbatim
# to new dataset, and fields which responds to second
# pattern will be added one case for each different %n.
# For example
# cases=[
# ['1','george','red',10,'blue',20,nil,nil],
# ['2','fred','green',15,'orange',30,'white',20],
# ['3','alfred',nil,nil,nil,nil,nil,nil]
# ]
# ds=Statsample::Dataset.new(%w{id name car_color1 car_value1 car_color2 car_value2 car_color3 car_value3})
# cases.each {|c| ds.add_case_array c }
# ds.one_to_many(['id'],'car_%v%n').to_matrix
# => Matrix[
# ["red", "1", 10],
# ["blue", "1", 20],
# ["green", "2", 15],
# ["orange", "2", 30],
# ["white", "2", 20]
# ]
#
def one_to_many(parent_fields, pattern)
#base_pattern=pattern.gsub(/%v|%n/,"")
re=Regexp.new pattern.gsub("%v","(.+?)").gsub("%n","(\\d+?)")
ds_vars=parent_fields
vars=[]
max_n=0
h=parent_fields.inject({}) {|a,v| a[v]=Statsample::Vector.new([], @vectors[v].type);a }
# Adding _row_id
h['_col_id']=[].to_scale
ds_vars.push("_col_id")
@fields.each do |f|
if f=~re
if !vars.include? $1
vars.push($1)
h[$1]=Statsample::Vector.new([], @vectors[f].type)
end
max_n=$2.to_i if max_n < $2.to_i
end
end
ds=Dataset.new(h,ds_vars+vars)
each do |row|
row_out={}
parent_fields.each do |f|
row_out[f]=row[f]
end
max_n.times do |n1|
n=n1+1
any_data=false
vars.each do |v|
data=row[pattern.gsub("%v",v.to_s).gsub("%n",n.to_s)]
row_out[v]=data
any_data=true if !data.nil?
end
if any_data
row_out["_col_id"]=n
ds.add_case(row_out,false)
end
end
end
ds.update_valid_data
ds
end
def report_building(b)
b.section(:name=>@name) do |g|
g.text _"Cases: %d" % cases
@fields.each do |f|
g.text "Element:[#{f}]"
g.parse_element(@vectors[f])
end
end
end
end
end
================================================
FILE: lib/statsample/dominanceanalysis/bootstrap.rb
================================================
module Statsample
class DominanceAnalysis
# == Goal
# Generates Bootstrap sample to identity the replicability of a Dominance Analysis. See Azen & Bodescu (2003) for more information.
#
# == Usage
#
# require 'statsample'
# a=100.times.collect {rand}.to_scale
# b=100.times.collect {rand}.to_scale
# c=100.times.collect {rand}.to_scale
# d=100.times.collect {rand}.to_scale
# ds={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset
# ds['y']=ds.collect{|row| row['a']*5+row['b']*2+row['c']*2+row['d']*2+10*rand()}
# dab=Statsample::DominanceAnalysis::Bootstrap.new(ds2, 'y', :debug=>true)
# dab.bootstrap(100,nil)
# puts dab.summary
# Output
# Sample size: 100
# t: 1.98421693632958
#
# Linear Regression Engine: Statsample::Regression::Multiple::MatrixEngine
# Table: Bootstrap report
# --------------------------------------------------------------------------------------------
# | pairs | sD | Dij | SE(Dij) | Pij | Pji | Pno | Reproducibility |
# --------------------------------------------------------------------------------------------
# | Complete dominance |
# --------------------------------------------------------------------------------------------
# | a - b | 1.0 | 0.6150 | 0.454 | 0.550 | 0.320 | 0.130 | 0.550 |
# | a - c | 1.0 | 0.9550 | 0.175 | 0.930 | 0.020 | 0.050 | 0.930 |
# | a - d | 1.0 | 0.9750 | 0.131 | 0.960 | 0.010 | 0.030 | 0.960 |
# | b - c | 1.0 | 0.8800 | 0.276 | 0.820 | 0.060 | 0.120 | 0.820 |
# | b - d | 1.0 | 0.9250 | 0.193 | 0.860 | 0.010 | 0.130 | 0.860 |
# | c - d | 0.5 | 0.5950 | 0.346 | 0.350 | 0.160 | 0.490 | 0.490 |
# --------------------------------------------------------------------------------------------
# | Conditional dominance |
# --------------------------------------------------------------------------------------------
# | a - b | 1.0 | 0.6300 | 0.458 | 0.580 | 0.320 | 0.100 | 0.580 |
# | a - c | 1.0 | 0.9700 | 0.156 | 0.960 | 0.020 | 0.020 | 0.960 |
# | a - d | 1.0 | 0.9800 | 0.121 | 0.970 | 0.010 | 0.020 | 0.970 |
# | b - c | 1.0 | 0.8850 | 0.283 | 0.840 | 0.070 | 0.090 | 0.840 |
# | b - d | 1.0 | 0.9500 | 0.181 | 0.920 | 0.020 | 0.060 | 0.920 |
# | c - d | 0.5 | 0.5800 | 0.360 | 0.350 | 0.190 | 0.460 | 0.460 |
# --------------------------------------------------------------------------------------------
# | General Dominance |
# --------------------------------------------------------------------------------------------
# | a - b | 1.0 | 0.6500 | 0.479 | 0.650 | 0.350 | 0.000 | 0.650 |
# | a - c | 1.0 | 0.9800 | 0.141 | 0.980 | 0.020 | 0.000 | 0.980 |
# | a - d | 1.0 | 0.9900 | 0.100 | 0.990 | 0.010 | 0.000 | 0.990 |
# | b - c | 1.0 | 0.9000 | 0.302 | 0.900 | 0.100 | 0.000 | 0.900 |
# | b - d | 1.0 | 0.9700 | 0.171 | 0.970 | 0.030 | 0.000 | 0.970 |
# | c - d | 1.0 | 0.5600 | 0.499 | 0.560 | 0.440 | 0.000 | 0.560 |
# --------------------------------------------------------------------------------------------
#
# Table: General averages
# ---------------------------------------
# | var | mean | se | p.5 | p.95 |
# ---------------------------------------
# | a | 0.133 | 0.049 | 0.062 | 0.218 |
# | b | 0.106 | 0.048 | 0.029 | 0.199 |
# | c | 0.035 | 0.032 | 0.002 | 0.106 |
# | d | 0.023 | 0.019 | 0.002 | 0.062 |
# ---------------------------------------
#
# == References:
# * Azen, R. & Budescu, D.V. (2003). The dominance analysis approach for comparing predictors in multiple regression. Psychological Methods, 8(2), 129-148.
class Bootstrap
include Writable
include Summarizable
# Total Dominance results
attr_reader :samples_td
# Conditional Dominance results
attr_reader :samples_cd
# General Dominance results
attr_reader :samples_gd
# General average results
attr_reader :samples_ga
# Name of fields
attr_reader :fields
# Regression class used for analysis
attr_accessor :regression_class
# Dataset
attr_accessor :ds
# Name of analysis
attr_accessor :name
# Alpha level of confidence. Default: ALPHA
attr_accessor :alpha
# Debug?
attr_accessor :debug
# Default level of confidence for t calculation
ALPHA=0.95
# Create a new Dominance Analysis Bootstrap Object
#
# * ds: A Dataset object
# * y_var: Name of dependent variable
# * opts: Any other attribute of the class
def initialize(ds,y_var, opts=Hash.new)
@ds=ds
@y_var=y_var
@n=ds.cases
@n_samples=0
@alpha=ALPHA
@debug=false
if y_var.is_a? Array
@fields=ds.fields-y_var
@regression_class=Regression::Multiple::MultipleDependent
else
@fields=ds.fields-[y_var]
@regression_class=Regression::Multiple::MatrixEngine
end
@samples_ga=@fields.inject({}){|a,v| a[v]=[];a}
@name=_("Bootstrap dominance Analysis: %s over %s") % [ ds.fields.join(",") , @y_var]
opts.each{|k,v|
self.send("#{k}=",v) if self.respond_to? k
}
create_samples_pairs
end
# lr_class deprecated
alias_method :lr_class, :regression_class
def da
if @da.nil?
@da=DominanceAnalysis.new(@ds,@y_var, :regression_class => @regression_class)
end
@da
end
# Creates n re-samples from original dataset and store result of
# each sample on @samples_td, @samples_cd, @samples_gd, @samples_ga
#
# * number_samples: Number of new samples to add
# * n: size of each new sample. If nil, equal to original sample size
def bootstrap(number_samples,n=nil)
number_samples.times{ |t|
@n_samples+=1
puts _("Bootstrap %d of %d") % [t+1, number_samples] if @debug
ds_boot=@ds.bootstrap(n)
da_1=DominanceAnalysis.new(ds_boot, @y_var, :regression_class => @regression_class)
da_1.total_dominance.each{|k,v|
@samples_td[k].push(v)
}
da_1.conditional_dominance.each{|k,v|
@samples_cd[k].push(v)
}
da_1.general_dominance.each{|k,v|
@samples_gd[k].push(v)
}
da_1.general_averages.each{|k,v|
@samples_ga[k].push(v)
}
}
end
def create_samples_pairs
@samples_td={}
@samples_cd={}
@samples_gd={}
@pairs=[]
c=(0...@fields.size).to_a.combination(2)
c.each do |data|
p data
convert=data.collect {|i| @fields[i] }
@pairs.push(convert)
[@samples_td, @samples_cd, @samples_gd].each{|s|
s[convert]=[]
}
end
end
def t
Distribution::T.p_value(1-((1-@alpha) / 2), @n_samples - 1)
end
def report_building(builder) # :nodoc:
raise "You should bootstrap first" if @n_samples==0
builder.section(:name=>@name) do |generator|
generator.text _("Sample size: %d\n") % @n_samples
generator.text "t: #{t}\n"
generator.text _("Linear Regression Engine: %s") % @regression_class.name
table=ReportBuilder::Table.new(:name=>"Bootstrap report", :header => [_("pairs"), "sD","Dij", _("SE(Dij)"), "Pij", "Pji", "Pno", _("Reproducibility")])
table.row([_("Complete dominance"),"","","","","","",""])
table.hr
@pairs.each{|pair|
std=@samples_td[pair].to_vector(:scale)
ttd=da.total_dominance_pairwise(pair[0],pair[1])
table.row(summary_pairs(pair,std,ttd))
}
table.hr
table.row([_("Conditional dominance"),"","","","","","",""])
table.hr
@pairs.each{|pair|
std=@samples_cd[pair].to_vector(:scale)
ttd=da.conditional_dominance_pairwise(pair[0],pair[1])
table.row(summary_pairs(pair,std,ttd))
}
table.hr
table.row([_("General Dominance"),"","","","","","",""])
table.hr
@pairs.each{|pair|
std=@samples_gd[pair].to_vector(:scale)
ttd=da.general_dominance_pairwise(pair[0],pair[1])
table.row(summary_pairs(pair,std,ttd))
}
generator.parse_element(table)
table=ReportBuilder::Table.new(:name=>_("General averages"), :header=>[_("var"), _("mean"), _("se"), _("p.5"), _("p.95")])
@fields.each{|f|
v=@samples_ga[f].to_vector(:scale)
row=[@ds[f].name, sprintf("%0.3f",v.mean), sprintf("%0.3f",v.sd), sprintf("%0.3f",v.percentil(5)),sprintf("%0.3f",v.percentil(95))]
table.row(row)
}
generator.parse_element(table)
end
end
def summary_pairs(pair,std,ttd)
freqs=std.proportions
[0, 0.5, 1].each{|n|
freqs[n]=0 if freqs[n].nil?
}
name="%s - %s" % [@ds[pair[0]].name, @ds[pair[1]].name]
[name,f(ttd,1),f(std.mean,4),f(std.sd),f(freqs[1]), f(freqs[0]), f(freqs[0.5]), f(freqs[ttd])]
end
def f(v,n=3)
prec="%0.#{n}f"
sprintf(prec,v)
end
end
end
end
================================================
FILE: lib/statsample/dominanceanalysis.rb
================================================
module Statsample
# Dominance Analysis is a procedure based on an examination of the R2 values
# for all possible subset models, to identify the relevance of one or more
# predictors in the prediction of criterium.
#
# See Budescu(1993), Azen & Budescu (2003, 2006) for more information.
#
# == Use
#
# a=1000.times.collect {rand}.to_scale
# b=1000.times.collect {rand}.to_scale
# c=1000.times.collect {rand}.to_scale
# ds={'a'=>a,'b'=>b,'c'=>c}.to_dataset
# ds['y']=ds.collect{|row| row['a']*5+row['b']*3+row['c']*2+rand()}
# da=Statsample::DominanceAnalysis.new(ds,'y')
# puts da.summary
#
# === Output:
#
# Report: Report 2010-02-08 19:10:11 -0300
# Table: Dominance Analysis result
# ------------------------------------------------------------
# | | r2 | sign | a | b | c |
# ------------------------------------------------------------
# | Model 0 | | | 0.648 | 0.265 | 0.109 |
# ------------------------------------------------------------
# | a | 0.648 | 0.000 | -- | 0.229 | 0.104 |
# | b | 0.265 | 0.000 | 0.612 | -- | 0.104 |
# | c | 0.109 | 0.000 | 0.643 | 0.260 | -- |
# ------------------------------------------------------------
# | k=1 Average | | | 0.627 | 0.244 | 0.104 |
# ------------------------------------------------------------
# | a*b | 0.877 | 0.000 | -- | -- | 0.099 |
# | a*c | 0.752 | 0.000 | -- | 0.224 | -- |
# | b*c | 0.369 | 0.000 | 0.607 | -- | -- |
# ------------------------------------------------------------
# | k=2 Average | | | 0.607 | 0.224 | 0.099 |
# ------------------------------------------------------------
# | a*b*c | 0.976 | 0.000 | -- | -- | -- |
# ------------------------------------------------------------
# | Overall averages | | | 0.628 | 0.245 | 0.104 |
# ------------------------------------------------------------
#
# Table: Pairwise dominance
# -----------------------------------------
# | Pairs | Total | Conditional | General |
# -----------------------------------------
# | a - b | 1.0 | 1.0 | 1.0 |
# | a - c | 1.0 | 1.0 | 1.0 |
# | b - c | 1.0 | 1.0 | 1.0 |
# -----------------------------------------
#
# == Reference:
# * Budescu, D. V. (1993). Dominance analysis: a new approach to the problem of relative importance of predictors in multiple regression. Psychological Bulletin, 114, 542-551.
# * Azen, R. & Budescu, D.V. (2003). The dominance analysis approach for comparing predictors in multiple regression. Psychological Methods, 8(2), 129-148.
# * Azen, R. & Budescu, D.V. (2006). Comparing predictors in Multivariate Regression Models: An extension of Dominance Analysis. Journal of Educational and Behavioral Statistics, 31(2), 157-180.
#
class DominanceAnalysis
include Summarizable
# Class to generate the regressions. Default to Statsample::Regression::Multiple::MatrixEngine
attr_accessor :regression_class
# Name of analysis
attr_accessor :name
# Set to true if you want to build from dataset, not correlation matrix
attr_accessor :build_from_dataset
# Array with independent variables. You could create subarrays,
# to test groups of predictors as blocks
attr_accessor :predictors
# If you provide a matrix as input, you should set
# the number of cases to define significance of R^2
attr_accessor :cases
# Method of :regression_class used to measure association.
#
# Only necessary to change if you have multivariate dependent.
# * :r2yx (R^2_yx), the default option, is the option when distinction
# between independent and dependents variable is arbitrary
# * :p2yx is the option when the distinction between independent and dependents variables is real.
#
attr_accessor :method_association
attr_reader :dependent
UNIVARIATE_REGRESSION_CLASS=Statsample::Regression::Multiple::MatrixEngine
MULTIVARIATE_REGRESSION_CLASS=Statsample::Regression::Multiple::MultipleDependent
def self.predictor_name(variable)
if variable.is_a? Array
sprintf("(%s)", variable.join(","))
else
variable
end
end
# Creates a new DominanceAnalysis object
# Parameters:
# * input: A Matrix or Dataset object
# * dependent: Name of dependent variable. Could be an array, if you want to
# do an Multivariate Regression Analysis. If nil, set to all
# fields on input, except criteria
def initialize(input, dependent, opts=Hash.new)
@build_from_dataset=false
if dependent.is_a? Array
@regression_class= MULTIVARIATE_REGRESSION_CLASS
@method_association=:r2yx
else
@regression_class= UNIVARIATE_REGRESSION_CLASS
@method_association=:r2
end
@name=nil
opts.each{|k,v|
self.send("#{k}=",v) if self.respond_to? k
}
@dependent=dependent
@dependent=[@dependent] unless @dependent.is_a? Array
@predictors ||= input.fields-@dependent
@name=_("Dominance Analysis: %s over %s") % [ @predictors.flatten.join(",") , @dependent.join(",")] if @name.nil?
if input.is_a? Statsample::Dataset
@ds=input
@matrix=Statsample::Bivariate.correlation_matrix(input)
@cases=Statsample::Bivariate.min_n_valid(input)
elsif input.is_a? ::Matrix
@ds=nil
@matrix=input
else
raise ArgumentError.new("You should use a Matrix or a Dataset")
end
@models=nil
@models_data=nil
@general_averages=nil
end
# Compute models.
def compute
create_models
fill_models
end
def models
if @models.nil?
compute
end
@models
end
def models_data
if @models_data.nil?
compute
end
@models_data
end
def create_models
@models=[]
@models_data={}
for i in 1..@predictors.size
c=(0...@predictors.size).to_a.combination(i)
c.each do |data|
independent=data.collect {|i1| @predictors[i1] }
@models.push(independent)
if (@build_from_dataset)
data=@ds.dup(independent.flatten+@dependent)
else
data=@matrix.submatrix(independent.flatten+@dependent)
end
modeldata=ModelData.new(independent, data, self)
models_data[independent.sort {|a,b| a.to_s<=>b.to_s}]=modeldata
end
end
end
def fill_models
@models.each do |m|
@predictors.each do |f|
next if m.include? f
base_model=md(m)
comp_model=md(m+[f])
base_model.add_contribution(f,comp_model.r2)
end
end
end
private :create_models, :fill_models
def dominance_for_nil_model(i,j)
if md([i]).r2>md([j]).r2
1
elsif md([i]).r2m.contributions[j]
dominances.push(1)
elsif m.contributions[i]1 ? 0.5 : final[0]
end
# Returns 1 if i cD k, 0 if j cD i and 0.5 if undetermined
def conditional_dominance_pairwise(i,j)
dm=dominance_for_nil_model(i,j)
return 0.5 if dm==0.5
dominances=[dm]
for k in 1...@predictors.size
a=average_k(k)
if a[i]>a[j]
dominances.push(1)
elsif a[i]1 ? 0.5 : final[0]
end
# Returns 1 if i gD k, 0 if j gD i and 0.5 if undetermined
def general_dominance_pairwise(i,j)
ga=general_averages
if ga[i]>ga[j]
1
elsif ga[i]b.to_s}]
end
# Get all model of size k
def md_k(k)
out=[]
@models.each{|m| out.push(md(m)) if m.size==k }
out
end
# For a hash with arrays of numbers as values
# Returns a hash with same keys and
# value as the mean of values of original hash
def get_averages(averages)
out={}
averages.each{|key,val| out[key]=val.to_vector(:scale).mean }
out
end
# Hash with average for each k size model.
def average_k(k)
return nil if k==@predictors.size
models=md_k(k)
averages=@predictors.inject({}) {|a,v| a[v]=[];a}
models.each do |m|
@predictors.each do |f|
averages[f].push(m.contributions[f]) unless m.contributions[f].nil?
end
end
get_averages(averages)
end
def general_averages
if @general_averages.nil?
averages=@predictors.inject({}) {|a,v| a[v]=[md([v]).r2];a}
for k in 1...@predictors.size
ak=average_k(k)
@predictors.each do |f|
averages[f].push(ak[f])
end
end
@general_averages=get_averages(averages)
end
@general_averages
end
def report_building(g)
compute if @models.nil?
g.section(:name=>@name) do |generator|
header=["","r2",_("sign")]+@predictors.collect {|c| DominanceAnalysis.predictor_name(c) }
generator.table(:name=>_("Dominance Analysis result"), :header=>header) do |t|
row=[_("Model 0"),"",""]+@predictors.collect{|f|
sprintf("%0.3f",md([f]).r2)
}
t.row(row)
t.hr
for i in 1..@predictors.size
mk=md_k(i)
mk.each{|m|
t.row(m.add_table_row)
}
# Report averages
a=average_k(i)
if !a.nil?
t.hr
row=[_("k=%d Average") % i,"",""] + @predictors.collect{|f|
sprintf("%0.3f",a[f])
}
t.row(row)
t.hr
end
end
g=general_averages
t.hr
row=[_("Overall averages"),"",""]+@predictors.collect{|f|
sprintf("%0.3f",g[f])
}
t.row(row)
end
td=total_dominance
cd=conditional_dominance
gd=general_dominance
generator.table(:name=>_("Pairwise dominance"), :header=>[_("Pairs"),_("Total"),_("Conditional"),_("General")]) do |t|
pairs.each{|pair|
name=pair.map{|v| v.is_a?(Array) ? "("+v.join("-")+")" : v}.join(" - ")
row=[name, sprintf("%0.1f",td[pair]), sprintf("%0.1f",cd[pair]), sprintf("%0.1f",gd[pair])]
t.row(row)
}
end
end
end
class ModelData # :nodoc:
attr_reader :contributions
def initialize(independent, data, da)
@independent=independent
@data=data
@predictors=da.predictors
@dependent=da.dependent
@cases=da.cases
@method=da.method_association
@contributions=@independent.inject({}){|a,v| a[v]=nil;a}
r_class=da.regression_class
if @dependent.size==1
@lr=r_class.new(data, @dependent[0], :cases=>@cases)
else
@lr=r_class.new(data, @dependent, :cases=>@cases)
end
end
def add_contribution(f, v)
@contributions[f]=v-r2
end
def r2
@lr.send(@method)
end
def name
@independent.collect {|variable|
DominanceAnalysis.predictor_name(variable)
}.join("*")
end
def add_table_row
if @cases
sign=sprintf("%0.3f", @lr.probability)
else
sign="???"
end
[name, sprintf("%0.3f",r2), sign] + @predictors.collect{|k|
v=@contributions[k]
if v.nil?
"--"
else
sprintf("%0.3f",v)
end
}
end
def summary
out=sprintf("%s: r2=%0.3f(p=%0.2f)\n",name, r2, @lr.significance, @lr.sst)
out << @predictors.collect{|k|
v=@contributions[k]
if v.nil?
"--"
else
sprintf("%s=%0.3f",k,v)
end
}.join(" | ")
out << "\n"
return out
end
end # end ModelData
end # end Dominance Analysis
end
require 'statsample/dominanceanalysis/bootstrap'
================================================
FILE: lib/statsample/factor/map.rb
================================================
module Statsample
module Factor
# = Velicer's Minimum Average Partial
#
# "Velicer’s (1976) MAP test involves a complete princi-
# pal components analysis followed by the examination of
# a series of matrices of partial correlations. Specifically,
# on the first step, the first principal component is par-
# tialed out of the correlations between the variables of in-
# terest, and the average squared coefficient in the off-
# diagonals of the resulting partial correlation matrix is
# computed. On the second step, the first two principal
# components are partialed out of the original correlation
# matrix and the average squared partial correlation is
# again computed. These computations are conducted for k
# (the number of variables) minus one steps. The average
# squared partial correlations from these steps are then
# lined up, and the number of components is determined by
# the step number in the analyses that resulted in the lowest
# average squared partial correlation. The average squared
# coefficient in the original correlation matrix is also com-
# puted, and if this coefficient happens to be lower than
# the lowest average squared partial correlation, then no
# components should be extracted from the correlation ma-
# trix. Statistically, components are retained as long as the
# variance in the correlation matrix represents systematic
# variance. Components are no longer retained when there
# is proportionately more unsystematic variance than sys-
# tematic variance." (O'Connor, 2000, p.397).
#
# Current algorithm is loosely based on SPSS O'Connor algorithm
#
# == Reference
# * O'Connor, B. (2000). SPSS and SAS programs for determining the number of components using parallel analysis and Velicer's MAP test. Behavior Research Methods, Instruments, & Computers, 32(3), 396-402.
#
class MAP
include Summarizable
include DirtyMemoize
# Name of analysis
attr_accessor :name
attr_reader :eigenvalues
# Number of factors to retain
attr_reader :number_of_factors
# Average squared correlations
attr_reader :fm
# Smallest average squared correlation
attr_reader :minfm
attr_accessor :use_gsl
def self.with_dataset(ds,opts=Hash.new)
new(ds.correlation_matrix,opts)
end
def initialize(matrix, opts=Hash.new)
@matrix=matrix
opts_default={
:use_gsl=>true,
:name=>_("Velicer's MAP")
}
@opts=opts_default.merge(opts)
opts_default.keys.each {|k| send("#{k}=", @opts[k]) }
end
def compute
gsl_m=(use_gsl and Statsample.has_gsl?) ? @matrix.to_gsl : @matrix
klass_m=gsl_m.class
eigvect,@eigenvalues=gsl_m.eigenvectors_matrix, gsl_m.eigenvalues
eigenvalues_sqrt=@eigenvalues.collect {|v| Math.sqrt(v)}
loadings=eigvect*(klass_m.diagonal(*eigenvalues_sqrt))
fm=Array.new(@matrix.row_size)
ncol=@matrix.column_size
fm[0]=(gsl_m.mssq - ncol).quo(ncol*(ncol-1))
(ncol-1).times do |m|
puts "MAP:Eigenvalue #{m+1}" if $DEBUG
a=loadings[0..(loadings.row_size-1),0..m]
partcov= gsl_m - (a*a.transpose)
d=klass_m.diagonal(*(partcov.diagonal.collect {|v| Math::sqrt(1/v)}))
pr=d*partcov*d
fm[m+1]=(pr.mssq-ncol).quo(ncol*(ncol-1))
end
minfm=fm[0]
nfactors=0
@errors=[]
fm.each_with_index do |v,s|
if defined?(Complex) and v.is_a? ::Complex
@errors.push(s)
else
if v < minfm
minfm=v
nfactors=s
end
end
end
@number_of_factors=nfactors
@fm=fm
@minfm=minfm
end
def report_building(g) #:nodoc:
g.section(:name=>@name) do |s|
s.table(:name=>_("Eigenvalues"),:header=>[_("Value")]) do |t|
eigenvalues.each_with_index do |e,i|
t.row([@errors.include?(i) ? "*" : "%0.6f" % e])
end
end
s.table(:name=>_("Velicer's Average Squared Correlations"), :header=>[_("number of components"),_("average square correlation")]) do |t|
fm.each_with_index do |v,i|
t.row(["%d" % i, @errors.include?(i) ? "*" : "%0.6f" % v])
end
end
s.text(_("The smallest average squared correlation is : %0.6f" % minfm))
s.text(_("The number of components is : %d" % number_of_factors))
end
end
dirty_memoize :number_of_factors, :fm, :minfm, :eigenvalues
end
end
end
================================================
FILE: lib/statsample/factor/parallelanalysis.rb
================================================
module Statsample
module Factor
# Performs Horn's 'parallel analysis' to a principal components analysis
# to adjust for sample bias in the retention of components.
# Can create the bootstrap samples using random data, using number
# of cases and variables, parameters for actual data (mean and standard
# deviation of each variable) or bootstrap sampling for actual data.
# == Description
# "PA involves the construction of a number of correlation matrices of random variables based on the same sample size and number of variables in the real data set. The average eigenvalues from the random correlation matrices are then compared to the eigenvalues from the real data correlation matrix, such that the first observed eigenvalue is compared to the first random eigenvalue, the second observed eigenvalue is compared to the second random eigenvalue, and so on." (Hayton, Allen & Scarpello, 2004, p.194)
# == Usage
# *With real dataset*
# # ds should be any valid dataset
# pa=Statsample::Factor::ParallelAnalysis.new(ds, :iterations=>100, :bootstrap_method=>:data)
#
# *With number of cases and variables*
# pa=Statsample::Factor::ParallelAnalysis.with_random_data(100,8)
#
# == Reference
# * Hayton, J., Allen, D. & Scarpello, V.(2004). Factor Retention Decisions in Exploratory Factor Analysis: a Tutorial on Parallel Analysis. Organizational Research Methods, 7 (2), 191-205.
# * O'Connor, B. (2000). SPSS and SAS programs for determining the number of components using parallel analysis and Velicer's MAP test. Behavior Research Methods, Instruments, & Computers, 32(3), 396-402.
# * Liu, O., & Rijmen, F. (2008). A modified procedure for parallel analysis of ordered categorical data. Behavior Research Methods, 40(2), 556-562.
class ParallelAnalysis
def self.with_random_data(cases,vars,opts=Hash.new)
require 'ostruct'
ds=OpenStruct.new
ds.fields=vars.times.map {|i| "v#{i+1}"}
ds.cases=cases
opts=opts.merge({:bootstrap_method=> :random, :no_data=>true})
new(ds, opts)
end
include DirtyMemoize
include Summarizable
# Number of random sets to produce. 50 by default
attr_accessor :iterations
# Name of analysis
attr_accessor :name
# Dataset. You could use mock vectors when use bootstrap method
attr_reader :ds
# Bootstrap method. :random used by default
# * :random: uses number of variables and cases for the dataset
# * :data : sample with replacement from actual data.
attr_accessor :bootstrap_method
# Uses smc on diagonal of matrixes, to perform simulation
# of a Principal Axis analysis.
# By default, false.
attr_accessor :smc
# Percentil over bootstrap eigenvalue should be accepted. 95 by default
attr_accessor :percentil
# Correlation matrix used with :raw_data . :correlation_matrix used by default
attr_accessor :matrix_method
# Number of eigenvalues to calculate. Should be set for
# Principal Axis Analysis.
attr_accessor :n_variables
# Dataset with bootstrapped eigenvalues
attr_reader :ds_eigenvalues
# Perform analysis without actual data.
attr_accessor :no_data
# Show extra information if true
attr_accessor :debug
attr_accessor :use_gsl
def initialize(ds, opts=Hash.new)
@ds=ds
@fields=@ds.fields
@n_variables=@fields.size
@n_cases=ds.cases
opts_default={
:name=>_("Parallel Analysis"),
:iterations=>50, # See Liu and Rijmen (2008)
:bootstrap_method => :random,
:smc=>false,
:percentil=>95,
:debug=>false,
:no_data=>false,
:matrix_method=>:correlation_matrix
}
@use_gsl=Statsample.has_gsl?
@opts=opts_default.merge(opts)
@opts[:matrix_method]==:correlation_matrix if @opts[:bootstrap_method]==:parameters
opts_default.keys.each {|k| send("#{k}=", @opts[k]) }
end
# Number of factor to retent
def number_of_factors
total=0
ds_eigenvalues.fields.each_with_index do |f,i|
if (@original[i]>0 and @original[i]>ds_eigenvalues[f].percentil(percentil))
total+=1
else
break
end
end
total
end
def report_building(g) #:nodoc:
g.section(:name=>@name) do |s|
s.text _("Bootstrap Method: %s") % bootstrap_method
s.text _("Uses SMC: %s") % (smc ? _("Yes") : _("No"))
s.text _("Correlation Matrix type : %s") % matrix_method
s.text _("Number of variables: %d") % @n_variables
s.text _("Number of cases: %d") % @n_cases
s.text _("Number of iterations: %d") % @iterations
if @no_data
s.table(:name=>_("Eigenvalues"), :header=>[_("n"), _("generated eigenvalue"), "p.#{percentil}"]) do |t|
ds_eigenvalues.fields.each_with_index do |f,i|
v=ds_eigenvalues[f]
t.row [i+1, "%0.4f" % v.mean, "%0.4f" % v.percentil(percentil), ]
end
end
else
s.text _("Number or factors to preserve: %d") % number_of_factors
s.table(:name=>_("Eigenvalues"), :header=>[_("n"), _("data eigenvalue"), _("generated eigenvalue"),"p.#{percentil}",_("preserve?")]) do |t|
ds_eigenvalues.fields.each_with_index do |f,i|
v=ds_eigenvalues[f]
t.row [i+1, "%0.4f" % @original[i], "%0.4f" % v.mean, "%0.4f" % v.percentil(percentil), (v.percentil(percentil)>0 and @original[i] > v.percentil(percentil)) ? "Yes":""]
end
end
end
end
end
# Perform calculation. Shouldn't be called directly for the user
def compute
@original=Statsample::Bivariate.send(matrix_method, @ds).eigenvalues unless no_data
@ds_eigenvalues=Statsample::Dataset.new((1..@n_variables).map{|v| "ev_%05d" % v})
@ds_eigenvalues.fields.each {|f| @ds_eigenvalues[f].type=:scale}
if bootstrap_method==:parameter or bootstrap_method==:random
rng = Distribution::Normal.rng
end
@iterations.times do |i|
begin
puts "#{@name}: Iteration #{i}" if $DEBUG or debug
# Create a dataset of dummy values
ds_bootstrap=Statsample::Dataset.new(@ds.fields)
@fields.each do |f|
if bootstrap_method==:random
ds_bootstrap[f]=@n_cases.times.map {|c| rng.call}.to_scale
elsif bootstrap_method==:data
ds_bootstrap[f]=ds[f].sample_with_replacement(@n_cases)
else
raise "bootstrap_method doesn't recogniced"
end
end
ds_bootstrap.update_valid_data
matrix=Statsample::Bivariate.send(matrix_method, ds_bootstrap)
matrix=matrix.to_gsl if @use_gsl
if smc
smc_v=matrix.inverse.diagonal.map{|ii| 1-(1.quo(ii))}
smc_v.each_with_index do |v,ii|
matrix[ii,ii]=v
end
end
ev=matrix.eigenvalues
@ds_eigenvalues.add_case_array(ev)
rescue Statsample::Bivariate::Tetrachoric::RequerimentNotMeet => e
puts "Error: #{e}" if $DEBUG
redo
end
end
@ds_eigenvalues.update_valid_data
end
dirty_memoize :number_of_factors, :ds_eigenvalues
dirty_writer :iterations, :bootstrap_method, :percentil, :smc
end
end
end
================================================
FILE: lib/statsample/factor/pca.rb
================================================
# encoding: UTF-8
module Statsample
module Factor
# Principal Component Analysis (PCA) of a covariance or
# correlation matrix..
#
# NOTE: Sign of second and later eigenvalues could be different
# using Ruby or GSL, so values for PCs and component matrix
# should differ, because extendmatrix and gsl's methods to calculate
# eigenvectors are different. Using R is worse, cause first
# eigenvector could have negative values!
# For Principal Axis Analysis, use Statsample::Factor::PrincipalAxis
#
# == Usage:
# require 'statsample'
# a=[2.5, 0.5, 2.2, 1.9, 3.1, 2.3, 2.0, 1.0, 1.5, 1.1].to_scale
# b=[2.4,0.7,2.9,2.2,3.0,2.7,1.6,1.1,1.6,0.9].to_scale
# ds={'a'=>a,'b'=>b}.to_dataset
# cor_matrix=Statsample::Bivariate.correlation_matrix(ds)
# pca=Statsample::Factor::PCA.new(cor_matrix)
# pca.m
# => 1
# pca.eigenvalues
# => [1.92592927269225, 0.0740707273077545]
# pca.component_matrix
# => GSL::Matrix
# [ 9.813e-01
# 9.813e-01 ]
# pca.communalities
# => [0.962964636346122, 0.962964636346122]
#
# == References:
# * SPSS Manual
# * Smith, L. (2002). A tutorial on Principal Component Analysis. Available on http://courses.eas.ualberta.ca/eas570/pca_tutorial.pdf
# * Härdle, W. & Simar, L. (2003). Applied Multivariate Statistical Analysis. Springer
#
class PCA
include Summarizable
# Name of analysis
attr_accessor :name
# Number of factors. Set by default to the number of factors
# with eigen values > 1
attr_accessor :m
# Use GSL if available
attr_accessor :use_gsl
# Add to the summary a rotation report
attr_accessor :summary_rotation
# Add to the summary a parallel analysis report
attr_accessor :summary_parallel_analysis
# Type of rotation. By default, Statsample::Factor::Rotation::Varimax
attr_accessor :rotation_type
attr_accessor :matrix_type
def initialize(matrix, opts=Hash.new)
@use_gsl=nil
@name=_("Principal Component Analysis")
@matrix=matrix
@n_variables=@matrix.column_size
@variables_names=(@matrix.respond_to? :fields) ? @matrix.fields : @n_variables.times.map {|i| _("VAR_%d") % (i+1)}
@matrix_type = @matrix.respond_to?(:_type) ? @matrix._type : :correlation
@m=nil
@rotation_type=Statsample::Factor::Varimax
opts.each{|k,v|
self.send("#{k}=",v) if self.respond_to? k
}
if @use_gsl.nil?
@use_gsl=Statsample.has_gsl?
end
if @matrix.respond_to? :fields
@variables_names=@matrix.fields
else
@variables_names=@n_variables.times.map {|i| "V#{i+1}"}
end
calculate_eigenpairs
if @m.nil?
# Set number of factors with eigenvalues > 1
@m=@eigenpairs.find_all {|ev,ec| ev>=1.0}.size
end
end
def rotation
@rotation_type.new(component_matrix)
end
def total_eigenvalues
eigenvalues.inject(0) {|ac,v| ac+v}
end
def create_centered_ds
h={}
@original_ds.factors.each {|f|
mean=@original_ds[f].mean
h[f]=@original_ds[f].recode {|c| c-mean}
}
@ds=h.to_dataset
end
# Feature matrix for +m+ factors
# Returns +m+ eigenvectors as columns.
# So, i=variable, j=component
def feature_matrix(m=nil)
m||=@m
if @use_gsl
omega_m=GSL::Matrix.zeros(@n_variables,m)
ev=eigenvectors
m.times do |i|
omega_m.set_column(i,ev[i])
end
omega_m
else
omega_m=::Matrix.build(@n_variables, m) {0}
m.times do |i|
omega_m.column= i, @eigenpairs[i][1]
end
omega_m
end
end
# Returns Principal Components for +input+ matrix or dataset
# The number of PC to return is equal to parameter +m+.
# If +m+ isn't set, m set to number of PCs selected at object creation.
# Use covariance matrix
def principal_components(input, m=nil)
if @use_gsl
data_matrix=input.to_gsl
else
data_matrix=input.to_matrix
end
m||=@m
raise "data matrix variables<>pca variables" if data_matrix.column_size!=@n_variables
fv=feature_matrix(m)
pcs=(fv.transpose*data_matrix.transpose).transpose
pcs.extend Statsample::NamedMatrix
pcs.fields_y=m.times.map {|i| "PC_%d" % (i+1)}
pcs.to_dataset
end
def component_matrix(m=nil)
var="component_matrix_#{matrix_type}"
send(var,m)
end
# Matrix with correlations between components and
# variables. Based on Härdle & Simar (2003, p.243)
def component_matrix_covariance(m=nil)
m||=@m
raise "m should be > 0" if m<1
ff=feature_matrix(m)
cm=::Matrix.build(@n_variables, m) {0}
@n_variables.times {|i|
m.times {|j|
cm[i,j]=ff[i,j] * Math.sqrt(eigenvalues[j] / @matrix[i,i])
}
}
cm.extend NamedMatrix
cm.name=_("Component matrix (from covariance)")
cm.fields_x = @variables_names
cm.fields_y = m.times.map {|i| "PC_%d" % (i+1)}
cm
end
# Matrix with correlations between components and
# variables
def component_matrix_correlation(m=nil)
m||=@m
raise "m should be > 0" if m<1
omega_m=::Matrix.build(@n_variables, m) {0}
gammas=[]
m.times {|i|
omega_m.column=i, @eigenpairs[i][1]
gammas.push(Math::sqrt(@eigenpairs[i][0]))
}
gamma_m=::Matrix.diagonal(*gammas)
cm=(omega_m*(gamma_m)).to_matrix
cm.extend CovariateMatrix
cm.name=_("Component matrix")
cm.fields_x = @variables_names
cm.fields_y = m.times.map {|i| "PC_%d" % (i+1)}
cm
end
def communalities(m=nil)
m||=@m
h=[]
@n_variables.times do |i|
sum=0
m.times do |j|
sum+=(@eigenpairs[j][0].abs*@eigenpairs[j][1][i]**2)
end
h.push(sum)
end
h
end
# Array with eigenvalues
def eigenvalues
@eigenpairs.collect {|c| c[0] }
end
def eigenvectors
@eigenpairs.collect {|c|
@use_gsl ? c[1].to_gsl : c[1].to_vector
}
end
def calculate_eigenpairs
@eigenpairs= @use_gsl ? @matrix.to_gsl.eigenpairs : @matrix.to_matrix.eigenpairs_ruby
end
def report_building(builder) # :nodoc:
builder.section(:name=>@name) do |generator|
generator.text _("Number of factors: %d") % m
generator.table(:name=>_("Communalities"), :header=>[_("Variable"),_("Initial"),_("Extraction"), _("%")]) do |t|
communalities(m).each_with_index {|com, i|
perc=com*100.quo(@matrix[i,i])
t.row([@variables_names[i], "%0.3f" % @matrix[i,i] , "%0.3f" % com, "%0.3f" % perc])
}
end
te=total_eigenvalues
generator.table(:name=>_("Total Variance Explained"), :header=>[_("Component"), _("E.Total"), _("%"), _("Cum. %")]) do |t|
ac_eigen=0
eigenvalues.each_with_index {|eigenvalue,i|
ac_eigen+=eigenvalue
t.row([_("Component %d") % (i+1), sprintf("%0.3f",eigenvalue), sprintf("%0.3f%%", eigenvalue*100.quo(te)), sprintf("%0.3f",ac_eigen*100.quo(te))])
}
end
generator.parse_element(component_matrix(m))
if (summary_rotation)
generator.parse_element(rotation)
end
end
end
private :calculate_eigenpairs, :create_centered_ds
end
end
end
================================================
FILE: lib/statsample/factor/principalaxis.rb
================================================
module Statsample
module Factor
# Principal Axis Analysis for a covariance or correlation matrix.
#
# For PCA, use Statsample::Factor::PCA
#
# == Usage:
# require 'statsample'
# a=[2.5, 0.5, 2.2, 1.9, 3.1, 2.3, 2.0, 1.0, 1.5, 1.1].to_scale
# b=[2.4,0.7,2.9,2.2,3.0,2.7,1.6,1.1,1.6,0.9].to_scale
# ds={'a'=>a,'b'=>b}.to_dataset
# cor_matrix=Statsample::Bivariate.correlation_matrix(ds)
# pa=Statsample::Factor::PrincipalAxis.new(cor_matrix)
# pa.iterate(1)
# pa.m
# => 1
# pca.component_matrix
# => GSL::Matrix
# [ 9.622e-01
# 9.622e-01 ]
# pca.communalities
# => [0.962964636346122, 0.962964636346122]
#
# == References:
# * SPSS Manual
# * Smith, L. (2002). A tutorial on Principal Component Analysis. Available on http://courses.eas.ualberta.ca/eas570/pca_tutorial.pdf
#
class PrincipalAxis
include DirtyMemoize
include Summarizable
# Name of analysis
attr_accessor :name
# Number of factors. Set by default to the number of factors
# with eigenvalues > 1 (Kaiser criterion).
#
# _Warning:_ Kaiser criterion overfactors! Give yourself some time
# and use Horn's Parallel Analysis.
#
attr_accessor :m
# Number of iterations required to converge
attr_reader :iterations
# Initial eigenvalues
attr_reader :initial_eigenvalues
# Tolerance for iterations
attr_accessor :epsilon
# Use SMC(squared multiple correlations) as diagonal. If false, use 1
attr_accessor :smc
# Maximum number of iterations
attr_accessor :max_iterations
# Eigenvalues of factor analysis
attr_reader :eigenvalues
# Minimum difference between succesive iterations on sum of communalities
DELTA=1e-3
# Maximum number of iterations
MAX_ITERATIONS=25
def initialize(matrix, opts=Hash.new)
@matrix=matrix
if @matrix.respond_to? :fields
@fields=@matrix.fields
else
@fields=@matrix.row_size.times.map {|i| _("Variable %d") % (i+1)}
end
@n_variables=@matrix.row_size
@name=""
@m=nil
@initial_eigenvalues=nil
@initial_communalities=nil
@component_matrix=nil
@delta=DELTA
@smc=true
@max_iterations=MAX_ITERATIONS
opts.each{|k,v|
self.send("#{k}=",v) if self.respond_to? k
}
if @matrix.respond_to? :fields
@variables_names=@matrix.fields
else
@variables_names=@n_variables.times.map {|i| "V#{i+1}"}
end
if @m.nil?
pca=PCA.new(::Matrix.rows(@matrix.to_a))
@m=pca.m
end
@clean=true
end
# Communality for all variables given m factors
def communalities(m=nil)
if m!=@m or @clean
iterate(m)
raise "Can't calculate comunality" if @communalities.nil?
end
@communalities
end
# Component matrix for m factors
def component_matrix(m=nil)
if m!=@m or @clean
iterate(m)
end
@component_matrix
end
# Iterate to find the factors
def iterate(m=nil)
@clean=false
m||=@m
@m=m
t = @max_iterations
work_matrix=@matrix.to_a
prev_com=initial_communalities
pca=PCA.new(::Matrix.rows(work_matrix))
@initial_eigenvalues=pca.eigenvalues
prev_sum=prev_com.inject(0) {|ac,v| ac+v}
@iterations=0
t.times do |i|
"#{@name}: Iteration #{i}" if $DEBUG
@iterations+=1
prev_com.each_with_index{|v,it|
work_matrix[it][it]=v
}
pca=PCA.new(::Matrix.rows(work_matrix))
@communalities=pca.communalities(m)
@eigenvalues=pca.eigenvalues
com_sum = @communalities.inject(0) {|ac,v| ac+v}
#jump=true
break if (com_sum-prev_sum).abs < @delta
@communalities.each_with_index do |v2,i2|
raise "Variable #{i2} with communality > 1" if v2>1.0
end
prev_sum=com_sum
prev_com=@communalities
end
@component_matrix=pca.component_matrix(m)
@component_matrix.extend CovariateMatrix
@component_matrix.name=_("Factor Matrix")
@component_matrix.fields_x = @variables_names
@component_matrix.fields_y = m.times.map {|i| "factor_#{i+1}"}
end
alias :compute :iterate
def initial_communalities
if @initial_communalities.nil?
if @smc
# Based on O'Connors(2000)
@initial_communalities=@matrix.inverse.diagonal.map{|i| 1-(1.quo(i))}
=begin
@initial_communalities=@matrix.column_size.times.collect {|i|
rxx , rxy = PrincipalAxis.separate_matrices(@matrix,i)
matrix=(rxy.t*rxx.inverse*rxy)
matrix[0,0]
}
=end
else
@initial_communalities=[1.0]*@matrix.column_size
end
end
@initial_communalities
end
# Returns two matrixes from a correlation matrix
# with regressors correlation matrix and criteria xy
# matrix.
def self.separate_matrices(matrix, y)
ac=[]
matrix.column_size.times do |i|
ac.push(matrix[y,i]) if i!=y
end
rxy=Matrix.columns([ac])
rows=[]
matrix.row_size.times do |i|
if i!=y
row=[]
matrix.row_size.times do |j|
row.push(matrix[i,j]) if j!=y
end
rows.push(row)
end
end
rxx=Matrix.rows(rows)
[rxx,rxy]
end
def report_building(generator)
iterate if @clean
generator.section(:name=>@name) do |s|
s.text _("Number of factors: %d") % m
s.text _("Iterations: %d") % @iterations
s.table(:name=>_("Communalities"), :header=>[_("Variable"),_("Initial"),_("Extraction")]) do |t|
communalities(m).each_with_index {|com,i|
t.row([@fields[i], sprintf("%0.4f", initial_communalities[i]), sprintf("%0.3f", com)])
}
end
s.table(:name=>_("Total Variance"), :header=>[_("Factor"), _("I.E.Total"), _("I.E. %"), _("I.E.Cum. %"),
_("S.L.Total"), _("S.L. %"), _("S.L.Cum. %")
]) do |t|
ac_eigen,ac_i_eigen=0,0
@initial_eigenvalues.each_with_index {|eigenvalue,i|
ac_i_eigen+=eigenvalue
ac_eigen+=@eigenvalues[i]
new_row=[
_("Factor %d") % (i+1),
sprintf("%0.3f",eigenvalue),
sprintf("%0.3f%%", eigenvalue*100.quo(@n_variables)),
sprintf("%0.3f",ac_i_eigen*100.quo(@n_variables))
]
if i<@m
new_row.concat [
sprintf("%0.3f", @eigenvalues[i]),
sprintf("%0.3f%%", @eigenvalues[i]*100.quo(@n_variables)),
sprintf("%0.3f",ac_eigen*100.quo(@n_variables))
]
else
new_row.concat ["","",""]
end
t.row new_row
}
end
s.parse_element(component_matrix)
end
end
dirty_writer :max_iterations, :epsilon, :smc
dirty_memoize :eigenvalues, :iterations, :initial_eigenvalues
end
end
end
================================================
FILE: lib/statsample/factor/rotation.rb
================================================
module Statsample
module Factor
# Base class for component matrix rotation.
#
# == Reference:
# * SPSS Manual
# * Lin, J. (2007). VARIMAX_K58 [Source code]. [http://www.johnny-lin.com/idl_code/varimax_k58.pro]
#
# Use subclasses Varimax, Equimax or Quartimax for desired type of rotation
# Use:
# a = Matrix[ [ 0.4320, 0.8129, 0.3872]
# , [ 0.7950, -0.5416, 0.2565]
# , [ 0.5944, 0.7234, -0.3441]
# , [ 0.8945, -0.3921, -0.1863] ]
# rotation = Statsample::Factor::Varimax(a)
# rotation.iterate
# p rotation.rotated
# p rotation.component_transformation_matrix
#
class Rotation
EPSILON=1e-15
MAX_ITERATIONS=25
include Summarizable
include DirtyMemoize
attr_reader :iterations, :rotated, :component_transformation_matrix, :h2
# Maximum number of iterations
attr_accessor :max_iterations
# Maximum precision
attr_accessor :epsilon
attr_accessor :use_gsl
dirty_writer :max_iterations, :epsilon
dirty_memoize :iterations, :rotated, :component_transformation_matrix, :h2
def initialize(matrix, opts=Hash.new)
@name=_("%s rotation") % rotation_name
@matrix=matrix
@n=@matrix.row_size # Variables, p on original
@m=@matrix.column_size # Factors, r on original
@component_transformation_matrix=nil
@max_iterations=MAX_ITERATIONS
@epsilon=EPSILON
@rotated=nil
@h2=(@matrix.collect {|c| c**2} * Matrix.column_vector([1]*@m)).column(0).to_a
@use_gsl=Statsample.has_gsl?
opts.each{|k,v|
self.send("#{k}=",v) if self.respond_to? k
}
end
def report_building(g)
g.section(:name=>@name) do |s|
s.parse_element(rotated)
s.parse_element(component_transformation_matrix)
end
end
alias_method :communalities, :h2
alias_method :rotated_component_matrix, :rotated
def compute
iterate
end
# Start iteration
def iterate
k_matrix=@use_gsl ? GSL::Matrix : ::Matrix
t=k_matrix.identity(@m)
b=(@use_gsl ? @matrix.to_gsl : @matrix.dup)
h=k_matrix.diagonal(*@h2).collect {|c| Math::sqrt(c)}
h_inverse=h.collect {|c| c!=0 ? 1/c : 0 }
bh=h_inverse * b
@not_converged=true
@iterations=0
while @not_converged
break if @iterations>@max_iterations
@iterations+=1
#puts "Iteration #{iterations}"
num_pairs=@m*(@m-1).quo(2)
(0..(@m-2)).each do |i| #+ go through factor index 0:r-1-1 (begin)
((i+1)..(@m-1)).each do |j| #+ pair i to "rest" of factors (begin)
xx = bh.column(i)
yy = bh.column(j)
tx = t.column(i)
ty = t.column(j)
uu = @n.times.collect {|var_i| xx[var_i]**2-yy[var_i]**2}
vv = @n.times.collect {|var_i| 2*xx[var_i]*yy[var_i]}
a = @n.times.inject(0) {|ac,var_i| ac+ uu[var_i] }
b = @n.times.inject(0) {|ac,var_i| ac+ vv[var_i] }
c = @n.times.inject(0) {|ac,var_i| ac+ (uu[var_i]**2 - vv[var_i]**2) }
d = @n.times.inject(0) {|ac,var_i| ac+ (2*uu[var_i]*vv[var_i]) }
num=x(a,b,c,d)
den=y(a,b,c,d)
phi=Math::atan2(num,den) / 4.0
# puts "#{i}-#{j}: #{phi}"
if(Math::sin(phi.abs) >= @epsilon)
xx_rot=( Math::cos(phi)*xx)+(Math::sin(phi)*yy)
yy_rot=((-Math::sin(phi))*xx)+(Math::cos(phi)*yy)
tx_rot=( Math::cos(phi)*tx)+(Math::sin(phi)*ty)
ty_rot=((-Math::sin(phi))*tx)+(Math::cos(phi)*ty)
bh=bh.to_a
@n.times {|row_i|
bh[row_i][i] = xx_rot[row_i]
bh[row_i][j] = yy_rot[row_i]
}
t=t.to_a
@m.times {|row_i|
t[row_i][i]=tx_rot[row_i]
t[row_i][j]=ty_rot[row_i]
}
#if @use_gsl
bh=k_matrix.[](*bh)
t=k_matrix.[](*t)
#else
# bh=Matrix.rows(bh)
# t=Matrix.rows(t)
#end
else
num_pairs=num_pairs-1
@not_converged=false if num_pairs==0
end # if
end #j
end #i
end # while
@rotated=h*bh
@rotated.extend CovariateMatrix
@rotated.name=_("Rotated Component matrix")
if @matrix.respond_to? :fields_x
@rotated.fields_x = @matrix.fields_x
else
@rotated.fields_x = @n.times.map {|i| "var_#{i+1}"}
end
if @matrix.respond_to? :fields_y
@rotated.fields_y = @matrix.fields_y
else
@rotated.fields_y = @m.times.map {|i| "var_#{i+1}"}
end
@component_transformation_matrix=t
@component_transformation_matrix.extend CovariateMatrix
@component_transformation_matrix.name=_("Component transformation matrix")
if @matrix.respond_to? :fields_y
@component_transformation_matrix.fields = @matrix.fields_y
else
@component_transformation_matrix.fields = @m.times.map {|i| "var_#{i+1}"}
end
@rotated
end
end
class Varimax < Rotation
def x(a,b,c,d)
d-(2*a*b / @n.to_f)
end
def y(a,b,c,d)
c-((a**2-b**2) / @n.to_f)
end
def rotation_name
"Varimax"
end
end
class Equimax < Rotation
def x(a,b,c,d)
d-(@m*a*b / @n.to_f)
end
def y(a,b,c,d)
c-@m*((a**2-b**2) / (2*@n.to_f))
end
def rotation_name
"Equimax"
end
end
class Quartimax < Rotation
def x(a,b,c,d)
d
end
def y(a,b,c,d)
c
end
def rotation_name
"Quartimax"
end
end
end
end
================================================
FILE: lib/statsample/factor.rb
================================================
require 'statsample/factor/rotation'
require 'statsample/factor/pca'
require 'statsample/factor/principalaxis'
require 'statsample/factor/parallelanalysis'
require 'statsample/factor/map'
module Statsample
# Factor Analysis toolbox.
# * Classes for Extraction of factors:
# * Statsample::Factor::PCA
# * Statsample::Factor::PrincipalAxis
# * Classes for Rotation of factors:
# * Statsample::Factor::Varimax
# * Statsample::Factor::Equimax
# * Statsample::Factor::Quartimax
# * Classes for determining the number of components
# * Statsample::Factor::MAP
# * Statsample::Factor::ParallelAnalysis
#
# About number of components, O'Connor(2000) said:
# The two procedures [PA and MAP ] complement each other nicely,
# in that the MAP tends to err (when it does err) in the direction
# of underextraction, whereas parallel analysis tends to err
# (when it does err) in the direction of overextraction.
# Optimal decisions are thus likely to be made after considering
# the results of both analytic procedures. (p.10)
module Factor
# Anti-image covariance matrix.
# Useful for inspection of desireability of data for factor analysis.
# According to Dziuban & Shirkey (1974, p.359):
# "If this matrix does not exhibit many zero off-diagonal elements,
# the investigator has evidence that the correlation
# matrix is not appropriate for factor analysis."
#
def self.anti_image_covariance_matrix(matrix)
s2=Matrix.diag(*(matrix.inverse.diagonal)).inverse
aicm=(s2)*matrix.inverse*(s2)
aicm.extend(Statsample::CovariateMatrix)
aicm.fields=matrix.fields if matrix.respond_to? :fields
aicm
end
def self.anti_image_correlation_matrix(matrix)
matrix=matrix.to_matrix
s=Matrix.diag(*(matrix.inverse.diagonal)).sqrt.inverse
aicm=s*matrix.inverse*s
aicm.extend(Statsample::CovariateMatrix)
aicm.fields=matrix.fields if matrix.respond_to? :fields
aicm
end
# Kaiser-Meyer-Olkin measure of sampling adequacy for correlation matrix.
#
# Kaiser's (1974, cited on Dziuban & Shirkey, 1974) present calibration of the index is as follows :
# * .90s—marvelous
# * .80s— meritorious
# * .70s—middling
# * .60s—mediocre
# * .50s—miserable
# * .50 •—unacceptable
def self.kmo(matrix)
q=anti_image_correlation_matrix(matrix)
n=matrix.row_size
sum_r,sum_q=0,0
n.times do |j|
n.times do |k|
if j!=k
sum_r+=matrix[j,k]**2
sum_q+=q[j,k]**2
end
end
end
sum_r.quo(sum_r+sum_q)
end
# Kaiser-Meyer-Olkin measure of sampling adequacy for one variable.
#
def self.kmo_univariate(matrix, var)
if var.is_a? String
if matrix.respond_to? :fields
j=matrix.fields.index(var)
raise "Matrix doesn't have field #{var}" if j.nil?
else
raise "Matrix doesn't respond to fields"
end
else
j=var
end
q=anti_image_correlation_matrix(matrix)
n=matrix.row_size
sum_r,sum_q=0,0
n.times do |k|
if j!=k
sum_r+=matrix[j,k]**2
sum_q+=q[j,k]**2
end
end
sum_r.quo(sum_r+sum_q)
end
end
end
================================================
FILE: lib/statsample/graph/boxplot.rb
================================================
require 'rubyvis'
module Statsample
module Graph
# = Boxplot
#
# From Wikipedia:
# In descriptive statistics, a box plot or boxplot (also known as a box-and-whisker diagram or plot) is a convenient way of graphically depicting groups of numerical data through their five-number summaries: the smallest observation (sample minimum), lower quartile (Q1), median (Q2), upper quartile (Q3), and largest observation (sample maximum). A boxplot may also indicate which observations, if any, might be considered outliers.
#
# == Usage
# === Svg output
# a=[1,2,3,4].to_scale
# b=[3,4,5,6].to_scale
# puts Statsample::Graph::Boxplot.new(:vectors=>[a,b]).to_svg
# === Using ReportBuilder
# a=[1,2,3,4].to_scale
# b=[3,4,5,6].to_scale
# rb=ReportBuilder.new
# rb.add(Statsample::Graph::Boxplot.new(:vectors=>[a,b]))
# rb.save_html('boxplot.html')
class Boxplot
include Summarizable
attr_accessor :name
# Total width of Boxplot
attr_accessor :width
# Total height of Boxplot
attr_accessor :height
# Top margin
attr_accessor :margin_top
# Bottom margin
attr_accessor :margin_bottom
# Left margin
attr_accessor :margin_left
# Right margin
attr_accessor :margin_right
# Array with assignation to groups of bars
# For example, for four vectors,
# boxplot.groups=[1,2,1,3]
# Assign same color to first and third element, and different to
# second and fourth
attr_accessor :groups
# Minimum value on y-axis. Automaticly defined from data
attr_accessor :minimum
# Maximum value on y-axis. Automaticly defined from data
attr_accessor :maximum
# Vectors to box-ploting
attr_accessor :vectors
# The rotation angle, in radians. Text is rotated clockwise relative
# to the anchor location. For example, with the default left alignment,
# an angle of Math.PI / 2 causes text to proceed downwards. The default angle is zero.
attr_accessor :label_angle
attr_reader :x_scale, :y_scale
# Create a new Boxplot.
# Parameters: Hash of options
# * :vectors: Array of vectors
# * :groups: Array of same size as :vectors:, with name of groups
# to colorize vectors
def initialize(opts=Hash.new)
@vectors=opts.delete :vectors
raise "You should define vectors" if @vectors.nil?
opts_default={
:name=>_("Boxplot"),
:groups=>nil,
:width=>400,
:height=>300,
:margin_top=>10,
:margin_bottom=>20,
:margin_left=>20,
:margin_right=>20,
:minimum=>nil,
:maximum=>nil,
:label_angle=>0
}
@opts=opts_default.merge(opts)
opts_default.keys.each {|k| send("#{k}=", @opts[k]) }
end
# Returns a Rubyvis panel with scatterplot
def rubyvis_panel # :nodoc:
that=self
min,max=@minimum, @maximum
min||=@vectors.map {|v| v.min}.min
max||=@vectors.map {|v| v.max}.max
margin_hor=margin_left + margin_right
margin_vert=margin_top + margin_bottom
x_scale = pv.Scale.ordinal(@vectors.size.times.map.to_a).split_banded(0, width-margin_hor, 4.0/5)
y_scale=Rubyvis::Scale.linear(min,max).range(0,height-margin_vert)
y_scale.nice
# cache data
colors=Rubyvis::Colors.category10
data=@vectors.map {|v|
out={:percentil_25=>v.percentil(25), :median=>v.median, :percentil_75=>v.percentil(75), :name=>v.name}
out[:iqr]=out[:percentil_75] - out[:percentil_25]
irq_max=out[:percentil_75] + out[:iqr]
irq_min=out[:percentil_25] - out[:iqr]
# Find the last data inside the margin
min = out[:percentil_25]
max = out[:percentil_75]
v.each {|d|
min=d if d < min and d > irq_min
max=d if d > max and d < irq_max
}
# Whiskers!
out[:low_whisker]=min
out[:high_whisker]=max
# And now, data outside whiskers
out[:outliers]=v.data_with_nils.find_all {|d| d < min or d > max }
out
}
vis=Rubyvis::Panel.new do |pan|
pan.width width - margin_hor
pan.height height - margin_vert
pan.bottom margin_bottom
pan.left margin_left
pan.right margin_right
pan.top margin_top
# Y axis
pan.rule do
data y_scale.ticks
bottom y_scale
stroke_style {|d| d!=0 ? "#eee" : "#000"}
label(:anchor=>'left') do
text y_scale.tick_format
end
end
pan.rule do
bottom 0
stroke_style 'black'
end
# Labels
pan.label do |l|
l.data data
l.text_angle that.label_angle
l.left {|v| x_scale[index] }
l.bottom(-15)
l.text {|v,x| v[:name]}
end
pan.panel do |bp|
bp.data data
bp.left {|v| x_scale[index]}
bp.width x_scale.range_band
# Bar
bp.bar do |b|
b.bottom {|v| y_scale[v[:percentil_25]]}
b.height {|v| y_scale[v[:percentil_75]] - y_scale[v[:percentil_25]] }
b.line_width 1
b.stroke_style {|v|
if that.groups
colors.scale(that.groups[parent.index]).darker
else
colors.scale(index).darker
end
}
b.fill_style {|v|
if that.groups
colors.scale(that.groups[parent.index])
else
colors.scale(index)
end
}
end
# Median
bp.rule do |r|
r.bottom {|v| y_scale[v[:median]]}
r.width x_scale.range_band
r.line_width 2
end
##
# Whiskeys
##
# Low whiskey
bp.rule do |r|
r.visible {|v| v[:percentil_25] > v[:low_whisker]}
r.bottom {|v| y_scale[v[:low_whisker]]}
end
bp.rule do |r|
r.visible {|v| v[:percentil_25] > v[:low_whisker]}
r.bottom {|v| y_scale[v[:low_whisker]]}
r.left {|v| x_scale.range_band / 2.0}
r.height {|v| y_scale.scale(v[:percentil_25]) - y_scale.scale(v[:low_whisker])}
end
# High whiskey
bp.rule do |r|
r.visible {|v| v[:percentil_75] < v[:high_whisker]}
r.bottom {|v| y_scale.scale(v[:high_whisker])}
end
bp.rule do |r|
r.visible {|v| v[:percentil_75] < v[:high_whisker]}
r.bottom {|v| y_scale.scale(v[:percentil_75])}
r.left {|v| x_scale.range_band / 2.0}
r.height {|v| y_scale.scale(v[:high_whisker]) - y_scale.scale(v[:percentil_75])}
end
# Outliers
bp.dot do |dot|
dot.shape_size 4
dot.data {|v| v[:outliers]}
dot.left {|v| x_scale.range_band / 2.0}
dot.bottom {|v| y_scale.scale(v)}
dot.title {|v| v}
end
end
end
vis
end
# Returns SVG with scatterplot
def to_svg
rp=rubyvis_panel
rp.render
rp.to_svg
end
def report_building(builder) # :nodoc:
builder.section(:name=>name) do |b|
b.image(to_svg, :type=>'svg', :width=>width, :height=>height)
end
end
end
end
end
================================================
FILE: lib/statsample/graph/histogram.rb
================================================
require 'rubyvis'
module Statsample
module Graph
# In statistics, a histogram is a graphical representation, showing a visual impression of the distribution of experimental data. It is an estimate of the probability distribution of a continuous variable and was first introduced by Karl Pearson [1]. A histogram consists of tabular frequencies, shown as adjacent rectangles, erected over discrete intervals (bins), with an area equal to the frequency of the observations in the interval. The height of a rectangle is also equal to the frequency density of the interval, i.e., the frequency divided by the width of the interval. The total area of the histogram is equal to the number of data.
#
# == Usage
# === Svg output
# a=[1,2,3,4].to_scale
# puts Statsample::Graph::Histogram.new(a).to_svg
# === Using ReportBuilder
# a=[1,2,3,4].to_scale
# rb=ReportBuilder.new
# rb.add(Statsample::Graph::Histogram.new(a))
# rb.save_html('histogram.html')
class Histogram
include Summarizable
# Histogram name
attr_accessor :name
# Total width
attr_accessor :width
# Total height
attr_accessor :height
# Top margin
attr_accessor :margin_top
# Bottom margin
attr_accessor :margin_bottom
# Left margin
attr_accessor :margin_left
# Right margin
attr_accessor :margin_right
attr_reader :hist
# Could be an array of ranges or number of bins
attr_accessor :bins
# Minimum value on x axis. Calculated automaticly from data if not set
attr_accessor :minimum_x
# Maximum value on x axis. Calculated automaticly from data if not set
attr_accessor :maximum_x
# Minimum value on y axis. Set to 0 if not set
attr_accessor :minimum_y
# Maximum value on y axis. Calculated automaticly from data if not set.
attr_accessor :maximum_y
# Add a line showing normal distribution
attr_accessor :line_normal_distribution
# data could be a vector or a histogram
def initialize(data, opts=Hash.new)
prov_name=(data.respond_to?(:name)) ? data.name : ""
opts_default={
:name=>_("Histograma (%s)") % prov_name,
:width=>400,
:height=>300,
:margin_top=>10,
:margin_bottom=>20,
:margin_left=>30,
:margin_right=>20,
:minimum_x=>nil,
:maximum_x=>nil,
:minimum_y=>nil,
:maximum_y=>nil,
:bins=>nil,
:line_normal_distribution=>false
}
@opts=opts_default.merge(opts)
opts_default.keys.each {|k| send("#{k}=", @opts[k]) }
@data=data
end
def pre_vis # :nodoc:
if @data.is_a? Statsample::Histogram
@hist=@data
@mean=@hist.estimated_mean
@sd=@hist.estimated_standard_deviation
elsif @data.is_a? Statsample::Vector
@mean=@data.mean
@sd=@data.sd
@bins||=Math::sqrt(@data.size).floor
@hist=@data.histogram(@bins)
end
end
def rubyvis_normal_distribution(pan)
x_scale=@x_scale
y_scale=@y_scale
wob = @hist.get_range(0)[1] - @hist.get_range(0)[0]
nob = ((@maximum_x-@minimum_x) / wob.to_f).floor
sum=@hist.sum
data=nob.times.map {|i|
l=@minimum_x+i*wob
r=@minimum_x+(i+1)*wob
middle=(l+r) / 2.0
pi=Distribution::Normal.cdf((r-@mean) / @sd) - Distribution::Normal.cdf((l-@mean) / @sd)
{:x=>middle, :y=>pi*sum}
}
pan.line do |l|
l.data data
l.interpolate "cardinal"
l.stroke_style "black"
l.bottom {|d| y_scale[d[:y]]}
l.left {|d| x_scale[d[:x]]}
end
end
# Returns a Rubyvis panel with scatterplot
def rubyvis_panel # :nodoc:
pre_vis
#that=self
@minimum_x||=@hist.min
@maximum_x||=@hist.max
@minimum_y||=0
@maximum_y||=@hist.max_val
margin_hor=margin_left + margin_right
margin_vert=margin_top + margin_bottom
x_scale = pv.Scale.linear(@minimum_x, @maximum_x).range(0, width - margin_hor)
y_scale=Rubyvis::Scale.linear(@minimum_y, @maximum_y).range(0, height - margin_vert)
y_scale.nice
bins=@hist.bins.times.map {|i|
{
:low =>@hist.get_range(i)[0],
:high=>@hist.get_range(i)[1],
:value=>@hist.bin[i]
}
}
@x_scale=x_scale
@y_scale=y_scale
# cache data
vis=Rubyvis::Panel.new do |pan|
pan.width width - margin_hor
pan.height height - margin_vert
pan.bottom margin_bottom
pan.left margin_left
pan.right margin_right
pan.top margin_top
# Y axis
pan.rule do
data y_scale.ticks
bottom y_scale
stroke_style {|d| d!=0 ? "#eee" : "#000"}
label(:anchor=>'left') do
text y_scale.tick_format
end
end
# X axis
pan.rule do
data x_scale.ticks
left x_scale
stroke_style "black"
height 5
bottom(-5)
label(:anchor=>'bottom') do
text x_scale.tick_format
end
end
pan.bar do |bar|
bar.data(bins)
bar.left {|v| x_scale[v[:low]]}
bar.width {|v| x_scale[v[:high]] - x_scale[v[:low]]}
bar.bottom 0
bar.height {|v| y_scale[v[:value]]}
bar.stroke_style "black"
bar.line_width 1
end
rubyvis_normal_distribution(pan) if @line_normal_distribution
end
vis
end
# Returns SVG with scatterplot
def to_svg
rp=rubyvis_panel
rp.render
rp.to_svg
end
def report_building(builder) # :nodoc:
builder.section(:name=>name) do |b|
b.image(to_svg, :type=>'svg', :width=>width, :height=>height)
end
end
def report_building_text(generator)
pre_vis
#anchor=generator.toc_entry(_("Histogram %s") % [@name])
step= @hist.max_val > 40 ? ( @hist.max_val / 40).ceil : 1
@hist.range.each_with_index do |r,i|
next if i==@hist.bins
generator.text(sprintf("%5.2f : %s", r, "*" * (@hist.bin[i] / step).floor ))
end
end
end
end
end
================================================
FILE: lib/statsample/graph/scatterplot.rb
================================================
require 'rubyvis'
module Statsample
module Graph
# = Scatterplot
#
# From Wikipedia:
# A scatter plot or scattergraph is a type of mathematical diagram using
# Cartesian coordinates to display values for two variables for a set of data.
#
# The data is displayed as a collection of points, each having the value of one variable determining the position on the horizontal axis and the value of the other variable determining the position on the vertical axis.[2] This kind of plot is also called a scatter chart, scatter diagram and scatter graph.
# == Usage
# === Svg output
# a=[1,2,3,4].to_scale
# b=[3,4,5,6].to_scale
# puts Statsample::Graph::Scatterplot.new(a,b).to_svg
# === Using ReportBuilder
# a=[1,2,3,4].to_scale
# b=[3,4,5,6].to_scale
# rb=ReportBuilder.new
# rb.add(Statsample::Graph::Scatterplot.new(a,b))
# rb.save_html('scatter.html')
class Scatterplot
include Summarizable
attr_accessor :name
# Total width of Scatterplot
attr_accessor :width
# Total height of Scatterplot
attr_accessor :height
attr_accessor :dot_alpha
# Add a line on median of x and y axis
attr_accessor :line_median
# Top margin
attr_accessor :margin_top
# Bottom margin
attr_accessor :margin_bottom
# Left margin
attr_accessor :margin_left
# Right margin
attr_accessor :margin_right
attr_reader :data
attr_reader :v1,:v2
# Array with assignation to groups of bars
# For example, for four vectors,
# boxplot.groups=[1,2,1,3]
# Assign same color to first and third element, and different to
# second and fourth
attr_accessor :groups
attr_reader :x_scale, :y_scale
# Minimum value on x axis. Calculated automaticly from data if not set
attr_accessor :minimum_x
# Maximum value on x axis. Calculated automaticly from data if not set
attr_accessor :maximum_x
# Minimum value on y axis. Set to 0 if not set
attr_accessor :minimum_y
# Maximum value on y axis. Calculated automaticly from data if not set.
attr_accessor :maximum_y
# Create a new Scatterplot.
# Params:
# * v1: Vector on X axis
# * v2: Vector on Y axis
# * opts: Hash of options. See attributes of Scatterplot
def initialize(v1,v2,opts=Hash.new)
@v1_name,@v2_name = v1.name,v2.name
@v1,@v2 = Statsample.only_valid_clone(v1,v2)
opts_default={
:name=>_("Scatterplot (%s - %s)") % [@v1_name, @v2_name],
:width=>400,
:height=>300,
:dot_alpha=>0.5,
:line_median=>false,
:margin_top=>10,
:margin_bottom=>20,
:margin_left=>20,
:margin_right=>20,
:minimum_x=>nil,
:maximum_x=>nil,
:minimum_y=>nil,
:maximum_y=>nil,
:groups=>nil
}
@opts=opts_default.merge(opts)
opts_default.keys.each {|k| send("#{k}=", @opts[k]) }
@data=[]
@v1.each_with_index {|d1,i|
@data.push({:x=>d1, :y=>@v2[i]})
}
end
# Add a rule on median of X and Y axis
def add_line_median(vis) # :nodoc:
that=self
x=@x_scale
y=@y_scale
vis.execute {
rule do
data [that.v1.median]
left x
stroke_style Rubyvis.color("#933").alpha(0.5)
label(:anchor=>"top") do
text x.tick_format
end
end
rule do
data [that.v2.median]
bottom y
stroke_style Rubyvis.color("#933").alpha(0.5)
label(:anchor=>"right") do
text y.tick_format
end
end
}
end
# Returns a Rubyvis panel with scatterplot
def rubyvis_panel # :nodoc:
that=self
#p @v1.map {|v| v}
@minimum_x||=@v1.min
@maximum_x||=@v1.max
@minimum_y||=@v2.min
@maximum_y||=@v2.max
colors=Rubyvis::Colors.category10
margin_hor=margin_left + margin_right
margin_vert=margin_top + margin_bottom
x=Rubyvis::Scale.linear(@minimum_x, @maximum_x).range(0, width - margin_hor)
y=Rubyvis::Scale.linear(@minimum_y, @maximum_y).range(0, height - margin_vert)
@x_scale=x
@y_scale=y
vis=Rubyvis::Panel.new do |pan|
pan.width width - margin_hor
pan.height height - margin_vert
pan.bottom margin_bottom
pan.left margin_left
pan.right margin_right
pan.top margin_top
# X axis
pan.rule do
data y.ticks
bottom y
stroke_style {|d| d!=0 ? "#eee" : "#000"}
label(:anchor=>'left') do
visible {|d| d!=0 and d < that.width}
text y.tick_format
end
end
# Y axis
pan.rule do
data x.ticks
left x
stroke_style {|d| d!=0 ? "#eee" : "#000"}
label(:anchor=>'bottom') do
visible {|d| d>0 and d < that.height}
text x.tick_format
end
end
# Add lines on median
add_line_median(pan) if line_median
pan.panel do
data(that.data)
dot do
left {|d| x[d[:x]]}
bottom {|d| y[d[:y]]}
fill_style {|v|
alpha=(that.dot_alpha-0.3<=0) ? 0.1 : that.dot_alpha-0.3
if that.groups
colors.scale(that.groups[index]).alpha(alpha)
else
colors.scale(0).alpha(alpha)
end
}
stroke_style {|v|
if that.groups
colors.scale(that.groups[parent.index]).alpha(that.dot_alpha)
else
colors.scale(0).alpha(that.dot_alpha)
end
}
shape_radius 2
end
end
end
vis
end
# Returns SVG with scatterplot
def to_svg
rp=rubyvis_panel
rp.render
rp.to_svg
end
def report_building(builder) # :nodoc:
builder.section(:name=>name) do |b|
b.image(to_svg, :type=>'svg', :width=>width, :height=>height)
end
end
end
end
end
================================================
FILE: lib/statsample/graph.rb
================================================
require 'statsample/graph/scatterplot'
require 'statsample/graph/boxplot'
require 'statsample/graph/histogram'
module Statsample
# Several Graph, based on Rubyvis
# * Statsample::Graph::Boxplot
# * Statsample::Graph::Histogram
# * Statsample::Graph::Scatterplot
module Graph
end
end
================================================
FILE: lib/statsample/histogram.rb
================================================
module Statsample
# A histogram consists of a set of bins which count the
# number of events falling into a given range of a continuous variable x.
#
# This implementations follows convention of GSL
# for specification.
#
# * Verbatim: *
#
# The range for bin[i] is given by range[i] to range[i+1].
# For n bins there are n+1 entries in the array range.
# Each bin is inclusive at the lower end and exclusive at the upper end.
# Mathematically this means that the bins are defined
# by the following inequality,
#
# bin[i] corresponds to range[i] <= x < range[i+1]
#
# Here is a diagram of the correspondence between ranges and bins
# on the number-line for x,
#
#
# [ bin[0] )[ bin[1] )[ bin[2] )[ bin[3] )[ bin[4] )
# ---|---------|---------|---------|---------|---------|--- x
# r[0] r[1] r[2] r[3] r[4] r[5]
#
#
# In this picture the values of the range array are denoted by r.
# On the left-hand side of each bin the square bracket ‘[’ denotes
# an inclusive lower bound ( r <= x), and the round parentheses ‘)’
# on the right-hand side denote an exclusive upper bound (x < r).
# Thus any samples which fall on the upper end of the histogram are
# excluded.
# If you want to include this value for the last bin you will need to
# add an extra bin to your histogram.
#
#
# == Reference:
# * http://www.gnu.org/software/gsl/manual/html_node/The-histogram-struct.html
class Histogram
include Enumerable
class << self
# Alloc +n_bins+, using +range+ as ranges of bins
def alloc(n_bins, range=nil, opts=Hash.new)
Histogram.new(n_bins, range, opts)
end
# Alloc +n_bins+ bins, using +p1+ as minimum and +p2+
# as maximum
def alloc_uniform(n_bins, p1=nil,p2=nil)
if p1.is_a? Array
min,max=p1
else
min,max=p1,p2
end
range=max - min
step=range / n_bins.to_f
range=(n_bins+1).times.map {|i| min + (step*i)}
Histogram.new(range)
end
end
attr_accessor :name
attr_reader :bin
attr_reader :range
include GetText
bindtextdomain("statsample")
def initialize(p1, min_max=false, opts=Hash.new)
if p1.is_a? Array
range=p1
@n_bins=p1.size-1
elsif p1.is_a? Integer
@n_bins=p1
end
@bin=[0.0]*(@n_bins)
if(min_max)
min, max=min_max[0], min_max[1]
range=Array.new(@n_bins+1)
(@n_bins+1).times {|i| range[i]=min+(i*(max-min).quo(@n_bins)) }
end
range||=[0.0]*(@n_bins+1)
set_ranges(range)
@name=""
opts.each{|k,v|
self.send("#{k}=",v) if self.respond_to? k
}
end
# Number of bins
def bins
@n_bins
end
#
def increment(x, w=1)
if x.respond_to? :each
x.each{|y| increment(y,w) }
elsif x.is_a? Numeric
(range.size-1).times do |i|
if x>=range[i] and xi, :low=>r[0],:high=>r[1], :middle=>(r[0]+r[1]) / 2.0, :value=>@bin[i]}
yield arg
end
end
def estimated_variance
sum,n=0,0
mean=estimated_mean
each do |v|
sum+=v[:value]*(v[:middle]-mean)**2
n+=v[:value]
end
sum / (n-1)
end
def estimated_standard_deviation
Math::sqrt(estimated_variance)
end
def estimated_mean
sum,n=0,0
each do |v|
sum+= v[:value]* v[:middle]
n+=v[:value]
end
sum / n
end
alias :mean :estimated_mean
alias :sigma :estimated_standard_deviation
def sum(start=nil,_end=nil)
start||=0
_end||=@n_bins-1
(start.._end).inject(0) {|ac,i| ac+@bin[i]}
end
def report_building(generator)
hg=Statsample::Graph::Histogram.new(self)
generator.parse_element(hg)
end
def report_building_text(generator)
@range.each_with_index do |r,i|
next if i==@bin.size
generator.text(sprintf("%5.2f : %d", r, @bin[i]))
end
end
end
end
================================================
FILE: lib/statsample/matrix.rb
================================================
class ::Vector
def to_matrix
::Matrix.columns([self.to_a])
end
def to_vector
self
end
end
class ::Matrix
def to_matrix
self
end
def to_dataset
f = (self.respond_to? :fields_y) ? fields_y : column_size.times.map {|i| _("VAR_%d") % (i+1) }
ds=Statsample::Dataset.new(f)
f.each do |ff|
ds[ff].type=:scale
ds[ff].name=ff
end
row_size.times {|i|
ds.add_case_array(self.row(i).to_a)
}
ds.update_valid_data
ds.name=self.name if self.respond_to? :name
ds
end
if defined? :eigenpairs
alias_method :eigenpairs_ruby, :eigenpairs
end
if Statsample.has_gsl?
# Optimize eigenpairs of extendmatrix module using gsl
def eigenpairs
to_gsl.eigenpairs
end
end
def eigenvalues
eigenpairs.collect {|v| v[0]}
end
def eigenvectors
eigenpairs.collect {|v| v[1]}
end
def eigenvectors_matrix
Matrix.columns(eigenvectors)
end
def to_gsl
out=[]
self.row_size.times{|i|
out[i]=self.row(i).to_a
}
GSL::Matrix[*out]
end
end
module GSL
class Vector
class Col
def to_matrix
::Matrix.columns([self.size.times.map {|i| self[i]}])
end
def to_ary
to_a
end
def to_gsl
self
end
end
end
class Matrix
def to_gsl
self
end
def to_dataset
f = (self.respond_to? :fields_y) ? fields_y : column_size.times.map {|i| _("VAR_%d") % (i+1) }
ds=Statsample::Dataset.new(f)
f.each do |ff|
ds[ff].type=:scale
ds[ff].name=ff
end
row_size.times {|i|
ds.add_case_array(self.row(i).to_a)
}
ds.update_valid_data
ds.name=self.name if self.respond_to? :name
ds
end
def row_size
size1
end
def column_size
size2
end
def determinant
det
end
def inverse
GSL::Linalg::LU.invert(self)
end
def eigenvalues
eigenpairs.collect {|v| v[0]}
end
def eigenvectors
eigenpairs.collect {|v| v[1]}
end
# Matrix sum of squares
def mssq
sum=0
to_v.each {|i| sum+=i**2}
sum
end
def eigenvectors_matrix
eigval, eigvec= GSL::Eigen.symmv(self)
GSL::Eigen::symmv_sort(eigval, eigvec, GSL::Eigen::SORT_VAL_DESC)
eigvec
end
def eigenpairs
eigval, eigvec= GSL::Eigen.symmv(self)
GSL::Eigen::symmv_sort(eigval, eigvec, GSL::Eigen::SORT_VAL_DESC)
@eigenpairs=eigval.size.times.map {|i|
[eigval[i],eigvec.get_col(i)]
}
end
#def eigenpairs_ruby
# self.to_matrix.eigenpairs_ruby
#end
def square?
size1==size2
end
def to_matrix
rows=self.size1
cols=self.size2
out=(0...rows).collect{|i| (0...cols).collect {|j| self[i,j]} }
::Matrix.rows(out)
end
def total_sum
sum=0
size1.times {|i|
size2.times {|j|
sum+=self[i,j]
}
}
sum
end
end
end
module Statsample
# Module to add names to X and Y fields
module NamedMatrix
include Summarizable
def fields
raise "Should be square" if !square?
fields_x
end
def fields=(v)
raise "Matrix should be square" if !square?
@fields_x=v
@fields_y=v
end
def fields_x=(v)
raise "Size of fields != row_size" if v.size!=row_size
@fields_x=v
end
def fields_y=(v)
raise "Size of fields != column_size" if v.size!=column_size
@fields_y=v
end
def fields_x
@fields_x||=row_size.times.collect {|i| _("X%d") % i}
end
def fields_y
@fields_y||=column_size.times.collect {|i| _("Y%d") % i}
end
def name
@name||=get_new_name
end
def name=(v)
@name=v
end
def get_new_name
@@named_matrix||=0
@@named_matrix+=1
_("Matrix %d") % @@named_matrix
end
end
# Module to add method for variance/covariance and correlation matrices
# == Usage
# matrix=Matrix[[1,2],[2,3]]
# matrix.extend CovariateMatrix
#
module CovariateMatrix
include NamedMatrix
@@covariatematrix=0
# Get type of covariate matrix. Could be :covariance or :correlation
def _type
if row_size==column_size
if row_size.times.find {|i| self[i,i]!=1.0}
:covariance
else
:correlation
end
else
@type
end
end
def _type=(t)
@type=t
end
def correlation
if(_type==:covariance)
matrix=Matrix.rows(row_size.times.collect { |i|
column_size.times.collect { |j|
if i==j
1.0
else
self[i,j].quo(Math::sqrt(self[i,i])*Math::sqrt(self[j,j]))
end
}
})
matrix.extend CovariateMatrix
matrix.fields_x=fields_x
matrix.fields_y=fields_y
matrix._type=:correlation
matrix
else
self
end
end
# Get variance for field k
#
def variance(k)
submatrix([k])[0,0]
end
def get_new_name
@@covariatematrix+=1
_("Covariate matrix %d") % @@covariatematrix
end
# Select a submatrix of factors. If you have a correlation matrix
# with a, b and c, you could obtain a submatrix of correlations of
# a and b, b and c or a and b
#
# You could use labels or index to select the factors.
# If you don't specify columns, its will be equal to rows.
#
# Example:
# a=Matrix[[1.0, 0.3, 0.2],
# [0.3, 1.0, 0.5],
# [0.2, 0.5, 1.0]]
# a.extend CovariateMatrix
# a.fields=%w{a b c}
# a.submatrix(%w{c a}, %w{b})
# => Matrix[[0.5],[0.3]]
# a.submatrix(%w{c a})
# => Matrix[[1.0, 0.2] , [0.2, 1.0]]
def submatrix(rows,columns=nil)
raise ArgumentError, "rows shouldn't be empty" if rows.respond_to? :size and rows.size==0
columns||=rows
# Convert all fields on index
row_index=rows.collect {|v|
r=v.is_a?(Numeric) ? v : fields_x.index(v)
raise "Index #{v} doesn't exists on matrix" if r.nil?
r
}
column_index=columns.collect {|v|
r=v.is_a?(Numeric) ? v : fields_y.index(v)
raise "Index #{v} doesn't exists on matrix" if r.nil?
r
}
fx=row_index.collect {|v| fields_x[v]}
fy=column_index.collect {|v| fields_y[v]}
matrix= Matrix.rows(row_index.collect {|i|
row=column_index.collect {|j| self[i,j]}})
matrix.extend CovariateMatrix
matrix.fields_x=fx
matrix.fields_y=fy
matrix._type=_type
matrix
end
def report_building(generator)
@name||= (_type==:correlation ? _("Correlation"):_("Covariance"))+_(" Matrix")
generator.table(:name=>@name, :header=>[""]+fields_y) do |t|
row_size.times {|i|
t.row([fields_x[i]]+row(i).to_a.collect {|i1|
i1.nil? ? "--" : sprintf("%0.3f",i1).gsub("0.",".")
})
}
end
end
end
end
================================================
FILE: lib/statsample/multiset.rb
================================================
module Statsample
# Multiset joins multiple dataset with the same fields and vectors
# but with different number of cases.
# This is the base class for stratified and cluster sampling estimation
class Multiset
# Name of fields
attr_reader :fields
# Array with Statsample::Dataset
attr_reader :datasets
# To create a multiset
# * Multiset.new(%w{f1 f2 f3}) # define only fields
def initialize(fields)
@fields=fields
@datasets={}
end
def self.new_empty_vectors(fields,ds_names)
ms=Multiset.new(fields)
ds_names.each{|d|
ms.add_dataset(d,Dataset.new(fields))
}
ms
end
# Generate a new dataset as a union of partial dataset
# If block given, this is applied to each dataset before union
def union(&block)
union_field={}
types={}
names={}
labels={}
each do |k,ds|
if block
ds=ds.dup
yield k,ds
end
@fields.each do |f|
union_field[f]||=Array.new
union_field[f].concat(ds[f].data)
types[f]||=ds[f].type
names[f]||=ds[f].name
labels[f]||=ds[f].labels
end
end
@fields.each do |f|
union_field[f]=union_field[f].to_vector(types[f])
union_field[f].name=names[f]
union_field[f].labels=labels[f]
end
ds_union=union_field.to_dataset
ds_union.fields=@fields
ds_union
end
def datasets_names
@datasets.keys.sort
end
def n_datasets
@datasets.size
end
def add_dataset(key,ds)
if(ds.fields!=@fields)
raise ArgumentError, "Dataset(#{ds.fields.to_s})must have the same fields of the Multiset(#{@fields})"
else
@datasets[key]=ds
end
end
def sum_field(field)
@datasets.inject(0) {|a,da|
stratum_name=da[0]
vector=da[1][field]
val=yield stratum_name,vector
a+val
}
end
def collect_vector(field)
@datasets.collect {|k,v|
yield k, v[field]
}
end
def each_vector(field)
@datasets.each {|k,v|
yield k, v[field]
}
end
def[](i)
@datasets[i]
end
def each(&block)
@datasets.each {|k,ds|
next if ds.cases==0
block.call(k,ds)
}
end
end
class StratifiedSample
class << self
# mean for an array of vectors
def mean(*vectors)
n_total=0
means=vectors.inject(0){|a,v|
n_total+=v.size
a+v.sum
}
means.to_f/n_total
end
def standard_error_ksd_wr(es)
n_total=0
sum=es.inject(0){|a,h|
n_total+=h['N']
a+((h['N']**2 * h['s']**2) / h['n'].to_f)
}
(1.to_f / n_total)*Math::sqrt(sum)
end
def variance_ksd_wr(es)
standard_error_ksd_wr(es)**2
end
def calculate_n_total(es)
es.inject(0) {|a,h| a+h['N'] }
end
# Source : Cochran (1972)
def variance_ksd_wor(es)
n_total=calculate_n_total(es)
es.inject(0){|a,h|
val=((h['N'].to_f / n_total)**2) * (h['s']**2 / h['n'].to_f) * (1 - (h['n'].to_f / h['N']))
a+val
}
end
def standard_error_ksd_wor(es)
Math::sqrt(variance_ksd_wor(es))
end
def variance_esd_wor(es)
n_total=calculate_n_total(es)
sum=es.inject(0){|a,h|
val=h['N']*(h['N']-h['n'])*(h['s']**2 / h['n'].to_f)
a+val
}
(1.0/(n_total**2))*sum
end
def standard_error_esd_wor(es)
Math::sqrt(variance_ksd_wor(es))
end
# Based on http://stattrek.com/Lesson6/STRAnalysis.aspx
def variance_esd_wr(es)
n_total=calculate_n_total(es)
sum=es.inject(0){|a,h|
val= ((h['s']**2 * h['N']**2) / h['n'].to_f)
a+val
}
(1.0/(n_total**2))*sum
end
def standard_error_esd_wr(es)
Math::sqrt(variance_esd_wr(es))
end
def proportion_variance_ksd_wor(es)
n_total=calculate_n_total(es)
es.inject(0){|a,h|
val= (((h['N'].to_f / n_total)**2 * h['p']*(1-h['p'])) / (h['n'])) * (1- (h['n'].to_f / h['N']))
a+val
}
end
def proportion_sd_ksd_wor(es)
Math::sqrt(proportion_variance_ksd_wor(es))
end
def proportion_sd_ksd_wr(es)
n_total=calculate_n_total(es)
sum=es.inject(0){|a,h|
val= (h['N']**2 * h['p']*(1-h['p'])) / h['n'].to_f
a+val
}
Math::sqrt(sum) * (1.0/n_total)
end
def proportion_variance_ksd_wr(es)
proportion_variance_ksd_wor(es)**2
end
def proportion_variance_esd_wor(es)
n_total=n_total=calculate_n_total(es)
sum=es.inject(0){|a,h|
a=(h['N']**2 * (h['N']-h['n']) * h['p']*(1.0-h['p'])) / ((h['n']-1)*(h['N']-1))
a+val
}
Math::sqrt(sum) * (1.0/n_total**2)
end
def proportion_sd_esd_wor(es)
Math::sqrt(proportion_variance_ksd_wor(es))
end
end
def initialize(ms,strata_sizes)
raise TypeError,"ms should be a Multiset" unless ms.is_a? Statsample::Multiset
@ms=ms
raise ArgumentError,"You should put a strata size for each dataset" if strata_sizes.keys.sort!=ms.datasets_names
@strata_sizes=strata_sizes
@population_size=@strata_sizes.inject(0) {|a,x| a+x[1]}
@strata_number=@ms.n_datasets
@sample_size=@ms.datasets.inject(0) {|a,x| a+x[1].cases}
end
# Number of strata
def strata_number
@strata_number
end
# Population size. Equal to sum of strata sizes
# Symbol: Nh
def population_size
@population_size
end
# Sample size. Equal to sum of sample of each stratum
def sample_size
@sample_size
end
# Size of stratum x
def stratum_size(h)
@strata_sizes[h]
end
def vectors_by_field(field)
@ms.datasets.collect{|k,ds|
ds[field]
}
end
# Population proportion based on strata
def proportion(field, v=1)
@ms.sum_field(field) {|s_name,vector|
stratum_ponderation(s_name)*vector.proportion(v)
}
end
# Stratum ponderation.
# Symbol: W\h\
def stratum_ponderation(h)
@strata_sizes[h].to_f / @population_size
end
alias_method :wh, :stratum_ponderation
# Population mean based on strata
def mean(field)
@ms.sum_field(field) {|s_name,vector|
stratum_ponderation(s_name)*vector.mean
}
end
# Standard error with estimated population variance and without replacement.
# Source: Cochran (1972)
def standard_error_wor(field)
es=@ms.collect_vector(field) {|s_n, vector|
{'N'=>@strata_sizes[s_n],'n'=>vector.size, 's'=>vector.sds}
}
StratifiedSample.standard_error_esd_wor(es)
end
# Standard error with estimated population variance and without replacement.
# Source: http://stattrek.com/Lesson6/STRAnalysis.aspx
def standard_error_wor_2(field)
sum=@ms.sum_field(field) {|s_name,vector|
s_size=@strata_sizes[s_name]
(s_size**2 * (1-(vector.size.to_f / s_size)) * vector.variance_sample / vector.size.to_f)
}
(1/@population_size.to_f)*Math::sqrt(sum)
end
def standard_error_wr(field)
es=@ms.collect_vector(field) {|s_n, vector|
{'N'=>@strata_sizes[s_n],'n'=>vector.size, 's'=>vector.sds}
}
StratifiedSample.standard_error_esd_wr(es)
end
def proportion_sd_esd_wor(field,v=1)
es=@ms.collect_vector(field) {|s_n, vector|
{'N'=>@strata_sizes[s_n],'n'=>vector.size, 'p'=>vector.proportion(v)}
}
StratifiedSample.proportion_sd_esd_wor(es)
end
def proportion_standard_error(field,v=1)
prop=proportion(field,v)
sum=@ms.sum_field(field) {|s_name,vector|
nh=vector.size
s_size=@strata_sizes[s_name]
(s_size**2 * (1-(nh / s_size)) * prop * (1-prop) / (nh - 1 ))
}
(1.quo(@population_size)) * Math::sqrt(sum)
end
# Cochran(1971), p. 150
def variance_pst(field,v=1)
sum=@ms.datasets.inject(0) {|a,da|
stratum_name=da[0]
ds=da[1]
nh=ds.cases.to_f
s_size=@strata_sizes[stratum_name]
prop=ds[field].proportion(v)
a + (((s_size**2 * (s_size-nh)) / (s_size-1))*(prop*(1-prop) / (nh-1)))
}
(1/@population_size.to_f ** 2)*sum
end
end
end
================================================
FILE: lib/statsample/regression/multiple/alglibengine.rb
================================================
if HAS_ALGIB
module Statsample
module Regression
module Multiple
# Class for Multiple Regression Analysis
# Requires Alglib gem and uses a listwise aproach.
# Faster than GslEngine on massive prediction use, because process is c-based.
# Prefer GslEngine if you need good memory use.
# If you need pairwise, use RubyEngine
# Example:
#
# @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
# @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
# @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
# @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
# ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
# lr=Statsample::Regression::Multiple::AlglibEngine.new(ds,'y')
#
class AlglibEngine < BaseEngine
def initialize(ds,y_var, opts=Hash.new)
super
@ds=ds.dup_only_valid
@ds_valid=@ds
@dy=@ds[@y_var]
@ds_indep=ds.dup(ds.fields-[y_var])
# Create a custom matrix
columns=[]
@fields=[]
@ds.fields.each{|f|
if f!=@y_var
columns.push(@ds[f].to_a)
@fields.push(f)
end
}
@dep_columns=columns.dup
columns.push(@ds[@y_var])
matrix=Matrix.columns(columns)
@lr_s=nil
@lr=::Alglib::LinearRegression.build_from_matrix(matrix)
@coeffs=assign_names(@lr.coeffs)
end
def _dump(i)
Marshal.dump({'ds'=>@ds,'y_var'=>@y_var})
end
def self._load(data)
h=Marshal.load(data)
self.new(h['ds'], h['y_var'])
end
def coeffs
@coeffs
end
# Coefficients using a constant
# Based on http://www.xycoon.com/ols1.htm
def matrix_resolution
mse_p=mse
columns=@dep_columns.dup.map {|xi| xi.map{|i| i.to_f}}
columns.unshift([1.0]*@ds.cases)
y=Matrix.columns([@dy.data.map {|i| i.to_f}])
x=Matrix.columns(columns)
xt=x.t
matrix=((xt*x)).inverse*xt
matrix*y
end
def r2
r**2
end
def r
Bivariate::pearson(@dy,predicted)
end
def sst
@dy.ss
end
def constant
@lr.constant
end
def standarized_coeffs
l=lr_s
assign_names(l.coeffs)
end
def lr_s
if @lr_s.nil?
build_standarized
end
@lr_s
end
def build_standarized
@ds_s=@ds.standarize
columns=[]
@ds_s.fields.each{|f|
columns.push(@ds_s[f].to_a) unless f==@y_var
}
@dep_columns_s=columns.dup
columns.push(@ds_s[@y_var])
matrix=Matrix.columns(columns)
@lr_s=Alglib::LinearRegression.build_from_matrix(matrix)
end
def process(v)
@lr.process(v)
end
def process_s(v)
lr_s.process(v)
end
# ???? Not equal to SPSS output
def standarized_residuals
res=residuals
red_sd=residuals.sds
res.collect {|v|
v.quo(red_sd)
}.to_vector(:scale)
end
end
end
end
end # for Statsample
end # for if
================================================
FILE: lib/statsample/regression/multiple/baseengine.rb
================================================
module Statsample
module Regression
module Multiple
# Base class for Multiple Regression Engines
class BaseEngine
include Statsample::Summarizable
# Name of analysis
attr_accessor :name
# Minimum number of valid case for pairs of correlation
attr_reader :cases
# Number of valid cases (listwise)
attr_reader :valid_cases
# Number of total cases (dataset.cases)
attr_reader :total_cases
attr_accessor :digits
def self.univariate?
true
end
def initialize(ds, y_var, opts = Hash.new)
@ds=ds
@predictors_n=@ds.fields.size-1
@total_cases=@ds.cases
@cases=@ds.cases
@y_var=y_var
@r2=nil
@name=_("Multiple Regression: %s over %s") % [ ds.fields.join(",") , @y_var]
opts_default={:digits=>3}
@opts=opts_default.merge opts
@opts.each{|k,v|
self.send("#{k}=",v) if self.respond_to? k
}
end
# Calculate F Test
def anova
@anova||=Statsample::Anova::OneWay.new(:ss_num=>ssr, :ss_den=>sse, :df_num=>df_r, :df_den=>df_e, :name_numerator=>_("Regression"), :name_denominator=>_("Error"), :name=>"ANOVA")
end
# Standard error of estimate
def se_estimate
Math::sqrt(sse.quo(df_e))
end
# Retrieves a vector with predicted values for y
def predicted
@total_cases.times.collect { |i|
invalid=false
vect=@dep_columns.collect {|v| invalid=true if v[i].nil?; v[i]}
if invalid
nil
else
process(vect)
end
}.to_vector(:scale)
end
# Retrieves a vector with standarized values for y
def standarized_predicted
predicted.standarized
end
# Retrieves a vector with residuals values for y
def residuals
(0...@total_cases).collect{|i|
invalid=false
vect=@dep_columns.collect{|v| invalid=true if v[i].nil?; v[i]}
if invalid or @ds[@y_var][i].nil?
nil
else
@ds[@y_var][i] - process(vect)
end
}.to_vector(:scale)
end
# R Multiple
def r
raise "You should implement this"
end
# Sum of squares Total
def sst
raise "You should implement this"
end
# R^2 Adjusted.
# Estimate Population R^2 usign Ezequiel formula.
# Always lower than sample R^2
# == Reference:
# * Leach, L. & Henson, R. (2007). The Use and Impact of Adjusted R2 Effects in Published Regression Research. Multiple Linear Regression Viewpoints, 33(1), 1-11.
def r2_adjusted
r2-((1-r2)*@predictors_n).quo(df_e)
end
# Sum of squares (regression)
def ssr
r2*sst
end
# Sum of squares (Error)
def sse
sst - ssr
end
# T values for coeffs
def coeffs_t
out={}
se=coeffs_se
coeffs.each do |k,v|
out[k]=v / se[k]
end
out
end
# Mean square Regression
def msr
ssr.quo(df_r)
end
# Mean Square Error
def mse
sse.quo(df_e)
end
# Degrees of freedom for regression
def df_r
@predictors_n
end
# Degrees of freedom for error
def df_e
@valid_cases-@predictors_n-1
end
# Fisher for Anova
def f
anova.f
end
# p-value of Fisher
def probability
anova.probability
end
# Tolerance for a given variable
# http://talkstats.com/showthread.php?t=5056
def tolerance(var)
ds=assign_names(@dep_columns)
ds.each{|k,v|
ds[k]=v.to_vector(:scale)
}
lr=self.class.new(ds.to_dataset,var)
1-lr.r2
end
# Tolerances for each coefficient
def coeffs_tolerances
@fields.inject({}) {|a,f|
a[f]=tolerance(f);
a
}
end
# Standard Error for coefficients
def coeffs_se
out={}
mse=sse.quo(df_e)
coeffs.each {|k,v|
out[k]=Math::sqrt(mse/(@ds[k].sum_of_squares * tolerance(k)))
}
out
end
# Estandar error of R^2
# ????
def se_r2
Math::sqrt((4*r2*(1-r2)**2*(df_e)**2).quo((@cases**2-1)*(@cases+3)))
end
# Estimated Variance-Covariance Matrix
# Used for calculation of se of constant
def estimated_variance_covariance_matrix
#mse_p=mse
columns=[]
@ds_valid.fields.each{|k|
v=@ds_valid[k]
columns.push(v.data) unless k==@y_var
}
columns.unshift([1.0]*@valid_cases)
x=Matrix.columns(columns)
matrix=((x.t*x)).inverse * mse
matrix.collect {|i| Math::sqrt(i) if i>=0 }
end
# T for constant
def constant_t
constant.to_f/constant_se
end
# Standard error for constant
def constant_se
estimated_variance_covariance_matrix[0,0]
end
def report_building(b)
di="%0.#{digits}f"
b.section(:name=>@name) do |g|
c=coeffs
g.text _("Engine: %s") % self.class
g.text(_("Cases(listwise)=%d(%d)") % [@total_cases, @valid_cases])
g.text _("R=")+(di % r)
g.text _("R^2=")+(di % r2)
g.text _("R^2 Adj=")+(di % r2_adjusted)
g.text _("Std.Error R=")+ (di % se_estimate)
g.text(_("Equation")+"="+ sprintf(di,constant) +" + "+ @fields.collect {|k| sprintf("#{di}%s",c[k],k)}.join(' + ') )
g.parse_element(anova)
sc=standarized_coeffs
cse=coeffs_se
g.table(:name=>_("Beta coefficients"), :header=>%w{coeff b beta se t}.collect{|field| _(field)} ) do |t|
t.row([_("Constant"), sprintf(di, constant), "-", constant_se.nil? ? "": sprintf(di, constant_se), constant_t.nil? ? "" : sprintf(di, constant_t)])
@fields.each do |f|
t.row([f, sprintf(di, c[f]), sprintf(di, sc[f]), sprintf(di, cse[f]), sprintf(di, c[f].quo(cse[f]))])
end
end
end
end
def assign_names(c)
a={}
@fields.each_index {|i|
a[@fields[i]]=c[i]
}
a
end
# Sum of squares of regression
# using the predicted value minus y mean
def ssr_direct
mean=@dy.mean
cases=0
ssr=(0...@ds.cases).inject(0) {|a,i|
invalid=false
v=@dep_columns.collect{|c| invalid=true if c[i].nil?; c[i]}
if !invalid
cases+=1
a+((process(v)-mean)**2)
else
a
end
}
ssr
end
def sse_direct
sst-ssr
end
def process(v)
c=coeffs
total=constant
@fields.each_index{|i|
total+=c[@fields[i]]*v[i]
}
total
end
end
end
end
end
================================================
FILE: lib/statsample/regression/multiple/gslengine.rb
================================================
if Statsample.has_gsl?
module Statsample
module Regression
module Multiple
# Class for Multiple Regression Analysis
# Requires rbgsl and uses a listwise aproach.
# Slower on prediction of values than Alglib, because predict is ruby based.
# Better memory management on multiple (+1000) series of regression.
# If you need pairwise, use RubyEngine
# Example:
#
# @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
# @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
# @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
# @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
# ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
# lr=Statsample::Regression::Multiple::GslEngine.new(ds,'y')
#
class GslEngine < BaseEngine
def initialize(ds,y_var, opts=Hash.new)
super
@ds=ds.dup_only_valid
@ds_valid=@ds
@valid_cases=@ds_valid.cases
@dy=@ds[@y_var]
@ds_indep=ds.dup(ds.fields-[y_var])
# Create a custom matrix
columns=[]
@fields=[]
max_deps = GSL::Matrix.alloc(@ds.cases, @ds.fields.size)
constant_col=@ds.fields.size-1
for i in 0...@ds.cases
max_deps.set(i,constant_col,1)
end
j=0
@ds.fields.each{|f|
if f!=@y_var
@ds[f].each_index{|i1|
max_deps.set(i1,j,@ds[f][i1])
}
columns.push(@ds[f].to_a)
@fields.push(f)
j+=1
end
}
@dep_columns=columns.dup
@lr_s=nil
c, @cov, @chisq, @status = GSL::MultiFit.linear(max_deps, @dy.gsl)
@constant=c[constant_col]
@coeffs_a=c.to_a.slice(0...constant_col)
@coeffs=assign_names(@coeffs_a)
c=nil
end
def _dump(i)
Marshal.dump({'ds'=>@ds,'y_var'=>@y_var})
end
def self._load(data)
h=Marshal.load(data)
self.new(h['ds'], h['y_var'])
end
def coeffs
@coeffs
end
# Coefficients using a constant
# Based on http://www.xycoon.com/ols1.htm
def matrix_resolution
columns=@dep_columns.dup.map {|xi| xi.map{|i| i.to_f}}
columns.unshift([1.0]*@ds.cases)
y=Matrix.columns([@dy.data.map {|i| i.to_f}])
x=Matrix.columns(columns)
xt=x.t
matrix=((xt*x)).inverse*xt
matrix*y
end
def r2
r**2
end
def r
Bivariate::pearson(@dy, predicted)
end
def sst
@dy.ss
end
def constant
@constant
end
def standarized_coeffs
l=lr_s
l.coeffs
end
def lr_s
if @lr_s.nil?
build_standarized
end
@lr_s
end
def build_standarized
@ds_s=@ds.standarize
@lr_s=GslEngine.new(@ds_s,@y_var)
end
def process_s(v)
lr_s.process(v)
end
# ???? Not equal to SPSS output
def standarized_residuals
res=residuals
red_sd=residuals.sds
res.collect {|v|
v.quo(red_sd)
}.to_vector(:scale)
end
# Standard error for coeffs
def coeffs_se
out={}
evcm=estimated_variance_covariance_matrix
@ds_valid.fields.each_with_index do |f,i|
mi=i+1
next if f==@y_var
out[f]=evcm[mi,mi]
end
out
end
end
end
end
end # for Statsample
end # for if
================================================
FILE: lib/statsample/regression/multiple/matrixengine.rb
================================================
module Statsample
module Regression
module Multiple
# Pure Ruby Class for Multiple Regression Analysis, based on a covariance or correlation matrix.
#
# Use Statsample::Regression::Multiple::RubyEngine if you have a
# Dataset, to avoid setting all details.
#
# Remember: NEVER use a Covariance data if you have missing data. Use only correlation matrix on that case.
#
#
# Example:
#
# matrix=[[1.0, 0.5, 0.2], [0.5, 1.0, 0.7], [0.2, 0.7, 1.0]]
#
# lr=Statsample::Regression::Multiple::MatrixEngine.new(matrix,2)
class MatrixEngine < BaseEngine
# Hash of standard deviation of predictors.
# Only useful for Correlation Matrix, because by default is set to 1
attr_accessor :x_sd
# Standard deviation of criterion
# Only useful for Correlation Matrix, because by default is set to 1
attr_accessor :y_sd
# Hash of mean for predictors. By default, set to 0
attr_accessor :x_mean
# Mean for criteria. By default, set to 0
attr_accessor :y_mean
# Number of cases
attr_writer :cases
attr_writer :digits
# Create object
#
def initialize(matrix,y_var, opts=Hash.new)
matrix.extend Statsample::CovariateMatrix
raise "#{y_var} variable should be on data" unless matrix.fields.include? y_var
if matrix._type==:covariance
@matrix_cov=matrix
@matrix_cor=matrix.correlation
@no_covariance=false
else
@matrix_cor=matrix
@matrix_cov=matrix
@no_covariance=true
end
@y_var=y_var
@fields=matrix.fields-[y_var]
@n_predictors=@fields.size
@predictors_n=@n_predictors
@matrix_x= @matrix_cor.submatrix(@fields)
@matrix_x_cov= @matrix_cov.submatrix(@fields)
raise LinearDependency, "Regressors are linearly dependent" if @matrix_x.determinant<1e-15
@matrix_y = @matrix_cor.submatrix(@fields, [y_var])
@matrix_y_cov = @matrix_cov.submatrix(@fields, [y_var])
@y_sd=Math::sqrt(@matrix_cov.submatrix([y_var])[0,0])
@x_sd=@n_predictors.times.inject({}) {|ac,i|
ac[@matrix_x_cov.fields[i]]=Math::sqrt(@matrix_x_cov[i,i])
ac;
}
@cases=nil
@x_mean=@fields.inject({}) {|ac,f|
ac[f]=0.0
ac;
}
@y_mean=0.0
@name=_("Multiple reggresion of %s on %s") % [@fields.join(","), @y_var]
opts_default={:digits=>3}
opts=opts_default.merge opts
opts.each{|k,v|
self.send("#{k}=",v) if self.respond_to? k
}
result_matrix=@matrix_x_cov.inverse * @matrix_y_cov
if matrix._type==:covariance
@coeffs=result_matrix.column(0).to_a
@coeffs_stan=coeffs.collect {|k,v|
coeffs[k]*@x_sd[k].quo(@y_sd)
}
else
@coeffs_stan=result_matrix.column(0).to_a
@coeffs=standarized_coeffs.collect {|k,v|
standarized_coeffs[k]*@y_sd.quo(@x_sd[k])
}
end
@total_cases=@valid_cases=@cases
end
def cases
raise "You should define the number of valid cases first" if @cases.nil?
@cases
end
# Get R^2 for the regression
# For fixed models is the coefficient of determination.
# On random models, is the 'squared-multiple correlation'
# Equal to
# * 1-(|R| / |R_x|) or
# * Sum(b_i*r_yi) <- used
def r2
@n_predictors.times.inject(0) {|ac,i| ac+@coeffs_stan[i]* @matrix_y[i,0]}
end
# Multiple correlation, on random models.
def r
Math::sqrt(r2)
end
# Value of constant
def constant
c=coeffs
@y_mean - @fields.inject(0){|a,k| a + (c[k] * @x_mean[k])}
end
# Hash of b or raw coefficients
def coeffs
assign_names(@coeffs)
end
# Hash of beta or standarized coefficients
def standarized_coeffs
assign_names(@coeffs_stan)
end
# Total sum of squares
def sst
@y_sd**2*(cases-1.0)
end
# Degrees of freedom for regression
def df_r
@n_predictors
end
# Degrees of freedom for error
def df_e
cases-@n_predictors-1
end
# Tolerance for a given variable
# defined as (1-R^2) of regression of other independent variables
# over the selected
# == Reference:
# * http://talkstats.com/showthread.php?t=5056
def tolerance(var)
return 1 if @matrix_x.column_size==1
lr=Statsample::Regression::Multiple::MatrixEngine.new(@matrix_x, var)
1-lr.r2
end
# Standard Error for coefficients.
# Standard error of a coefficients depends on
# * Tolerance of the coeffients: Higher tolerances implies higher error
# * Higher r2 implies lower error
# == Reference:
# * Cohen et al. (2003). Applied Multiple Reggression / Correlation Analysis for the Behavioral Sciences
#
def coeffs_se
out={}
#mse=sse.quo(df_e)
coeffs.each {|k,v|
out[k]=@y_sd.quo(@x_sd[k])*Math::sqrt( 1.quo(tolerance(k)))*Math::sqrt((1-r2).quo(df_e))
}
out
end
# t value for constant
def constant_t
return nil if constant_se.nil?
constant.to_f / constant_se
end
# Standard error for constant.
# This method recreates the estimaded variance-covariance matrix
# using means, standard deviation and covariance matrix.
# So, needs the covariance matrix.
def constant_se
return nil if @no_covariance
means=@x_mean
#means[@y_var]=@y_mean
means[:constant]=1
sd=@x_sd
#sd[@y_var]=@y_sd
sd[:constant]=0
fields=[:constant]+@matrix_cov.fields-[@y_var]
# Recreate X'X using the variance-covariance matrix
xt_x=Matrix.rows(fields.collect {|i|
fields.collect {|j|
if i==:constant or j==:constant
cov=0
elsif i==j
cov=sd[i]**2
else
cov=@matrix_cov.submatrix(i..i,j..j)[0,0]
end
cov*(@cases-1)+@cases*means[i]*means[j]
}
})
matrix=xt_x.inverse * mse
matrix.collect {|i| Math::sqrt(i) if i>0 }[0,0]
end
end
end
end
end
================================================
FILE: lib/statsample/regression/multiple/rubyengine.rb
================================================
module Statsample
module Regression
module Multiple
# Pure Ruby Class for Multiple Regression Analysis.
# Slower than AlglibEngine, but is pure ruby and can use a pairwise aproach for missing values.
# Coeffient calculation uses correlation matrix between the vectors
# If you need listwise aproach for missing values, use AlglibEngine, because is faster.
#
# Example:
#
# @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
# @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
# @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
# @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
# ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
# lr=Statsample::Regression::Multiple::RubyEngine.new(ds,'y')
class RubyEngine < MatrixEngine
def initialize(ds,y_var, opts=Hash.new)
matrix=ds.correlation_matrix
fields_indep=ds.fields-[y_var]
default={
:y_mean=>ds[y_var].mean,
:x_mean=>fields_indep.inject({}) {|ac,f| ac[f]=ds[f].mean; ac},
:y_sd=>ds[y_var].sd,
:x_sd=>fields_indep.inject({}) {|ac,f| ac[f]=ds[f].sd; ac},
:cases=>Statsample::Bivariate.min_n_valid(ds)
}
opts=opts.merge(default)
super(matrix, y_var, opts)
@ds=ds
@dy=ds[@y_var]
@ds_valid=ds.dup_only_valid
@total_cases=@ds.cases
@valid_cases=@ds_valid.cases
@ds_indep = ds.dup(ds.fields-[y_var])
set_dep_columns
end
def set_dep_columns
@dep_columns=[]
@ds_indep.each_vector{|k,v|
@dep_columns.push(v.data_with_nils)
}
end
def fix_with_mean
i=0
@ds_indep.each do |row|
empty=[]
row.each do |k,v|
empty.push(k) if v.nil?
end
if empty.size==1
@ds_indep[empty[0]][i]=@ds[empty[0]].mean
end
i+=1
end
@ds_indep.update_valid_data
set_dep_columns
end
def fix_with_regression
i=0
@ds_indep.each{|row|
empty=[]
row.each{|k,v|
empty.push(k) if v.nil?
}
if empty.size==1
field=empty[0]
lr=MultipleRegression.new(@ds_indep,field)
fields=[]
@ds_indep.fields.each{|f|
fields.push(row[f]) unless f==field
}
@ds_indep[field][i]=lr.process(fields)
end
i+=1
}
@ds_indep.update_valid_data
set_dep_columns
end
# Standard error for constant
def constant_se
estimated_variance_covariance_matrix[0,0]
end
end
end
end
end
================================================
FILE: lib/statsample/regression/multiple.rb
================================================
require 'statsample/regression/multiple/baseengine'
module Statsample
module Regression
# Module for OLS Multiple Regression Analysis.
#
# Use:.
#
# require 'statsample'
# a=1000.times.collect {rand}.to_scale
# b=1000.times.collect {rand}.to_scale
# c=1000.times.collect {rand}.to_scale
# ds={'a'=>a,'b'=>b,'c'=>c}.to_dataset
# ds['y']=ds.collect{|row| row['a']*5+row['b']*3+row['c']*2+rand()}
# lr=Statsample::Regression.multiple(ds,'y')
# puts lr.summary
# Summary for regression of a,b,c over y
# *************************************************************
# Engine: Statsample::Regression::Multiple::AlglibEngine
# Cases(listwise)=1000(1000)
# r=0.986
# r2=0.973
# Equation=0.504+5.011a + 2.995b + 1.988c
# ----------------------------
# ANOVA TABLE
# --------------------------------------------------------------
# | source | ss | df | ms | f | s |
# --------------------------------------------------------------
# | Regression | 2979.321 | 3 | 993.107 | 12040.067 | 0.000 |
# | Error | 82.154 | 996 | 0.082 | | |
# | Total | 3061.475 | 999 | | | |
# --------------------------------------------------------------
# Beta coefficientes
# -----------------------------------------------
# | coeff | b | beta | se | t |
# -----------------------------------------------
# | Constant | 0.504 | - | 0.030 | 16.968 |
# | a | 5.011 | 0.832 | 0.031 | 159.486 |
# | b | 2.995 | 0.492 | 0.032 | 94.367 |
# | c | 1.988 | 0.323 | 0.032 | 62.132 |
# -----------------------------------------------
#
module Multiple
# Obtain r2 for regressors
def self.r2_from_matrices(rxx,rxy)
matrix=(rxy.transpose*rxx.inverse*rxy)
matrix[0,0]
end
class MultipleDependent
def significance
0.0
end
def initialize(matrix,y_var, opts=Hash.new)
matrix.extend Statsample::CovariateMatrix
@matrix=matrix
@fields=matrix.fields-y_var
@y_var=y_var
@q=@y_var.size
@matrix_cor=matrix.correlation
@matrix_cor_xx = @matrix_cor.submatrix(@fields)
@matrix_cor_yy = @matrix_cor.submatrix(y_var, y_var)
@sxx = @matrix.submatrix(@fields)
@syy = @matrix.submatrix(y_var, y_var)
@sxy = @matrix.submatrix(@fields, y_var)
@syx = @sxy.t
end
def r2yx
1- (@matrix_cor.determinant.quo(@matrix_cor_yy.determinant * @matrix_cor_xx.determinant))
end
# Residual covariance of Y after accountin with lineal relation with x
def syyx
@syy-@syx*@sxx.inverse*@sxy
end
def r2yx_covariance
1-(syyx.determinant.quo(@syy.determinant))
end
def vxy
@q-(@syy.inverse*syyx).trace
end
def p2yx
vxy.quo(@q)
end
end
end
end
end
================================================
FILE: lib/statsample/regression/simple.rb
================================================
module Statsample
module Regression
# Class for calculation of linear regressions with form
# y = a+bx
# To create a Statsample::Regression::Simple object:
# * Statsample::Regression::Simple.new_from_dataset(ds,x,y)
# * Statsample::Regression::Simple.new_from_vectors(vx,vy)
# * Statsample::Regression::Simple.new_from_gsl(gsl)
#
class Simple
include Summarizable
attr_accessor :a,:b,:cov00, :cov01, :covx1, :chisq, :status
attr_accessor :name
attr_accessor :digits
def initialize(init_method, *argv)
self.send(init_method, *argv)
end
private_class_method :new
# Obtain y value given x value
# x=a+bx
def y(val_x)
@a+@b*val_x
end
# Obtain x value given y value
# x=(y-a)/b
def x(val_y)
(val_y-@a) / @b.to_f
end
# Sum of square error
def sse
(0...@vx.size).inject(0) {|acum,i| acum+((@vy[i]-y(@vx[i]))**2)
}
end
def standard_error
Math::sqrt(sse / (@vx.size-2).to_f)
end
# Sum of square regression
def ssr
vy_mean=@vy.mean
(0...@vx.size).inject(0) {|a,i|
a+((y(@vx[i])-vy_mean)**2)
}
end
# Sum of square total
def sst
@vy.sum_of_squared_deviation
end
# Value of r
def r
@b * (@vx.sds / @vy.sds)
end
# Value of r^2
def r2
r**2
end
class << self
# Create a regression object giving an array with following parameters:
# a,b,cov00, cov01, covx1, chisq, status
# Useful to obtain x and y values with a and b values.
def new_from_gsl(ar)
new(:init_gsl, *ar)
end
# Create a simple regression using two vectors
def new_from_vectors(vx,vy, opts=Hash.new)
new(:init_vectors,vx,vy, opts)
end
# Create a simple regression using a dataset and two vector names.
def new_from_dataset(ds,x,y, opts=Hash.new)
new(:init_vectors,ds[x],ds[y], opts)
end
end
def init_vectors(vx,vy, opts=Hash.new)
@vx,@vy=Statsample.only_valid_clone(vx,vy)
x_m=@vx.mean
y_m=@vy.mean
num=den=0
(0...@vx.size).each {|i|
num+=(@vx[i]-x_m)*(@vy[i]-y_m)
den+=(@vx[i]-x_m)**2
}
@b=num.to_f/den
@a=y_m - @b*x_m
opts_default={
:digits=>3,
:name=>_("Regression of %s over %s") % [@vx.name, @vy.name]
}
@opts=opts_default.merge opts
@opts.each{|k,v|
self.send("#{k}=",v) if self.respond_to? k
}
end
def init_gsl(a,b,cov00, cov01, covx1, chisq, status)
@a=a
@b=b
@cov00=cov00
@cov01=cov01
@covx1=covx1
@chisq=chisq
@status=status
end
def report_building(gen)
f="%0.#{digits}f"
gen.section(:name=>name) do |s|
s.table(:header=>[_("Variable"), _("Value")]) do |t|
t.row [_("r"), f % r]
t.row [_("r^2"), f % r2]
t.row [_("a"), f % a]
t.row [_("b"), f % b]
t.row [_("s.e"), f % standard_error]
end
end
end
private :init_vectors, :init_gsl
end
end
end
================================================
FILE: lib/statsample/regression.rb
================================================
require 'statsample/regression/simple'
require 'statsample/regression/multiple'
require 'statsample/regression/multiple/matrixengine'
require 'statsample/regression/multiple/rubyengine'
require 'statsample/regression/multiple/gslengine'
module Statsample
# = Module for regression procedures.
# Use the method on this class to generate
# analysis.
# If you need more control, you can
# create and control directly the objects who computes
# the regressions.
#
# * Simple Regression : Statsample::Regression::Simple
# * Multiple Regression: Statsample::Regression::Multiple
# * Logit Regression: Statsample::Regression::Binomial::Logit
# * Probit Regression: Statsample::Regression::Binomial::Probit
module Regression
LinearDependency=Class.new(Exception)
# Create a Statsample::Regression::Simple object, for simple regression
# * x: independent Vector
# * y: dependent Vector
# Usage:
# x=100.times.collect {|i| rand(100)}.to_scale
# y=100.times.collect {|i| 2+x[i]*2+rand()}.to_scale
# sr=Statsample::Regression.simple(x,y)
# sr.a
# => 2.51763295177808
# sr.b
# => 1.99973746599856
# sr.r
# => 0.999987881153254
def self.simple(x,y)
Statsample::Regression::Simple.new_from_vectors(x,y)
end
# Creates one of the Statsample::Regression::Multiple object,
# for OLS multiple regression.
# Parameters:
# * ds: Dataset.
# * y: Name of dependent variable.
# * opts: A hash with options
# * missing_data: Could be
# * :listwise: delete cases with one or more empty data (default).
# * :pairwise: uses correlation matrix. Use with caution.
#
# Usage:
# lr=Statsample::Regression::multiple(ds,'y')
def self.multiple(ds,y_var, opts=Hash.new)
missing_data= (opts[:missing_data].nil? ) ? :listwise : opts.delete(:missing_data)
if missing_data==:pairwise
Statsample::Regression::Multiple::RubyEngine.new(ds,y_var, opts)
else
if Statsample.has_gsl? and false
Statsample::Regression::Multiple::GslEngine.new(ds, y_var, opts)
else
ds2=ds.dup_only_valid
Statsample::Regression::Multiple::RubyEngine.new(ds2,y_var, opts)
end
end
end
end
end
================================================
FILE: lib/statsample/reliability/icc.rb
================================================
module Statsample
module Reliability
# = Intra-class correlation
# According to Shrout & Fleiss (1979, p.422): "ICC is the correlation
# between one measurement (either a single rating or a mean of
# several ratings) on a target and another measurement obtained on that target"
# == Usage
# require 'statsample'
# size=1000
# a = size.times.map {rand(10)}.to_scale
# b = a.recode{|i|i+rand(4)-2}
# c =a.recode{|i|i+rand(4)-2}
# d = a.recode{|i|i+rand(4)-2}
# ds={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset
# # Use :type attribute to set type to summarize
# icc=Statsample::Reliability::ICC.new(ds, :type=>:icc_1_k)
# puts icc.summary
#
# == Reference
# * Shrout,P. & Fleiss, J. (1979). Intraclass Correlation: Uses in assessing rater reliability. Psychological Bulletin, 86(2), 420-428
# * McGraw, K. & Wong, S.P. (1996). Forming Inferences About Some Intraclass Correlation Coefficients. Psychological methods, 1(1), 30-46.
class ICC
include Summarizable
# Create a ICC analysis for a given dataset
# Each vector is a different measurement. Only uses complete data
# (listwise deletion).
#
attr_reader :df_bt
attr_reader :df_wt
attr_reader :df_bj
attr_reader :df_residual
attr_reader :ms_bt
attr_reader :ms_wt
attr_reader :ms_bj
attr_reader :ms_residual
alias :bms :ms_bt
alias :wms :ms_wt
alias :jms :ms_bj
alias :ems :ms_residual
alias :msr :ms_bt
alias :msw :ms_wt
alias :msc :ms_bj
alias :mse :ms_residual
# :section: Shrout and Fleiss ICC denominations
attr_reader :icc_1_1
attr_reader :icc_2_1
attr_reader :icc_3_1
attr_reader :icc_1_k
attr_reader :icc_2_k
attr_reader :icc_3_k
# :section: McGraw and Wong ICC denominations
attr_reader :icc_1
attr_reader :icc_c_1
attr_reader :icc_a_1
attr_reader :icc_k
attr_reader :icc_c_k
attr_reader :icc_a_k
attr_reader :n, :k
attr_reader :total_mean
# Type of analysis, for easy summarization
# By default, set to :icc_1
# * Shrout & Fleiss(1979) denominations
# * :icc_1_1
# * :icc_2_1
# * :icc_3_1
# * :icc_1_k
# * :icc_2_k
# * :icc_3_k
# * McGraw & Wong (1996) denominations
# * :icc_1
# * :icc_k
# * :icc_c_1
# * :icc_c_k
# * :icc_a_1
# * :icc_a_k
attr_reader :type
# ICC value, set with :type
attr_reader :r
attr_reader :f
attr_reader :lbound
attr_reader :ubound
attr_accessor :g_rho
attr_accessor :alpha
attr_accessor :name
def initialize(ds, opts=Hash.new)
@ds=ds.dup_only_valid
@vectors=@ds.vectors.values
@n=@ds.cases
@k=@ds.fields.size
compute
@g_rho=0
@alpha=0.05
@icc_name=nil
opts_default={:name=>"Intra-class correlation", :type=>:icc_1}
@opts=opts_default.merge(opts)
@opts.each{|k,v| self.send("#{k}=",v) if self.respond_to? k }
end
def type=(v)
case v
when :icc_1_1
@icc_name=_("Shrout & Fleiss ICC(1,1)")
@r=@icc_1_1
@f=icc_1_f
@lbound, @ubound=icc_1_1_ci(@alpha)
when :icc_2_1
@icc_name=_("Shrout & Fleiss ICC(2,1)")
@r=@icc_2_1
@f=icc_2_f
@lbound, @ubound=icc_2_1_ci(@alpha)
when :icc_3_1
@icc_name=_("Shrout & Fleiss ICC(3,1)")
@r=@icc_3_1
@f=icc_3_f
@lbound, @ubound=icc_3_1_ci(@alpha)
when :icc_1_k
@icc_name=_("Shrout & Fleiss ICC(1,k)")
@r=@icc_1_k
@f=icc_1_k_f
@lbound, @ubound=icc_1_k_ci(@alpha)
when :icc_2_k
@icc_name=_("Shrout & Fleiss ICC(2,k)")
@r=@icc_2_k
@f=icc_2_k_f
@lbound, @ubound=icc_2_k_ci(@alpha)
when :icc_3_k
@icc_name=_("Shrout & Fleiss ICC(3,k)")
@r=@icc_3_k
@f=icc_3_k_f
@lbound, @ubound=icc_3_k_ci(@alpha)
when :icc_1
@icc_name=_("McGraw & Wong ICC(1)")
@r=@icc_1_1
@f=icc_1_f(@g_rho)
@lbound, @ubound=icc_1_1_ci(@alpha)
when :icc_k
@icc_name=_("McGraw & Wong ICC(K)")
@r=@icc_1_k
@f=icc_1_k_f(@g_rho)
@lbound, @ubound=icc_1_k_ci(@alpha)
when :icc_c_1
@icc_name=_("McGraw & Wong ICC(C,1)")
@r=@icc_3_1
@f=icc_c_1_f(@g_rho)
@lbound, @ubound=icc_3_1_ci(@alpha)
when :icc_c_k
@icc_name=_("McGraw & Wong ICC(C,K)")
@r=@icc_3_k
@f=icc_c_k_f(@g_rho)
@lbound, @ubound=icc_c_k_ci(@alpha)
when :icc_a_1
@icc_name=_("McGraw & Wong ICC(A,1)")
@r=@icc_2_1
@f=icc_a_1_f(@g_rho)
@lbound,@ubound = icc_2_1_ci(@alpha)
when :icc_a_k
@icc_name=_("McGraw & Wong ICC(A,K)")
@r=@icc_2_k
@f=icc_a_k_f(@g_rho)
@lbound,@ubound=icc_2_k_ci(@alpha)
else
raise "Type #{v} doesn't exists"
end
end
def compute
@df_bt=n-1
@df_wt=n*(k-1)
@df_bj=k-1
@df_residual=(n-1)*(k-1)
@total_mean=@vectors.inject(0){|ac,v| ac+v.sum}.quo(n*k)
vm=@ds.vector_mean
@ss_bt=k*vm.ss(@total_mean)
@ms_bt=@ss_bt.quo(@df_bt)
@ss_bj=n*@vectors.inject(0){|ac,v| ac+(v.mean-@total_mean).square}
@ms_bj=@ss_bj.quo(@df_bj)
@ss_wt=@vectors.inject(0){|ac,v| ac+(v-vm).ss(0)}
@ms_wt=@ss_wt.quo(@df_wt)
@ss_residual=@ss_wt-@ss_bj
@ms_residual=@ss_residual.quo(@df_residual)
###
# Shrout and Fleiss denomination
###
# ICC(1,1) / ICC(1)
@icc_1_1=(bms-wms).quo(bms+(k-1)*wms)
# ICC(2,1) / ICC(A,1)
@icc_2_1=(bms-ems).quo(bms+(k-1)*ems+k*(jms - ems).quo(n))
# ICC(3,1) / ICC(C,1)
@icc_3_1=(bms-ems).quo(bms+(k-1)*ems)
# ICC(1,K) / ICC(K)
@icc_1_k=(bms-wms).quo(bms)
# ICC(2,K) / ICC(A,k)
@icc_2_k=(bms-ems).quo(bms+(jms-ems).quo(n))
# ICC(3,K) / ICC(C,k) = Cronbach's alpha
@icc_3_k=(bms-ems).quo(bms)
###
# McGraw and Wong
###
end
def icc_1_f(rho=0.0)
num=msr*(1-rho)
den=msw*(1+(k-1)*rho)
Statsample::Test::F.new(num, den, @df_bt, @df_wt)
end
# One way random F, type k
def icc_1_k_f(rho=0)
num=msr*(1-rho)
den=msw
Statsample::Test::F.new(num, den, @df_bt, @df_wt)
end
def icc_c_1_f(rho=0)
num=msr*(1-rho)
den=mse*(1+(k-1)*rho)
Statsample::Test::F.new(num, den, @df_bt, @df_residual)
end
def icc_c_k_f(rho=0)
num=(1-rho)
den=1-@icc_3_k
Statsample::Test::F.new(num, den, @df_bt, @df_residual)
end
def v(a,b)
((a*msc+b*mse)**2).quo(((a*msc)**2.quo(k-1))+((b*mse)**2.quo( (n-1) * (k-1))))
end
def a(rho)
(k*rho).quo(n*(1-rho))
end
def b(rho)
1+((k*rho*(n-1)).quo(n*(1-rho)))
end
def c(rho)
rho.quo(n*(1-rho))
end
def d(rho)
1+((rho*(n-1)).quo(n*(1-rho)))
end
private :v, :a, :b, :c, :d
def icc_a_1_f(rho=0)
fj=jms.quo(ems)
num=msr
den=a(rho)*msc+b(rho)*mse
pp = @icc_2_1
vn=(k-1)*(n-1)*((k*pp*fj+n*(1+(k-1)*pp)-k*pp)**2)
vd=(n-1)*(k**2)*(pp**2)*(fj**2)+((n*(1+(k-1)*pp)-k*pp)**2)
v=vn.quo(vd)
Statsample::Test::F.new(num, den, @df_bt, v)
end
def icc_a_k_f(rho=0)
num=msr
den=c(rho)*msc+d(rho)*mse
fj=jms.quo(ems)
pp = @icc_2_k
vn=(k-1)*(n-1)*((k*pp*fj+n*(1+(k-1)*pp)-k*pp)**2)
vd=(n-1)*(k**2)*(pp**2)*(fj**2)+((n*(1+(k-1)*pp)-k*pp)**2)
v=vn.quo(vd)
Statsample::Test::F.new(num, den, @df_bt,v)
end
# F test for ICC Case 1. Shrout and Fleiss
def icc_1_f_shrout
Statsample::Test::F.new(bms, wms, @df_bt, @df_wt)
end
# Intervale of confidence for ICC (1,1)
def icc_1_1_ci(alpha=0.05)
per=1-(0.5*alpha)
fu=icc_1_f.f*Distribution::F.p_value(per, @df_wt, @df_bt)
fl=icc_1_f.f.quo(Distribution::F.p_value(per, @df_bt, @df_wt))
[(fl-1).quo(fl+k-1), (fu-1).quo(fu+k-1)]
end
# Intervale of confidence for ICC (1,k)
def icc_1_k_ci(alpha=0.05)
per=1-(0.5*alpha)
fu=icc_1_f.f*Distribution::F.p_value(per, @df_wt, @df_bt)
fl=icc_1_f.f.quo(Distribution::F.p_value(per, @df_bt, @df_wt))
[1-1.quo(fl), 1-1.quo(fu)]
end
# F test for ICC Case 2
def icc_2_f
Statsample::Test::F.new(bms, ems, @df_bt, @df_residual)
end
#
# F* for ICC(2,1) and ICC(2,k)
#
def icc_2_1_fs(pp,alpha=0.05)
fj=jms.quo(ems)
per=1-(0.5*alpha)
vn=(k-1)*(n-1)*((k*pp*fj+n*(1+(k-1)*pp)-k*pp)**2)
vd=(n-1)*(k**2)*(pp**2)*(fj**2)+((n*(1+(k-1)*pp)-k*pp)**2)
v=vn.quo(vd)
f1=Distribution::F.p_value(per, n-1,v)
f2=Distribution::F.p_value(per, v, n-1)
[f1,f2]
end
def icc_2_1_ci(alpha=0.05)
icc_2_1_ci_mcgraw
end
# Confidence interval ICC(A,1), McGawn
def icc_2_1_ci_mcgraw(alpha=0.05)
fd,fu=icc_2_1_fs(icc_2_1,alpha)
cl=(n*(msr-fd*mse)).quo(fd*(k*msc+(k*n-k-n)*mse)+n*msr)
cu=(n*(fu*msr-mse)).quo(k*msc+(k*n-k-n)*mse+n*fu*msr)
[cl,cu]
end
def icc_2_k_ci(alpha=0.05)
icc_2_k_ci_mcgraw(alpha)
end
def icc_2_k_ci_mcgraw(alpha=0.05)
f1,f2=icc_2_1_fs(icc_2_k,alpha)
[
(n*(msr-f1*mse)).quo(f1*(msc-mse)+n*msr),
(n*(f2*msr-mse)).quo(msc-mse+n*f2*msr)
]
end
def icc_2_k_ci_shrout(alpha=0.05)
ci=icc_2_1_ci(alpha)
[(ci[0]*k).quo(1+(k-1)*ci[0]), (ci[1]*k).quo(1+(k-1)*ci[1])]
end
def icc_3_f
Statsample::Test::F.new(bms, ems, @df_bt, @df_residual)
end
def icc_3_1_ci(alpha=0.05)
per=1-(0.5*alpha)
fl=icc_3_f.f.quo(Distribution::F.p_value(per, @df_bt, @df_residual))
fu=icc_3_f.f*Distribution::F.p_value(per, @df_residual, @df_bt)
[(fl-1).quo(fl+k-1), (fu-1).quo(fu+k-1)]
end
def icc_3_k_ci(alpha=0.05)
per=1-(0.5*alpha)
fl=icc_3_f.f.quo(Distribution::F.p_value(per, @df_bt, @df_residual))
fu=icc_3_f.f*Distribution::F.p_value(per, @df_residual, @df_bt)
[1-1.quo(fl),1-1.quo(fu)]
end
def icc_c_k_ci(alpha=0.05)
per=1-(0.5*alpha)
fl=icc_c_k_f.f.quo(Distribution::F.p_value(per, @df_bt, @df_residual))
fu=icc_c_k_f.f*Distribution::F.p_value(per, @df_residual, @df_bt)
[1-1.quo(fl),1-1.quo(fu)]
end
def report_building(b)
b.section(:name=>name) do |s|
s.text @icc_name
s.text _("ICC: %0.4f") % @r
s.parse_element(@f)
s.text _("CI (%0.2f): [%0.4f - %0.4f]") % [(1-@alpha)*100, @lbound, @ubound]
end
end
end
end
end
================================================
FILE: lib/statsample/reliability/multiscaleanalysis.rb
================================================
module Statsample
module Reliability
# DSL for analysis of multiple scales analysis.
# Retrieves reliability analysis for each scale and
# provides fast accessors to correlations matrix,
# PCA and Factor Analysis.
#
# == Usage
# @x1=[1,1,1,1,2,2,2,2,3,3,3,30].to_vector(:scale)
# @x2=[1,1,1,2,2,3,3,3,3,4,4,50].to_vector(:scale)
# @x3=[2,2,1,1,1,2,2,2,3,4,5,40].to_vector(:scale)
# @x4=[1,2,3,4,4,4,4,3,4,4,5,30].to_vector(:scale)
# ds={'x1'=>@x1,'x2'=>@x2,'x3'=>@x3,'x4'=>@x4}.to_dataset
# opts={:name=>"Scales", # Name of analysis
# :summary_correlation_matrix=>true, # Add correlation matrix
# :summary_pca } # Add PCA between scales
# msa=Statsample::Reliability::MultiScaleAnalysis.new(opts) do |m|
# m.scale :s1, ds.clone(%w{x1 x2})
# m.scale :s2, ds.clone(%w{x3 x4}), {:name=>"Scale 2"}
# end
# # Retrieve summary
# puts msa.summary
class MultiScaleAnalysis
include Statsample::Summarizable
# Hash with scales
attr_reader :scales
# Name of analysis
attr_accessor :name
# Add a correlation matrix on summary
attr_accessor :summary_correlation_matrix
# Add PCA to summary
attr_accessor :summary_pca
# Add Principal Axis to summary
attr_accessor :summary_principal_axis
# Options for Factor::PCA object
attr_accessor :pca_options
# Options for Factor::PrincipalAxis
attr_accessor :principal_axis_options
# Add Parallel Analysis to summary
attr_accessor :summary_parallel_analysis
# Options for Parallel Analysis
attr_accessor :parallel_analysis_options
# Add MPA to summary
attr_accessor :summary_map
# Options for MAP
attr_accessor :map_options
# Generates a new MultiScaleAnalysis
# Opts could be any accessor of the class
# * :name,
# * :summary_correlation_matrix
# * :summary_pca
# * :summary_principal_axis
# * :summary_map
# * :pca_options
# * :factor_analysis_options
# * :map_options
# If block given, all methods should be called
# inside object environment.
#
def initialize(opts=Hash.new, &block)
@scales=Hash.new
@scales_keys=Array.new
opts_default={ :name=>_("Multiple Scale analysis"),
:summary_correlation_matrix=>false,
:summary_pca=>false,
:summary_principal_axis=>false,
:summary_parallel_analysis=>false,
:summary_map=>false,
:pca_options=>Hash.new,
:principal_axis_options=>Hash.new,
:parallel_analysis_options=>Hash.new,
:map_options=>Hash.new
}
@opts=opts_default.merge(opts)
@opts.each{|k,v|
self.send("#{k}=",v) if self.respond_to? k
}
if block
block.arity<1 ? instance_eval(&block) : block.call(self)
end
end
# Add or retrieve a scale to analysis.
# If second parameters is a dataset, generates a ScaleAnalysis
# for ds, named code with options opts.
#
# If second parameters is empty, returns the ScaleAnalysis
# code.
def scale(code, ds=nil, opts=nil)
if ds.nil?
@scales[code]
else
opts={:name=>_("Scale %s") % code} if opts.nil?
@scales_keys.push(code)
@scales[code]=ScaleAnalysis.new(ds, opts)
end
end
# Delete ScaleAnalysis named code
def delete_scale(code)
@scales_keys.delete code
@scales.delete code
end
# Retrieves a Principal Component Analysis (Factor::PCA)
# using all scales, using opts a options.
def pca(opts=nil)
opts||=pca_options
Statsample::Factor::PCA.new(correlation_matrix, opts)
end
# Retrieve Velicer's MAP
# using all scales.
def map(opts=nil)
opts||=map_options
Statsample::Factor::MAP.new(correlation_matrix, opts)
end
# Retrieves a PrincipalAxis Analysis (Factor::PrincipalAxis)
# using all scales, using opts a options.
def principal_axis_analysis(opts=nil)
opts||=principal_axis_options
Statsample::Factor::PrincipalAxis.new(correlation_matrix, opts)
end
def dataset_from_scales
ds=Dataset.new(@scales_keys)
@scales.each_pair do |code,scale|
ds[code.to_s]=scale.ds.vector_sum
ds[code.to_s].name=scale.name
end
ds.update_valid_data
ds
end
def parallel_analysis(opts=nil)
opts||=parallel_analysis_options
Statsample::Factor::ParallelAnalysis.new(dataset_from_scales, opts)
end
# Retrieves a Correlation Matrix between scales.
#
def correlation_matrix
Statsample::Bivariate.correlation_matrix(dataset_from_scales)
end
def report_building(b) # :nodoc:
b.section(:name=>name) do |s|
s.section(:name=>_("Reliability analysis of scales")) do |s2|
@scales.each_pair do |k, scale|
s2.parse_element(scale)
end
end
if summary_correlation_matrix
s.section(:name=>_("Correlation matrix for %s") % name) do |s2|
s2.parse_element(correlation_matrix)
end
end
if summary_pca
s.section(:name=>_("PCA for %s") % name) do |s2|
s2.parse_element(pca)
end
end
if summary_principal_axis
s.section(:name=>_("Principal Axis for %s") % name) do |s2|
s2.parse_element(principal_axis_analysis)
end
end
if summary_parallel_analysis
s.section(:name=>_("Parallel Analysis for %s") % name) do |s2|
s2.parse_element(parallel_analysis)
end
end
if summary_map
s.section(:name=>_("MAP for %s") % name) do |s2|
s2.parse_element(map)
end
end
end
end
end
end
end
================================================
FILE: lib/statsample/reliability/scaleanalysis.rb
================================================
module Statsample
module Reliability
# Analysis of a Scale. Analoge of Scale Reliability analysis on SPSS.
# Returns several statistics for complete scale and each item
# == Usage
# @x1=[1,1,1,1,2,2,2,2,3,3,3,30].to_vector(:scale)
# @x2=[1,1,1,2,2,3,3,3,3,4,4,50].to_vector(:scale)
# @x3=[2,2,1,1,1,2,2,2,3,4,5,40].to_vector(:scale)
# @x4=[1,2,3,4,4,4,4,3,4,4,5,30].to_vector(:scale)
# ds={'x1'=>@x1,'x2'=>@x2,'x3'=>@x3,'x4'=>@x4}.to_dataset
# ia=Statsample::Reliability::ScaleAnalysis.new(ds)
# puts ia.summary
class ScaleAnalysis
include Summarizable
attr_reader :ds,:mean, :sd,:valid_n, :alpha , :alpha_standarized, :variances_mean, :covariances_mean, :cov_m
attr_accessor :name
attr_accessor :summary_histogram
def initialize(ds, opts=Hash.new)
@dumped=ds.fields.find_all {|f|
ds[f].variance==0
}
@ods=ds
@ds=ds.dup_only_valid(ds.fields - @dumped)
@ds.name=ds.name
@k=@ds.fields.size
@total=@ds.vector_sum
@o_total=@dumped.size > 0 ? @ods.vector_sum : nil
@vector_mean=@ds.vector_mean
@item_mean=@vector_mean.mean
@item_sd=@vector_mean.sd
@mean=@total.mean
@median=@total.median
@skew=@total.skew
@kurtosis=@total.kurtosis
@sd = @total.sd
@variance=@total.variance
@valid_n = @total.size
opts_default={
:name=>_("Reliability Analysis"),
:summary_histogram=>true
}
@opts=opts_default.merge(opts)
@opts.each{|k,v| self.send("#{k}=",v) if self.respond_to? k }
@cov_m=Statsample::Bivariate.covariance_matrix(@ds)
# Mean for covariances and variances
@variances=@k.times.map {|i| @cov_m[i,i]}.to_scale
@variances_mean=@variances.mean
@covariances_mean=(@variance-@variances.sum).quo(@k**2-@k)
#begin
@alpha = Statsample::Reliability.cronbach_alpha(@ds)
@alpha_standarized = Statsample::Reliability.cronbach_alpha_standarized(@ds)
#rescue => e
# raise DatasetException.new(@ds,e), "Error calculating alpha"
#end
end
# Returns a hash with structure
def item_characteristic_curve
i=0
out={}
total={}
@ds.each do |row|
tot=@total[i]
@ds.fields.each do |f|
out[f]||= {}
total[f]||={}
out[f][tot]||= 0
total[f][tot]||=0
out[f][tot]+= row[f]
total[f][tot]+=1
end
i+=1
end
total.each do |f,var|
var.each do |tot,v|
out[f][tot]=out[f][tot].quo(total[f][tot])
end
end
out
end
# =Adjusted R.P.B. for each item
# Adjusted RPB(Point biserial-correlation) for each item
#
def item_total_correlation
@itc||=@ds.fields.inject({}) do |a,v|
vector=@ds[v].clone
ds2=@ds.clone
ds2.delete_vector(v)
total=ds2.vector_sum
a[v]=Statsample::Bivariate.pearson(vector,total)
a
end
end
def mean_rpb
item_total_correlation.values.to_scale.mean
end
def item_statistics
@is||=@ds.fields.inject({}) do |a,v|
a[v]={:mean=>@ds[v].mean, :sds=>Math::sqrt(@cov_m.variance(v))}
a
end
end
# Returns a dataset with cases ordered by score
# and variables ordered by difficulty
def item_difficulty_analysis
dif={}
@ds.fields.each{|f| dif[f]=@ds[f].mean }
dif_sort=dif.sort{|a,b| -(a[1]<=>b[1])}
scores_sort={}
scores=@ds.vector_mean
scores.each_index{|i| scores_sort[i]=scores[i] }
scores_sort=scores_sort.sort{|a,b| a[1]<=>b[1]}
ds_new=Statsample::Dataset.new(['case','score'] + dif_sort.collect{|a,b| a})
scores_sort.each do |i,score|
row=[i, score]
case_row=@ds.case_as_hash(i)
dif_sort.each{|variable,dif_value| row.push(case_row[variable]) }
ds_new.add_case_array(row)
end
ds_new.update_valid_data
ds_new
end
def stats_if_deleted
@sif||=stats_if_deleted_intern
end
def stats_if_deleted_intern # :nodoc:
return Hash.new if @ds.fields.size==1
@ds.fields.inject({}) do |a,v|
cov_2=@cov_m.submatrix(@ds.fields-[v])
#ds2=@ds.clone
#ds2.delete_vector(v)
#total=ds2.vector_sum
a[v]={}
#a[v][:mean]=total.mean
a[v][:mean]=@mean-item_statistics[v][:mean]
a[v][:variance_sample]=cov_2.total_sum
a[v][:sds]=Math::sqrt(a[v][:variance_sample])
n=cov_2.row_size
a[v][:alpha] = (n>=2) ? Statsample::Reliability.cronbach_alpha_from_covariance_matrix(cov_2) : nil
a
end
end
def report_building(builder) #:nodoc:
builder.section(:name=>@name) do |s|
if @dumped.size>0
s.section(:name=>"Items with variance=0") do |s1|
s.table(:name=>_("Summary for %s with all items") % @name) do |t|
t.row [_("Items"), @ods.fields.size]
t.row [_("Sum mean"), "%0.4f" % @o_total.mean]
t.row [_("S.d. mean"), "%0.4f" % @o_total.sd]
end
s.table(:name=>_("Deleted items"), :header=>['item','mean']) do |t|
@dumped.each do |f|
t.row(["#{@ods[f].name}(#{f})", "%0.5f" % @ods[f].mean])
end
end
s.parse_element(Statsample::Graph::Histogram.new(@o_total, :name=>"Histogram (complete data) for %s" % @name)) if @summary_histogram
end
end
s.table(:name=>_("Summary for %s") % @name) do |t|
t.row [_("Valid Items"), @ds.fields.size]
t.row [_("Valid cases"), @valid_n]
t.row [_("Sum mean"), "%0.4f" % @mean]
t.row [_("Sum sd"), "%0.4f" % @sd ]
# t.row [_("Sum variance"), "%0.4f" % @variance]
t.row [_("Sum median"), @median]
t.hr
t.row [_("Item mean"), "%0.4f" % @item_mean]
t.row [_("Item sd"), "%0.4f" % @item_sd]
t.hr
t.row [_("Skewness"), "%0.4f" % @skew]
t.row [_("Kurtosis"), "%0.4f" % @kurtosis]
t.hr
t.row [_("Cronbach's alpha"), @alpha ? ("%0.4f" % @alpha) : "--"]
t.row [_("Standarized Cronbach's alpha"), @alpha_standarized ? ("%0.4f" % @alpha_standarized) : "--" ]
t.row [_("Mean rpb"), "%0.4f" % mean_rpb]
t.row [_("Variances mean"), "%g" % @variances_mean]
t.row [_("Covariances mean") , "%g" % @covariances_mean]
end
if (@alpha)
s.text _("Items for obtain alpha(0.8) : %d" % Statsample::Reliability::n_for_desired_reliability(@alpha, 0.8, @ds.fields.size))
s.text _("Items for obtain alpha(0.9) : %d" % Statsample::Reliability::n_for_desired_reliability(@alpha, 0.9, @ds.fields.size))
end
sid=stats_if_deleted
is=item_statistics
itc=item_total_correlation
s.table(:name=>_("Items report for %s") % @name, :header=>["item","mean","sd", "mean if deleted", "var if deleted", "sd if deleted"," item-total correl.", "alpha if deleted"]) do |t|
@ds.fields.each do |f|
row=["#{@ds[f].name}(#{f})"]
if is[f]
row+=[sprintf("%0.5f",is[f][:mean]), sprintf("%0.5f", is[f][:sds])]
else
row+=["-","-"]
end
if sid[f]
row+= [sprintf("%0.5f",sid[f][:mean]), sprintf("%0.5f",sid[f][:variance_sample]), sprintf("%0.5f",sid[f][:sds])]
else
row+=%w{- - -}
end
if itc[f]
row+= [sprintf("%0.5f",itc[f])]
else
row+=['-']
end
if sid[f] and !sid[f][:alpha].nil?
row+=[sprintf("%0.5f",sid[f][:alpha])]
else
row+=["-"]
end
t.row row
end # end each
end # table
s.parse_element(Statsample::Graph::Histogram.new(@total, :name=>"Histogram (valid data) for %s" % @name)) if @summary_histogram
end # section
end # def
end # class
end # module
end # module
================================================
FILE: lib/statsample/reliability/skillscaleanalysis.rb
================================================
module Statsample
module Reliability
# Analysis of a Skill Scale
# Given a dataset with results and a correct answers hash,
# generates a ScaleAnalysis
# == Usage
# x1=%{a b b c}.to_vector
# x2=%{b a b c}.to_vector
# x3=%{a c b a}.to_vector
# ds={'x1'=>@x1,'x2'=>@x2,'x3'=>@x3}.to_dataset
# key={'x1'=>'a','x2'=>'b','x3'=>'a'}
# ssa=Statsample::Reliability::SkillScaleAnalysis.new(ds,key)
# puts ssa.summary
class SkillScaleAnalysis
include Summarizable
attr_accessor :name
attr_accessor :summary_minimal_item_correlation
attr_accessor :summary_show_problematic_items
def initialize(ds,key,opts=Hash.new)
opts_default={
:name=>_("Skill Scale Reliability Analysis (%s)") % ds.name,
:summary_minimal_item_correlation=>0.10,
:summary_show_problematic_items=>true
}
@ds=ds
@key=key
@opts=opts_default.merge(opts)
@opts.each{|k,v| self.send("#{k}=",v) if self.respond_to? k }
@cds=nil
end
# Dataset only corrected vectors
def corrected_dataset_minimal
cds=corrected_dataset
dsm=@key.keys.inject({}) {|ac,v| ac[v]=cds[v];ac}.to_dataset
@key.keys.each do |k|
dsm[k].name=_("%s(corrected)") % @ds[k].name
dsm[k].labels=@ds[k].labels
end
dsm.name=_("Corrected dataset from %s") % @ds.name
dsm
end
def vector_sum
corrected_dataset_minimal.vector_sum
end
def vector_mean
corrected_dataset_minimal.vector_mean
end
def scale_analysis
sa=ScaleAnalysis.new(corrected_dataset_minimal)
sa.name=_("%s (Scale Analysis)") % @name
sa
end
def corrected_dataset
if @cds.nil?
@cds=@ds.dup_empty
@key.keys.each {|k| @cds[k].type=:scale; @cds[k].name=@ds[k].name}
@ds.each do |row|
out={}
row.each do |k,v|
if @key.keys.include? k
if @ds[k].is_valid? v
out[k]= @key[k]==v ? 1 : 0
else
out[k]=nil
end
else
out[k]=v
end
end
@cds.add_case(out,false)
end
@cds.update_valid_data
end
@cds
end
def report_building(builder)
builder.section(:name=>@name) do |s|
sa=scale_analysis
s.parse_element(sa)
if summary_show_problematic_items
s.section(:name=>_("Problematic Items")) do |spi|
count=0
sa.item_total_correlation.each do |k,v|
if v < summary_minimal_item_correlation
count+=1
spi.section(:name=>_("Item: %s") % @ds[k].name) do |spii|
spii.text _("Correct answer: %s") % @key[k]
spii.text _("p: %0.3f") % corrected_dataset[k].mean
props=@ds[k].proportions.inject({}) {|ac,v| ac[v[0]] = v[1].to_f;ac}
spi.table(:name=>"Proportions",:header=>[_("Value"), _("%")]) do |table|
props.each do |k1,v|
table.row [ @ds[k].labeling(k1), "%0.3f" % v]
end
end
end
end
end
spi.text _("No problematic items") if count==0
end
end
end
end
end
end
end
================================================
FILE: lib/statsample/reliability.rb
================================================
module Statsample
module Reliability
class << self
# Calculate Chonbach's alpha for a given dataset.
# only uses tuples without missing data
def cronbach_alpha(ods)
ds=ods.dup_only_valid
return nil if ds.vectors.any? {|k,v| v.variance==0}
n_items=ds.fields.size
return nil if n_items<=1
s2_items=ds.vectors.inject(0) {|ac,v|
ac+v[1].variance }
total=ds.vector_sum
(n_items.quo(n_items-1)) * (1-(s2_items.quo(total.variance)))
end
# Calculate Chonbach's alpha for a given dataset
# using standarized values for every vector.
# Only uses tuples without missing data
# Return nil if one or more vectors has 0 variance
def cronbach_alpha_standarized(ods)
ds=ods.dup_only_valid
return nil if ds.vectors.any? {|k,v| v.variance==0}
ds=ds.fields.inject({}){|a,f|
a[f]=ods[f].standarized;
a
}.to_dataset
cronbach_alpha(ds)
end
# Predicted reliability of a test by replicating
# +n+ times the number of items
def spearman_brown_prophecy(r,n)
(n*r).quo(1+(n-1)*r)
end
alias :sbp :spearman_brown_prophecy
# Returns the number of items
# to obtain +r_d+ desired reliability
# from +r+ current reliability, achieved with
# +n+ items
def n_for_desired_reliability(r,r_d,n=1)
return nil if r.nil?
(r_d*(1-r)).quo(r*(1-r_d))*n
end
# Get Cronbach alpha from n cases,
# s2 mean variance and cov
# mean covariance
def cronbach_alpha_from_n_s2_cov(n,s2,cov)
(n.quo(n-1)) * (1-(s2.quo(s2+(n-1)*cov)))
end
# Get Cronbach's alpha from a covariance matrix
def cronbach_alpha_from_covariance_matrix(cov)
n=cov.row_size
raise "covariance matrix should have at least 2 variables" if n < 2
s2=n.times.inject(0) {|ac,i| ac+cov[i,i]}
(n.quo(n-1))*(1-(s2.quo(cov.total_sum)))
end
# Returns n necessary to obtain specific alpha
# given variance and covariance mean of items
def n_for_desired_alpha(alpha,s2,cov)
# Start with a regular test : 50 items
min=2
max=1000
n=50
prev_n=0
epsilon=0.0001
dif=1000
c_a=cronbach_alpha_from_n_s2_cov(n,s2,cov)
dif=c_a - alpha
while(dif.abs>epsilon and n!=prev_n)
prev_n=n
if dif<0
min=n
n=(n+(max-min).quo(2)).to_i
else
max=n
n=(n-(max-min).quo(2)).to_i
end
c_a=cronbach_alpha_from_n_s2_cov(n,s2,cov)
dif=c_a - alpha
#puts "#{n} , #{c_a}"
end
n
end
# First derivative for alfa
# Parameters
# n: Number of items
# sx: mean of variances
# sxy: mean of covariances
def alpha_first_derivative(n,sx,sxy)
(sxy*(sx-sxy)).quo(((sxy*(n-1))+sx)**2)
end
# Second derivative for alfa
# Parameters
# n: Number of items
# sx: mean of variances
# sxy: mean of covariances
def alfa_second_derivative(n,sx,sxy)
(2*(sxy**2)*(sxy-sx)).quo(((sxy*(n-1))+sx)**3)
end
end
class ItemCharacteristicCurve
attr_reader :totals, :counts, :vector_total
def initialize (ds, vector_total=nil)
vector_total||=ds.vector_sum
raise ArgumentError, "Total size != Dataset size" if vector_total.size!=ds.cases
@vector_total=vector_total
@ds=ds
@totals={}
@counts=@ds.fields.inject({}) {|a,v| a[v]={};a}
process
end
def process
i=0
@ds.each do |row|
tot=@vector_total[i]
@totals[tot]||=0
@totals[tot]+=1
@ds.fields.each do |f|
item=row[f].to_s
@counts[f][tot]||={}
@counts[f][tot][item]||=0
@counts[f][tot][item] += 1
end
i+=1
end
end
# Return a hash with p for each different value on a vector
def curve_field(field, item)
out={}
item=item.to_s
@totals.each do |value,n|
count_value= @counts[field][value][item].nil? ? 0 : @counts[field][value][item]
out[value]=count_value.quo(n)
end
out
end # def
end # self
end # Reliability
end # Statsample
require 'statsample/reliability/icc.rb'
require 'statsample/reliability/scaleanalysis.rb'
require 'statsample/reliability/skillscaleanalysis.rb'
require 'statsample/reliability/multiscaleanalysis.rb'
================================================
FILE: lib/statsample/resample.rb
================================================
module Statsample
module Resample
class << self
def repeat_and_save(times,&action)
(1..times).inject([]) {|a,x| a.push(action.call); a}
end
def generate (size,low,upper)
range=upper-low+1
Vector.new((0...size).collect {|x| rand(range)+low },:scale)
end
end
end
end
================================================
FILE: lib/statsample/rserve_extension.rb
================================================
# Several additions to Statsample objects, to support
# rserve-client
module Statsample
class Vector
def to_REXP
Rserve::REXP::Wrapper.wrap(data_with_nils)
end
end
class Dataset
def to_REXP
names=@fields
data=@fields.map {|f|
Rserve::REXP::Wrapper.wrap(@vectors[f].data_with_nils)
}
l=Rserve::Rlist.new(data,names)
Rserve::REXP.create_data_frame(l)
end
end
end
================================================
FILE: lib/statsample/shorthand.rb
================================================
class Object
# Shorthand for Statsample::Analysis.store(*args,&block)
def ss_analysis(*args,&block)
Statsample::Analysis.store(*args,&block)
end
end
module Statsample
# Module which provide shorthands for many methods.
module Shorthand
###
# :section: R like methods
###
def read_with_cache(klass, filename,opts=Hash.new, cache=true)
file_ds=filename+".ds"
if cache and (File.exists? file_ds and File.mtime(file_ds)>File.mtime(filename))
ds=Statsample.load(file_ds)
else
ds=klass.read(filename)
ds.save(file_ds) if cache
end
ds
end
# Import an Excel file. Cache result by default
def read_excel(filename, opts=Hash.new, cache=true)
read_with_cache(Statsample::Excel, filename, opts, cache)
end
# Import an CSV file. Cache result by default
def read_csv
read_with_cache(Statsample::CSV, filename, opts, cache)
end
# Retrieve names (fields) from dataset
def names(ds)
ds.fields
end
# Create a correlation matrix from a dataset
def cor(ds)
Statsample::Bivariate.correlation_matrix(ds)
end
# Create a variance/covariance matrix from a dataset
def cov(ds)
Statsample::Bivariate.covariate_matrix(ds)
end
# Create a Statsample::Vector
# Analog to R's c
def vector(*args)
Statsample::Vector[*args]
end
# Random generation for the normal distribution
def rnorm(n,mean=0,sd=1)
rng=Distribution::Normal.rng(mean,sd)
Statsample::Vector.new_scale(n) { rng.call}
end
# Creates a new Statsample::Dataset
# Each key is transformed into string
def dataset(vectors=Hash.new)
vectors=vectors.inject({}) {|ac,v| ac[v[0].to_s]=v[1];ac}
Statsample::Dataset.new(vectors)
end
alias :data_frame :dataset
# Returns a Statsample::Graph::Boxplot
def boxplot(*args)
Statsample::Graph::Boxplot.new(*args)
end
# Returns a Statsample::Graph::Histogram
def histogram(*args)
Statsample::Graph::Histogram.new(*args)
end
# Returns a Statsample::Graph::Scatterplot
def scatterplot(*args)
Statsample::Graph::Scatterplot.new(*args)
end
# Returns a Statsample::Test::Levene
def levene(*args)
Statsample::Test::Levene.new(*args)
end
def principal_axis(*args)
Statsample::Factor::PrincipalAxis.new(*args)
end
def polychoric(*args)
Statsample::Bivariate::Polychoric.new(*args)
end
def tetrachoric(*args)
Statsample::Bivariate::Tetrachoric.new(*args)
end
###
# Other Shortcuts
###
def lr(*args)
Statsample::Regression.multiple(*args)
end
def pca(ds,opts=Hash.new)
Statsample::Factor::PCA.new(ds,opts)
end
def dominance_analysis(*args)
Statsample::DominanceAnalysis.new(*args)
end
def dominance_analysis_bootstrap(*args)
Statsample::DominanceAnalysis::Bootstrap.new(*args)
end
def scale_analysis(*args)
Statsample::Reliability::ScaleAnalysis.new(*args)
end
def skill_scale_analysis(*args)
Statsample::Reliability::SkillScaleAnalysis.new(*args)
end
def multiscale_analysis(*args,&block)
Statsample::Reliability::MultiScaleAnalysis.new(*args,&block)
end
def test_u(*args)
Statsample::Test::UMannWhitney.new(*args)
end
module_function :test_u, :rnorm
end
end
================================================
FILE: lib/statsample/srs.rb
================================================
module Statsample
# Several methods to estimate parameters for simple random sampling
# == Reference:
# * Cochran, W.(1972). Sampling Techniques [spanish edition].
# * http://stattrek.com/Lesson6/SRS.aspx
module SRS
class << self
########################
#
# :SECTION: Proportion estimation
#
# Function for estimation of proportions
########################
#
# Finite population correction (over variance)
# Source: Cochran(1972)
def fpc_var(sam,pop)
(pop - sam).quo(pop - 1)
end
# Finite population correction (over standard deviation)
def fpc(sam,pop)
Math::sqrt((pop-sam).quo(pop-1))
end
# Non sample fraction.
#
# 1 - sample fraction
def qf(sam , pop)
1-(sam.quo(pop))
end
# Sample size estimation for proportions, infinite poblation
def estimation_n0(d,prop,margin=0.95)
t=Distribution::Normal.p_value(1-(1-margin).quo(2))
var=prop*(1-prop)
t**2*var.quo(d**2)
end
# Sample size estimation for proportions, finite poblation.
def estimation_n(d,prop,n_pobl,margin=0.95)
n0=estimation_n0(d,prop,margin)
n0.quo( 1 + ((n0 - 1).quo(n_pobl)))
end
# Proportion confidence interval with t values
# Uses estimated proportion, sample without replacement.
def proportion_confidence_interval_t(prop, n_sample, n_population, margin=0.95)
t = Distribution::T.p_value(1-((1-margin).quo(2)) , n_sample-1)
proportion_confidence_interval(prop,n_sample,n_population, t)
end
# Proportion confidence interval with z values
# Uses estimated proportion, sample without replacement.
def proportion_confidence_interval_z(p, n_sample, n_population, margin=0.95)
z=Distribution::Normal.p_value(1-((1-margin).quo(2)))
proportion_confidence_interval(p,n_sample,n_population, z)
end
# Proportion confidence interval with x value
# Uses estimated proportion, sample without replacement
def proportion_confidence_interval(p, sam,pop , x)
#f=sam.quo(pop)
one_range=x * Math::sqrt((qf(sam, pop) * p * (1-p)).quo(sam-1)) + (1.quo(sam * 2.0))
[p-one_range, p+one_range]
end
# Standard deviation for sample distribution of a proportion
# Know proportion, sample with replacement.
# Based on http://stattrek.com/Lesson6/SRS.aspx
def proportion_sd_kp_wr(p, n_sample)
Math::sqrt(p*(1-p).quo(n_sample))
end
# Standard deviation for sample distribution of a proportion
# Know proportion, sample without replacement.
#
# Sources:
# * Cochran(1972)
def proportion_sd_kp_wor(p, sam, pop)
fpc(sam,pop)*Math::sqrt(p*(1-p).quo(sam))
end
# Standard deviation for sample distribution of a proportion
# Estimated proportion, sample with replacement
# Based on http://stattrek.com/Lesson6/SRS.aspx.
def proportion_sd_ep_wr(p, n_sample)
Math::sqrt(p*(1-p).quo(n_sample-1))
end
# Standard deviation for sample distribution of a proportion.
# Estimated proportion, sample without replacement.
# Reference:
# * Cochran, 1972, Técnicas de muestreo
def proportion_sd_ep_wor(p, sam,pop)
fsc=(pop-sam).quo((sam-1)*pop)
Math::sqrt(fsc*p*(1-p))
end
# Total estimation sd based on sample.
# Known proportion, sample without replacement
# Reference:
# * Cochran(1972)
def proportion_total_sd_kp_wor(prop, sam, pop)
pob * proportion_sd_kp_wor(p, sam, pop)
end
# Total estimation sd based on sample.
# Estimated proportion, sample without replacement
# Source: Cochran(1972)
def proportion_total_sd_ep_wor(prop, sam, pop)
fsc=((pop - sam).to_f / ( sam - 1))
Math::sqrt(fsc*pop*prop*(1-prop))
end
########################
#
# :SECTION: Mean stimation
#
########################
# Standard error. Known variance, sample with replacement.
def standard_error_ksd_wr(s, sam, pop)
s.quo(Math::sqrt(sam)) * Math::sqrt((pop-1).quo(pop))
end
# Standard error of the mean. Known variance, sample w/o replacement
def standard_error_ksd_wor(s,sam,pop)
s.quo(Math::sqrt(sam)) * Math::sqrt(qf(sam,pop))
end
alias_method :standard_error_esd_wr, :standard_error_ksd_wr
# Standard error of the mean.
# Estimated variance, without replacement
# Cochran (1972) p.47
def standard_error_esd_wor(s,sam,pop)
s.quo(Math::sqrt(sam)) * Math::sqrt(qf(sam,pop))
end
alias_method :standard_error, :standard_error_esd_wor
alias_method :se, :standard_error_esd_wor
# Standard error of total estimation
def standard_error_total(s,sam,pop)
pop*se(s,sam,pop)
end
# Confidence Interval using T-Student
# Use with n < 60
def mean_confidence_interval_t(mean,s,n_sample,n_population,margin=0.95)
t=Distribution::T.p_value(1-((1-margin) / 2),n_sample-1)
mean_confidence_interval(mean,s,n_sample,n_population,t)
end
# Confidente Interval using Z
# Use with n > 60
def mean_confidence_interval_z(mean,s,n_sample,n_population,margin=0.95)
z=Distribution::Normal.p_value(1-((1-margin) / 2))
mean_confidence_interval(mean,s,n_sample,n_population, z)
end
# Confidente interval using X.
#
# Better use mean_confidence_interval_z or mean_confidence_interval_t
def mean_confidence_interval(mean,s,n_sample,n_population,x)
range=x*se(s,n_sample,n_population)
[mean-range,mean+range]
end
end
end
end
================================================
FILE: lib/statsample/test/bartlettsphericity.rb
================================================
module Statsample
module Test
# == Bartlett's test of Sphericity.
# Test the hyphotesis that the sample correlation matrix
# comes from a multivariate normal population where variables
# are independent. In other words, the population correlation
# matrix is the identity matrix.
# == Reference
# * Dziuban, C., & Shirkey E. (1974). When is a correlation matrix appropriate for factor analysis? Some decision rules. Psychological Bulletin, 81(6), 358-361.
class BartlettSphericity
include Statsample::Test
include Summarizable
attr_accessor :name
attr_reader :ncases
attr_reader :nvars
attr_reader :value
attr_reader :df
# Args
# * _matrix_: correlation matrix
# * _ncases_: number of cases
def initialize(matrix,ncases)
@matrix=matrix
@ncases=ncases
@nvars=@matrix.row_size
@name=_("Bartlett's test of sphericity")
compute
end
# Uses SPSS formula.
# On Dziuban & Shirkey, the minus between the first and second
# statement is a *!!!
#
def compute
@value=-((@ncases-1)-(2*@nvars+5).quo(6))*Math::log(@matrix.determinant)
@df=(@nvars*(@nvars-1)).quo(2)
end
def probability
1-Distribution::ChiSquare.cdf(@value,@df)
end
def report_building(builder) # :nodoc:
builder.text "%s : X(%d) = %0.4f , p = %0.4f" % [@name, @df, @value, probability]
end
end
end
end
================================================
FILE: lib/statsample/test/chisquare.rb
================================================
module Statsample
module Test
module ChiSquare
class WithMatrix
attr_reader :df
attr_reader :value
def initialize(observed, expected=nil)
@observed=observed
@expected=expected or calculate_expected
raise "Observed size!=expected size" if @observed.row_size!=@expected.row_size or @observed.column_size!=@expected.column_size
@df=(@observed.row_size-1)*(@observed.column_size-1)
@value=compute_chi
end
def calculate_expected
sum=@observed.total_sum
@expected=Matrix.rows( @observed.row_size.times.map {|i|
@observed.column_size.times.map {|j|
(@observed.row_sum[i].quo(sum) * @observed.column_sum[j].quo(sum))*sum
}
})
end
def to_f
@value
end
def chi_square
@value
end
def probability
1-Distribution::ChiSquare.cdf(@value.to_f,@df)
end
def compute_chi
sum=0
(0...@observed.row_size).each {|i|
(0...@observed.column_size).each {|j|
sum+=((@observed[i, j] - @expected[i,j])**2).quo(@expected[i,j])
}
}
sum
end
end
end
end
end
================================================
FILE: lib/statsample/test/f.rb
================================================
module Statsample
module Test
# From Wikipedia:
# An F-test is any statistical test in which the test statistic has an F-distribution under the null hypothesis. It is most often used when comparing statistical models that have been fit to a data set, in order to identify the model that best fits the population from which the data were sampled.
class F
include Statsample::Test
include Summarizable
attr_reader :var_num, :var_den, :df_num, :df_den, :var_total, :df_total
# Tails for probability (:both, :left or :right)
attr_accessor :tails
# Name of F analysis
attr_accessor :name
# Parameters:
# * var_num: variance numerator
# * var_den: variance denominator
# * df_num: degrees of freedom numerator
# * df_den: degrees of freedom denominator
def initialize(var_num, var_den, df_num, df_den, opts=Hash.new)
@var_num=var_num
@var_den=var_den
@df_num=df_num
@df_den=df_den
@var_total=var_num+var_den
@df_total=df_num+df_den
opts_default={:tails=>:right, :name=>_("F Test")}
@opts=opts_default.merge(opts)
raise "Tails should be right or left, not both" if @opts[:tails]==:both
opts_default.keys.each {|k|
send("#{k}=", @opts[k])
}
end
def f
@var_num.quo(@var_den)
end
def to_f
f
end
# probability
def probability
p_using_cdf(Distribution::F.cdf(f, @df_num, @df_den), tails)
end
def report_building(builder) #:nodoc:
if @df_num.is_a? Integer and @df_den.is_a? Integer
builder.text "%s : F(%d, %d) = %0.4f , p = %0.4f" % [@name, @df_num, @df_den, f, probability]
else
builder.text "%s : F(%0.2f, %0.2f) = %0.4f , p = %0.4f" % [@name, @df_num, @df_den, f, probability]
end
end
end
end
end
================================================
FILE: lib/statsample/test/kolmogorovsmirnov.rb
================================================
module Statsample
module Test
# == Kolmogorov-Smirnov's test of equality of distributions.
class KolmogorovSmirnov
attr_reader :d
include Statsample::Test
include Summarizable
# Creates a new Kolmogorov-Smirnov test
# d1 should have each method
# d2 could be a Distribution class, with a cdf method,
# a vector or a lambda
def initialize(d1,d2)
raise "First argument should have each method" unless d1.respond_to? :each
@d1=make_cdf(d1)
if d2.respond_to? :cdf or d2.is_a? Proc
@d2=d2
elsif d2.respond_to? :each
@d2=make_cdf(d2)
else
raise "Second argument should respond to cdf or each"
end
calculate
end
def calculate
d=0
@d1.each {|x|
v1=@d1.cdf(x);
v2=@d2.is_a?(Proc) ? @d2.call(x) : @d2.cdf(x)
d=(v1-v2).to_f.abs if (v1-v2).abs>d
}
@d=d
end
# Make a wrapper EmpiricDistribution to any method which implements
# each
# On Statsample::Vector, only uses #valid_data
def make_cdf(v)
v.is_a?(Statsample::Vector) ? EmpiricDistribution.new(v.valid_data) : EmpiricDistribution.new(v)
end
class EmpiricDistribution
def initialize(data)
@min=data.min
@max=data.max
@data=data.sort
@n=data.size
end
def each
@data.each {|x|
yield x
}
end
def cdf(x)
return 0 if x<@min
return 1 if x>=@max
v=@data.index{|v1| v1>=x}
v.nil? ? 0 : (v+(x==@data[v]? 1 : 0)).quo(@n)
end
end # End EmpiricDistribution
end
end
end
================================================
FILE: lib/statsample/test/levene.rb
================================================
module Statsample
module Test
# = Levene Test for Equality of Variances
# From NIST/SEMATECH:
# Levene's test ( Levene, 1960) is used to test if k samples have equal variances. Equal variances across samples is called homogeneity of variance. Some statistical tests, for example the analysis of variance, assume that variances are equal across groups or samples. The Levene test can be used to verify that assumption.
# Use:
# require 'statsample'
# a=[1,2,3,4,5,6,7,8,100,10].to_scale
# b=[30,40,50,60,70,80,90,100,110,120].to_scale
#
# levene=Statsample::Test::Levene.new([a,b])
# puts levene.summary
#
# Output:
# Levene Test
# F: 0.778121319848449
# p: 0.389344552595791
#
# Reference:
# * NIST/SEMATECH e-Handbook of Statistical Methods. Available on http://www.itl.nist.gov/div898/handbook/eda/section3/eda35a.htm
class Levene
include Statsample::Test
include Summarizable
# Degrees of freedom 1 (k-1)
attr_reader :d1
# Degrees of freedom 2 (n-k)
attr_reader :d2
# Name of test
attr_accessor :name
# Input could be an array of vectors or a dataset
def initialize(input, opts=Hash.new())
if input.is_a? Statsample::Dataset
@vectors=input.vectors.values
else
@vectors=input
end
@name=_("Levene Test")
opts.each{|k,v|
self.send("#{k}=",v) if self.respond_to? k
}
compute
end
# Value of the test
def f
@w
end
def report_building(builder) # :nodoc:
builder.text "%s : F(%d, %d) = %0.4f , p = %0.4f" % [@name, @d1, @d2, f, probability]
end
def compute
n=@vectors.inject(0) {|ac,v| ac+v.n_valid}
zi=@vectors.collect {|vector|
mean=vector.mean
vector.collect {|v| (v-mean).abs }.to_scale
}
total_mean=zi.inject([]) {|ac,vector|
ac+vector.valid_data
}.to_scale.mean
k=@vectors.size
sum_num=zi.inject(0) {|ac,vector|
ac+(vector.size*(vector.mean-total_mean)**2)
}
sum_den=zi.inject(0) {|ac,vector|
z_mean=vector.mean
ac+vector.valid_data.inject(0) {|acp,zij|
acp+(zij-z_mean)**2
}
}
@w=((n-k)*sum_num).quo((k-1)*sum_den)
@d1=k-1
@d2=n-k
end
private :compute
# Probability.
# With H_0 = Sum(s2)=0, probability of getting a value of the test upper or equal to the obtained on the sample
def probability
p_using_cdf(Distribution::F.cdf(f, @d1, @d2), :right)
end
end
end
end
================================================
FILE: lib/statsample/test/t.rb
================================================
module Statsample
module Test
# A t-test is any statistical hypothesis test in which the test statistic follows a Student's t distribution, if the null hypothesis is supported
class T
class << self
include Math
# Test the null hypothesis that the population mean is equal to a specified value u, one uses the statistic.
# Is the same formula used on t-test for paired sample.
# * x: sample/differences mean
# * u: population mean
# * s: sample/differences standard deviation
# * n: sample size
def one_sample(x,u,s,n)
(x-u)*Math::sqrt(n).quo(s)
end
# Test if means of two samples are different.
# * x1: sample 1 mean
# * x2: sample 2 mean
# * s1: sample 1 standard deviation
# * s2: sample 2 standard deviation
# * n1: sample 1 size
# * n2: sample 2 size
# * equal_variance: true if equal_variance assumed
#
def two_sample_independent(x1, x2, s1, s2, n1, n2, equal_variance = false)
num=x1-x2
if equal_variance
sx1x2 = sqrt(((n1-1)*s1**2 + (n2-1)*s2**2).quo(n1+n2-2))
den = sx1x2*sqrt(1.quo(n1)+1.quo(n2))
else
den=sqrt((s1**2).quo(n1) + (s2**2).quo(n2))
end
num.quo(den)
end
# Degrees of freedom for equal variance on t test
def df_equal_variance(n1,n2)
n1+n2-2
end
# Degrees of freedom for unequal variance
# * s1: sample 1 standard deviation
# * s2: sample 2 standard deviation
# * n1: sample 1 size
# * n2: sample 2 size
# == Reference
# * http://en.wikipedia.org/wiki/Welch-Satterthwaite_equation
def df_not_equal_variance(s1,s2,n1,n2)
s2_1=s1**2
s2_2=s2**2
num=(s2_1.quo(n1)+s2_2.quo(n2))**2
den=(s2_1.quo(n1)**2).quo(n1-1) + (s2_2.quo(n2)**2).quo(n2-1)
num.quo(den)
end
end
include Statsample::Test
include Summarizable
attr_reader :standard_error, :estimate, :df
# Tails for p-value (:both, :left or :right). Default :both
attr_accessor :tails
# Name of F analysis
attr_accessor :name
attr_accessor :confidence_level
attr_reader :t
attr_accessor :estimate_name, :standard_error_name
# Creates a generic t test. Use OneSample or TwoSamplesIndependent
# classes for better summaries.
# Parameters:
# * estimate: estimate
# * standard_error: standard error of estimate
# * df: degrees of freedom
def initialize(estimate, standard_error, df, opts=Hash.new)
@estimate=estimate
@standard_error=standard_error
@df=df
@t = @estimate / @standard_error.to_f
opts_default={ :tails=>:both,
:name=>_("T Test"),
:estimate_name=>_("Estimate"),
:standard_error_name=>_("Std.Err.of Estimate"),
:confidence_level=>0.95}
@opts = opts_default.merge(opts)
@opts.keys.each {|k|
send("#{k}=", @opts[k]) if respond_to? k
}
end
alias :se :standard_error
def to_f
t
end
# probability
def probability
p_using_cdf(Distribution::T.cdf(t, df), tails)
end
def confidence_interval(cl=nil)
cl||=confidence_level
t_crit = t_critical(cl, df)
[estimate - se*t_crit, estimate + se*t_crit]
end
alias :ci :confidence_interval
def report_building(builder) #:nodoc:
builder.section(:name=>@name) do |section|
section.text _("%s: %0.4f | %s: %0.4f") % [@estimate_name, @estimate, @standard_error_name, se]
report_building_t(section)
end
end
def report_building_t(s)
df_f=@df.is_a?(Integer) ? "%d" : "%0.4f"
s.text _("t(%d) = %0.4f, p=%0.4f (%s tails)") % [df, t,probability, tails]
s.text _("CI(%d%%): %0.4f - %0.4f") % [confidence_level*100, ci[0],ci[1]]
end
# One Sample t-test
# == Usage
# a=1000.times.map {rand(100)}.to_scale
# t_1=Statsample::Test::T::OneSample.new(a, {:u=>50})
# t_1.summary
#
# === Output
#
# = One Sample T Test
# Sample mean: 48.954
# Population mean:50
# Tails: both
# t = -1.1573, p=0.2474, d.f=999
class OneSample
include Math
include Statsample::Test
include Summarizable
# Options
attr_accessor :opts
# Name of test
attr_accessor :name
# Population mean to contrast
attr_accessor :u
# Degress of freedom
attr_reader :df
# Tails for probability (:both, :left or :right)
attr_accessor :tails
# Create a One Sample T Test
# Options:
# * :u = Mean to compare. Default= 0
# * :name = Name of the analysis
# * :tails = Tail for probability. Could be :both, :left, :right
def initialize(vector, opts=Hash.new)
@vector=vector
default={:u=>0, :name=>"One Sample T Test", :tails=>:both}
@opts=default.merge(opts)
@name=@opts[:name]
@u=@opts[:u]
@tails=@opts[:tails]
@confidence_level=@opts[:confidence_level] || 0.95
@df= @vector.n_valid-1
@t=nil
end
def t_object
T.new(@vector.mean-u, @vector.se, @vector.n_valid-1, opts)
end
def t
t_object.t
end
def probability
t_object.probability
end
def standard_error
t_object.standard_error
end
alias :se :standard_error
def confidence_interval(cl=nil)
t_object.confidence_interval(cl)
end
alias :ci :confidence_interval
def report_building(b) # :nodoc:
b.section(:name=>@name) {|s|
s.text _("Sample mean: %0.4f | Sample sd: %0.4f | se : %0.4f") % [@vector.mean, @vector.sd, se]
s.text _("Population mean: %0.4f") % u if u!=0
t_object.report_building_t(s)
}
end
end
# Two Sample t-test.
#
# == Usage
# a=1000.times.map {rand(100)}.to_scale
# b=1000.times.map {rand(100)}.to_scale
# t_2=Statsample::Test::T::TwoSamplesIndependent.new(a,b)
# t_2.summary
# === Output
# = Two Sample T Test
# Mean and standard deviation
# +----------+---------+---------+------+
# | Variable | m | sd | n |
# +----------+---------+---------+------+
# | 1 | 49.3310 | 29.3042 | 1000 |
# | 2 | 47.8180 | 28.8640 | 1000 |
# +----------+---------+---------+------+
#
# == Levene Test
# Levene Test
# F: 0.3596
# p: 0.5488
# T statistics
# +--------------------+--------+-----------+----------------+
# | Type | t | df | p (both tails) |
# +--------------------+--------+-----------+----------------+
# | Equal variance | 1.1632 | 1998 | 0.2449 |
# | Non equal variance | 1.1632 | 1997.5424 | 0.1362 |
# +--------------------+--------+-----------+----------------+
class TwoSamplesIndependent
include Math
include Statsample::Test
include DirtyMemoize
include Summarizable
# Options
attr_accessor :opts
# Name of test
attr_accessor :name
# Degress of freedom (equal variance)
attr_reader :df_equal_variance
# Degress of freedom (not equal variance)
attr_reader :df_not_equal_variance
# Value of t for equal_variance
attr_reader :t_equal_variance
# Value of t for non-equal_variance
attr_reader :t_not_equal_variance
# Probability(equal variance)
attr_reader :probability_equal_variance
# Probability(unequal variance)
attr_reader :probability_not_equal_variance
# Tails for probability (:both, :left or :right)
attr_accessor :tails
# Create the object
dirty_writer :tails
dirty_memoize :t_equal_variance, :t_not_equal_variance, :probability_equal_variance, :probability_not_equal_variance, :df_equal_variance, :df_not_equal_variance
# Create a Two Independent T Test
# Options:
# * :name = Name of the analysis
# * :tails = Tail for probability. Could be :both, :left, :right
def initialize(v1, v2, opts=Hash.new)
@v1=v1
@v2=v2
default={:u=>0, :name=>"Two Sample T Test", :tails=>:both}
@opts=default.merge(opts)
@name=@opts[:name]
@tails=@opts[:tails]
end
# Set t and probability for given u
def compute
@t_equal_variance= T.two_sample_independent(@v1.mean, @v2.mean, @v1.sd, @v2.sd, @v1.n_valid, @v2.n_valid,true)
@t_not_equal_variance= T.two_sample_independent(@v1.mean, @v2.mean, @v1.sd, @v2.sd, @v1.n_valid, @v2.n_valid, false)
@df_equal_variance=T.df_equal_variance(@v1.n_valid, @v2.n_valid)
@df_not_equal_variance=T.df_not_equal_variance(@v1.sd, @v2.sd, @v1.n_valid, @v2.n_valid)
@probability_equal_variance = p_using_cdf(Distribution::T.cdf(@t_equal_variance, @df_equal_variance), tails)
@probability_not_equal_variance = p_using_cdf(Distribution::T.cdf(@t_not_equal_variance, @df_not_equal_variance), tails)
end
# Cohen's d is a measure of effect size. Its defined as the difference between two means divided by a standard deviation for the data
def d
n1=@v1.n_valid
n2=@v2.n_valid
num=@v1.mean-@v2.mean
den=Math::sqrt( ((n1-1)*@v1.sd+(n2-1)*@v2.sd).quo(n1+n2))
num.quo(den)
end
def report_building(b) # :nodoc:
b.section(:name=>@name) {|g|
g.table(:name=>_("Mean and standard deviation"), :header=>[_("Variable"), _("mean"), _("sd"),_("n")]) {|t|
t.row([@v1.name,"%0.4f" % @v1.mean,"%0.4f" % @v1.sd,@v1.n_valid])
t.row([@v2.name,"%0.4f" % @v2.mean,"%0.4f" % @v2.sd, @v2.n_valid])
}
g.parse_element(Statsample::Test.levene([@v1,@v2],:name=>_("Levene test for equality of variances")))
g.table(:name=>_("T statistics"),:header=>["Type","t","df", "p (#{tails} tails)"].map{|v| _(v)}) {|t|
t.row([_("Equal variance"), "%0.4f" % t_equal_variance, df_equal_variance, "%0.4f" % probability_equal_variance])
t.row([_("Non equal variance"), "%0.4f" % t_not_equal_variance, "%0.4f" % df_not_equal_variance, "%0.4f" % probability_not_equal_variance])
}
g.table(:name=>_("Effect size")) do |t|
t.row ['x1-x2', "%0.4f" % (@v1.mean-@v2.mean)]
t.row ['d', "%0.4f" % d]
end
}
end
end
end
end
end
================================================
FILE: lib/statsample/test/umannwhitney.rb
================================================
module Statsample
module Test
#
# = U Mann-Whitney test
#
# Non-parametric test for assessing whether two independent samples
# of observations come from the same distribution.
#
# == Assumptions
#
# * The two samples under investigation in the test are independent of each other and the observations within each sample are independent.
# * The observations are comparable (i.e., for any two observations, one can assess whether they are equal or, if not, which one is greater).
# * The variances in the two groups are approximately equal.
#
# Higher differences of distributions correspond to
# to lower values of U.
#
class UMannWhitney
# Max for m*n allowed for exact calculation of probability
MAX_MN_EXACT=10000
# U sampling distribution, based on Dinneen & Blakesley (1973) algorithm.
# This is the algorithm used on SPSS.
#
# Parameters:
# * n1: group 1 size
# * n2: group 2 size
# == Reference:
# * Dinneen, L., & Blakesley, B. (1973). Algorithm AS 62: A Generator for the Sampling Distribution of the Mann- Whitney U Statistic. Journal of the Royal Statistical Society, 22(2), 269-273
#
def self.u_sampling_distribution_as62(n1,n2)
freq=[]
work=[]
mn1=n1*n2+1
max_u=n1*n2
minmn=n1n2 ? n1 : n2
n1=maxmn+1
(1..n1).each{|i| freq[i]=1}
n1+=1
(n1..mn1).each{|i| freq[i]=0}
work[1]=0
xin=maxmn
(2..minmn).each do |i|
work[i]=0
xin=xin+maxmn
n1=xin+2
l=1+xin.quo(2)
k=i
(1..l).each do |j|
k=k+1
n1=n1-1
sum=freq[j]+work[j]
freq[j]=sum
work[k]=sum-freq[n1]
freq[n1]=sum
end
end
# Generate percentages for normal U
dist=(1+max_u/2).to_i
freq.shift
total=freq.inject(0) {|a,v| a+v }
(0...dist).collect {|i|
if i!=max_u-i
ues=freq[i]*2
else
ues=freq[i]
end
ues.quo(total)
}
end
# Generate distribution for permutations.
# Very expensive, but useful for demostrations
def self.distribution_permutations(n1,n2)
base=[0]*n1+[1]*n2
po=Statsample::Permutation.new(base)
total=n1*n2
req={}
po.each do |perm|
r0,s0=0,0
perm.each_index {|c_i|
if perm[c_i]==0
r0+=c_i+1
s0+=1
end
}
u1=r0-((s0*(s0+1)).quo(2))
u2=total-u1
temp_u= (u1 <= u2) ? u1 : u2
req[perm]=temp_u
end
req
end
# Sample 1 Rank sum
attr_reader :r1
# Sample 2 Rank sum
attr_reader :r2
# Sample 1 U (useful for demostration)
attr_reader :u1
# Sample 2 U (useful for demostration)
attr_reader :u2
# U Value
attr_reader :u
# Value of compensation for ties (useful for demostration)
attr_reader :t
# Name of test
attr_accessor :name
include Summarizable
#
# Create a new U Mann-Whitney test
# Params: Two Statsample::Vectors
#
def initialize(v1,v2, opts=Hash.new)
@v1=v1
@v2=v2
@n1=v1.valid_data.size
@n2=v2.valid_data.size
data=(v1.valid_data+v2.valid_data).to_scale
groups=(([0]*@n1)+([1]*@n2)).to_vector
ds={'g'=>groups, 'data'=>data}.to_dataset
@t=nil
@ties=data.data.size!=data.data.uniq.size
if(@ties)
adjust_for_ties(ds['data'])
end
ds['ranked']=ds['data'].ranked(:scale)
@n=ds.cases
@r1=ds.filter{|r| r['g']==0}['ranked'].sum
@r2=((ds.cases*(ds.cases+1)).quo(2))-r1
@u1=r1-((@n1*(@n1+1)).quo(2))
@u2=r2-((@n2*(@n2+1)).quo(2))
@u=(u1_("Mann-Whitney's U")}
@opts=opts_default.merge(opts)
opts_default.keys.each {|k|
send("#{k}=", @opts[k])
}
end
def report_building(generator) # :nodoc:
generator.section(:name=>@name) do |s|
s.table(:name=>_("%s results") % @name) do |t|
t.row([_("Sum of ranks %s") % @v1.name, "%0.3f" % @r1])
t.row([_("Sum of ranks %s") % @v2.name, "%0.3f" % @r2])
t.row([_("U Value"), "%0.3f" % @u])
t.row([_("Z"), "%0.3f (p: %0.3f)" % [z, probability_z]])
if @n1*@n2100000.
# Uses u_sampling_distribution_as62
def probability_exact
dist=UMannWhitney.u_sampling_distribution_as62(@n1,@n2)
sum=0
(0..@u.to_i).each {|i|
sum+=dist[i]
}
sum
end
# Adjunt for ties.
#
# == Reference:
# * http://europe.isixsigma.com/library/content/c080806a.asp
def adjust_for_ties(data)
@t=data.frequencies.find_all{|k,v| v>1}.inject(0) {|a,v|
a+(v[1]**3-v[1]).quo(12)
}
end
private :adjust_for_ties
# Z value for U, with adjust for ties.
# For large samples, U is approximately normally distributed.
# In that case, you can use z to obtain probabily for U.
# == Reference:
# * SPSS Manual
def z
mu=(@n1*@n2).quo(2)
if(!@ties)
ou=Math::sqrt(((@n1*@n2)*(@n1+@n2+1)).quo(12))
else
n=@n1+@n2
first=(@n1*@n2).quo(n*(n-1))
second=((n**3-n).quo(12))-@t
ou=Math::sqrt(first*second)
end
(@u-mu).quo(ou)
end
# Assuming H_0, the proportion of cdf with values of U lower
# than the sample, using normal approximation.
# Use with more than 30 cases per group.
def probability_z
(1-Distribution::Normal.cdf(z.abs()))*2
end
end
end
end
================================================
FILE: lib/statsample/test/wilcoxonsignedrank.rb
================================================
module Statsample
module Test
# From Wikipedia:
# The Wilcoxon signed-rank test is a non-parametric statistical hypothesis test used when comparing two related samples, matched samples, or repeated measurements on a single sample to assess whether their population mean ranks differ (i.e. it is a paired difference test). It can be used as an alternative to the paired Student's t-test, t-test for matched pairs, or the t-test for dependent samples when the population cannot be assumed to be normally distributed.
class WilcoxonSignedRank
include Statsample::Test
include Summarizable
# Name of F analysis
attr_accessor :name
attr_reader :w
attr_reader :nr
attr_writer :tails
# Parameters:
def initialize(v1,v2, opts=Hash.new)
@v1=v1
@v2=v2
opts_default={:name=>_("Wilcoxon Signed Rank Test"),:tails=>:both}
@opts=opts_default.merge(opts)
opts_default.keys.each {|k|
send("#{k}=", @opts[k])
}
calculate
end
def calculate
df=Statsample::Dataset.new({'v1'=>@v1,'v2'=>@v2})
df["abs"]=df.collect {|row|
r=(row["v2"]-row["v1"]).abs
}
df["sgn"]=df.collect {|row|
r=row["v2"]-row["v1"]
r==0 ? 0 : r/r.abs
}
df=df.filter {|row| row["sgn"]!=0}
df["rank"]=df["abs"].ranked
@nr=df.cases
@w=df.collect {|row|
row["sgn"]*row["rank"]
#p row["sgn"]*row["rank"]
}.sum
end
def report_building(generator) # :nodoc:
generator.section(:name=>@name) do |s|
s.table(:name=>_("%s results") % @name) do |t|
t.row([_("W Value"), "%0.3f" % @w])
t.row([_("Z"), "%0.3f (p: %0.3f)" % [z, probability_z]])
if(nr<=10)
t.row([_("Exact probability"), "p-exact: %0.3f" % [probability_exact]])
end
end
end
end
def z
sigma=Math.sqrt((nr*(nr+1)*(2*nr+1))/6)
(w-0.5)/sigma
end
# Assuming normal distribution of W, this calculate
# the probability of samples with Z equal or higher than
# obtained on sample
def probability_z
(1-Distribution::Normal.cdf(z))*(@tails==:both ? 2:1)
end
# Calculate exact probability.
# Don't calculate for large Nr, please!
def probability_exact
str_format="%0#{nr}b"
combinations=2**nr
#p str_format
total_w=combinations.times.map {|i|
comb=sprintf(str_format,i)
w_local=comb.length.times.inject(0) {|ac,j|
sgn=comb[j]=="0" ? -1 : 1
ac+(j+1)*sgn
}
}.sort
total_w.find_all {|v|
if @tails==:both
v<=-w.abs or v>=w.abs
elsif @tails==:left
v<=w
elsif @tails==:right
v>=w
end
}.count/(combinations.to_f)
end
end
end
end
================================================
FILE: lib/statsample/test.rb
================================================
module Statsample
# Module for several statistical tests
module Test
autoload(:UMannWhitney, 'statsample/test/umannwhitney')
autoload(:Levene, 'statsample/test/levene')
autoload(:T, 'statsample/test/t')
autoload(:F, 'statsample/test/f')
autoload(:ChiSquare, 'statsample/test/chisquare')
autoload(:BartlettSphericity, 'statsample/test/bartlettsphericity')
autoload(:KolmogorovSmirnov, 'statsample/test/kolmogorovsmirnov')
autoload(:WilcoxonSignedRank, 'statsample/test/wilcoxonsignedrank')
# Returns probability of getting a value lower or higher
# than sample, using cdf and number of tails.
#
# * :left : For one tail left, return the cdf
# * :right : For one tail right, return 1-cdf
# * :both : For both tails, returns 2*right_tail(cdf.abs)
def p_using_cdf(cdf, tails=:both)
tails=:both if tails==2 or tails==:two
tails=:right if tails==1 or tails==:positive
tails=:left if tails==:negative
case tails
when :left then cdf
when :right then 1-cdf
when :both
if cdf>=0.5
cdf=1-cdf
end
2*cdf
end
end
# Get critical t to create confidence interval
def t_critical(confidence_level, df)
-Distribution::T.p_value((1-confidence_level) / 2.0, df)
end
# Get critical z to create confidence interval
def z_critical(confidence_level)
-Distribution::Z.p_value((1-confidence_level) / 2.0)
end
extend self
# Calculate chi square for two Matrix
class << self
def chi_square(observed, expected=nil)
case observed
when Vector
ChiSquare::WithVector.new(observed,expected)
when Matrix
ChiSquare::WithMatrix.new(observed,expected)
else
raise "Not implemented for #{observed.class}"
end
end
# Shorthand for Statsample::Test::UMannWhitney.new
#
# * v1 and v2 should be Statsample::Vector.
def u_mannwhitney(v1, v2)
Statsample::Test::UMannWhitney.new(v1,v2)
end
# Shorthand for Statsample::Test::T::OneSample.new
def t_one_sample(vector, opts=Hash.new)
Statsample::Test::T::OneSample.new(vector,opts)
end
# Shorthand for Statsample::Test::T::TwoSamplesIndependent.new
def t_two_samples_independent(v1,v2, opts=Hash.new)
Statsample::Test::T::TwoSamplesIndependent.new(v1,v2,opts)
end
# Shorthand for Statsample::Test::WilcoxonSignedRank.new
def wilcoxon_signed_rank(v1,v2,opts=Hash.new)
Statsample::Test::WilcoxonSignedRank.new(v1,v2,opts)
end
# Shorthand for Statsample::Test::Levene.new
def levene(input, opts=Hash.new)
Statsample::Test::Levene.new(input,opts)
end
end
end
end
================================================
FILE: lib/statsample/vector/gsl.rb
================================================
module Statsample
class Vector
module GSL_
def clear_gsl
@gsl=nil
end
def set_valid_data
clear_gsl
set_valid_data_ruby
end
def push(v)
# If data is GSL::Vector, should be converted first to an Array
if @data.is_a? GSL::Vector
@data=@data.to_a
end
push_ruby(v)
end
def gsl
@gsl||=GSL::Vector.alloc(@scale_data) if @scale_data.size>0
end
alias :to_gsl :gsl
def vector_standarized_compute(m,sd)
if flawed?
vector_standarized_compute_ruby(m,sd)
else
gsl.collect {|x| (x.to_f - m).quo(sd)}.to_scale
end
end
def vector_centered_compute(m)
if flawed?
vector_centered_compute_ruby(m)
else
gsl.collect {|x| (x.to_f - m)}.to_scale
end
end
def sample_with_replacement(sample=1)
if(@type!=:scale)
sample_with_replacement_ruby(sample)
else
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
Statsample::Vector.new(r.sample(gsl, sample).to_a,:scale)
end
end
def sample_without_replacement(sample=1)
if(@type!=:scale)
sample_without_replacement_ruby(sample)
else
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
r.choose(gsl, sample).to_a
end
end
def median
if @type!=:scale
median_ruby
else
sorted=GSL::Vector.alloc(@scale_data.sort)
GSL::Stats::median_from_sorted_data(sorted)
end
end
def sum
check_type :scale
gsl.nil? ? nil : gsl.sum
end
def mean
check_type :scale
gsl.nil? ? nil : gsl.mean
end
def variance_sample(m=nil)
check_type :scale
m||=mean
gsl.nil? ? nil : gsl.variance_m
end
def standard_deviation_sample(m=nil)
check_type :scale
m||=mean
gsl.nil? ? nil : gsl.sd(m)
end
def variance_population(m=nil) # :nodoc:
check_type :scale
m||=mean
gsl.nil? ? nil : gsl.variance_with_fixed_mean(m)
end
def standard_deviation_population(m=nil) # :nodoc:
check_type :scale
m||=mean
gsl.nil? ? nil : gsl.sd_with_fixed_mean(m)
end
def skew # :nodoc:
check_type :scale
gsl.nil? ? nil : gsl.skew
end
def kurtosis # :nodoc:
check_type :scale
gsl.nil? ? nil : gsl.kurtosis
end
end
end
end
================================================
FILE: lib/statsample/vector.rb
================================================
require 'date'
require 'statsample/vector/gsl'
module Statsample::VectorShorthands
# Creates a new Statsample::Vector object
# Argument should be equal to Vector.new
def to_vector(*args)
Statsample::Vector.new(self,*args)
end
# Creates a new Statsample::Vector object of type :scale
def to_scale(*args)
Statsample::Vector.new(self, :scale, *args)
end
end
class Array
include Statsample::VectorShorthands
end
if Statsample.has_gsl?
module GSL
class Vector
include Statsample::VectorShorthands
end
end
end
module Statsample
# Collection of values on one dimension. Works as a column on a Spreadsheet.
#
# == Usage
# The fast way to create a vector uses Array.to_vector or Array.to_scale.
#
# v=[1,2,3,4].to_vector(:scale)
# v=[1,2,3,4].to_scale
#
class Vector
include Enumerable
include Writable
include Summarizable
include Statsample::VectorShorthands
# Level of measurement. Could be :nominal, :ordinal or :scale
attr_reader :type
# Original data.
attr_reader :data
# Valid data. Equal to data, minus values assigned as missing values
attr_reader :valid_data
# Array of values considered as missing. Nil is a missing value, by default
attr_reader :missing_values
# Array of values considered as "Today", with date type. "NOW", "TODAY", :NOW and :TODAY are 'today' values, by default
attr_reader :today_values
# Missing values array
attr_reader :missing_data
# Original data, with all missing values replaced by nils
attr_reader :data_with_nils
# Date date, with all missing values replaced by nils
attr_reader :date_data_with_nils
# Change label for specific values
attr_accessor :labels
# Name of vector. Should be used for output by many classes
attr_accessor :name
# Creates a new Vector object.
# * data Any data which can be converted on Array
# * type Level of meausurement. See Vector#type
# * opts Hash of options
# * :missing_values Array of missing values. See Vector#missing_values
# * :today_values Array of 'today' values. See Vector#today_values
# * :labels Labels for data values
# * :name Name of vector
def initialize(data=[], type=:nominal, opts=Hash.new)
@data=data.is_a?(Array) ? data : data.to_a
@type=type
opts_default={
:missing_values=>[],
:today_values=>['NOW','TODAY', :NOW, :TODAY],
:labels=>{},
:name=>nil
}
@opts=opts_default.merge(opts)
if @opts[:name].nil?
@@n_table||=0
@@n_table+=1
@opts[:name]="Vector #{@@n_table}"
end
@missing_values=@opts[:missing_values]
@labels=@opts[:labels]
@today_values=@opts[:today_values]
@name=@opts[:name]
@valid_data=[]
@data_with_nils=[]
@date_data_with_nils=[]
@missing_data=[]
@has_missing_data=nil
@scale_data=nil
set_valid_data
self.type=type
end
# Create a vector using (almost) any object
# * Array: flattened
# * Range: transformed using to_a
# * Statsample::Vector
# * Numeric and string values
def self.[](*args)
values=[]
args.each do |a|
case a
when Array
values.concat a.flatten
when Statsample::Vector
values.concat a.to_a
when Range
values.concat a.to_a
else
values << a
end
end
vector=new(values)
vector.type=:scale if vector.can_be_scale?
vector
end
# Create a new scale type vector
# Parameters
# [n] Size
# [val] Value of each value
# [&block] If block provided, is used to set the values of vector
def self.new_scale(n,val=nil, &block)
if block
vector=n.times.map {|i| block.call(i)}.to_scale
else
vector=n.times.map { val}.to_scale
end
vector.type=:scale
vector
end
# Creates a duplicate of the Vector.
# Note: data, missing_values and labels are duplicated, so
# changes on original vector doesn't propages to copies.
def dup
Vector.new(@data.dup,@type, :missing_values => @missing_values.dup, :labels => @labels.dup, :name=>@name)
end
# Returns an empty duplicate of the vector. Maintains the type,
# missing values and labels.
def dup_empty
Vector.new([],@type, :missing_values => @missing_values.dup, :labels => @labels.dup, :name=> @name)
end
if Statsample::STATSAMPLE__.respond_to?(:check_type)
# Raises an exception if type of vector is inferior to t type
def check_type(t)
Statsample::STATSAMPLE__.check_type(self,t)
end
else
def check_type(t) #:nodoc:
_check_type(t)
end
end
def _check_type(t) #:nodoc:
raise NoMethodError if (t==:scale and @type!=:scale) or (t==:ordinal and @type==:nominal) or (t==:date) or (:date==@type)
end
def vector_standarized_compute(m,sd) # :nodoc:
@data_with_nils.collect{|x| x.nil? ? nil : (x.to_f - m).quo(sd) }.to_vector(:scale)
end
# Return a vector usign the standarized values for data
# with sd with denominator n-1. With variance=0 or mean nil,
# returns a vector of equal size full of nils
#
def vector_standarized(use_population=false)
check_type :scale
m=mean
sd=use_population ? sdp : sds
return ([nil]*size).to_scale if mean.nil? or sd==0.0
vector=vector_standarized_compute(m,sd)
vector.name=_("%s(standarized)") % @name
vector
end
def vector_centered_compute(m) #:nodoc:
@data_with_nils.collect {|x| x.nil? ? nil : x.to_f-m }.to_scale
end
# Return a centered vector
def vector_centered
check_type :scale
m=mean
return ([nil]*size).to_scale if mean.nil?
vector=vector_centered_compute(m)
vector.name=_("%s(centered)") % @name
vector
end
alias_method :standarized, :vector_standarized
alias_method :centered, :vector_centered
# Return a vector with values replaced with the percentiles
# of each values
def vector_percentil
check_type :ordinal
c=@valid_data.size
vector=ranked.map {|i| i.nil? ? nil : (i.quo(c)*100).to_f }.to_vector(@type)
vector.name=_("%s(percentil)") % @name
vector
end
def box_cox_transformation(lambda) # :nodoc:
raise "Should be a scale" unless @type==:scale
@data_with_nils.collect{|x|
if !x.nil?
if(lambda==0)
Math.log(x)
else
(x**lambda-1).quo(lambda)
end
else
nil
end
}.to_vector(:scale)
end
# Vector equality.
# Two vector will be the same if their data, missing values, type, labels are equals
def ==(v2)
return false unless v2.instance_of? Statsample::Vector
@data==v2.data and @missing_values==v2.missing_values and @type==v2.type and @labels==v2.labels
end
def _dump(i) # :nodoc:
Marshal.dump({'data'=>@data,'missing_values'=>@missing_values, 'labels'=>@labels, 'type'=>@type,'name'=>@name})
end
def self._load(data) # :nodoc:
h=Marshal.load(data)
Vector.new(h['data'], h['type'], :missing_values=> h['missing_values'], :labels=>h['labels'], :name=>h['name'])
end
# Returns a new vector, with data modified by block.
# Equivalent to create a Vector after #collect on data
def recode(type=nil)
type||=@type
@data.collect{|x|
yield x
}.to_vector(type)
end
# Modifies current vector, with data modified by block.
# Equivalent to #collect! on @data
def recode!
@data.collect!{|x|
yield x
}
set_valid_data
end
def push(v)
@data.push(v)
set_valid_data
end
# Dicotomize the vector with 0 and 1, based on lowest value
# If parameter if defined, this value and lower
# will be 0 and higher, 1
def dichotomize(low=nil)
fs=factors
low||=factors.min
@data_with_nils.collect{|x|
if x.nil?
nil
elsif x>low
1
else
0
end
}.to_scale
end
# Iterate on each item.
# Equivalent to
# @data.each{|x| yield x}
def each
@data.each{|x| yield(x) }
end
# Iterate on each item, retrieving index
def each_index
(0...@data.size).each {|i|
yield(i)
}
end
# Add a value at the end of the vector.
# If second argument set to false, you should update the Vector usign
# Vector.set_valid_data at the end of your insertion cycle
#
def add(v,update_valid=true)
@data.push(v)
set_valid_data if update_valid
end
# Update valid_data, missing_data, data_with_nils and gsl
# at the end of an insertion.
#
# Use after Vector.add(v,false)
# Usage:
# v=Statsample::Vector.new
# v.add(2,false)
# v.add(4,false)
# v.data
# => [2,3]
# v.valid_data
# => []
# v.set_valid_data
# v.valid_data
# => [2,3]
def set_valid_data
@valid_data.clear
@missing_data.clear
@data_with_nils.clear
@date_data_with_nils.clear
set_valid_data_intern
set_scale_data if(@type==:scale)
set_date_data if(@type==:date)
end
if Statsample::STATSAMPLE__.respond_to?(:set_valid_data_intern)
def set_valid_data_intern #:nodoc:
Statsample::STATSAMPLE__.set_valid_data_intern(self)
end
else
def set_valid_data_intern #:nodoc:
_set_valid_data_intern
end
end
def _set_valid_data_intern #:nodoc:
@data.each do |n|
if is_valid? n
@valid_data.push(n)
@data_with_nils.push(n)
else
@data_with_nils.push(nil)
@missing_data.push(n)
end
end
@has_missing_data=@missing_data.size>0
end
# Retrieves true if data has one o more missing values
def has_missing_data?
@has_missing_data
end
alias :flawed? :has_missing_data?
# Retrieves label for value x. Retrieves x if
# no label defined.
def labeling(x)
@labels.has_key?(x) ? @labels[x].to_s : x.to_s
end
alias :label :labeling
# Returns a Vector with data with labels replaced by the label.
def vector_labeled
d=@data.collect{|x|
if @labels.has_key? x
@labels[x]
else
x
end
}
Vector.new(d,@type)
end
# Size of total data
def size
@data.size
end
alias_method :n, :size
# Retrieves i element of data
def [](i)
@data[i]
end
# Set i element of data.
# Note: Use set_valid_data if you include missing values
def []=(i,v)
@data[i]=v
end
# Return true if a value is valid (not nil and not included on missing values)
def is_valid?(x)
!(x.nil? or @missing_values.include? x)
end
# Set missing_values.
# set_valid_data is called after changes
def missing_values=(vals)
@missing_values = vals
set_valid_data
end
# Set data considered as "today" on data vectors
def today_values=(vals)
@today_values = vals
set_valid_data
end
# Set level of measurement.
def type=(t)
@type=t
set_scale_data if(t==:scale)
set_date_data if (t==:date)
end
def to_a
if @data.is_a? Array
@data.dup
else
@data.to_a
end
end
alias_method :to_ary, :to_a
# Vector sum.
# - If v is a scalar, add this value to all elements
# - If v is a Array or a Vector, should be of the same size of this vector
# every item of this vector will be added to the value of the
# item at the same position on the other vector
def +(v)
_vector_ari("+",v)
end
# Vector rest.
# - If v is a scalar, rest this value to all elements
# - If v is a Array or a Vector, should be of the same
# size of this vector
# every item of this vector will be rested to the value of the
# item at the same position on the other vector
def -(v)
_vector_ari("-",v)
end
def *(v)
_vector_ari("*",v)
end
# Reports all values that doesn't comply with a condition.
# Returns a hash with the index of data and the invalid data.
def verify
h={}
(0...@data.size).to_a.each{|i|
if !(yield @data[i])
h[i]=@data[i]
end
}
h
end
def _vector_ari(method,v) # :nodoc:
if(v.is_a? Vector or v.is_a? Array)
raise ArgumentError, "The array/vector parameter (#{v.size}) should be of the same size of the original vector (#{@data.size})" unless v.size==@data.size
sum=[]
v.size.times {|i|
if((v.is_a? Vector and v.is_valid?(v[i]) and is_valid?(@data[i])) or (v.is_a? Array and !v[i].nil? and !data[i].nil?))
sum.push(@data[i].send(method,v[i]))
else
sum.push(nil)
end
}
Statsample::Vector.new(sum, :scale)
elsif(v.respond_to? method )
Statsample::Vector.new(
@data.collect {|x|
if(!x.nil?)
x.send(method,v)
else
nil
end
} , :scale)
else
raise TypeError,"You should pass a scalar or a array/vector"
end
end
# Return an array with the data splitted by a separator.
# a=Vector.new(["a,b","c,d","a,b","d"])
# a.splitted
# =>
# [["a","b"],["c","d"],["a","b"],["d"]]
def splitted(sep=Statsample::SPLIT_TOKEN)
@data.collect{|x|
if x.nil?
nil
elsif (x.respond_to? :split)
x.split(sep)
else
[x]
end
}
end
# Returns a hash of Vectors, defined by the different values
# defined on the fields
# Example:
#
# a=Vector.new(["a,b","c,d","a,b"])
# a.split_by_separator
# => {"a"=>#,
# "b"=>#,
# "c"=>#}
#
def split_by_separator(sep=Statsample::SPLIT_TOKEN)
split_data=splitted(sep)
factors=split_data.flatten.uniq.compact
out=factors.inject({}) {|a,x|
a[x]=[]
a
}
split_data.each do |r|
if r.nil?
factors.each do |f|
out[f].push(nil)
end
else
factors.each do |f|
out[f].push(r.include?(f) ? 1:0)
end
end
end
out.inject({}){|s,v|
s[v[0]]=Vector.new(v[1],:nominal)
s
}
end
def split_by_separator_freq(sep=Statsample::SPLIT_TOKEN)
split_by_separator(sep).inject({}) {|a,v|
a[v[0]]=v[1].inject {|s,x| s+x.to_i}
a
}
end
# == Bootstrap
# Generate +nr+ resamples (with replacement) of size +s+
# from vector, computing each estimate from +estimators+
# over each resample.
# +estimators+ could be
# a) Hash with variable names as keys and lambdas as values
# a.bootstrap(:log_s2=>lambda {|v| Math.log(v.variance)},1000)
# b) Array with names of method to bootstrap
# a.bootstrap([:mean, :sd],1000)
# c) A single method to bootstrap
# a.jacknife(:mean, 1000)
# If s is nil, is set to vector size by default.
#
# Returns a dataset where each vector is an vector
# of length +nr+ containing the computed resample estimates.
def bootstrap(estimators, nr, s=nil)
s||=n
h_est, es, bss= prepare_bootstrap(estimators)
nr.times do |i|
bs=sample_with_replacement(s)
es.each do |estimator|
# Add bootstrap
bss[estimator].push(h_est[estimator].call(bs))
end
end
es.each do |est|
bss[est]=bss[est].to_scale
bss[est].type=:scale
end
bss.to_dataset
end
# == Jacknife
# Returns a dataset with jacknife delete-+k+ +estimators+
# +estimators+ could be:
# a) Hash with variable names as keys and lambdas as values
# a.jacknife(:log_s2=>lambda {|v| Math.log(v.variance)})
# b) Array with method names to jacknife
# a.jacknife([:mean, :sd])
# c) A single method to jacknife
# a.jacknife(:mean)
# +k+ represent the block size for block jacknife. By default
# is set to 1, for classic delete-one jacknife.
#
# Returns a dataset where each vector is an vector
# of length +cases+/+k+ containing the computed jacknife estimates.
#
# == Reference:
# * Sawyer, S. (2005). Resampling Data: Using a Statistical Jacknife.
def jacknife(estimators, k=1)
raise "n should be divisible by k:#{k}" unless n%k==0
nb=(n / k).to_i
h_est, es, ps= prepare_bootstrap(estimators)
est_n=es.inject({}) {|h,v|
h[v]=h_est[v].call(self)
h
}
nb.times do |i|
other=@data_with_nils.dup
other.slice!(i*k,k)
other=other.to_scale
es.each do |estimator|
# Add pseudovalue
ps[estimator].push( nb * est_n[estimator] - (nb-1) * h_est[estimator].call(other))
end
end
es.each do |est|
ps[est]=ps[est].to_scale
ps[est].type=:scale
end
ps.to_dataset
end
# For an array or hash of estimators methods, returns
# an array with three elements
# 1.- A hash with estimators names as keys and lambdas as values
# 2.- An array with estimators names
# 3.- A Hash with estimators names as keys and empty arrays as values
def prepare_bootstrap(estimators)
h_est=estimators
h_est=[h_est] unless h_est.is_a? Array or h_est.is_a? Hash
if h_est.is_a? Array
h_est=h_est.inject({}) {|h,est|
h[est]=lambda {|v| v.send(est)}
h
}
end
bss=h_est.keys.inject({}) {|h,v| h[v]=[];h}
[h_est,h_est.keys, bss]
end
private :prepare_bootstrap
# Returns an random sample of size n, with replacement,
# only with valid data.
#
# In all the trails, every item have the same probability
# of been selected.
def sample_with_replacement(sample=1)
vds=@valid_data.size
(0...sample).collect{ @valid_data[rand(vds)] }
end
# Returns an random sample of size n, without replacement,
# only with valid data.
#
# Every element could only be selected once.
#
# A sample of the same size of the vector is the vector itself.
def sample_without_replacement(sample=1)
raise ArgumentError, "Sample size couldn't be greater than n" if sample>@valid_data.size
out=[]
size=@valid_data.size
while out.sizedir could be :horizontal or :vertical
def to_matrix(dir=:horizontal)
case dir
when :horizontal
Matrix[@data]
when :vertical
Matrix.columns([@data])
end
end
def inspect
self.to_s
end
# Retrieves uniques values for data.
def factors
if @type==:scale
@scale_data.uniq.sort
elsif @type==:date
@date_data_with_nils.uniq.sort
else
@valid_data.uniq.sort
end
end
if Statsample::STATSAMPLE__.respond_to?(:frequencies)
# Returns a hash with the distribution of frecuencies for
# the sample
def frequencies
Statsample::STATSAMPLE__.frequencies(@valid_data)
end
else
def frequencies #:nodoc:
_frequencies
end
end
def _frequencies #:nodoc:
@valid_data.inject(Hash.new) {|a,x|
a[x]||=0
a[x]=a[x]+1
a
}
end
# Returns the most frequent item.
def mode
frequencies.max{|a,b| a[1]<=>b[1]}.first
end
# The numbers of item with valid data.
def n_valid
@valid_data.size
end
# Returns a hash with the distribution of proportions of
# the sample.
def proportions
frequencies.inject({}){|a,v|
a[v[0]] = v[1].quo(n_valid)
a
}
end
# Proportion of a given value.
def proportion(v=1)
frequencies[v].quo(@valid_data.size)
end
def report_building(b)
b.section(:name=>name) do |s|
s.text _("n :%d") % n
s.text _("n valid:%d") % n_valid
if @type==:nominal
s.text _("factors:%s") % factors.join(",")
s.text _("mode: %s") % mode
s.table(:name=>_("Distribution")) do |t|
frequencies.sort.each do |k,v|
key=labels.has_key?(k) ? labels[k]:k
t.row [key, v , ("%0.2f%%" % (v.quo(n_valid)*100))]
end
end
end
s.text _("median: %s") % median.to_s if(@type==:ordinal or @type==:scale)
if(@type==:scale)
s.text _("mean: %0.4f") % mean
if sd
s.text _("std.dev.: %0.4f") % sd
s.text _("std.err.: %0.4f") % se
s.text _("skew: %0.4f") % skew
s.text _("kurtosis: %0.4f") % kurtosis
end
end
end
end
# Variance of p, according to poblation size
def variance_proportion(n_poblation, v=1)
Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
end
# Variance of p, according to poblation size
def variance_total(n_poblation, v=1)
Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
end
def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1)
Statsample::proportion_confidence_interval_t(proportion(v), @valid_data.size, n_poblation, margin)
end
def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1)
Statsample::proportion_confidence_interval_z(proportion(v), @valid_data.size, n_poblation, margin)
end
self.instance_methods.find_all{|met| met=~/_slow$/}.each do |met|
met_or=met.gsub("_slow","")
if !self.method_defined?(met_or)
alias_method met_or, met
end
end
######
### Ordinal Methods
######
# == Percentil
# Returns the value of the percentile q
#
# Accepts an optional second argument specifying the strategy to interpolate
# when the requested percentile lies between two data points a and b
# Valid strategies are:
# * :midpoint (Default): (a + b) / 2
# * :linear : a + (b - a) * d where d is the decimal part of the index between a and b.
# This is the NIST recommended method (http://en.wikipedia.org/wiki/Percentile#NIST_method)
#
def percentil(q, strategy = :midpoint)
check_type :ordinal
sorted=@valid_data.sort
case strategy
when :midpoint
v = (n_valid * q).quo(100)
if(v.to_i!=v)
sorted[v.to_i]
else
(sorted[(v-0.5).to_i].to_f + sorted[(v+0.5).to_i]).quo(2)
end
when :linear
index = (q / 100.0) * (n_valid + 1)
k = index.truncate
d = index % 1
if k == 0
sorted[0]
elsif k >= sorted.size
sorted[-1]
else
sorted[k - 1] + d * (sorted[k] - sorted[k - 1])
end
else
raise NotImplementedError.new "Unknown strategy #{strategy.to_s}"
end
end
# Returns a ranked vector.
def ranked(type=:ordinal)
check_type :ordinal
i=0
r=frequencies.sort.inject({}){|a,v|
a[v[0]]=(i+1 + i+v[1]).quo(2)
i+=v[1]
a
}
@data.collect {|c| r[c] }.to_vector(type)
end
# Return the median (percentil 50)
def median
check_type :ordinal
percentil(50)
end
# Minimun value
def min
check_type :ordinal
@valid_data.min
end
# Maximum value
def max
check_type :ordinal
@valid_data.max
end
def set_date_data
@date_data_with_nils=@data.collect do|x|
if x.is_a? Date
x
elsif x.is_a? Time
Date.new(x.year, x.month, x.day)
elsif x.is_a? String and x=~/(\d{4,4})[-\/](\d{1,2})[-\/](\d{1,2})/
Date.new($1.to_i,$2.to_i,$3.to_i)
elsif @today_values.include? x
Date.today()
elsif @missing_values.include? x or x.nil?
nil
end
end
end
def set_scale_data
@scale_data=@valid_data.collect do|x|
if x.is_a? Numeric
x
elsif x.is_a? String and x.to_i==x.to_f
x.to_i
else
x.to_f
end
end
end
private :set_date_data, :set_scale_data
# The range of the data (max - min)
def range;
check_type :scale
@scale_data.max - @scale_data.min
end
# The sum of values for the data
def sum
check_type :scale
@scale_data.inject(0){|a,x|x+a} ;
end
# The arithmetical mean of data
def mean
check_type :scale
sum.to_f.quo(n_valid)
end
# Sum of squares for the data around a value.
# By default, this value is the mean
# ss= sum{(xi-m)^2}
#
def sum_of_squares(m=nil)
check_type :scale
m||=mean
@scale_data.inject(0){|a,x| a+(x-m).square}
end
# Sum of squared deviation
def sum_of_squared_deviation
check_type :scale
@scale_data.inject(0) {|a,x| x.square+a} - (sum.square.quo(n_valid))
end
# Population variance (denominator N)
def variance_population(m=nil)
check_type :scale
m||=mean
squares=@scale_data.inject(0){|a,x| x.square+a}
squares.quo(n_valid) - m.square
end
# Population Standard deviation (denominator N)
def standard_deviation_population(m=nil)
check_type :scale
Math::sqrt( variance_population(m) )
end
# Population average deviation (denominator N)
# author: Al Chou
def average_deviation_population( m = nil )
check_type :scale
m ||= mean
( @scale_data.inject( 0 ) { |a, x| ( x - m ).abs + a } ).quo( n_valid )
end
def median_absolute_deviation
med=median
recode {|x| (x-med).abs}.median
end
alias :mad :median_absolute_deviation
# Sample Variance (denominator n-1)
def variance_sample(m=nil)
check_type :scale
m||=mean
sum_of_squares(m).quo(n_valid - 1)
end
# Sample Standard deviation (denominator n-1)
def standard_deviation_sample(m=nil)
check_type :scale
m||=mean
Math::sqrt(variance_sample(m))
end
# Skewness of the sample
def skew(m=nil)
check_type :scale
m||=mean
th=@scale_data.inject(0){|a,x| a+((x-m)**3)}
th.quo((@scale_data.size)*sd(m)**3)
end
# Kurtosis of the sample
def kurtosis(m=nil)
check_type :scale
m||=mean
fo=@scale_data.inject(0){|a,x| a+((x-m)**4)}
fo.quo((@scale_data.size)*sd(m)**4)-3
end
# Product of all values on the sample
#
def product
check_type :scale
@scale_data.inject(1){|a,x| a*x }
end
# With a fixnum, creates X bins within the range of data
# With an Array, each value will be a cut point
def histogram(bins=10)
check_type :scale
if bins.is_a? Array
#h=Statsample::Histogram.new(self, bins)
h=Statsample::Histogram.alloc(bins)
else
# ugly patch. The upper limit for a bin has the form
# x < range
#h=Statsample::Histogram.new(self, bins)
min,max=Statsample::Util.nice(@valid_data.min,@valid_data.max)
# fix last data
if max==@valid_data.max
max+=1e-10
end
h=Statsample::Histogram.alloc(bins,[min,max])
# Fix last bin
end
h.increment(@valid_data)
h
end
# Coefficient of variation
# Calculed with the sample standard deviation
def coefficient_of_variation
check_type :scale
standard_deviation_sample.quo(mean)
end
# Standard error of the distribution mean
# Calculated using sd/sqrt(n)
def standard_error
standard_deviation_sample.quo(Math.sqrt(valid_data.size))
end
alias :se :standard_error
alias_method :sdp, :standard_deviation_population
alias_method :sds, :standard_deviation_sample
alias_method :adp, :average_deviation_population
alias_method :cov, :coefficient_of_variation
alias_method :variance, :variance_sample
alias_method :sd, :standard_deviation_sample
alias_method :ss, :sum_of_squares
include_aliasing Statsample::Vector::GSL_ if Statsample.has_gsl?
end
end
================================================
FILE: lib/statsample/version.rb
================================================
module Statsample
VERSION = '1.4.0'
end
================================================
FILE: lib/statsample.rb
================================================
# = statsample.rb -
# Statsample - Statistic package for Ruby
# Copyright (C) 2008-2014 Claudio Bustos
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#
#$:.unshift(File.dirname(__FILE__))
require 'matrix'
require 'extendmatrix'
require 'distribution'
require 'dirty-memoize'
require 'reportbuilder'
class Numeric
def square ; self * self ; end
end
class String
def is_number?
if self =~ /^-?\d+[,.]?\d*(e-?\d+)?$/
true
else
false
end
end
end
class Module
def include_aliasing(m, suffix="ruby")
m.instance_methods.each do |f|
if instance_methods.include? f
alias_method("#{f}_#{suffix}",f)
remove_method f
end
end
include m
end
end
class Array
# Recode repeated values on an array, adding the number of repetition
# at the end
# Example:
# a=%w{a b c c d d d e}
# a.recode_repeated
# => ["a","b","c_1","c_2","d_1","d_2","d_3","e"]
def recode_repeated
if self.size!=self.uniq.size
# Find repeated
repeated=self.inject({}) {|a,v|
(a[v].nil? ? a[v]=1 : a[v]+=1); a }.find_all{|k,v| v>1}.collect{|k,v| k}
ns=repeated.inject({}) {|a,v| a[v]=0;a}
self.collect do |f|
if repeated.include? f
ns[f]+=1
sprintf("%s_%d",f,ns[f])
else
f
end
end
else
self
end
end
end
def create_test(*args,&proc)
description=args.shift
fields=args
[description, fields, Proc.new]
end
#--
# Test extensions
begin
require 'gettext'
rescue LoadError
def bindtextdomain(d) #:nodoc:
d
end
# Bored module
module GetText #:nodoc:
def _(t)
t
end
end
end
# Library for statistical analysis on Ruby
#
# * Classes for manipulation and storage of data:
# * Module Statsample::Bivariate provides covariance and pearson, spearman, point biserial, tau a, tau b, gamma, tetrachoric (see Bivariate::Tetrachoric) and polychoric (see Bivariate::Polychoric) correlations. Include methods to create correlation and covariance matrices
# * Multiple types of regression on Statsample::Regression
# * Factorial Analysis algorithms on Statsample::Factor module.
# * Dominance Analysis. Based on Budescu and Azen papers.link[http://psycnet.apa.org/journals/met/8/2/129/].
# * Module Statsample::Codification, to help to codify open questions
# * Converters to import and export data from databases, csv and excel files.
# * Module Statsample::Crosstab provides function to create crosstab for categorical data
# * Reliability analysis provides functions to analyze scales.
# * Module Statsample::SRS (Simple Random Sampling) provides a lot of functions to estimate standard error for several type of samples
# * Interfaces to gdchart, gnuplot and SVG::Graph
#
module Statsample
def self.create_has_library(library)
define_singleton_method("has_#{library}?") do
cv="@@#{library}"
if !class_variable_defined? cv
begin
require library.to_s
class_variable_set(cv,true)
rescue LoadError
class_variable_set(cv,false)
end
end
class_variable_get(cv)
end
end
create_has_library :gsl
SPLIT_TOKEN = ","
autoload(:Analysis, 'statsample/analysis')
autoload(:Database, 'statsample/converters')
autoload(:Anova, 'statsample/anova')
autoload(:CSV, 'statsample/converters')
autoload(:PlainText, 'statsample/converters')
autoload(:Excel, 'statsample/converters')
autoload(:GGobi, 'statsample/converters')
autoload(:SPSS, 'statsample/converter/spss')
autoload(:Histogram, 'statsample/histogram')
autoload(:DominanceAnalysis, 'statsample/dominanceanalysis')
autoload(:HtmlReport, 'statsample/htmlreport')
autoload(:Mx, 'statsample/converters')
autoload(:Resample, 'statsample/resample')
autoload(:SRS, 'statsample/srs')
autoload(:Codification, 'statsample/codification')
autoload(:Reliability, 'statsample/reliability')
autoload(:Bivariate, 'statsample/bivariate')
autoload(:Multivariate, 'statsample/multivariate')
autoload(:Multiset, 'statsample/multiset')
autoload(:StratifiedSample, 'statsample/multiset')
autoload(:MLE, 'statsample/mle')
autoload(:Regression, 'statsample/regression')
autoload(:Test, 'statsample/test')
autoload(:Factor, 'statsample/factor')
autoload(:Graph, 'statsample/graph')
class << self
# Load a object saved on a file.
def load(filename)
if File.exist? filename
o=false
File.open(filename,"r") {|fp| o=Marshal.load(fp) }
o
else
false
end
end
# Create a matrix using vectors as columns.
# Use:
#
# matrix=Statsample.vector_cols_matrix(v1,v2)
def vector_cols_matrix(*vs)
# test
size=vs[0].size
vs.each{|v|
raise ArgumentError,"Arguments should be Vector" unless v.instance_of? Statsample::Vector
raise ArgumentError,"Vectors size should be the same" if v.size!=size
}
Matrix.rows((0...size).to_a.collect() {|i|
vs.collect{|v| v[i]}
})
end
# Returns a duplicate of the input vectors, without missing data
# for any of the vectors.
#
# a=[1,2,3,6,7,nil,3,5].to_scale
# b=[nil,nil,5,6,4,5,10,2].to_scale
# c=[2,4,6,7,4,5,6,7].to_scale
# a2,b2,c2=Statsample.only_valid(a,b,c)
# => [#,
# #,
# #]
#
def only_valid(*vs)
i=1
h=vs.inject({}) {|a,v| a["v#{i}"]=v;i+=1;a}
ds=Statsample::Dataset.new(h).dup_only_valid
ds.vectors.values
end
# Cheap version of #only_valid.
# If any vectors have missing_values, return only valid.
# If not, return the vectors itself
def only_valid_clone(*vs)
if vs.any? {|v| v.flawed?}
only_valid(*vs)
else
vs
end
end
end
module Util
# Reference: http://www.itl.nist.gov/div898/handbook/eda/section3/normprpl.htm
def normal_order_statistic_medians(i,n)
if i==1
u= 1.0 - normal_order_statistic_medians(n,n)
elsif i==n
u=0.5**(1 / n.to_f)
else
u= (i - 0.3175) / (n + 0.365)
end
u
end
def self.nice(s,e) # :nodoc:
reverse = etrue).add(self).send(method)
end
end
module STATSAMPLE__ #:nodoc:
end
end
#--
begin
require 'statsamplert'
rescue LoadError
module Statsample
OPTIMIZED=false
end
end
require 'statsample/vector'
require 'statsample/dataset'
require 'statsample/crosstab'
require 'statsample/matrix'
require 'statsample/shorthand'
require 'statsample/version'
================================================
FILE: po/es/statsample.po
================================================
msgid ""
msgstr ""
"Project-Id-Version: statsample 1.0.1\n"
"POT-Creation-Date: 2011-03-03 12:03-0300\n"
"PO-Revision-Date: 2011-03-03 12:05-0300\n"
"Last-Translator: Claudio Bustos \n"
"Language-Team: Desarrollador\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n"
"X-Poedit-Language: Spanish\n"
"X-Poedit-SourceCharset: utf-8\n"
#: lib/statsample/test/f.rb:26
msgid "F Test"
msgstr "Prueba F"
#: lib/statsample/test/t.rb:82
msgid "T Test"
msgstr "Prueba T"
#: lib/statsample/test/t.rb:83
msgid "Estimate"
msgstr "Estimado"
#: lib/statsample/test/t.rb:84
msgid "Std.Err.of Estimate"
msgstr "Err.Est. del Estimado"
#: lib/statsample/test/t.rb:114
msgid "%s: %0.4f | %s: %0.4f"
msgstr "%s: %0.4f | %s: %0.4f"
#: lib/statsample/test/t.rb:120
msgid "t(%d) = %0.4f, p=%0.4f (%s tails)"
msgstr "t(%d) = %0.4f, p=%0.4f (%s colas)"
#: lib/statsample/test/t.rb:121
msgid "CI(%d%%): %0.4f - %0.4f"
msgstr "IC(%d%%): %0.4f - %0.4f"
#: lib/statsample/test/t.rb:190
msgid "Sample mean: %0.4f | Sample sd: %0.4f | se : %0.4f"
msgstr "Media de la muestra: %0.4f | DE de la muestra: %0.4f | EE : %0.4f"
#: lib/statsample/test/t.rb:191
msgid "Population mean: %0.4f"
msgstr "Promedio población: %0.4f"
#: lib/statsample/test/t.rb:292
msgid "Mean and standard deviation"
msgstr "Promedio y desviación estándar"
#: lib/statsample/test/t.rb:292
#: lib/statsample/regression/simple.rb:109
#: lib/statsample/factor/pca.rb:216
#: lib/statsample/factor/principalaxis.rb:202
msgid "Variable"
msgstr "Variable"
#: lib/statsample/test/t.rb:292
#: lib/statsample/dominanceanalysis/bootstrap.rb:208
msgid "mean"
msgstr "promedio"
#: lib/statsample/test/t.rb:292
msgid "sd"
msgstr "de"
#: lib/statsample/test/t.rb:292
#: lib/statsample/factor/parallelanalysis.rb:103
#: lib/statsample/factor/parallelanalysis.rb:111
msgid "n"
msgstr "n"
#: lib/statsample/test/t.rb:296
msgid "Levene test for equality of variances"
msgstr "Test de Levene para igualdad de variancas"
#: lib/statsample/test/t.rb:298
msgid "T statistics"
msgstr "Estadístico T"
#: lib/statsample/test/t.rb:299
msgid "Equal variance"
msgstr "Varianza Igual"
#: lib/statsample/test/t.rb:300
msgid "Non equal variance"
msgstr "Varianza Desigual"
#: lib/statsample/test/t.rb:302
msgid "Effect size"
msgstr "Tamaño del efecto"
#: lib/statsample/test/umannwhitney.rb:140
msgid "Mann-Whitney's U"
msgstr "U de Mann-Whitney"
#: lib/statsample/test/umannwhitney.rb:149
msgid "%s results"
msgstr "resultados de %s"
#: lib/statsample/test/umannwhitney.rb:150
#: lib/statsample/test/umannwhitney.rb:151
msgid "Sum of ranks %s"
msgstr "Suma de rangos %s"
#: lib/statsample/test/umannwhitney.rb:152
msgid "U Value"
msgstr "Valor de U"
#: lib/statsample/test/umannwhitney.rb:153
msgid "Z"
msgstr "Z"
#: lib/statsample/test/umannwhitney.rb:155
msgid "Exact p (Dinneen & Blakesley, 1973):"
msgstr "p exacto (Dinneen & Blakesley, 1973):"
#: lib/statsample/test/levene.rb:37
msgid "Levene Test"
msgstr "Test de Levene"
#: lib/statsample/test/bartlettsphericity.rb:25
msgid "Bartlett's test of sphericity"
msgstr "Test de esfericidad de Bartlett"
#: lib/statsample/regression/multiple/baseengine.rb:27
msgid "Multiple Regression: %s over %s"
msgstr "Regresión Múltiple: %s sobre %s"
#: lib/statsample/regression/multiple/baseengine.rb:40
msgid "Regression"
msgstr "Regresión"
#: lib/statsample/regression/multiple/baseengine.rb:40
msgid "Error"
msgstr "Error"
#: lib/statsample/regression/multiple/baseengine.rb:184
msgid "Engine: %s"
msgstr "Motor: %s"
#: lib/statsample/regression/multiple/baseengine.rb:185
msgid "Cases(listwise)=%d(%d)"
msgstr "Casos (sólo válidos)=%d(%d)"
#: lib/statsample/regression/multiple/baseengine.rb:186
msgid "R="
msgstr "R="
#: lib/statsample/regression/multiple/baseengine.rb:187
msgid "R^2="
msgstr "R^2="
#: lib/statsample/regression/multiple/baseengine.rb:188
msgid "R^2 Adj="
msgstr "R^2 Adj="
#: lib/statsample/regression/multiple/baseengine.rb:189
msgid "Std.Error R="
msgstr "Error estándar R="
#: lib/statsample/regression/multiple/baseengine.rb:191
msgid "Equation"
msgstr "Ecuación"
#: lib/statsample/regression/multiple/baseengine.rb:197
msgid "Beta coefficients"
msgstr "Coeficientes beta"
#: lib/statsample/regression/multiple/baseengine.rb:198
msgid "Constant"
msgstr "Constante"
#: lib/statsample/regression/multiple/matrixengine.rb:78
msgid "Multiple reggresion of %s on %s"
msgstr "Regresión Múltiple de %s en %s"
#: lib/statsample/regression/simple.rb:88
msgid "Regression of %s over %s"
msgstr "Regresión de %s sobre %s"
#: lib/statsample/regression/simple.rb:109
#: lib/statsample/factor/map.rb:105
#: lib/statsample/reliability/skillscaleanalysis.rb:92
msgid "Value"
msgstr "Valor"
#: lib/statsample/regression/simple.rb:110
msgid "r"
msgstr "r"
#: lib/statsample/regression/simple.rb:111
msgid "r^2"
msgstr "r^2"
#: lib/statsample/regression/simple.rb:112
msgid "a"
msgstr "a"
#: lib/statsample/regression/simple.rb:113
msgid "b"
msgstr "b"
#: lib/statsample/regression/simple.rb:114
msgid "s.e"
msgstr "e.e."
#: lib/statsample/dominanceanalysis/bootstrap.rb:115
msgid "Bootstrap dominance Analysis: %s over %s"
msgstr "Resultados del Análisis de Dominancia Bootstrap: %s en %s"
#: lib/statsample/dominanceanalysis/bootstrap.rb:138
msgid "Bootstrap %d of %d"
msgstr "Bootstrap: %d de %d"
#: lib/statsample/dominanceanalysis/bootstrap.rb:177
msgid "Sample size: %d\n"
msgstr "Tamaño de muestra: %d\n"
#: lib/statsample/dominanceanalysis/bootstrap.rb:179
msgid "Linear Regression Engine: %s"
msgstr "Motor de Regresión Linear: %s"
#: lib/statsample/dominanceanalysis/bootstrap.rb:181
msgid "pairs"
msgstr "pares"
#: lib/statsample/dominanceanalysis/bootstrap.rb:181
msgid "SE(Dij)"
msgstr "EE(Dij)"
#: lib/statsample/dominanceanalysis/bootstrap.rb:181
msgid "Reproducibility"
msgstr "Reproducibilidad"
#: lib/statsample/dominanceanalysis/bootstrap.rb:182
msgid "Complete dominance"
msgstr "Dominancia Completa"
#: lib/statsample/dominanceanalysis/bootstrap.rb:190
msgid "Conditional dominance"
msgstr "Dominancia Condicional"
#: lib/statsample/dominanceanalysis/bootstrap.rb:199
msgid "General Dominance"
msgstr "Dominancia General"
#: lib/statsample/dominanceanalysis/bootstrap.rb:208
msgid "General averages"
msgstr "Promedios generales"
#: lib/statsample/dominanceanalysis/bootstrap.rb:208
msgid "var"
msgstr "var"
#: lib/statsample/dominanceanalysis/bootstrap.rb:208
msgid "se"
msgstr "de"
#: lib/statsample/dominanceanalysis/bootstrap.rb:208
msgid "p.5"
msgstr "p.5"
#: lib/statsample/dominanceanalysis/bootstrap.rb:208
msgid "p.95"
msgstr "p.95"
#: lib/statsample/anova/twoway.rb:59
msgid "ANOVA Two-Way"
msgstr "Anova de dos vías"
#: lib/statsample/anova/twoway.rb:60
msgid "A"
msgstr "A"
#: lib/statsample/anova/twoway.rb:61
msgid "B"
msgstr "B"
#: lib/statsample/anova/twoway.rb:62
msgid "Within"
msgstr "Dentro"
#: lib/statsample/anova/twoway.rb:98
#: lib/statsample/anova/oneway.rb:57
msgid "%s Table"
msgstr "Tabla %s"
#: lib/statsample/anova/twoway.rb:103
#: lib/statsample/anova/oneway.rb:60
#: lib/statsample/crosstab.rb:101
#: lib/statsample/crosstab.rb:116
#: lib/statsample/crosstab.rb:151
#: lib/statsample/crosstab.rb:173
#: lib/statsample/dominanceanalysis.rb:354
msgid "Total"
msgstr "Total"
#: lib/statsample/anova/twoway.rb:172
msgid "Anova Two-Way on %s"
msgstr "Anova de dos vías en %s"
#: lib/statsample/anova/twoway.rb:184
#: lib/statsample/anova/oneway.rb:127
msgid "Test of Homogeneity of variances (Levene)"
msgstr "Test de homogeneidad de varianza (Levene)"
#: lib/statsample/anova/twoway.rb:189
#: lib/statsample/anova/twoway.rb:193
msgid "%s Mean"
msgstr "Promedio %s"
#: lib/statsample/anova/oneway.rb:35
msgid "Explained variance"
msgstr "Varianza explicada"
#: lib/statsample/anova/oneway.rb:36
msgid "Unexplained variance"
msgstr "Varianza sin explicar"
#: lib/statsample/anova/oneway.rb:97
msgid "Anova One-Way"
msgstr "Anova de una vía"
#: lib/statsample/anova/oneway.rb:98
msgid "Between Groups"
msgstr "Entre grupos"
#: lib/statsample/anova/oneway.rb:99
msgid "Within Groups"
msgstr "Dentro de grupos"
#: lib/statsample/anova/oneway.rb:119
msgid "Contrast for %s"
msgstr "Contraste para %s"
#: lib/statsample/anova/oneway.rb:163
msgid "Descriptives"
msgstr "Descriptivos"
#: lib/statsample/anova/contrast.rb:13
msgid "Psi estimate"
msgstr "Psi Estimado"
#: lib/statsample/anova/contrast.rb:14
msgid "Contrast"
msgstr "Contraste"
#: lib/statsample/anova/contrast.rb:73
msgid "Contrast:%s"
msgstr "Contraste: %s"
#: lib/statsample/graph/scatterplot.rb:72
msgid "Scatterplot (%s - %s)"
msgstr "Diagrama de dispersión (%s - %s)"
#: lib/statsample/graph/histogram.rb:50
msgid "Histograma (%s)"
msgstr "Histograma (%s)"
#: lib/statsample/graph/boxplot.rb:63
msgid "Boxplot"
msgstr "Diagrama de caja"
#: lib/statsample/bivariate/pearson.rb:32
msgid "Correlation (%s - %s)"
msgstr "Correlación (%s - %s)"
#: lib/statsample/bivariate/pearson.rb:50
msgid "%s : r=%0.3f (t:%0.3f, g.l.=%d, p:%0.3f / %s tails)"
msgstr "%s : r=%0.3f (t:%0.3f, g.l.=%d, p:%0.3f / %s colas)"
#: lib/statsample/factor/parallelanalysis.rb:68
msgid "Parallel Analysis"
msgstr "Análisis Paralelo"
#: lib/statsample/factor/parallelanalysis.rb:96
msgid "Bootstrap Method: %s"
msgstr "Método de Remuestreo: %s"
#: lib/statsample/factor/parallelanalysis.rb:97
msgid "Uses SMC: %s"
msgstr "Usa SMC: %s"
#: lib/statsample/factor/parallelanalysis.rb:97
msgid "Yes"
msgstr "Sí"
#: lib/statsample/factor/parallelanalysis.rb:97
msgid "No"
msgstr "No"
#: lib/statsample/factor/parallelanalysis.rb:98
msgid "Correlation Matrix type : %s"
msgstr "Tipo de matriz de correlacion : %s"
#: lib/statsample/factor/parallelanalysis.rb:99
msgid "Number of variables: %d"
msgstr "Número de variables: %d"
#: lib/statsample/factor/parallelanalysis.rb:100
msgid "Number of cases: %d"
msgstr "Número de casos: %d"
#: lib/statsample/factor/parallelanalysis.rb:101
msgid "Number of iterations: %d"
msgstr "Número de iteraciones: %d"
#: lib/statsample/factor/parallelanalysis.rb:103
#: lib/statsample/factor/parallelanalysis.rb:111
#: lib/statsample/factor/map.rb:105
msgid "Eigenvalues"
msgstr "Eigenvalues"
#: lib/statsample/factor/parallelanalysis.rb:103
#: lib/statsample/factor/parallelanalysis.rb:111
msgid "generated eigenvalue"
msgstr "eigenvalue generado"
#: lib/statsample/factor/parallelanalysis.rb:110
msgid "Number or factors to preserve: %d"
msgstr "Número de factores a preservar: %d"
#: lib/statsample/factor/parallelanalysis.rb:111
msgid "data eigenvalue"
msgstr "eigenvalue de los datos"
#: lib/statsample/factor/parallelanalysis.rb:111
msgid "preserve?"
msgstr "¿preservar?"
#: lib/statsample/factor/map.rb:60
msgid "Velicer's MAP"
msgstr "PPM de Velicer"
#: lib/statsample/factor/map.rb:110
msgid "Velicer's Average Squared Correlations"
msgstr "Correlaciones Cuadradas Promedio de Velicer "
#: lib/statsample/factor/map.rb:110
msgid "number of components"
msgstr "número de componentes"
#: lib/statsample/factor/map.rb:110
msgid "average square correlation"
msgstr "correlación cuadrada promedio"
#: lib/statsample/factor/map.rb:115
msgid "The smallest average squared correlation is : %0.6f"
msgstr "La correlación cuadrada promedio más pequeña es: %0.6f"
#: lib/statsample/factor/map.rb:116
msgid "The number of components is : %d"
msgstr "El número de componentes es: %d"
#: lib/statsample/factor/pca.rb:56
msgid "Principal Component Analysis"
msgstr "Análisis de componentes principales"
#: lib/statsample/factor/pca.rb:59
#: lib/statsample/matrix.rb:14
#: lib/statsample/matrix.rb:81
msgid "VAR_%d"
msgstr "VAR_%d"
#: lib/statsample/factor/pca.rb:160
msgid "Component matrix (from covariance)"
msgstr "Matriz de componentes (desde covarianza)"
#: lib/statsample/factor/pca.rb:181
msgid "Component matrix"
msgstr "Matriz de componentes"
#: lib/statsample/factor/pca.rb:215
#: lib/statsample/factor/principalaxis.rb:200
msgid "Number of factors: %d"
msgstr "Número de factores: %d"
#: lib/statsample/factor/pca.rb:216
#: lib/statsample/factor/principalaxis.rb:202
msgid "Communalities"
msgstr "Comunalidades"
#: lib/statsample/factor/pca.rb:216
#: lib/statsample/factor/principalaxis.rb:202
msgid "Initial"
msgstr "Inicial"
#: lib/statsample/factor/pca.rb:216
#: lib/statsample/factor/principalaxis.rb:202
msgid "Extraction"
msgstr "Extracción"
#: lib/statsample/factor/pca.rb:216
#: lib/statsample/factor/pca.rb:223
#: lib/statsample/reliability/skillscaleanalysis.rb:92
msgid "%"
msgstr "%"
#: lib/statsample/factor/pca.rb:223
msgid "Total Variance Explained"
msgstr "Varianza Total Explicada"
#: lib/statsample/factor/pca.rb:223
msgid "Component"
msgstr "Componente"
#: lib/statsample/factor/pca.rb:223
msgid "E.Total"
msgstr "E. Total"
#: lib/statsample/factor/pca.rb:223
msgid "Cum. %"
msgstr "% Acum."
#: lib/statsample/factor/pca.rb:227
msgid "Component %d"
msgstr "Componente %d"
#: lib/statsample/factor/principalaxis.rb:70
msgid "Variable %d"
msgstr "Variable %d"
#: lib/statsample/factor/principalaxis.rb:147
msgid "Factor Matrix"
msgstr "Matriz de Factores"
#: lib/statsample/factor/principalaxis.rb:201
msgid "Iterations: %d"
msgstr "Iteraciones: %d"
#: lib/statsample/factor/principalaxis.rb:207
msgid "Total Variance"
msgstr "Varianza Total"
#: lib/statsample/factor/principalaxis.rb:207
msgid "Factor"
msgstr "Factor"
#: lib/statsample/factor/principalaxis.rb:207
msgid "I.E.Total"
msgstr "E.I. Total"
#: lib/statsample/factor/principalaxis.rb:207
msgid "I.E. %"
msgstr "E.I. %"
#: lib/statsample/factor/principalaxis.rb:207
msgid "I.E.Cum. %"
msgstr "E.I. Acum. %"
#: lib/statsample/factor/principalaxis.rb:208
msgid "S.L.Total"
msgstr "C.C. Total"
#: lib/statsample/factor/principalaxis.rb:208
msgid "S.L. %"
msgstr "C.C. %"
#: lib/statsample/factor/principalaxis.rb:208
msgid "S.L.Cum. %"
msgstr "C.C. Acum %"
#: lib/statsample/factor/principalaxis.rb:215
msgid "Factor %d"
msgstr "Factor %d"
#: lib/statsample/factor/rotation.rb:35
msgid "%s rotation"
msgstr "rotación %s"
#: lib/statsample/factor/rotation.rb:132
msgid "Rotated Component matrix"
msgstr "Matriz de componentes rotada"
#: lib/statsample/factor/rotation.rb:149
msgid "Component transformation matrix"
msgstr "Matriz de transformación de componentes"
#: lib/statsample/reliability/multiscaleanalysis.rb:67
msgid "Multiple Scale analysis"
msgstr "Análisis de múltiples escalas"
#: lib/statsample/reliability/multiscaleanalysis.rb:97
msgid "Scale %s"
msgstr "Escala %s"
#: lib/statsample/reliability/multiscaleanalysis.rb:145
msgid "Reliability analysis of scales"
msgstr "Análisis de confiabilidad de escalas"
#: lib/statsample/reliability/multiscaleanalysis.rb:151
msgid "Correlation matrix for %s"
msgstr "Matriz de correlaciones para %s"
#: lib/statsample/reliability/multiscaleanalysis.rb:156
msgid "PCA for %s"
msgstr "ACP para %s"
#: lib/statsample/reliability/multiscaleanalysis.rb:161
msgid "Principal Axis for %s"
msgstr "Ejes principales para %s"
#: lib/statsample/reliability/multiscaleanalysis.rb:167
msgid "Parallel Analysis for %s"
msgstr "Análisis Paralelo para %s"
#: lib/statsample/reliability/multiscaleanalysis.rb:172
msgid "MAP for %s"
msgstr "MAP para %s"
#: lib/statsample/reliability/skillscaleanalysis.rb:21
msgid "Skill Scale Reliability Analysis (%s)"
msgstr "Análisis de confiabilidad de escalas de habilidad"
#: lib/statsample/reliability/skillscaleanalysis.rb:36
msgid "%s(corrected)"
msgstr "%s(corregido)"
#: lib/statsample/reliability/skillscaleanalysis.rb:40
msgid "Corrected dataset from %s"
msgstr "Grupo de datos corregido desde %s"
#: lib/statsample/reliability/skillscaleanalysis.rb:51
msgid "%s (Scale Analysis)"
msgstr "%s (Análisis de Escala)"
#: lib/statsample/reliability/skillscaleanalysis.rb:82
msgid "Problematic Items"
msgstr "Ítems problemáticos"
#: lib/statsample/reliability/skillscaleanalysis.rb:87
msgid "Item: %s"
msgstr "Ítem: %s"
#: lib/statsample/reliability/skillscaleanalysis.rb:88
msgid "Correct answer: %s"
msgstr "Respuesta correcta: %s"
#: lib/statsample/reliability/skillscaleanalysis.rb:89
msgid "p: %0.3f"
msgstr "p: %0.3f"
#: lib/statsample/reliability/skillscaleanalysis.rb:101
msgid "No problematic items"
msgstr "Sin ítems problemáticos"
#: lib/statsample/reliability/scaleanalysis.rb:44
msgid "Reliability Analisis"
msgstr "Análisis de confiabilidad"
#: lib/statsample/reliability/scaleanalysis.rb:157
msgid "Summary for %s with all items"
msgstr "Sumario para %s con todos los ítems"
#: lib/statsample/reliability/scaleanalysis.rb:158
msgid "Items"
msgstr "Ítems"
#: lib/statsample/reliability/scaleanalysis.rb:159
#: lib/statsample/reliability/scaleanalysis.rb:176
msgid "Sum mean"
msgstr "Promedio de suma"
#: lib/statsample/reliability/scaleanalysis.rb:160
msgid "S.d. mean"
msgstr "Promedio de d.e."
#: lib/statsample/reliability/scaleanalysis.rb:162
msgid "Deleted items"
msgstr "Ítems eliminados"
#: lib/statsample/reliability/scaleanalysis.rb:172
msgid "Summary for %s"
msgstr "Sumario para %s"
#: lib/statsample/reliability/scaleanalysis.rb:173
msgid "Valid Items"
msgstr "Ítems Válidos"
#: lib/statsample/reliability/scaleanalysis.rb:175
msgid "Valid cases"
msgstr "casos válidos"
#: lib/statsample/reliability/scaleanalysis.rb:177
msgid "Sum sd"
msgstr "d.e. de suma"
#: lib/statsample/reliability/scaleanalysis.rb:179
msgid "Sum median"
msgstr "Mediana de suma"
#: lib/statsample/reliability/scaleanalysis.rb:181
msgid "Item mean"
msgstr "Promedio de los ítemes"
#: lib/statsample/reliability/scaleanalysis.rb:182
msgid "Item sd"
msgstr "DE de Items"
#: lib/statsample/reliability/scaleanalysis.rb:184
msgid "Skewness"
msgstr "Sesgo"
#: lib/statsample/reliability/scaleanalysis.rb:185
msgid "Kurtosis"
msgstr "Curtosis"
#: lib/statsample/reliability/scaleanalysis.rb:187
msgid "Cronbach's alpha"
msgstr "Alfa de Cronbach"
#: lib/statsample/reliability/scaleanalysis.rb:188
msgid "Standarized Cronbach's alpha"
msgstr "Alfa de Cronbach estandarizado"
#: lib/statsample/reliability/scaleanalysis.rb:189
msgid "Mean rpb"
msgstr "rbp medio"
#: lib/statsample/reliability/scaleanalysis.rb:191
msgid "Variances mean"
msgstr "Promedio de las varianzas"
#: lib/statsample/reliability/scaleanalysis.rb:192
msgid "Covariances mean"
msgstr "Promedio de las covarianzas"
#: lib/statsample/reliability/scaleanalysis.rb:196
msgid "Items for obtain alpha(0.8) : %d"
msgstr "Ítems para obtener alfa(0,8): %d"
#: lib/statsample/reliability/scaleanalysis.rb:197
msgid "Items for obtain alpha(0.9) : %d"
msgstr "Ítems para obtener alfa(0,9): %d"
#: lib/statsample/reliability/scaleanalysis.rb:205
msgid "Items report for %s"
msgstr "Reporte de ítems para %s"
#: lib/statsample/reliability/icc.rb:114
msgid "Shrout & Fleiss ICC(1,1)"
msgstr "Shrout & Fleiss ICC(1,1)"
#: lib/statsample/reliability/icc.rb:119
msgid "Shrout & Fleiss ICC(2,1)"
msgstr "Shrout & Fleiss ICC(2,1)"
#: lib/statsample/reliability/icc.rb:125
msgid "Shrout & Fleiss ICC(3,1)"
msgstr "Shrout & Fleiss ICC(3,1)"
#: lib/statsample/reliability/icc.rb:132
msgid "Shrout & Fleiss ICC(1,k)"
msgstr "Shrout & Fleiss ICC(1,k)"
#: lib/statsample/reliability/icc.rb:138
msgid "Shrout & Fleiss ICC(2,k)"
msgstr "Shrout & Fleiss ICC(2,k)"
#: lib/statsample/reliability/icc.rb:145
msgid "Shrout & Fleiss ICC(3,k)"
msgstr "Shrout & Fleiss ICC(3,k)"
#: lib/statsample/reliability/icc.rb:153
msgid "McGraw & Wong ICC(1)"
msgstr "McGraw & Wong ICC(1)"
#: lib/statsample/reliability/icc.rb:159
msgid "McGraw & Wong ICC(K)"
msgstr "McGraw & Wong ICC(K)"
#: lib/statsample/reliability/icc.rb:165
msgid "McGraw & Wong ICC(C,1)"
msgstr "McGraw & Wong ICC(C,1)"
#: lib/statsample/reliability/icc.rb:172
msgid "McGraw & Wong ICC(C,K)"
msgstr "McGraw & Wong ICC(C,K)"
#: lib/statsample/reliability/icc.rb:179
msgid "McGraw & Wong ICC(A,1)"
msgstr "McGraw & Wong ICC(A,1)"
#: lib/statsample/reliability/icc.rb:186
msgid "McGraw & Wong ICC(A,K)"
msgstr "McGraw & Wong ICC(A,K)"
#: lib/statsample/reliability/icc.rb:408
msgid "ICC: %0.4f"
msgstr "CIC: %0.3f"
#: lib/statsample/reliability/icc.rb:410
msgid "CI (%0.2f): [%0.4f - %0.4f]"
msgstr "IC (%0.2f): [%0.4f - %0.4f]"
#: lib/statsample/crosstab.rb:22
msgid "Crosstab %s - %s"
msgstr "Tabulación cruzada %s - %s"
#: lib/statsample/crosstab.rb:98
msgid "Rows: %s"
msgstr "Filas: %s"
#: lib/statsample/crosstab.rb:99
msgid "Columns: %s"
msgstr "Columnas: %s"
#: lib/statsample/crosstab.rb:101
msgid "Raw"
msgstr "En Bruto"
#: lib/statsample/crosstab.rb:146
msgid "% Row"
msgstr "% Fila"
#: lib/statsample/crosstab.rb:147
msgid "% Column"
msgstr "% Columna"
#: lib/statsample/crosstab.rb:148
msgid "% Total"
msgstr "% Total"
#: lib/statsample/dominanceanalysis.rb:121
msgid "Dominance Analysis: %s over %s"
msgstr "Análisis de dominancia: %s en %s"
#: lib/statsample/dominanceanalysis.rb:315
msgid "sign"
msgstr "signo"
#: lib/statsample/dominanceanalysis.rb:317
msgid "Dominance Analysis result"
msgstr "Resultados del análisis de dominancia"
#: lib/statsample/dominanceanalysis.rb:318
msgid "Model 0"
msgstr "Modelo 0"
#: lib/statsample/dominanceanalysis.rb:333
msgid "k=%d Average"
msgstr "k=%d Promedio"
#: lib/statsample/dominanceanalysis.rb:345
msgid "Overall averages"
msgstr "Promedios generales"
#: lib/statsample/dominanceanalysis.rb:354
msgid "Pairwise dominance"
msgstr "Dominancia en pares"
#: lib/statsample/dominanceanalysis.rb:354
msgid "Pairs"
msgstr "Pares"
#: lib/statsample/dominanceanalysis.rb:354
msgid "Conditional"
msgstr "Condicional"
#: lib/statsample/dominanceanalysis.rb:354
msgid "General"
msgstr "General"
#: lib/statsample/matrix.rb:181
msgid "X%d"
msgstr "X%d"
#: lib/statsample/matrix.rb:184
msgid "Y%d"
msgstr "Y%d"
#: lib/statsample/matrix.rb:196
msgid "Matrix %d"
msgstr "Matriz %d"
#: lib/statsample/matrix.rb:255
msgid "Covariate matrix %d"
msgstr "Matriz de Covarianza %d"
#: lib/statsample/matrix.rb:303
msgid "Correlation"
msgstr "Correlación"
#: lib/statsample/matrix.rb:303
msgid "Covariance"
msgstr "Covarianza"
#: lib/statsample/matrix.rb:303
msgid " Matrix"
msgstr "Matriz"
#: lib/statsample/vector.rb:177
msgid "%s(standarized)"
msgstr "%s(estandarizado)"
#: lib/statsample/vector.rb:189
msgid "%s(centered)"
msgstr "%s(centrado)"
#: lib/statsample/vector.rb:201
msgid "%s(percentil)"
msgstr "%s(percentil)"
#: lib/statsample/vector.rb:778
msgid "n :%d"
msgstr "n: %s"
#: lib/statsample/vector.rb:779
msgid "n valid:%d"
msgstr "n válido: %d"
#: lib/statsample/vector.rb:780
msgid "factors:%s"
msgstr "factores:%s"
#: lib/statsample/vector.rb:781
msgid "mode: %s"
msgstr "modo: %s"
#: lib/statsample/vector.rb:782
msgid "Distribution"
msgstr "Distribución"
#: lib/statsample/vector.rb:788
msgid "median: %s"
msgstr "Mediana: %s"
#: lib/statsample/vector.rb:790
msgid "mean: %0.4f"
msgstr "promedio: %0.3f"
#: lib/statsample/vector.rb:791
msgid "sd: %0.4f"
msgstr "d.e.: %0.3f"
#: lib/statsample/dataset.rb:161
msgid "Dataset %d"
msgstr "Dataset %d"
#: lib/statsample/dataset.rb:457
msgid "Sum from %s"
msgstr "Suma para %s"
#: lib/statsample/dataset.rb:510
msgid "Means from %s"
msgstr "Media desde %s"
#: lib/statsample/dataset.rb:734
msgid "%s(filtered)"
msgstr "%s(filtrado)"
#: lib/statsample/dataset.rb:956
msgid "Cases: %d"
msgstr "Casos: %s"
================================================
FILE: po/statsample.pot
================================================
# Statsample po template.
# Copyright (C) 2009-2009 Claudio Bustos
# This file is distributed under the same license as the Statsample package.
# Claudio Bustos
#
#, fuzzy
msgid ""
msgstr ""
"Project-Id-Version: statsample 1.0.1\n"
"POT-Creation-Date: 2011-03-03 12:03-0300\n"
"PO-Revision-Date: 2009-08-04 15:36-0400\n"
"Last-Translator: FULL NAME \n"
"Language-Team: LANGUAGE \n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n"
"Plural-Forms: nplurals=INTEGER; plural=EXPRESSION;\n"
#: lib/statsample/test/f.rb:26
msgid "F Test"
msgstr ""
#: lib/statsample/test/t.rb:82
msgid "T Test"
msgstr ""
#: lib/statsample/test/t.rb:83
msgid "Estimate"
msgstr ""
#: lib/statsample/test/t.rb:84
msgid "Std.Err.of Estimate"
msgstr ""
#: lib/statsample/test/t.rb:114
msgid "%s: %0.4f | %s: %0.4f"
msgstr ""
#: lib/statsample/test/t.rb:120
msgid "t(%d) = %0.4f, p=%0.4f (%s tails)"
msgstr ""
#: lib/statsample/test/t.rb:121
msgid "CI(%d%%): %0.4f - %0.4f"
msgstr ""
#: lib/statsample/test/t.rb:190
msgid "Sample mean: %0.4f | Sample sd: %0.4f | se : %0.4f"
msgstr ""
#: lib/statsample/test/t.rb:191
msgid "Population mean: %0.4f"
msgstr ""
#: lib/statsample/test/t.rb:292
msgid "Mean and standard deviation"
msgstr ""
#: lib/statsample/test/t.rb:292 lib/statsample/regression/simple.rb:109
#: lib/statsample/factor/pca.rb:216 lib/statsample/factor/principalaxis.rb:202
msgid "Variable"
msgstr ""
#: lib/statsample/test/t.rb:292
#: lib/statsample/dominanceanalysis/bootstrap.rb:208
msgid "mean"
msgstr ""
#: lib/statsample/test/t.rb:292
msgid "sd"
msgstr ""
#: lib/statsample/test/t.rb:292 lib/statsample/factor/parallelanalysis.rb:103
#: lib/statsample/factor/parallelanalysis.rb:111
msgid "n"
msgstr ""
#: lib/statsample/test/t.rb:296
msgid "Levene test for equality of variances"
msgstr ""
#: lib/statsample/test/t.rb:298
msgid "T statistics"
msgstr ""
#: lib/statsample/test/t.rb:299
msgid "Equal variance"
msgstr ""
#: lib/statsample/test/t.rb:300
msgid "Non equal variance"
msgstr ""
#: lib/statsample/test/t.rb:302
msgid "Effect size"
msgstr ""
#: lib/statsample/test/umannwhitney.rb:140
msgid "Mann-Whitney's U"
msgstr ""
#: lib/statsample/test/umannwhitney.rb:149
msgid "%s results"
msgstr ""
#: lib/statsample/test/umannwhitney.rb:150
#: lib/statsample/test/umannwhitney.rb:151
msgid "Sum of ranks %s"
msgstr ""
#: lib/statsample/test/umannwhitney.rb:152
msgid "U Value"
msgstr ""
#: lib/statsample/test/umannwhitney.rb:153
msgid "Z"
msgstr ""
#: lib/statsample/test/umannwhitney.rb:155
msgid "Exact p (Dinneen & Blakesley, 1973):"
msgstr ""
#: lib/statsample/test/levene.rb:37
msgid "Levene Test"
msgstr ""
#: lib/statsample/test/bartlettsphericity.rb:25
msgid "Bartlett's test of sphericity"
msgstr ""
#: lib/statsample/regression/multiple/baseengine.rb:27
msgid "Multiple Regression: %s over %s"
msgstr ""
#: lib/statsample/regression/multiple/baseengine.rb:40
msgid "Regression"
msgstr ""
#: lib/statsample/regression/multiple/baseengine.rb:40
msgid "Error"
msgstr ""
#: lib/statsample/regression/multiple/baseengine.rb:184
msgid "Engine: %s"
msgstr ""
#: lib/statsample/regression/multiple/baseengine.rb:185
msgid "Cases(listwise)=%d(%d)"
msgstr ""
#: lib/statsample/regression/multiple/baseengine.rb:186
msgid "R="
msgstr ""
#: lib/statsample/regression/multiple/baseengine.rb:187
msgid "R^2="
msgstr ""
#: lib/statsample/regression/multiple/baseengine.rb:188
msgid "R^2 Adj="
msgstr ""
#: lib/statsample/regression/multiple/baseengine.rb:189
msgid "Std.Error R="
msgstr ""
#: lib/statsample/regression/multiple/baseengine.rb:191
msgid "Equation"
msgstr ""
#: lib/statsample/regression/multiple/baseengine.rb:197
msgid "Beta coefficients"
msgstr ""
#: lib/statsample/regression/multiple/baseengine.rb:198
msgid "Constant"
msgstr ""
#: lib/statsample/regression/multiple/matrixengine.rb:78
msgid "Multiple reggresion of %s on %s"
msgstr ""
#: lib/statsample/regression/simple.rb:88
msgid "Regression of %s over %s"
msgstr ""
#: lib/statsample/regression/simple.rb:109 lib/statsample/factor/map.rb:105
#: lib/statsample/reliability/skillscaleanalysis.rb:92
msgid "Value"
msgstr ""
#: lib/statsample/regression/simple.rb:110
msgid "r"
msgstr ""
#: lib/statsample/regression/simple.rb:111
msgid "r^2"
msgstr ""
#: lib/statsample/regression/simple.rb:112
msgid "a"
msgstr ""
#: lib/statsample/regression/simple.rb:113
msgid "b"
msgstr ""
#: lib/statsample/regression/simple.rb:114
msgid "s.e"
msgstr ""
#: lib/statsample/dominanceanalysis/bootstrap.rb:115
msgid "Bootstrap dominance Analysis: %s over %s"
msgstr ""
#: lib/statsample/dominanceanalysis/bootstrap.rb:138
msgid "Bootstrap %d of %d"
msgstr ""
#: lib/statsample/dominanceanalysis/bootstrap.rb:177
msgid "Sample size: %d\n"
msgstr ""
#: lib/statsample/dominanceanalysis/bootstrap.rb:179
msgid "Linear Regression Engine: %s"
msgstr ""
#: lib/statsample/dominanceanalysis/bootstrap.rb:181
msgid "pairs"
msgstr ""
#: lib/statsample/dominanceanalysis/bootstrap.rb:181
msgid "SE(Dij)"
msgstr ""
#: lib/statsample/dominanceanalysis/bootstrap.rb:181
msgid "Reproducibility"
msgstr ""
#: lib/statsample/dominanceanalysis/bootstrap.rb:182
msgid "Complete dominance"
msgstr ""
#: lib/statsample/dominanceanalysis/bootstrap.rb:190
msgid "Conditional dominance"
msgstr ""
#: lib/statsample/dominanceanalysis/bootstrap.rb:199
msgid "General Dominance"
msgstr ""
#: lib/statsample/dominanceanalysis/bootstrap.rb:208
msgid "General averages"
msgstr ""
#: lib/statsample/dominanceanalysis/bootstrap.rb:208
msgid "var"
msgstr ""
#: lib/statsample/dominanceanalysis/bootstrap.rb:208
msgid "se"
msgstr ""
#: lib/statsample/dominanceanalysis/bootstrap.rb:208
msgid "p.5"
msgstr ""
#: lib/statsample/dominanceanalysis/bootstrap.rb:208
msgid "p.95"
msgstr ""
#: lib/statsample/anova/twoway.rb:59
msgid "ANOVA Two-Way"
msgstr ""
#: lib/statsample/anova/twoway.rb:60
msgid "A"
msgstr ""
#: lib/statsample/anova/twoway.rb:61
msgid "B"
msgstr ""
#: lib/statsample/anova/twoway.rb:62
msgid "Within"
msgstr ""
#: lib/statsample/anova/twoway.rb:98 lib/statsample/anova/oneway.rb:57
msgid "%s Table"
msgstr ""
#: lib/statsample/anova/twoway.rb:103 lib/statsample/anova/oneway.rb:60
#: lib/statsample/crosstab.rb:101 lib/statsample/crosstab.rb:116
#: lib/statsample/crosstab.rb:151 lib/statsample/crosstab.rb:173
#: lib/statsample/dominanceanalysis.rb:354
msgid "Total"
msgstr ""
#: lib/statsample/anova/twoway.rb:172
msgid "Anova Two-Way on %s"
msgstr ""
#: lib/statsample/anova/twoway.rb:184 lib/statsample/anova/oneway.rb:127
msgid "Test of Homogeneity of variances (Levene)"
msgstr ""
#: lib/statsample/anova/twoway.rb:189 lib/statsample/anova/twoway.rb:193
msgid "%s Mean"
msgstr ""
#: lib/statsample/anova/oneway.rb:35
msgid "Explained variance"
msgstr ""
#: lib/statsample/anova/oneway.rb:36
msgid "Unexplained variance"
msgstr ""
#: lib/statsample/anova/oneway.rb:97
msgid "Anova One-Way"
msgstr ""
#: lib/statsample/anova/oneway.rb:98
msgid "Between Groups"
msgstr ""
#: lib/statsample/anova/oneway.rb:99
msgid "Within Groups"
msgstr ""
#: lib/statsample/anova/oneway.rb:119
msgid "Contrast for %s"
msgstr ""
#: lib/statsample/anova/oneway.rb:163
msgid "Descriptives"
msgstr ""
#: lib/statsample/anova/contrast.rb:13
msgid "Psi estimate"
msgstr ""
#: lib/statsample/anova/contrast.rb:14
msgid "Contrast"
msgstr ""
#: lib/statsample/anova/contrast.rb:73
msgid "Contrast:%s"
msgstr ""
#: lib/statsample/graph/scatterplot.rb:72
msgid "Scatterplot (%s - %s)"
msgstr ""
#: lib/statsample/graph/histogram.rb:50
msgid "Histograma (%s)"
msgstr ""
#: lib/statsample/graph/boxplot.rb:63
msgid "Boxplot"
msgstr ""
#: lib/statsample/bivariate/pearson.rb:32
msgid "Correlation (%s - %s)"
msgstr ""
#: lib/statsample/bivariate/pearson.rb:50
msgid "%s : r=%0.3f (t:%0.3f, g.l.=%d, p:%0.3f / %s tails)"
msgstr ""
#: lib/statsample/factor/parallelanalysis.rb:68
msgid "Parallel Analysis"
msgstr ""
#: lib/statsample/factor/parallelanalysis.rb:96
msgid "Bootstrap Method: %s"
msgstr ""
#: lib/statsample/factor/parallelanalysis.rb:97
msgid "Uses SMC: %s"
msgstr ""
#: lib/statsample/factor/parallelanalysis.rb:97
msgid "Yes"
msgstr ""
#: lib/statsample/factor/parallelanalysis.rb:97
msgid "No"
msgstr ""
#: lib/statsample/factor/parallelanalysis.rb:98
msgid "Correlation Matrix type : %s"
msgstr ""
#: lib/statsample/factor/parallelanalysis.rb:99
msgid "Number of variables: %d"
msgstr ""
#: lib/statsample/factor/parallelanalysis.rb:100
msgid "Number of cases: %d"
msgstr ""
#: lib/statsample/factor/parallelanalysis.rb:101
msgid "Number of iterations: %d"
msgstr ""
#: lib/statsample/factor/parallelanalysis.rb:103
#: lib/statsample/factor/parallelanalysis.rb:111
#: lib/statsample/factor/map.rb:105
msgid "Eigenvalues"
msgstr ""
#: lib/statsample/factor/parallelanalysis.rb:103
#: lib/statsample/factor/parallelanalysis.rb:111
msgid "generated eigenvalue"
msgstr ""
#: lib/statsample/factor/parallelanalysis.rb:110
msgid "Number or factors to preserve: %d"
msgstr ""
#: lib/statsample/factor/parallelanalysis.rb:111
msgid "data eigenvalue"
msgstr ""
#: lib/statsample/factor/parallelanalysis.rb:111
msgid "preserve?"
msgstr ""
#: lib/statsample/factor/map.rb:60
msgid "Velicer's MAP"
msgstr ""
#: lib/statsample/factor/map.rb:110
msgid "Velicer's Average Squared Correlations"
msgstr ""
#: lib/statsample/factor/map.rb:110
msgid "number of components"
msgstr ""
#: lib/statsample/factor/map.rb:110
msgid "average square correlation"
msgstr ""
#: lib/statsample/factor/map.rb:115
msgid "The smallest average squared correlation is : %0.6f"
msgstr ""
#: lib/statsample/factor/map.rb:116
msgid "The number of components is : %d"
msgstr ""
#: lib/statsample/factor/pca.rb:56
msgid "Principal Component Analysis"
msgstr ""
#: lib/statsample/factor/pca.rb:59 lib/statsample/matrix.rb:14
#: lib/statsample/matrix.rb:81
msgid "VAR_%d"
msgstr ""
#: lib/statsample/factor/pca.rb:160
msgid "Component matrix (from covariance)"
msgstr ""
#: lib/statsample/factor/pca.rb:181
msgid "Component matrix"
msgstr ""
#: lib/statsample/factor/pca.rb:215 lib/statsample/factor/principalaxis.rb:200
msgid "Number of factors: %d"
msgstr ""
#: lib/statsample/factor/pca.rb:216 lib/statsample/factor/principalaxis.rb:202
msgid "Communalities"
msgstr ""
#: lib/statsample/factor/pca.rb:216 lib/statsample/factor/principalaxis.rb:202
msgid "Initial"
msgstr ""
#: lib/statsample/factor/pca.rb:216 lib/statsample/factor/principalaxis.rb:202
msgid "Extraction"
msgstr ""
#: lib/statsample/factor/pca.rb:216 lib/statsample/factor/pca.rb:223
#: lib/statsample/reliability/skillscaleanalysis.rb:92
msgid "%"
msgstr ""
#: lib/statsample/factor/pca.rb:223
msgid "Total Variance Explained"
msgstr ""
#: lib/statsample/factor/pca.rb:223
msgid "Component"
msgstr ""
#: lib/statsample/factor/pca.rb:223
msgid "E.Total"
msgstr ""
#: lib/statsample/factor/pca.rb:223
msgid "Cum. %"
msgstr ""
#: lib/statsample/factor/pca.rb:227
msgid "Component %d"
msgstr ""
#: lib/statsample/factor/principalaxis.rb:70
msgid "Variable %d"
msgstr ""
#: lib/statsample/factor/principalaxis.rb:147
msgid "Factor Matrix"
msgstr ""
#: lib/statsample/factor/principalaxis.rb:201
msgid "Iterations: %d"
msgstr ""
#: lib/statsample/factor/principalaxis.rb:207
msgid "Total Variance"
msgstr ""
#: lib/statsample/factor/principalaxis.rb:207
msgid "Factor"
msgstr ""
#: lib/statsample/factor/principalaxis.rb:207
msgid "I.E.Total"
msgstr ""
#: lib/statsample/factor/principalaxis.rb:207
msgid "I.E. %"
msgstr ""
#: lib/statsample/factor/principalaxis.rb:207
msgid "I.E.Cum. %"
msgstr ""
#: lib/statsample/factor/principalaxis.rb:208
msgid "S.L.Total"
msgstr ""
#: lib/statsample/factor/principalaxis.rb:208
msgid "S.L. %"
msgstr ""
#: lib/statsample/factor/principalaxis.rb:208
msgid "S.L.Cum. %"
msgstr ""
#: lib/statsample/factor/principalaxis.rb:215
msgid "Factor %d"
msgstr ""
#: lib/statsample/factor/rotation.rb:35
msgid "%s rotation"
msgstr ""
#: lib/statsample/factor/rotation.rb:132
msgid "Rotated Component matrix"
msgstr ""
#: lib/statsample/factor/rotation.rb:149
msgid "Component transformation matrix"
msgstr ""
#: lib/statsample/reliability/multiscaleanalysis.rb:67
msgid "Multiple Scale analysis"
msgstr ""
#: lib/statsample/reliability/multiscaleanalysis.rb:97
msgid "Scale %s"
msgstr ""
#: lib/statsample/reliability/multiscaleanalysis.rb:145
msgid "Reliability analysis of scales"
msgstr ""
#: lib/statsample/reliability/multiscaleanalysis.rb:151
msgid "Correlation matrix for %s"
msgstr ""
#: lib/statsample/reliability/multiscaleanalysis.rb:156
msgid "PCA for %s"
msgstr ""
#: lib/statsample/reliability/multiscaleanalysis.rb:161
msgid "Principal Axis for %s"
msgstr ""
#: lib/statsample/reliability/multiscaleanalysis.rb:167
msgid "Parallel Analysis for %s"
msgstr ""
#: lib/statsample/reliability/multiscaleanalysis.rb:172
msgid "MAP for %s"
msgstr ""
#: lib/statsample/reliability/skillscaleanalysis.rb:21
msgid "Skill Scale Reliability Analysis (%s)"
msgstr ""
#: lib/statsample/reliability/skillscaleanalysis.rb:36
msgid "%s(corrected)"
msgstr ""
#: lib/statsample/reliability/skillscaleanalysis.rb:40
msgid "Corrected dataset from %s"
msgstr ""
#: lib/statsample/reliability/skillscaleanalysis.rb:51
msgid "%s (Scale Analysis)"
msgstr ""
#: lib/statsample/reliability/skillscaleanalysis.rb:82
msgid "Problematic Items"
msgstr ""
#: lib/statsample/reliability/skillscaleanalysis.rb:87
msgid "Item: %s"
msgstr ""
#: lib/statsample/reliability/skillscaleanalysis.rb:88
msgid "Correct answer: %s"
msgstr ""
#: lib/statsample/reliability/skillscaleanalysis.rb:89
msgid "p: %0.3f"
msgstr ""
#: lib/statsample/reliability/skillscaleanalysis.rb:101
msgid "No problematic items"
msgstr ""
#: lib/statsample/reliability/scaleanalysis.rb:44
msgid "Reliability Analisis"
msgstr ""
#: lib/statsample/reliability/scaleanalysis.rb:157
msgid "Summary for %s with all items"
msgstr ""
#: lib/statsample/reliability/scaleanalysis.rb:158
msgid "Items"
msgstr ""
#: lib/statsample/reliability/scaleanalysis.rb:159
#: lib/statsample/reliability/scaleanalysis.rb:176
msgid "Sum mean"
msgstr ""
#: lib/statsample/reliability/scaleanalysis.rb:160
msgid "S.d. mean"
msgstr ""
#: lib/statsample/reliability/scaleanalysis.rb:162
msgid "Deleted items"
msgstr ""
#: lib/statsample/reliability/scaleanalysis.rb:172
msgid "Summary for %s"
msgstr ""
#: lib/statsample/reliability/scaleanalysis.rb:173
msgid "Valid Items"
msgstr ""
#: lib/statsample/reliability/scaleanalysis.rb:175
msgid "Valid cases"
msgstr ""
#: lib/statsample/reliability/scaleanalysis.rb:177
msgid "Sum sd"
msgstr ""
#: lib/statsample/reliability/scaleanalysis.rb:179
msgid "Sum median"
msgstr ""
#: lib/statsample/reliability/scaleanalysis.rb:181
msgid "Item mean"
msgstr ""
#: lib/statsample/reliability/scaleanalysis.rb:182
msgid "Item sd"
msgstr ""
#: lib/statsample/reliability/scaleanalysis.rb:184
msgid "Skewness"
msgstr ""
#: lib/statsample/reliability/scaleanalysis.rb:185
msgid "Kurtosis"
msgstr ""
#: lib/statsample/reliability/scaleanalysis.rb:187
msgid "Cronbach's alpha"
msgstr ""
#: lib/statsample/reliability/scaleanalysis.rb:188
msgid "Standarized Cronbach's alpha"
msgstr ""
#: lib/statsample/reliability/scaleanalysis.rb:189
msgid "Mean rpb"
msgstr ""
#: lib/statsample/reliability/scaleanalysis.rb:191
msgid "Variances mean"
msgstr ""
#: lib/statsample/reliability/scaleanalysis.rb:192
msgid "Covariances mean"
msgstr ""
#: lib/statsample/reliability/scaleanalysis.rb:196
msgid "Items for obtain alpha(0.8) : %d"
msgstr ""
#: lib/statsample/reliability/scaleanalysis.rb:197
msgid "Items for obtain alpha(0.9) : %d"
msgstr ""
#: lib/statsample/reliability/scaleanalysis.rb:205
msgid "Items report for %s"
msgstr ""
#: lib/statsample/reliability/icc.rb:114
msgid "Shrout & Fleiss ICC(1,1)"
msgstr ""
#: lib/statsample/reliability/icc.rb:119
msgid "Shrout & Fleiss ICC(2,1)"
msgstr ""
#: lib/statsample/reliability/icc.rb:125
msgid "Shrout & Fleiss ICC(3,1)"
msgstr ""
#: lib/statsample/reliability/icc.rb:132
msgid "Shrout & Fleiss ICC(1,k)"
msgstr ""
#: lib/statsample/reliability/icc.rb:138
msgid "Shrout & Fleiss ICC(2,k)"
msgstr ""
#: lib/statsample/reliability/icc.rb:145
msgid "Shrout & Fleiss ICC(3,k)"
msgstr ""
#: lib/statsample/reliability/icc.rb:153
msgid "McGraw & Wong ICC(1)"
msgstr ""
#: lib/statsample/reliability/icc.rb:159
msgid "McGraw & Wong ICC(K)"
msgstr ""
#: lib/statsample/reliability/icc.rb:165
msgid "McGraw & Wong ICC(C,1)"
msgstr ""
#: lib/statsample/reliability/icc.rb:172
msgid "McGraw & Wong ICC(C,K)"
msgstr ""
#: lib/statsample/reliability/icc.rb:179
msgid "McGraw & Wong ICC(A,1)"
msgstr ""
#: lib/statsample/reliability/icc.rb:186
msgid "McGraw & Wong ICC(A,K)"
msgstr ""
#: lib/statsample/reliability/icc.rb:408
msgid "ICC: %0.4f"
msgstr ""
#: lib/statsample/reliability/icc.rb:410
msgid "CI (%0.2f): [%0.4f - %0.4f]"
msgstr ""
#: lib/statsample/crosstab.rb:22
msgid "Crosstab %s - %s"
msgstr ""
#: lib/statsample/crosstab.rb:98
msgid "Rows: %s"
msgstr ""
#: lib/statsample/crosstab.rb:99
msgid "Columns: %s"
msgstr ""
#: lib/statsample/crosstab.rb:101
msgid "Raw"
msgstr ""
#: lib/statsample/crosstab.rb:146
msgid "% Row"
msgstr ""
#: lib/statsample/crosstab.rb:147
msgid "% Column"
msgstr ""
#: lib/statsample/crosstab.rb:148
msgid "% Total"
msgstr ""
#: lib/statsample/dominanceanalysis.rb:121
msgid "Dominance Analysis: %s over %s"
msgstr ""
#: lib/statsample/dominanceanalysis.rb:315
msgid "sign"
msgstr ""
#: lib/statsample/dominanceanalysis.rb:317
msgid "Dominance Analysis result"
msgstr ""
#: lib/statsample/dominanceanalysis.rb:318
msgid "Model 0"
msgstr ""
#: lib/statsample/dominanceanalysis.rb:333
msgid "k=%d Average"
msgstr ""
#: lib/statsample/dominanceanalysis.rb:345
msgid "Overall averages"
msgstr ""
#: lib/statsample/dominanceanalysis.rb:354
msgid "Pairwise dominance"
msgstr ""
#: lib/statsample/dominanceanalysis.rb:354
msgid "Pairs"
msgstr ""
#: lib/statsample/dominanceanalysis.rb:354
msgid "Conditional"
msgstr ""
#: lib/statsample/dominanceanalysis.rb:354
msgid "General"
msgstr ""
#: lib/statsample/matrix.rb:181
msgid "X%d"
msgstr ""
#: lib/statsample/matrix.rb:184
msgid "Y%d"
msgstr ""
#: lib/statsample/matrix.rb:196
msgid "Matrix %d"
msgstr ""
#: lib/statsample/matrix.rb:255
msgid "Covariate matrix %d"
msgstr ""
#: lib/statsample/matrix.rb:303
msgid "Correlation"
msgstr ""
#: lib/statsample/matrix.rb:303
msgid "Covariance"
msgstr ""
#: lib/statsample/matrix.rb:303
msgid " Matrix"
msgstr ""
#: lib/statsample/vector.rb:177
msgid "%s(standarized)"
msgstr ""
#: lib/statsample/vector.rb:189
msgid "%s(centered)"
msgstr ""
#: lib/statsample/vector.rb:201
msgid "%s(percentil)"
msgstr ""
#: lib/statsample/vector.rb:778
msgid "n :%d"
msgstr ""
#: lib/statsample/vector.rb:779
msgid "n valid:%d"
msgstr ""
#: lib/statsample/vector.rb:780
msgid "factors:%s"
msgstr ""
#: lib/statsample/vector.rb:781
msgid "mode: %s"
msgstr ""
#: lib/statsample/vector.rb:782
msgid "Distribution"
msgstr ""
#: lib/statsample/vector.rb:788
msgid "median: %s"
msgstr ""
#: lib/statsample/vector.rb:790
msgid "mean: %0.4f"
msgstr ""
#: lib/statsample/vector.rb:791
msgid "sd: %0.4f"
msgstr ""
#: lib/statsample/dataset.rb:161
msgid "Dataset %d"
msgstr ""
#: lib/statsample/dataset.rb:457
msgid "Sum from %s"
msgstr ""
#: lib/statsample/dataset.rb:510
msgid "Means from %s"
msgstr ""
#: lib/statsample/dataset.rb:734
msgid "%s(filtered)"
msgstr ""
#: lib/statsample/dataset.rb:956
msgid "Cases: %d"
msgstr ""
================================================
FILE: references.txt
================================================
References
* Azen, R. & Budescu, D.V. (2003). The dominance analysis approach for comparing predictors in multiple regression. Psychological Methods, 8(2), 129-148.
* Azen, R. & Budescu, D.V. (2006). Comparing predictors in Multivariate Regression Models: An extension of Dominance Analysis. Journal of Educational and Behavioral Statistics, 31(2), 157-180.
* Budescu, D. V. (1993). Dominance analysis: a new approach to the problem of relative importance of predictors in multiple regression. Psychological Bulletin, 114, 542-551.
* Cochran, W.(1972). Sampling Techniques [spanish edition].
* Cohen et al. (2003). Applied Multiple Reggression / Correlation Analysis for the Behavioral Sciences
* Dinneen, L., & Blakesley, B. (1973). Algorithm AS 62: A Generator for the Sampling Distribution of the Mann- Whitney U Statistic. Journal of the Royal Statistical Society, 22(2), 269-273
* Dziuban, C., & Shirkey E. (1974). When is a correlation matrix appropriate for factor analysis? Some decision rules. Psychological Bulletin, 81(6), 358-361.
* Hayton, J., Allen, D. & Scarpello, V.(2004). Factor Retention Decisions in Exploratory Factor Analysis: a Tutorial on Parallel Analysis. Organizational Research Methods, 7 (2), 191-205.
* Härdle, W. & Simar, L. (2003). Applied Multivariate Statistical Analysis. Springer
* Leach, L. & Henson, R. (2007). The Use and Impact of Adjusted R2 Effects in Published Regression Research. Multiple Linear Regression Viewpoints, 33(1), 1-11.
* Lin, J. (2007). VARIMAX_K58 [Source code]. [http://www.johnny-lin.com/idl_code/varimax_k58.pro]
* Liu, O., & Rijmen, F. (2008). A modified procedure for parallel analysis of ordered categorical data. Behavior Research Methods, 40(2), 556-562.
* McGraw, K. & Wong, S.P. (1996). Forming Inferences About Some Intraclass Correlation Coefficients. Psychological methods, 1(1), 30-46.
* O'Connor, B. (2000). SPSS and SAS programs for determining the number of components using parallel analysis and Velicer's MAP test. Behavior Research Methods, Instruments, & Computers, 32(3), 396-402.
* SPSS Manual
* Sawyer, S. (2005). Resampling Data: Using a Statistical Jacknife.
* Shrout,P. & Fleiss, J. (1979). Intraclass Correlation: Uses in assessing rater reliability. Psychological Bulletin, 86(2), 420-428
* Smith, L. (2002). A tutorial on Principal Component Analysis. Available on http://courses.eas.ualberta.ca/eas570/pca_tutorial.pdf
* http://en.wikipedia.org/wiki/Welch-Satterthwaite_equation
* http://europe.isixsigma.com/library/content/c080806a.asp
* http://stattrek.com/Lesson6/SRS.aspx
* http://talkstats.com/showthread.php?t=5056
* http://www.gnu.org/software/gsl/manual/html_node/The-histogram-struct.html
================================================
FILE: setup.rb
================================================
#
# setup.rb
#
# Copyright (c) 2000-2005 Minero Aoki
#
# This program is free software.
# You can distribute/modify this program under the terms of
# the GNU LGPL, Lesser General Public License version 2.1.
#
unless Enumerable.method_defined?(:map) # Ruby 1.4.6
module Enumerable
alias map collect
end
end
unless File.respond_to?(:read) # Ruby 1.6
def File.read(fname)
open(fname) {|f|
return f.read
}
end
end
unless Errno.const_defined?(:ENOTEMPTY) # Windows?
module Errno
class ENOTEMPTY
# We do not raise this exception, implementation is not needed.
end
end
end
def File.binread(fname)
open(fname, 'rb') {|f|
return f.read
}
end
# for corrupted Windows' stat(2)
def File.dir?(path)
File.directory?((path[-1,1] == '/') ? path : path + '/')
end
class ConfigTable
include Enumerable
def initialize(rbconfig)
@rbconfig = rbconfig
@items = []
@table = {}
# options
@install_prefix = nil
@config_opt = nil
@verbose = true
@no_harm = false
end
attr_accessor :install_prefix
attr_accessor :config_opt
attr_writer :verbose
def verbose?
@verbose
end
attr_writer :no_harm
def no_harm?
@no_harm
end
def [](key)
lookup(key).resolve(self)
end
def []=(key, val)
lookup(key).set val
end
def names
@items.map {|i| i.name }
end
def each(&block)
@items.each(&block)
end
def key?(name)
@table.key?(name)
end
def lookup(name)
@table[name] or setup_rb_error "no such config item: #{name}"
end
def add(item)
@items.push item
@table[item.name] = item
end
def remove(name)
item = lookup(name)
@items.delete_if {|i| i.name == name }
@table.delete_if {|name, i| i.name == name }
item
end
def load_script(path, inst = nil)
if File.file?(path)
MetaConfigEnvironment.new(self, inst).instance_eval File.read(path), path
end
end
def savefile
'.config'
end
def load_savefile
begin
File.foreach(savefile()) do |line|
k, v = *line.split(/=/, 2)
self[k] = v.strip
end
rescue Errno::ENOENT
setup_rb_error $!.message + "\n#{File.basename($0)} config first"
end
end
def save
@items.each {|i| i.value }
File.open(savefile(), 'w') {|f|
@items.each do |i|
f.printf "%s=%s\n", i.name, i.value if i.value? and i.value
end
}
end
def load_standard_entries
standard_entries(@rbconfig).each do |ent|
add ent
end
end
def standard_entries(rbconfig)
c = rbconfig
rubypath = File.join(c['bindir'], c['ruby_install_name'] + c['EXEEXT'])
major = c['MAJOR'].to_i
minor = c['MINOR'].to_i
teeny = c['TEENY'].to_i
version = "#{major}.#{minor}"
# ruby ver. >= 1.4.4?
newpath_p = ((major >= 2) or
((major == 1) and
((minor >= 5) or
((minor == 4) and (teeny >= 4)))))
if c['rubylibdir']
# V > 1.6.3
libruby = "#{c['prefix']}/lib/ruby"
librubyver = c['rubylibdir']
librubyverarch = c['archdir']
siteruby = c['sitedir']
siterubyver = c['sitelibdir']
siterubyverarch = c['sitearchdir']
elsif newpath_p
# 1.4.4 <= V <= 1.6.3
libruby = "#{c['prefix']}/lib/ruby"
librubyver = "#{c['prefix']}/lib/ruby/#{version}"
librubyverarch = "#{c['prefix']}/lib/ruby/#{version}/#{c['arch']}"
siteruby = c['sitedir']
siterubyver = "$siteruby/#{version}"
siterubyverarch = "$siterubyver/#{c['arch']}"
else
# V < 1.4.4
libruby = "#{c['prefix']}/lib/ruby"
librubyver = "#{c['prefix']}/lib/ruby/#{version}"
librubyverarch = "#{c['prefix']}/lib/ruby/#{version}/#{c['arch']}"
siteruby = "#{c['prefix']}/lib/ruby/#{version}/site_ruby"
siterubyver = siteruby
siterubyverarch = "$siterubyver/#{c['arch']}"
end
parameterize = lambda {|path|
path.sub(/\A#{Regexp.quote(c['prefix'])}/, '$prefix')
}
if arg = c['configure_args'].split.detect {|arg| /--with-make-prog=/ =~ arg }
makeprog = arg.sub(/'/, '').split(/=/, 2)[1]
else
makeprog = 'make'
end
[
ExecItem.new('installdirs', 'std/site/home',
'std: install under libruby; site: install under site_ruby; home: install under $HOME')\
{|val, table|
case val
when 'std'
table['rbdir'] = '$librubyver'
table['sodir'] = '$librubyverarch'
when 'site'
table['rbdir'] = '$siterubyver'
table['sodir'] = '$siterubyverarch'
when 'home'
setup_rb_error '$HOME was not set' unless ENV['HOME']
table['prefix'] = ENV['HOME']
table['rbdir'] = '$libdir/ruby'
table['sodir'] = '$libdir/ruby'
end
},
PathItem.new('prefix', 'path', c['prefix'],
'path prefix of target environment'),
PathItem.new('bindir', 'path', parameterize.call(c['bindir']),
'the directory for commands'),
PathItem.new('libdir', 'path', parameterize.call(c['libdir']),
'the directory for libraries'),
PathItem.new('datadir', 'path', parameterize.call(c['datadir']),
'the directory for shared data'),
PathItem.new('mandir', 'path', parameterize.call(c['mandir']),
'the directory for man pages'),
PathItem.new('sysconfdir', 'path', parameterize.call(c['sysconfdir']),
'the directory for system configuration files'),
PathItem.new('localstatedir', 'path', parameterize.call(c['localstatedir']),
'the directory for local state data'),
PathItem.new('libruby', 'path', libruby,
'the directory for ruby libraries'),
PathItem.new('librubyver', 'path', librubyver,
'the directory for standard ruby libraries'),
PathItem.new('librubyverarch', 'path', librubyverarch,
'the directory for standard ruby extensions'),
PathItem.new('siteruby', 'path', siteruby,
'the directory for version-independent aux ruby libraries'),
PathItem.new('siterubyver', 'path', siterubyver,
'the directory for aux ruby libraries'),
PathItem.new('siterubyverarch', 'path', siterubyverarch,
'the directory for aux ruby binaries'),
PathItem.new('rbdir', 'path', '$siterubyver',
'the directory for ruby scripts'),
PathItem.new('sodir', 'path', '$siterubyverarch',
'the directory for ruby extentions'),
PathItem.new('rubypath', 'path', rubypath,
'the path to set to #! line'),
ProgramItem.new('rubyprog', 'name', rubypath,
'the ruby program using for installation'),
ProgramItem.new('makeprog', 'name', makeprog,
'the make program to compile ruby extentions'),
SelectItem.new('shebang', 'all/ruby/never', 'ruby',
'shebang line (#!) editing mode'),
BoolItem.new('without-ext', 'yes/no', 'no',
'does not compile/install ruby extentions')
]
end
private :standard_entries
def load_multipackage_entries
multipackage_entries().each do |ent|
add ent
end
end
def multipackage_entries
[
PackageSelectionItem.new('with', 'name,name...', '', 'ALL',
'package names that you want to install'),
PackageSelectionItem.new('without', 'name,name...', '', 'NONE',
'package names that you do not want to install')
]
end
private :multipackage_entries
ALIASES = {
'std-ruby' => 'librubyver',
'stdruby' => 'librubyver',
'rubylibdir' => 'librubyver',
'archdir' => 'librubyverarch',
'site-ruby-common' => 'siteruby', # For backward compatibility
'site-ruby' => 'siterubyver', # For backward compatibility
'bin-dir' => 'bindir',
'bin-dir' => 'bindir',
'rb-dir' => 'rbdir',
'so-dir' => 'sodir',
'data-dir' => 'datadir',
'ruby-path' => 'rubypath',
'ruby-prog' => 'rubyprog',
'ruby' => 'rubyprog',
'make-prog' => 'makeprog',
'make' => 'makeprog'
}
def fixup
ALIASES.each do |ali, name|
@table[ali] = @table[name]
end
@items.freeze
@table.freeze
@options_re = /\A--(#{@table.keys.join('|')})(?:=(.*))?\z/
end
def parse_opt(opt)
m = @options_re.match(opt) or setup_rb_error "config: unknown option #{opt}"
m.to_a[1,2]
end
def dllext
@rbconfig['DLEXT']
end
def value_config?(name)
lookup(name).value?
end
class Item
def initialize(name, template, default, desc)
@name = name.freeze
@template = template
@value = default
@default = default
@description = desc
end
attr_reader :name
attr_reader :description
attr_accessor :default
alias help_default default
def help_opt
"--#{@name}=#{@template}"
end
def value?
true
end
def value
@value
end
def resolve(table)
@value.gsub(%r<\$([^/]+)>) { table[$1] }
end
def set(val)
@value = check(val)
end
private
def check(val)
setup_rb_error "config: --#{name} requires argument" unless val
val
end
end
class BoolItem < Item
def config_type
'bool'
end
def help_opt
"--#{@name}"
end
private
def check(val)
return 'yes' unless val
case val
when /\Ay(es)?\z/i, /\At(rue)?\z/i then 'yes'
when /\An(o)?\z/i, /\Af(alse)\z/i then 'no'
else
setup_rb_error "config: --#{@name} accepts only yes/no for argument"
end
end
end
class PathItem < Item
def config_type
'path'
end
private
def check(path)
setup_rb_error "config: --#{@name} requires argument" unless path
path[0,1] == '$' ? path : File.expand_path(path)
end
end
class ProgramItem < Item
def config_type
'program'
end
end
class SelectItem < Item
def initialize(name, selection, default, desc)
super
@ok = selection.split('/')
end
def config_type
'select'
end
private
def check(val)
unless @ok.include?(val.strip)
setup_rb_error "config: use --#{@name}=#{@template} (#{val})"
end
val.strip
end
end
class ExecItem < Item
def initialize(name, selection, desc, &block)
super name, selection, nil, desc
@ok = selection.split('/')
@action = block
end
def config_type
'exec'
end
def value?
false
end
def resolve(table)
setup_rb_error "$#{name()} wrongly used as option value"
end
undef set
def evaluate(val, table)
v = val.strip.downcase
unless @ok.include?(v)
setup_rb_error "invalid option --#{@name}=#{val} (use #{@template})"
end
@action.call v, table
end
end
class PackageSelectionItem < Item
def initialize(name, template, default, help_default, desc)
super name, template, default, desc
@help_default = help_default
end
attr_reader :help_default
def config_type
'package'
end
private
def check(val)
unless File.dir?("packages/#{val}")
setup_rb_error "config: no such package: #{val}"
end
val
end
end
class MetaConfigEnvironment
def initialize(config, installer)
@config = config
@installer = installer
end
def config_names
@config.names
end
def config?(name)
@config.key?(name)
end
def bool_config?(name)
@config.lookup(name).config_type == 'bool'
end
def path_config?(name)
@config.lookup(name).config_type == 'path'
end
def value_config?(name)
@config.lookup(name).config_type != 'exec'
end
def add_config(item)
@config.add item
end
def add_bool_config(name, default, desc)
@config.add BoolItem.new(name, 'yes/no', default ? 'yes' : 'no', desc)
end
def add_path_config(name, default, desc)
@config.add PathItem.new(name, 'path', default, desc)
end
def set_config_default(name, default)
@config.lookup(name).default = default
end
def remove_config(name)
@config.remove(name)
end
# For only multipackage
def packages
raise '[setup.rb fatal] multi-package metaconfig API packages() called for single-package; contact application package vendor' unless @installer
@installer.packages
end
# For only multipackage
def declare_packages(list)
raise '[setup.rb fatal] multi-package metaconfig API declare_packages() called for single-package; contact application package vendor' unless @installer
@installer.packages = list
end
end
end # class ConfigTable
# This module requires: #verbose?, #no_harm?
module FileOperations
def mkdir_p(dirname, prefix = nil)
dirname = prefix + File.expand_path(dirname) if prefix
$stderr.puts "mkdir -p #{dirname}" if verbose?
return if no_harm?
# Does not check '/', it's too abnormal.
dirs = File.expand_path(dirname).split(%r<(?=/)>)
if /\A[a-z]:\z/i =~ dirs[0]
disk = dirs.shift
dirs[0] = disk + dirs[0]
end
dirs.each_index do |idx|
path = dirs[0..idx].join('')
Dir.mkdir path unless File.dir?(path)
end
end
def rm_f(path)
$stderr.puts "rm -f #{path}" if verbose?
return if no_harm?
force_remove_file path
end
def rm_rf(path)
$stderr.puts "rm -rf #{path}" if verbose?
return if no_harm?
remove_tree path
end
def remove_tree(path)
if File.symlink?(path)
remove_file path
elsif File.dir?(path)
remove_tree0 path
else
force_remove_file path
end
end
def remove_tree0(path)
Dir.foreach(path) do |ent|
next if ent == '.'
next if ent == '..'
entpath = "#{path}/#{ent}"
if File.symlink?(entpath)
remove_file entpath
elsif File.dir?(entpath)
remove_tree0 entpath
else
force_remove_file entpath
end
end
begin
Dir.rmdir path
rescue Errno::ENOTEMPTY
# directory may not be empty
end
end
def move_file(src, dest)
force_remove_file dest
begin
File.rename src, dest
rescue
File.open(dest, 'wb') {|f|
f.write File.binread(src)
}
File.chmod File.stat(src).mode, dest
File.unlink src
end
end
def force_remove_file(path)
begin
remove_file path
rescue
end
end
def remove_file(path)
File.chmod 0777, path
File.unlink path
end
def install(from, dest, mode, prefix = nil)
$stderr.puts "install #{from} #{dest}" if verbose?
return if no_harm?
realdest = prefix ? prefix + File.expand_path(dest) : dest
realdest = File.join(realdest, File.basename(from)) if File.dir?(realdest)
str = File.binread(from)
if diff?(str, realdest)
verbose_off {
rm_f realdest if File.exist?(realdest)
}
File.open(realdest, 'wb') {|f|
f.write str
}
File.chmod mode, realdest
File.open("#{objdir_root()}/InstalledFiles", 'a') {|f|
if prefix
f.puts realdest.sub(prefix, '')
else
f.puts realdest
end
}
end
end
def diff?(new_content, path)
return true unless File.exist?(path)
new_content != File.binread(path)
end
def command(*args)
$stderr.puts args.join(' ') if verbose?
system(*args) or raise RuntimeError,
"system(#{args.map{|a| a.inspect }.join(' ')}) failed"
end
def ruby(*args)
command config('rubyprog'), *args
end
def make(task = nil)
command(*[config('makeprog'), task].compact)
end
def extdir?(dir)
File.exist?("#{dir}/MANIFEST") or File.exist?("#{dir}/extconf.rb")
end
def files_of(dir)
Dir.open(dir) {|d|
return d.select {|ent| File.file?("#{dir}/#{ent}") }
}
end
DIR_REJECT = %w( . .. CVS SCCS RCS CVS.adm .svn )
def directories_of(dir)
Dir.open(dir) {|d|
return d.select {|ent| File.dir?("#{dir}/#{ent}") } - DIR_REJECT
}
end
end
# This module requires: #srcdir_root, #objdir_root, #relpath
module HookScriptAPI
def get_config(key)
@config[key]
end
alias config get_config
# obsolete: use metaconfig to change configuration
def set_config(key, val)
@config[key] = val
end
#
# srcdir/objdir (works only in the package directory)
#
def curr_srcdir
"#{srcdir_root()}/#{relpath()}"
end
def curr_objdir
"#{objdir_root()}/#{relpath()}"
end
def srcfile(path)
"#{curr_srcdir()}/#{path}"
end
def srcexist?(path)
File.exist?(srcfile(path))
end
def srcdirectory?(path)
File.dir?(srcfile(path))
end
def srcfile?(path)
File.file?(srcfile(path))
end
def srcentries(path = '.')
Dir.open("#{curr_srcdir()}/#{path}") {|d|
return d.to_a - %w(. ..)
}
end
def srcfiles(path = '.')
srcentries(path).select {|fname|
File.file?(File.join(curr_srcdir(), path, fname))
}
end
def srcdirectories(path = '.')
srcentries(path).select {|fname|
File.dir?(File.join(curr_srcdir(), path, fname))
}
end
end
class ToplevelInstaller
Version = '3.4.1'
Copyright = 'Copyright (c) 2000-2005 Minero Aoki'
TASKS = [
[ 'all', 'do config, setup, then install' ],
[ 'config', 'saves your configurations' ],
[ 'show', 'shows current configuration' ],
[ 'setup', 'compiles ruby extentions and others' ],
[ 'install', 'installs files' ],
[ 'test', 'run all tests in test/' ],
[ 'clean', "does `make clean' for each extention" ],
[ 'distclean',"does `make distclean' for each extention" ]
]
def ToplevelInstaller.invoke
config = ConfigTable.new(load_rbconfig())
config.load_standard_entries
config.load_multipackage_entries if multipackage?
config.fixup
klass = (multipackage?() ? ToplevelInstallerMulti : ToplevelInstaller)
klass.new(File.dirname($0), config).invoke
end
def ToplevelInstaller.multipackage?
File.dir?(File.dirname($0) + '/packages')
end
def ToplevelInstaller.load_rbconfig
if arg = ARGV.detect {|arg| /\A--rbconfig=/ =~ arg }
ARGV.delete(arg)
load File.expand_path(arg.split(/=/, 2)[1])
$".push 'rbconfig.rb'
else
require 'rbconfig'
end
::Config::CONFIG
end
def initialize(ardir_root, config)
@ardir = File.expand_path(ardir_root)
@config = config
# cache
@valid_task_re = nil
end
def config(key)
@config[key]
end
def inspect
"#<#{self.class} #{__id__()}>"
end
def invoke
run_metaconfigs
case task = parsearg_global()
when nil, 'all'
parsearg_config
init_installers
exec_config
exec_setup
exec_install
else
case task
when 'config', 'test'
;
when 'clean', 'distclean'
@config.load_savefile if File.exist?(@config.savefile)
else
@config.load_savefile
end
__send__ "parsearg_#{task}"
init_installers
__send__ "exec_#{task}"
end
end
def run_metaconfigs
@config.load_script "#{@ardir}/metaconfig"
end
def init_installers
@installer = Installer.new(@config, @ardir, File.expand_path('.'))
end
#
# Hook Script API bases
#
def srcdir_root
@ardir
end
def objdir_root
'.'
end
def relpath
'.'
end
#
# Option Parsing
#
def parsearg_global
while arg = ARGV.shift
case arg
when /\A\w+\z/
setup_rb_error "invalid task: #{arg}" unless valid_task?(arg)
return arg
when '-q', '--quiet'
@config.verbose = false
when '--verbose'
@config.verbose = true
when '--help'
print_usage $stdout
exit 0
when '--version'
puts "#{File.basename($0)} version #{Version}"
exit 0
when '--copyright'
puts Copyright
exit 0
else
setup_rb_error "unknown global option '#{arg}'"
end
end
nil
end
def valid_task?(t)
valid_task_re() =~ t
end
def valid_task_re
@valid_task_re ||= /\A(?:#{TASKS.map {|task,desc| task }.join('|')})\z/
end
def parsearg_no_options
unless ARGV.empty?
task = caller(0).first.slice(%r<`parsearg_(\w+)'>, 1)
setup_rb_error "#{task}: unknown options: #{ARGV.join(' ')}"
end
end
alias parsearg_show parsearg_no_options
alias parsearg_setup parsearg_no_options
alias parsearg_test parsearg_no_options
alias parsearg_clean parsearg_no_options
alias parsearg_distclean parsearg_no_options
def parsearg_config
evalopt = []
set = []
@config.config_opt = []
while i = ARGV.shift
if /\A--?\z/ =~ i
@config.config_opt = ARGV.dup
break
end
name, value = *@config.parse_opt(i)
if @config.value_config?(name)
@config[name] = value
else
evalopt.push [name, value]
end
set.push name
end
evalopt.each do |name, value|
@config.lookup(name).evaluate value, @config
end
# Check if configuration is valid
set.each do |n|
@config[n] if @config.value_config?(n)
end
end
def parsearg_install
@config.no_harm = false
@config.install_prefix = ''
while a = ARGV.shift
case a
when '--no-harm'
@config.no_harm = true
when /\A--prefix=/
path = a.split(/=/, 2)[1]
path = File.expand_path(path) unless path[0,1] == '/'
@config.install_prefix = path
else
setup_rb_error "install: unknown option #{a}"
end
end
end
def print_usage(out)
out.puts 'Typical Installation Procedure:'
out.puts " $ ruby #{File.basename $0} config"
out.puts " $ ruby #{File.basename $0} setup"
out.puts " # ruby #{File.basename $0} install (may require root privilege)"
out.puts
out.puts 'Detailed Usage:'
out.puts " ruby #{File.basename $0} "
out.puts " ruby #{File.basename $0} [] []"
fmt = " %-24s %s\n"
out.puts
out.puts 'Global options:'
out.printf fmt, '-q,--quiet', 'suppress message outputs'
out.printf fmt, ' --verbose', 'output messages verbosely'
out.printf fmt, ' --help', 'print this message'
out.printf fmt, ' --version', 'print version and quit'
out.printf fmt, ' --copyright', 'print copyright and quit'
out.puts
out.puts 'Tasks:'
TASKS.each do |name, desc|
out.printf fmt, name, desc
end
fmt = " %-24s %s [%s]\n"
out.puts
out.puts 'Options for CONFIG or ALL:'
@config.each do |item|
out.printf fmt, item.help_opt, item.description, item.help_default
end
out.printf fmt, '--rbconfig=path', 'rbconfig.rb to load',"running ruby's"
out.puts
out.puts 'Options for INSTALL:'
out.printf fmt, '--no-harm', 'only display what to do if given', 'off'
out.printf fmt, '--prefix=path', 'install path prefix', ''
out.puts
end
#
# Task Handlers
#
def exec_config
@installer.exec_config
@config.save # must be final
end
def exec_setup
@installer.exec_setup
end
def exec_install
@installer.exec_install
end
def exec_test
@installer.exec_test
end
def exec_show
@config.each do |i|
printf "%-20s %s\n", i.name, i.value if i.value?
end
end
def exec_clean
@installer.exec_clean
end
def exec_distclean
@installer.exec_distclean
end
end # class ToplevelInstaller
class ToplevelInstallerMulti < ToplevelInstaller
include FileOperations
def initialize(ardir_root, config)
super
@packages = directories_of("#{@ardir}/packages")
raise 'no package exists' if @packages.empty?
@root_installer = Installer.new(@config, @ardir, File.expand_path('.'))
end
def run_metaconfigs
@config.load_script "#{@ardir}/metaconfig", self
@packages.each do |name|
@config.load_script "#{@ardir}/packages/#{name}/metaconfig"
end
end
attr_reader :packages
def packages=(list)
raise 'package list is empty' if list.empty?
list.each do |name|
raise "directory packages/#{name} does not exist"\
unless File.dir?("#{@ardir}/packages/#{name}")
end
@packages = list
end
def init_installers
@installers = {}
@packages.each do |pack|
@installers[pack] = Installer.new(@config,
"#{@ardir}/packages/#{pack}",
"packages/#{pack}")
end
with = extract_selection(config('with'))
without = extract_selection(config('without'))
@selected = @installers.keys.select {|name|
(with.empty? or with.include?(name)) \
and not without.include?(name)
}
end
def extract_selection(list)
a = list.split(/,/)
a.each do |name|
setup_rb_error "no such package: #{name}" unless @installers.key?(name)
end
a
end
def print_usage(f)
super
f.puts 'Inluded packages:'
f.puts ' ' + @packages.sort.join(' ')
f.puts
end
#
# Task Handlers
#
def exec_config
run_hook 'pre-config'
each_selected_installers {|inst| inst.exec_config }
run_hook 'post-config'
@config.save # must be final
end
def exec_setup
run_hook 'pre-setup'
each_selected_installers {|inst| inst.exec_setup }
run_hook 'post-setup'
end
def exec_install
run_hook 'pre-install'
each_selected_installers {|inst| inst.exec_install }
run_hook 'post-install'
end
def exec_test
run_hook 'pre-test'
each_selected_installers {|inst| inst.exec_test }
run_hook 'post-test'
end
def exec_clean
rm_f @config.savefile
run_hook 'pre-clean'
each_selected_installers {|inst| inst.exec_clean }
run_hook 'post-clean'
end
def exec_distclean
rm_f @config.savefile
run_hook 'pre-distclean'
each_selected_installers {|inst| inst.exec_distclean }
run_hook 'post-distclean'
end
#
# lib
#
def each_selected_installers
Dir.mkdir 'packages' unless File.dir?('packages')
@selected.each do |pack|
$stderr.puts "Processing the package `#{pack}' ..." if verbose?
Dir.mkdir "packages/#{pack}" unless File.dir?("packages/#{pack}")
Dir.chdir "packages/#{pack}"
yield @installers[pack]
Dir.chdir '../..'
end
end
def run_hook(id)
@root_installer.run_hook id
end
# module FileOperations requires this
def verbose?
@config.verbose?
end
# module FileOperations requires this
def no_harm?
@config.no_harm?
end
end # class ToplevelInstallerMulti
class Installer
FILETYPES = %w( bin lib ext data conf man )
include FileOperations
include HookScriptAPI
def initialize(config, srcroot, objroot)
@config = config
@srcdir = File.expand_path(srcroot)
@objdir = File.expand_path(objroot)
@currdir = '.'
end
def inspect
"#<#{self.class} #{File.basename(@srcdir)}>"
end
def noop(rel)
end
#
# Hook Script API base methods
#
def srcdir_root
@srcdir
end
def objdir_root
@objdir
end
def relpath
@currdir
end
#
# Config Access
#
# module FileOperations requires this
def verbose?
@config.verbose?
end
# module FileOperations requires this
def no_harm?
@config.no_harm?
end
def verbose_off
begin
save, @config.verbose = @config.verbose?, false
yield
ensure
@config.verbose = save
end
end
#
# TASK config
#
def exec_config
exec_task_traverse 'config'
end
alias config_dir_bin noop
alias config_dir_lib noop
def config_dir_ext(rel)
extconf if extdir?(curr_srcdir())
end
alias config_dir_data noop
alias config_dir_conf noop
alias config_dir_man noop
def extconf
ruby "#{curr_srcdir()}/extconf.rb", *@config.config_opt
end
#
# TASK setup
#
def exec_setup
exec_task_traverse 'setup'
end
def setup_dir_bin(rel)
files_of(curr_srcdir()).each do |fname|
update_shebang_line "#{curr_srcdir()}/#{fname}"
end
end
alias setup_dir_lib noop
def setup_dir_ext(rel)
make if extdir?(curr_srcdir())
end
alias setup_dir_data noop
alias setup_dir_conf noop
alias setup_dir_man noop
def update_shebang_line(path)
return if no_harm?
return if config('shebang') == 'never'
old = Shebang.load(path)
if old
$stderr.puts "warning: #{path}: Shebang line includes too many args. It is not portable and your program may not work." if old.args.size > 1
new = new_shebang(old)
return if new.to_s == old.to_s
else
return unless config('shebang') == 'all'
new = Shebang.new(config('rubypath'))
end
$stderr.puts "updating shebang: #{File.basename(path)}" if verbose?
open_atomic_writer(path) {|output|
File.open(path, 'rb') {|f|
f.gets if old # discard
output.puts new.to_s
output.print f.read
}
}
end
def new_shebang(old)
if /\Aruby/ =~ File.basename(old.cmd)
Shebang.new(config('rubypath'), old.args)
elsif File.basename(old.cmd) == 'env' and old.args.first == 'ruby'
Shebang.new(config('rubypath'), old.args[1..-1])
else
return old unless config('shebang') == 'all'
Shebang.new(config('rubypath'))
end
end
def open_atomic_writer(path, &block)
tmpfile = File.basename(path) + '.tmp'
begin
File.open(tmpfile, 'wb', &block)
File.rename tmpfile, File.basename(path)
ensure
File.unlink tmpfile if File.exist?(tmpfile)
end
end
class Shebang
def Shebang.load(path)
line = nil
File.open(path) {|f|
line = f.gets
}
return nil unless /\A#!/ =~ line
parse(line)
end
def Shebang.parse(line)
cmd, *args = *line.strip.sub(/\A\#!/, '').split(' ')
new(cmd, args)
end
def initialize(cmd, args = [])
@cmd = cmd
@args = args
end
attr_reader :cmd
attr_reader :args
def to_s
"#! #{@cmd}" + (@args.empty? ? '' : " #{@args.join(' ')}")
end
end
#
# TASK install
#
def exec_install
rm_f 'InstalledFiles'
exec_task_traverse 'install'
end
def install_dir_bin(rel)
install_files targetfiles(), "#{config('bindir')}/#{rel}", 0755
end
def install_dir_lib(rel)
install_files libfiles(), "#{config('rbdir')}/#{rel}", 0644
end
def install_dir_ext(rel)
return unless extdir?(curr_srcdir())
install_files rubyextentions('.'),
"#{config('sodir')}/#{File.dirname(rel)}",
0555
end
def install_dir_data(rel)
install_files targetfiles(), "#{config('datadir')}/#{rel}", 0644
end
def install_dir_conf(rel)
# FIXME: should not remove current config files
# (rename previous file to .old/.org)
install_files targetfiles(), "#{config('sysconfdir')}/#{rel}", 0644
end
def install_dir_man(rel)
install_files targetfiles(), "#{config('mandir')}/#{rel}", 0644
end
def install_files(list, dest, mode)
mkdir_p dest, @config.install_prefix
list.each do |fname|
install fname, dest, mode, @config.install_prefix
end
end
def libfiles
glob_reject(%w(*.y *.output), targetfiles())
end
def rubyextentions(dir)
ents = glob_select("*.#{@config.dllext}", targetfiles())
if ents.empty?
setup_rb_error "no ruby extention exists: 'ruby #{$0} setup' first"
end
ents
end
def targetfiles
mapdir(existfiles() - hookfiles())
end
def mapdir(ents)
ents.map {|ent|
if File.exist?(ent)
then ent # objdir
else "#{curr_srcdir()}/#{ent}" # srcdir
end
}
end
# picked up many entries from cvs-1.11.1/src/ignore.c
JUNK_FILES = %w(
core RCSLOG tags TAGS .make.state
.nse_depinfo #* .#* cvslog.* ,* .del-* *.olb
*~ *.old *.bak *.BAK *.orig *.rej _$* *$
*.org *.in .*
)
def existfiles
glob_reject(JUNK_FILES, (files_of(curr_srcdir()) | files_of('.')))
end
def hookfiles
%w( pre-%s post-%s pre-%s.rb post-%s.rb ).map {|fmt|
%w( config setup install clean ).map {|t| sprintf(fmt, t) }
}.flatten
end
def glob_select(pat, ents)
re = globs2re([pat])
ents.select {|ent| re =~ ent }
end
def glob_reject(pats, ents)
re = globs2re(pats)
ents.reject {|ent| re =~ ent }
end
GLOB2REGEX = {
'.' => '\.',
'$' => '\$',
'#' => '\#',
'*' => '.*'
}
def globs2re(pats)
/\A(?:#{
pats.map {|pat| pat.gsub(/[\.\$\#\*]/) {|ch| GLOB2REGEX[ch] } }.join('|')
})\z/
end
#
# TASK test
#
TESTDIR = 'test'
def exec_test
unless File.directory?('test')
$stderr.puts 'no test in this package' if verbose?
return
end
$stderr.puts 'Running tests...' if verbose?
begin
require 'test/unit'
rescue LoadError
setup_rb_error 'test/unit cannot loaded. You need Ruby 1.8 or later to invoke this task.'
end
runner = Test::Unit::AutoRunner.new(true)
runner.to_run << TESTDIR
runner.run
end
#
# TASK clean
#
def exec_clean
exec_task_traverse 'clean'
rm_f @config.savefile
rm_f 'InstalledFiles'
end
alias clean_dir_bin noop
alias clean_dir_lib noop
alias clean_dir_data noop
alias clean_dir_conf noop
alias clean_dir_man noop
def clean_dir_ext(rel)
return unless extdir?(curr_srcdir())
make 'clean' if File.file?('Makefile')
end
#
# TASK distclean
#
def exec_distclean
exec_task_traverse 'distclean'
rm_f @config.savefile
rm_f 'InstalledFiles'
end
alias distclean_dir_bin noop
alias distclean_dir_lib noop
def distclean_dir_ext(rel)
return unless extdir?(curr_srcdir())
make 'distclean' if File.file?('Makefile')
end
alias distclean_dir_data noop
alias distclean_dir_conf noop
alias distclean_dir_man noop
#
# Traversing
#
def exec_task_traverse(task)
run_hook "pre-#{task}"
FILETYPES.each do |type|
if type == 'ext' and config('without-ext') == 'yes'
$stderr.puts 'skipping ext/* by user option' if verbose?
next
end
traverse task, type, "#{task}_dir_#{type}"
end
run_hook "post-#{task}"
end
def traverse(task, rel, mid)
dive_into(rel) {
run_hook "pre-#{task}"
__send__ mid, rel.sub(%r[\A.*?(?:/|\z)], '')
directories_of(curr_srcdir()).each do |d|
traverse task, "#{rel}/#{d}", mid
end
run_hook "post-#{task}"
}
end
def dive_into(rel)
return unless File.dir?("#{@srcdir}/#{rel}")
dir = File.basename(rel)
Dir.mkdir dir unless File.dir?(dir)
prevdir = Dir.pwd
Dir.chdir dir
$stderr.puts '---> ' + rel if verbose?
@currdir = rel
yield
Dir.chdir prevdir
$stderr.puts '<--- ' + rel if verbose?
@currdir = File.dirname(rel)
end
def run_hook(id)
path = [ "#{curr_srcdir()}/#{id}",
"#{curr_srcdir()}/#{id}.rb" ].detect {|cand| File.file?(cand) }
return unless path
begin
instance_eval File.read(path), path, 1
rescue
raise if $DEBUG
setup_rb_error "hook #{path} failed:\n" + $!.message
end
end
end # class Installer
class SetupError < StandardError; end
def setup_rb_error(msg)
raise SetupError, msg
end
if $0 == __FILE__
begin
ToplevelInstaller.invoke
rescue SetupError
raise if $DEBUG
$stderr.puts $!.message
$stderr.puts "Try 'ruby #{$0} --help' for detailed usage."
exit 1
end
end
================================================
FILE: test/fixtures/correlation_matrix.rb
================================================
# Retrieve Correlation matrix for eigth variables
module Statsample
module Fixtures
def harman_817
Matrix[
[1.0, 0.84, 0.62, -0.53, 0.03, 0.57, -0.33, -0.63],
[0.84, 1.00, 0.84, -0.68, -0.05, 0.76, -0.35, -0.73],
[0.62, 0.84, 1.00, -0.76, 0.08, 0.81, -0.51, -0.81],
[-0.53, -0.68, -0.76, 1.00, -0.25, -0.80, 0.62, 0.88],
[0.03, -0.05, 0.08, -0.25, 1.00, 0.25, -0.72, -0.36],
[0.57, 0.76, 0.81, -0.80, 0.25, 1.00, -0.58, -0.84],
[-0.33, -0.35, -0.51, 0.62, -0.72, -0.58, 1.00, 0.68],
[-0.63, -0.73, -0.81, 0.88, -0.36, -0.84, 0.68, 1.00]
].extend(Statsample::CovariateMatrix)
end
end
end
================================================
FILE: test/fixtures/hartman_23.matrix
================================================
"height" "arm.span" "forearm" "lower.leg" "weight" "bitro.diameter" "chest.girth" "chest.width"
"height" 1 0.846 0.805 0.859 0.473 0.398 0.301 0.382
"arm.span" 0.846 1 0.881 0.826 0.376 0.326 0.277 0.415
"forearm" 0.805 0.881 1 0.801 0.38 0.319 0.237 0.345
"lower.leg" 0.859 0.826 0.801 1 0.436 0.329 0.327 0.365
"weight" 0.473 0.376 0.38 0.436 1 0.762 0.73 0.629
"bitro.diameter" 0.398 0.326 0.319 0.329 0.762 1 0.583 0.577
"chest.girth" 0.301 0.277 0.237 0.327 0.73 0.583 1 0.539
"chest.width" 0.382 0.415 0.345 0.365 0.629 0.577 0.539 1
================================================
FILE: test/fixtures/repeated_fields.csv
================================================
"id","name","age","city","a1","name","age"
1,"Alex",20,"New York","a,b","a",3
2,"Claude",23,"London","b,c","b",4
3,"Peter",25,"London","a","c",5
4,"Franz",27,"Paris",,"d",6
5,"George","5,5","Tome","a,b,c","f",
6,"Fernand",20,"London","c,b","f",8
================================================
FILE: test/fixtures/stock_data.csv
================================================
17.66
17.65
17.68
17.66
17.68
17.67
17.68
17.68
17.67
17.67
17.68
17.71
17.74
17.72
17.73
17.76
17.74
17.69
17.69
17.67
17.66
17.67
17.69
17.69
17.68
17.65
17.65
17.64
17.63
17.64
17.67
17.68
17.7
17.68
17.69
17.69
17.72
17.71
17.71
17.71
17.69
17.69
17.71
17.72
17.71
17.68
17.68
17.68
17.69
17.68
17.68
17.69
17.67
17.69
17.71
17.7
17.7
17.71
17.73
17.74
17.74
17.74
17.76
17.77
17.55
17.55
17.5
17.46
17.49
17.54
17.51
17.54
17.57
17.54
17.52
17.53
17.56
17.55
17.55
17.54
17.55
17.55
17.55
17.54
17.52
17.53
17.51
17.52
17.5
17.5
17.5
17.49
17.46
17.47
17.48
17.45
17.41
17.39
17.38
17.43
17.44
17.43
17.43
17.46
17.46
17.47
17.47
17.45
17.48
17.49
17.5
17.49
17.48
17.49
17.47
17.47
17.44
17.44
17.43
17.45
17.42
17.43
17.43
17.44
17.44
17.43
17.41
17.41
17.38
17.38
17.37
17.37
17.37
17.3
17.28
17.27
17.19
16.41
16.44
16.48
16.53
16.51
16.57
16.54
16.59
16.64
16.6
16.65
16.69
16.69
16.68
16.64
16.65
16.66
16.64
16.61
16.65
16.67
16.66
16.65
16.61
16.59
16.57
16.55
16.55
16.57
16.54
16.6
16.62
16.6
16.59
16.61
16.66
16.69
16.67
16.65
16.66
16.65
16.65
16.68
16.68
16.67
16.64
16.73
16.76
16.75
16.79
16.8
16.77
16.74
16.76
16.83
16.84
16.82
16.89
16.93
16.94
16.9
16.92
16.88
16.85
16.87
16.8
16.79
16.85
16.85
16.8
16.82
16.85
16.9
16.86
16.79
16.75
16.78
17.06
17.05
17.04
17.02
17.01
17.02
17.05
17.07
17.08
17.09
17.1
17.11
17.09
17.1
17.1
17.12
17.17
17.16
17.17
17.18
17.18
17.18
17.17
17.15
17.14
17.13
17.14
17.13
17.12
17.12
17.09
17.09
17.11
17.06
17.07
17.06
17.07
17.06
17.09
17.05
17.04
17.04
16.99
17
17.03
17
16.97
16.96
16.98
16.98
16.98
17.03
17
17
17
17.02
17
17.02
17.01
17.02
17.03
17.03
17.01
17.03
17.03
17.03
17.01
17.03
17.05
17.05
17.08
17.04
17.01
17.03
17.02
17.03
17.04
17.05
17.37
17.35
17.34
17.32
17.29
17.29
17.22
17.26
17.3
17.34
17.33
17.39
17.4
17.39
17.48
17.5
17.47
17.43
17.4
17.42
17.46
17.48
17.48
17.46
17.46
17.45
17.43
17.44
17.48
17.43
17.45
17.47
17.46
17.46
17.48
17.48
17.48
17.46
17.5
17.55
17.58
17.57
17.56
17.59
17.61
17.62
17.63
17.62
17.61
17.61
17.62
17.64
17.65
17.61
17.62
17.66
17.65
17.64
17.63
17.64
17.64
17.64
17.63
17.61
17.61
17.62
17.63
17.64
17.65
17.66
17.68
17.69
17.69
17.69
17.66
17.69
17.69
17.62
17.68
17.64
17.65
17.61
17.52
17.56
17.55
17.55
17.48
17.45
17.46
17.46
17.44
17.47
17.5
17.49
17.5
17.53
17.53
17.54
17.51
17.51
17.53
17.53
17.53
17.55
17.55
17.54
17.56
17.59
17.57
17.58
17.58
17.57
17.59
17.57
17.55
17.51
17.51
17.52
17.52
17.53
17.55
17.59
17.61
17.61
17.6
17.6
17.62
17.65
17.62
17.6
17.6
17.62
17.61
17.62
17.63
17.64
17.65
17.61
17.62
17.64
17.63
17.62
17.6
17.57
17.57
17.6
17.59
17.6
17.61
17.61
17.63
17.63
17.59
17.58
17.76
17.79
17.76
17.73
17.74
17.73
17.67
17.66
17.66
17.64
17.63
17.62
17.61
17.6
17.61
17.61
17.6
17.6
17.64
17.65
17.65
17.63
17.61
17.6
17.63
17.63
17.62
17.63
17.64
17.62
17.63
17.65
17.64
17.6
17.59
17.59
17.58
17.58
17.6
17.6
17.6
17.6
17.6
17.58
17.59
17.6
17.6
17.6
17.59
17.59
17.58
17.58
17.65
17.65
================================================
FILE: test/fixtures/test_csv.csv
================================================
"id","name","age","city","a1"
1,"Alex",20,"New York","a,b"
2,"Claude",23,"London","b,c"
3,"Peter",25,"London","a"
4,"Franz",27,"Paris",
5,"George","5,5","Tome","a,b,c"
6,"Fernand",,,
================================================
FILE: test/fixtures/tetmat_matrix.txt
================================================
1.0000000 0.1703164 0.2275128 0.1071861 0.0665047
0.1703164 1.0000000 0.1890911 0.1111471 0.1724219
0.2275128 0.1890911 1.0000000 0.1866805 0.1055028
0.1071861 0.1111471 0.1866805 1.0000000 0.2009241
0.0665047 0.1724219 0.1055028 0.2009241 1.0000000
================================================
FILE: test/fixtures/tetmat_test.txt
================================================
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
1 1 1 1 2
1 1 1 1 2
1 1 1 1 2
1 1 1 1 2
1 1 1 1 2
1 1 1 1 2
1 1 1 2 1
1 1 1 2 1
1 1 1 2 2
1 1 1 2 2
1 1 1 2 2
1 1 1 2 2
1 1 1 2 2
1 1 1 2 2
1 1 1 2 2
1 1 1 2 2
1 1 1 2 2
1 1 1 2 2
1 1 1 2 2
1 1 2 1 1
1 1 2 1 2
1 1 2 2 1
1 1 2 2 1
1 1 2 2 1
1 1 2 2 2
1 1 2 2 2
1 1 2 2 2
1 1 2 2 2
1 2 1 1 1
1 2 1 1 2
1 2 1 1 2
1 2 1 1 2
1 2 1 1 2
1 2 1 1 2
1 2 1 1 2
1 2 1 1 2
1 2 1 1 2
1 2 1 2 2
1 2 1 2 2
1 2 1 2 2
1 2 1 2 2
1 2 1 2 2
1 2 1 2 2
1 2 1 2 2
1 2 1 2 2
1 2 1 2 2
1 2 1 2 2
1 2 1 2 2
1 2 1 2 2
1 2 1 2 2
1 2 1 2 2
1 2 1 2 2
1 2 1 2 2
1 2 2 1 2
1 2 2 1 2
1 2 2 1 2
1 2 2 2 1
1 2 2 2 1
1 2 2 2 2
1 2 2 2 2
1 2 2 2 2
1 2 2 2 2
1 2 2 2 2
1 2 2 2 2
1 2 2 2 2
1 2 2 2 2
1 2 2 2 2
1 2 2 2 2
1 2 2 2 2
1 2 2 2 2
1 2 2 2 2
1 2 2 2 2
1 2 2 2 2
2 1 1 1 1
2 1 1 1 1
2 1 1 1 1
2 1 1 1 1
2 1 1 1 1
2 1 1 1 1
2 1 1 1 1
2 1 1 1 1
2 1 1 1 1
2 1 1 1 1
2 1 1 1 2
2 1 1 1 2
2 1 1 1 2
2 1 1 1 2
2 1 1 1 2
2 1 1 1 2
2 1 1 1 2
2 1 1 1 2
2 1 1 1 2
2 1 1 1 2
2 1 1 1 2
2 1 1 1 2
2 1 1 1 2
2 1 1 1 2
2 1 1 1 2
2 1 1 1 2
2 1 1 1 2
2 1 1 1 2
2 1 1 1 2
2 1 1 1 2
2 1 1 1 2
2 1 1 1 2
2 1 1 1 2
2 1 1 1 2
2 1 1 1 2
2 1 1 1 2
2 1 1 1 2
2 1 1 1 2
2 1 1 1 2
2 1 1 2 1
2 1 1 2 1
2 1 1 2 1
2 1 1 2 1
2 1 1 2 1
2 1 1 2 1
2 1 1 2 1
2 1 1 2 1
2 1 1 2 1
2 1 1 2 1
2 1 1 2 1
2 1 1 2 1
2 1 1 2 1
2 1 1 2 1
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 1 2 2
2 1 2 1 1
2 1 2 1 1
2 1 2 1 1
2 1 2 1 2
2 1 2 1 2
2 1 2 1 2
2 1 2 1 2
2 1 2 1 2
2 1 2 1 2
2 1 2 1 2
2 1 2 1 2
2 1 2 1 2
2 1 2 1 2
2 1 2 1 2
2 1 2 1 2
2 1 2 1 2
2 1 2 1 2
2 1 2 1 2
2 1 2 1 2
2 1 2 1 2
2 1 2 1 2
2 1 2 1 2
2 1 2 1 2
2 1 2 1 2
2 1 2 1 2
2 1 2 1 2
2 1 2 1 2
2 1 2 1 2
2 1 2 1 2
2 1 2 1 2
2 1 2 1 2
2 1 2 2 1
2 1 2 2 1
2 1 2 2 1
2 1 2 2 1
2 1 2 2 1
2 1 2 2 1
2 1 2 2 1
2 1 2 2 1
2 1 2 2 1
2 1 2 2 1
2 1 2 2 1
2 1 2 2 1
2 1 2 2 1
2 1 2 2 1
2 1 2 2 1
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 1 2 2 2
2 2 1 1 1
2 2 1 1 1
2 2 1 1 1
2 2 1 1 1
2 2 1 1 1
2 2 1 1 1
2 2 1 1 1
2 2 1 1 1
2 2 1 1 1
2 2 1 1 1
2 2 1 1 1
2 2 1 1 1
2 2 1 1 1
2 2 1 1 1
2 2 1 1 1
2 2 1 1 1
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 1 2
2 2 1 2 1
2 2 1 2 1
2 2 1 2 1
2 2 1 2 1
2 2 1 2 1
2 2 1 2 1
2 2 1 2 1
2 2 1 2 1
2 2 1 2 1
2 2 1 2 1
2 2 1 2 1
2 2 1 2 1
2 2 1 2 1
2 2 1 2 1
2 2 1 2 1
2 2 1 2 1
2 2 1 2 1
2 2 1 2 1
2 2 1 2 1
2 2 1 2 1
2 2 1 2 1
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 1 2 2
2 2 2 1 1
2 2 2 1 1
2 2 2 1 1
2 2 2 1 1
2 2 2 1 1
2 2 2 1 1
2 2 2 1 1
2 2 2 1 1
2 2 2 1 1
2 2 2 1 1
2 2 2 1 1
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 1 2
2 2 2 2 1
2 2 2 2 1
2 2 2 2 1
2 2 2 2 1
2 2 2 2 1
2 2 2 2 1
2 2 2 2 1
2 2 2 2 1
2 2 2 2 1
2 2 2 2 1
2 2 2 2 1
2 2 2 2 1
2 2 2 2 1
2 2 2 2 1
2 2 2 2 1
2 2 2 2 1
2 2 2 2 1
2 2 2 2 1
2 2 2 2 1
2 2 2 2 1
2 2 2 2 1
2 2 2 2 1
2 2 2 2 1
2 2 2 2 1
2 2 2 2 1
2 2 2 2 1
2 2 2 2 1
2 2 2 2 1
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
2 2 2 2 2
================================================
FILE: test/helpers_tests.rb
================================================
$:.unshift(File.expand_path(File.dirname(__FILE__)+'/../lib/'))
$:.unshift(File.expand_path(File.dirname(__FILE__)+'/'))
require 'minitest'
require 'minitest/unit'
require 'mocha/setup'
require 'tempfile'
require 'tmpdir'
require 'shoulda'
require 'shoulda-context'
require 'fixtures/correlation_matrix'
require 'statsample'
module MiniTest
class Test
include Shoulda::Context::Assertions
include Shoulda::Context::InstanceMethods
extend Shoulda::Context::ClassMethods
def self.should_with_gsl(name,&block)
should(name) do
if Statsample.has_gsl?
instance_eval(&block)
else
skip("Requires GSL")
end
end
end
end
module Assertions
def assert_similar_vector(exp, obs, delta=1e-10,msg=nil)
msg||="Different vectors #{exp} - #{obs}"
assert_equal(exp.size, obs.size)
exp.data_with_nils.each_with_index {|v,i|
assert_in_delta(v,obs[i],delta)
}
end
def assert_equal_vector(exp,obs,delta=1e-10,msg=nil)
assert_equal(exp.size, obs.size, "Different size.#{msg}")
exp.size.times {|i|
assert_in_delta(exp[i],obs[i],delta, "Different element #{i}. \nExpected:\n#{exp}\nObserved:\n#{obs}.#{msg}")
}
end
def assert_equal_matrix(exp,obs,delta=1e-10,msg=nil)
assert_equal(exp.row_size, obs.row_size, "Different row size.#{msg}")
assert_equal(exp.column_size, obs.column_size, "Different column size.#{msg}")
exp.row_size.times {|i|
exp.column_size.times {|j|
assert_in_delta(exp[i,j],obs[i,j], delta, "Different element #{i},#{j}\nExpected:\n#{exp}\nObserved:\n#{obs}.#{msg}")
}
}
end
alias :assert_raise :assert_raises unless method_defined? :assert_raise
alias :assert_not_equal :refute_equal unless method_defined? :assert_not_equal
alias :assert_not_same :refute_same unless method_defined? :assert_not_same
unless method_defined? :assert_nothing_raised
def assert_nothing_raised(msg=nil)
msg||="Nothing should be raised, but raised %s"
begin
yield
not_raised=true
rescue Exception => e
not_raised=false
msg=sprintf(msg,e)
end
assert(not_raised,msg)
end
end
end
end
MiniTest.autorun
================================================
FILE: test/test_analysis.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleAnalysisTestCase < MiniTest::Unit::TestCase
context(Statsample::Analysis) do
setup do
Statsample::Analysis.clear_analysis
end
should "store() should create and store Statsample::Analysis::Suite" do
Statsample::Analysis.store(:first) do
a=1
end
assert(Statsample::Analysis.stored_analysis[:first])
assert(Statsample::Analysis.stored_analysis[:first].is_a? Statsample::Analysis::Suite)
end
should "ss_analysis should create an Statsample::Analysis" do
ss_analysis(:first) {a=1}
end
should "store last created analysis" do
an=Statsample::Analysis.store(:first) do
a=1
end
assert_equal(an,Statsample::Analysis.last)
end
should "add_to_reportbuilder() add sections to reportbuilder object" do
rb=mock()
rb.expects(:add).with {|value| value.is_a? ReportBuilder::Section and value.name==:first}
rb.expects(:add).with {|value| value.is_a? ReportBuilder::Section and value.name==:second}
Statsample::Analysis.store(:first) do
echo "first","second"
end
Statsample::Analysis.store(:second) do
echo "third"
end
Statsample::Analysis.add_to_reportbuilder(rb,:first,:second)
end
should "to_text returns the same as a normal ReportBuilder object" do
rb=ReportBuilder.new(:name=>:test)
section=ReportBuilder::Section.new(:name=>"first")
a=[1,2,3].to_scale
section.add("first")
section.add(a)
rb.add(section)
exp=rb.to_text
an=ss_analysis(:first) {
echo 'first'
summary(a)
}
obs=Statsample::Analysis.to_text(:first)
assert_equal(exp.split("\n")[1,exp.size], obs.split("\n")[1,obs.size])
end
should "run() execute all analysis by default" do
m1=mock()
m1.expects(:run).once
m1.expects(:hide).once
Statsample::Analysis.store(:first) do
m1.run
end
Statsample::Analysis.store(:second) do
m1.hide
end
# Should run all test
Statsample::Analysis.run
end
should "run() execute blocks specificed on parameters" do
m1=mock()
m1.expects(:run).once
m1.expects(:hide).never
Statsample::Analysis.store(:first) do
m1.run
end
Statsample::Analysis.store(:second) do
m1.hide
end
# Should run all test
Statsample::Analysis.run(:first)
end
context(Statsample::Analysis::Suite) do
should "echo() uses output#puts with same arguments" do
an=Statsample::Analysis::Suite.new(:output)
obj=mock()
obj.expects(:puts).with(:first,:second).once
an.output=obj
an.echo(:first,:second)
end
should "summary() should call object.summary" do
an=Statsample::Analysis::Suite.new(:summary)
obj=stub('summarizable',:summary=>'summary')
assert_equal(obj.summary,an.summary(obj))
end
should "attach() allows to call objects on objects which respond to fields" do
an=Statsample::Analysis::Suite.new(:summary)
ds={'x'=>stub(:mean=>10),'y'=>stub(:mean=>12)}
ds.expects(:fields).returns(%w{x y}).at_least_once
an.attach(ds)
assert_equal(10,an.x.mean)
assert_equal(12,an.y.mean)
assert_raise(RuntimeError) {
an.z
}
end
should "attached objects should be called LIFO" do
an=Statsample::Analysis::Suite.new(:summary)
ds1={'x'=>stub(:mean=>100),'y'=>stub(:mean=>120),'z'=>stub(:mean=>13)}
ds1.expects(:fields).returns(%w{x y z}).at_least_once
ds2={'x'=>stub(:mean=>10),'y'=>stub(:mean=>12)}
ds2.expects(:fields).returns(%w{x y}).at_least_once
an.attach(ds1)
an.attach(ds2)
assert_equal(10,an.x.mean)
assert_equal(12,an.y.mean)
assert_equal(13,an.z.mean)
end
should "detach() without arguments drop latest object" do
an=Statsample::Analysis::Suite.new(:summary)
ds1={'x'=>stub(:mean=>100),'y'=>stub(:mean=>120),'z'=>stub(:mean=>13)}
ds1.expects(:fields).returns(%w{x y z}).at_least_once
ds2={'x'=>stub(:mean=>10),'y'=>stub(:mean=>12)}
ds2.expects(:fields).returns(%w{x y}).at_least_once
an.attach(ds1)
an.attach(ds2)
assert_equal(10,an.x.mean)
an.detach
assert_equal(100, an.x.mean)
end
should "detach() with argument drop select object" do
an=Statsample::Analysis::Suite.new(:summary)
ds1={'x'=>1}
ds1.expects(:fields).returns(%w{x}).at_least_once
ds2={'x'=>2,'y'=>3}
ds2.expects(:fields).returns(%w{x y}).at_least_once
ds3={'y'=>4}
ds3.expects(:fields).returns(%w{y}).at_least_once
an.attach(ds3)
an.attach(ds2)
an.attach(ds1)
assert_equal(1,an.x)
assert_equal(3,an.y)
an.detach(ds2)
assert_equal(4,an.y)
end
should "perform a simple analysis" do
output=mock()
output.expects(:puts).with(5.5)
an=Statsample::Analysis.store(:simple, :output=>output) do
ds=data_frame(:x=>vector(1..10),:y=>vector(1..10))
attach(ds)
echo x.mean
end
an.run
end
end
context(Statsample::Analysis::SuiteReportBuilder) do
should "echo() use add on rb object" do
an=Statsample::Analysis::SuiteReportBuilder.new(:puts_to_add)
an.rb.expects(:add).with(:first).twice
an.echo(:first, :first)
end
should "summary() uses add on rb object" do
an=Statsample::Analysis::SuiteReportBuilder.new(:summary_to_add)
an.rb.expects(:add).with(:first).once
an.summary(:first)
end
end
end
end
================================================
FILE: test/test_anova_contrast.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleAnovaContrastTestCase < MiniTest::Unit::TestCase
context(Statsample::Anova::Contrast) do
setup do
constant=[12,13,11,12,12].to_scale
frequent=[9,10,9,13,14].to_scale
infrequent=[15,16,17,16,16].to_scale
never=[17,18,12,18,20].to_scale
@vectors=[constant, frequent, infrequent, never]
@c=Statsample::Anova::Contrast.new(:vectors=>@vectors)
end
should "return correct value using c" do
@c.c([1,-1.quo(3),-1.quo(3),-1.quo(3)])
#@c.c([1,-0.333,-0.333,-0.333])
assert_in_delta(-2.6667, @c.psi, 0.0001)
assert_in_delta(1.0165, @c.se, 0.0001)
assert_in_delta(-2.623, @c.t, 0.001)
assert_in_delta(-4.82, @c.confidence_interval[0],0.01)
assert_in_delta(-0.51, @c.confidence_interval[1],0.01)
assert(@c.summary.size>0)
end
should "return correct values using c_by_index" do
@c.c_by_index([0],[1,2,3])
assert_in_delta(-2.6667, @c.psi, 0.0001)
assert_in_delta(1.0165, @c.se, 0.0001)
assert_in_delta(-2.623, @c.t, 0.001)
end
should "return correct values using incomplete c_by_index" do
c1=Statsample::Anova::Contrast.new(:vectors=>@vectors, :c=>[0.5,0.5,-1,0])
c2=Statsample::Anova::Contrast.new(:vectors=>@vectors, :c1=>[0,1],:c2=>[2])
assert_equal(c1.psi,c2.psi)
assert_equal(c1.se,c2.se)
assert_equal(c1.t,c2.t)
end
end
end
================================================
FILE: test/test_anovaoneway.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleAnovaOneWayTestCase < MiniTest::Unit::TestCase
context(Statsample::Anova::OneWay) do
setup do
@ss_num=30.08
@ss_den=87.88
@df_num=2
@df_den=21
@anova=Statsample::Anova::OneWay.new(:ss_num=>@ss_num, :ss_den=>@ss_den, :df_num=>@df_num, :df_den=>@df_den)
end
should "Statsample::Anova.oneway respond to #oneway" do
assert(Statsample::Anova.respond_to? :oneway)
end
should "return correct value for ms_num and ms_den" do
assert_in_delta(15.04, @anova.ms_num, 0.01)
assert_in_delta(4.18, @anova.ms_den, 0.01)
end
should "return correct value for f" do
assert_in_delta(3.59, @anova.f, 0.01)
end
should "respond to summary" do
assert(@anova.respond_to? :summary)
assert(@anova.summary.size>0)
end
end
end
================================================
FILE: test/test_anovatwoway.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleAnovaTwoWayTestCase < MiniTest::Unit::TestCase
context(Statsample::Anova::TwoWay) do
setup do
@ss_a=192.2
@ss_b=57.8
@ss_axb=168.2
@ss_within=75.6
@df_a=@df_b=1
@df_within=16
@anova=Statsample::Anova::TwoWay.new(:ss_a=>@ss_a, :ss_b=>@ss_b, :ss_axb=>@ss_axb, :ss_within=>@ss_within , :df_a=>@df_a, :df_b=>@df_b, :df_within=>@df_within)
end
should "Statsample::Anova.twoway respond to #twoway" do
assert(Statsample::Anova.respond_to? :twoway)
end
should "return correct value for ms_a, ms_b and ms_axb" do
assert_in_delta(192.2, @anova.ms_a, 0.01)
assert_in_delta(57.8, @anova.ms_b, 0.01)
assert_in_delta(168.2, @anova.ms_axb, 0.01)
end
should "return correct value for f " do
assert_in_delta(40.68, @anova.f_a, 0.01)
assert_in_delta(12.23, @anova.f_b, 0.01)
assert_in_delta(35.60, @anova.f_axb, 0.01)
end
should "return correct value for probability for f " do
assert(@anova.f_a_probability < 0.05)
assert(@anova.f_b_probability < 0.05)
assert(@anova.f_axb_probability < 0.05)
end
should "respond to summary" do
assert(@anova.respond_to? :summary)
assert(@anova.summary.size>0)
end
end
end
================================================
FILE: test/test_anovatwowaywithdataset.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
# Reference:
# * http://www.uwsp.edu/psych/Stat/13/anova-2w.htm#III
class StatsampleAnovaTwoWayWithVectorsTestCase < MiniTest::Unit::TestCase
context(Statsample::Anova::TwoWayWithVectors) do
setup do
@pa=[5,4,3,4,2,18,19,14,12,15,6,7,5,8,4,6,9,5,9,3].to_scale
@pa.name="Passive Avoidance"
@a=[0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,1,1,1,1,1].to_vector
@a.labels={0=>'0%',1=>'35%'}
@a.name='Diet'
@b=[0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1].to_vector
@b.labels={0=>'Young',1=>'Older'}
@b.name="Age"
@anova=Statsample::Anova::TwoWayWithVectors.new(:a=>@a,:b=>@b, :dependent=>@pa)
end
should "Statsample::Anova respond to #twoway_with_vectors" do
assert(Statsample::Anova.respond_to? :twoway_with_vectors)
end
should "#new returns the same as Statsample::Anova.twoway_with_vectors" do
@anova2=Statsample::Anova.twoway_with_vectors(:a=>@a,:b=>@b, :dependent=>@pa)
assert_equal(@anova.summary, @anova2.summary)
end
should "return correct value for ms_a, ms_b and ms_axb" do
assert_in_delta(192.2, @anova.ms_a, 0.01)
assert_in_delta(57.8, @anova.ms_b, 0.01)
assert_in_delta(168.2, @anova.ms_axb, 0.01)
end
should "return correct value for f " do
assert_in_delta(40.68, @anova.f_a, 0.01)
assert_in_delta(12.23, @anova.f_b, 0.01)
assert_in_delta(35.60, @anova.f_axb, 0.01)
end
should "return correct value for probability for f " do
assert(@anova.f_a_probability < 0.05)
assert(@anova.f_b_probability < 0.05)
assert(@anova.f_axb_probability < 0.05)
end
should "respond to summary" do
@anova.summary_descriptives=true
@anova.summary_levene=true
assert(@anova.respond_to? :summary)
assert(@anova.summary.size>0)
end
end
end
================================================
FILE: test/test_anovawithvectors.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleAnovaOneWayWithVectorsTestCase < MiniTest::Unit::TestCase
context(Statsample::Anova::OneWayWithVectors) do
context("when initializing") do
setup do
@v1=10.times.map {rand(100)}.to_scale
@v2=10.times.map {rand(100)}.to_scale
@v3=10.times.map {rand(100)}.to_scale
end
should "be the same using [] or args*" do
a1=Statsample::Anova::OneWayWithVectors.new(@v1,@v2,@v3)
a2=Statsample::Anova::OneWayWithVectors.new([@v1,@v2,@v3])
assert_equal(a1.f,a2.f)
end
should "be the same using module method or object instantiation" do
a1=Statsample::Anova::OneWayWithVectors.new(@v1,@v2,@v3)
a2=Statsample::Anova.oneway_with_vectors(@v1,@v2,@v3)
assert_equal(a1.f,a2.f)
end
should "detect optional hash" do
a1=Statsample::Anova::OneWayWithVectors.new(@v1,@v2,@v3, {:name=>'aaa'})
assert_equal('aaa', a1.name)
end
should "omit incorrect arguments" do
a1=Statsample::Anova::OneWayWithVectors.new(@v1,@v2,@v3, {:name=>'aaa'})
a2=Statsample::Anova::OneWayWithVectors.new(@v1,nil,nil,@v2,@v3, {:name=>'aaa'})
assert_equal(a1.f,a2.f)
end
end
setup do
@v1=[3,3,2,3,6].to_vector(:scale)
@v2=[7,6,5,6,7].to_vector(:scale)
@v3=[9,8,9,7,8].to_vector(:scale)
@name="Anova testing"
@anova=Statsample::Anova::OneWayWithVectors.new(@v1,@v2,@v3, :name=>@name)
end
should "store correctly contrasts" do
c1=Statsample::Anova::Contrast.new(:vectors=>[@v1,@v2,@v3], :c=>[1,-0.5, -0.5])
c2=@anova.contrast(:c=>[1,-0.5,-0.5])
assert_equal(c1.t,c2.t)
end
should "respond to #summary" do
assert(@anova.respond_to? :summary)
end
should "have correct name of analysis on #summary" do
assert_match(/#{@name}/, @anova.summary)
end
should "returns same levene values as direct Levene creation" do
assert_equal(@anova.levene.f, Statsample::Test.levene([@v1,@v2,@v3]).f)
end
should "have correct value for levene" do
assert_in_delta(0.604,@anova.levene.f, 0.001)
assert_in_delta(0.562,@anova.levene.probability, 0.001)
end
should "have correct value for sst" do
assert_in_delta(72.933, @anova.sst,0.001)
end
should "have correct value for sswg" do
assert_in_delta(14.8,@anova.sswg,0.001)
end
should "have correct value for ssb" do
assert_in_delta(58.133,@anova.ssbg,0.001)
end
should "sst=sswg+ssbg" do
assert_in_delta(@anova.sst,@anova.sswg+@anova.ssbg,0.00001)
end
should "df total equal to number of n-1" do
assert_equal(@v1.n+@v2.n+@v3.n-1,@anova.df_total)
end
should "df wg equal to number of n-k" do
assert_equal(@v1.n+@v2.n+@v3.n-3,@anova.df_wg)
end
should "df bg equal to number of k-1" do
assert_equal(2,@anova.df_bg)
end
should "f=(ssbg/df_bg)/(sswt/df_wt)" do
assert_in_delta((@anova.ssbg.quo(@anova.df_bg)).quo( @anova.sswg.quo(@anova.df_wg)), @anova.f, 0.001)
end
should "p be correct" do
assert(@anova.probability<0.01)
end
should "be correct using different test values" do
anova2=Statsample::Anova::OneWayWithVectors.new([@v1,@v1,@v1,@v1,@v2])
assert_in_delta(3.960, anova2.f,0.001)
assert_in_delta(0.016, anova2.probability,0.001)
end
context "with extra information on summary" do
setup do
@anova.summary_descriptives=true
@anova.summary_levene=true
@summary=@anova.summary
end
should "have section with levene statistics" do
assert_match(/Levene/, @summary)
end
should "have section with descriptives" do
assert_match(/Min/, @summary)
end
end
end
end
================================================
FILE: test/test_awesome_print_bug.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleAwesomePrintBug < MiniTest::Test
context("Awesome Print integration") do
setup do
require "awesome_print"
end
should "should be flawless" do
a=[1,2,3].to_scale
assert(a!=[1,2,3])
assert_nothing_raised do
ap a
end
end
end
end
================================================
FILE: test/test_bartlettsphericity.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleBartlettSphericityTestCase < MiniTest::Test
include Statsample::Test
context Statsample::Test::BartlettSphericity do
setup do
@v1=[1 ,2 ,3 ,4 ,7 ,8 ,9 ,10,14,15,20,50,60,70].to_scale
@v2=[5 ,6 ,11,12,13,16,17,18,19,20,30,0,0,0].to_scale
@v3=[10,3 ,20,30,40,50,80,10,20,30,40,2,3,4].to_scale
# KMO: 0.490
ds={'v1'=>@v1,'v2'=>@v2,'v3'=>@v3}.to_dataset
cor=Statsample::Bivariate.correlation_matrix(ds)
@bs=Statsample::Test::BartlettSphericity.new(cor, 14)
end
should "have correct value for chi" do
assert_in_delta(9.477, @bs.value,0.001)
end
should "have correct value for df" do
assert_equal(3, @bs.df)
end
should "have correct value for probability" do
assert_in_delta(0.024,@bs.probability,0.001)
end
end
end
================================================
FILE: test/test_bivariate.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleBivariateTestCase < MiniTest::Test
should "method sum of squares should be correct" do
v1=[1,2,3,4,5,6].to_vector(:scale)
v2=[6,2,4,10,12,8].to_vector(:scale)
assert_equal(23.0, Statsample::Bivariate.sum_of_squares(v1,v2))
end
should_with_gsl "return same covariance with ruby and gls implementation" do
v1=20.times.collect {|a| rand()}.to_scale
v2=20.times.collect {|a| rand()}.to_scale
assert_in_delta(Statsample::Bivariate.covariance(v1,v2), Statsample::Bivariate.covariance_slow(v1,v2), 0.001)
end
should_with_gsl "return same correlation with ruby and gls implementation" do
v1=20.times.collect {|a| rand()}.to_scale
v2=20.times.collect {|a| rand()}.to_scale
assert_in_delta(GSL::Stats::correlation(v1.gsl, v2.gsl), Statsample::Bivariate.pearson_slow(v1,v2), 1e-10)
end
should "return correct pearson correlation" do
v1=[6,5,4,7,8,4,3,2].to_vector(:scale)
v2=[2,3,7,8,6,4,3,2].to_vector(:scale)
assert_in_delta(0.525,Statsample::Bivariate.pearson(v1,v2), 0.001)
assert_in_delta(0.525,Statsample::Bivariate.pearson_slow(v1,v2), 0.001)
v3=[6,2, 1000,1000,5,4,7,8,4,3,2,nil].to_vector(:scale)
v4=[2,nil,nil,nil, 3,7,8,6,4,3,2,500].to_vector(:scale)
assert_in_delta(0.525,Statsample::Bivariate.pearson(v3,v4),0.001)
# Test ruby method
v3a,v4a=Statsample.only_valid v3, v4
assert_in_delta(0.525, Statsample::Bivariate.pearson_slow(v3a,v4a),0.001)
end
should "return correct values for t_pearson and prop_pearson" do
v1=[6,5,4,7,8,4,3,2].to_vector(:scale)
v2=[2,3,7,8,6,4,3,2].to_vector(:scale)
r=Statsample::Bivariate::Pearson.new(v1,v2)
assert_in_delta(0.525,r.r, 0.001)
assert_in_delta(Statsample::Bivariate.t_pearson(v1,v2), r.t, 0.001)
assert_in_delta(Statsample::Bivariate.prop_pearson(r.t,8,:both), r.probability, 0.001)
assert(r.summary.size>0)
end
should "return correct correlation_matrix with nils values" do
v1=[6,5,4,7,8,4,3,2].to_vector(:scale)
v2=[2,3,7,8,6,4,3,2].to_vector(:scale)
v3=[6,2, 1000,1000,5,4,7,8].to_vector(:scale)
v4=[2,nil,nil,nil, 3,7,8,6].to_vector(:scale)
ds={'v1'=>v1,'v2'=>v2,'v3'=>v3,'v4'=>v4}.to_dataset
c=Proc.new {|n1,n2|Statsample::Bivariate.pearson(n1,n2)}
expected=Matrix[ [c.call(v1,v1),c.call(v1,v2),c.call(v1,v3),c.call(v1,v4)], [c.call(v2,v1),c.call(v2,v2),c.call(v2,v3),c.call(v2,v4)], [c.call(v3,v1),c.call(v3,v2),c.call(v3,v3),c.call(v3,v4)],
[c.call(v4,v1),c.call(v4,v2),c.call(v4,v3),c.call(v4,v4)]
]
obt=Statsample::Bivariate.correlation_matrix(ds)
for i in 0...expected.row_size
for j in 0...expected.column_size
#puts expected[i,j].inspect
#puts obt[i,j].inspect
assert_in_delta(expected[i,j], obt[i,j],0.0001, "#{expected[i,j].class}!=#{obt[i,j].class} ")
end
end
#assert_equal(expected,obt)
end
should_with_gsl "return same values for optimized and pairwise covariance matrix" do
cases=100
v1=Statsample::Vector.new_scale(cases) {rand()}
v2=Statsample::Vector.new_scale(cases) {rand()}
v3=Statsample::Vector.new_scale(cases) {rand()}
v4=Statsample::Vector.new_scale(cases) {rand()}
v5=Statsample::Vector.new_scale(cases) {rand()}
ds={'v1'=>v1,'v2'=>v2,'v3'=>v3,'v4'=>v4,'v5'=>v5}.to_dataset
cor_opt=Statsample::Bivariate.covariance_matrix_optimized(ds)
cor_pw =Statsample::Bivariate.covariance_matrix_pairwise(ds)
assert_equal_matrix(cor_opt,cor_pw,1e-15)
end
should_with_gsl "return same values for optimized and pairwise correlation matrix" do
cases=100
v1=Statsample::Vector.new_scale(cases) {rand()}
v2=Statsample::Vector.new_scale(cases) {rand()}
v3=Statsample::Vector.new_scale(cases) {rand()}
v4=Statsample::Vector.new_scale(cases) {rand()}
v5=Statsample::Vector.new_scale(cases) {rand()}
ds={'v1'=>v1,'v2'=>v2,'v3'=>v3,'v4'=>v4,'v5'=>v5}.to_dataset
cor_opt=Statsample::Bivariate.correlation_matrix_optimized(ds)
cor_pw =Statsample::Bivariate.correlation_matrix_pairwise(ds)
assert_equal_matrix(cor_opt,cor_pw,1e-15)
end
should "return correct correlation_matrix without nils values" do
v1=[6,5,4,7,8,4,3,2].to_vector(:scale)
v2=[2,3,7,8,6,4,3,2].to_vector(:scale)
v3=[6,2, 1000,1000,5,4,7,8].to_vector(:scale)
v4=[2,4,6,7, 3,7,8,6].to_vector(:scale)
ds={'v1'=>v1,'v2'=>v2,'v3'=>v3,'v4'=>v4}.to_dataset
c=Proc.new {|n1,n2|Statsample::Bivariate.pearson(n1,n2)}
expected=Matrix[ [c.call(v1,v1),c.call(v1,v2),c.call(v1,v3),c.call(v1,v4)], [c.call(v2,v1),c.call(v2,v2),c.call(v2,v3),c.call(v2,v4)], [c.call(v3,v1),c.call(v3,v2),c.call(v3,v3),c.call(v3,v4)],
[c.call(v4,v1),c.call(v4,v2),c.call(v4,v3),c.call(v4,v4)]
]
obt=Statsample::Bivariate.correlation_matrix(ds)
for i in 0...expected.row_size
for j in 0...expected.column_size
#puts expected[i,j].inspect
#puts obt[i,j].inspect
assert_in_delta(expected[i,j], obt[i,j],0.0001, "#{expected[i,j].class}!=#{obt[i,j].class} ")
end
end
#assert_equal(expected,obt)
end
should "return correct value for prop pearson" do
assert_in_delta(0.42, Statsample::Bivariate.prop_pearson(Statsample::Bivariate.t_r(0.084,94), 94),0.01)
assert_in_delta(0.65, Statsample::Bivariate.prop_pearson(Statsample::Bivariate.t_r(0.046,95), 95),0.01)
r=0.9
n=100
t=Statsample::Bivariate.t_r(r,n)
assert(Statsample::Bivariate.prop_pearson(t,n,:both)<0.05)
assert(Statsample::Bivariate.prop_pearson(t,n,:right)<0.05)
assert(Statsample::Bivariate.prop_pearson(t,n,:left)>0.05)
r=-0.9
n=100
t=Statsample::Bivariate.t_r(r,n)
assert(Statsample::Bivariate.prop_pearson(t,n,:both)<0.05)
assert(Statsample::Bivariate.prop_pearson(t,n,:right)>0.05)
assert(Statsample::Bivariate.prop_pearson(t,n,:left)<0.05)
end
should "return correct value for Spearman's rho" do
v1=[86,97,99,100,101,103,106,110,112,113].to_vector(:scale)
v2=[0,20,28,27,50,29,7,17,6,12].to_vector(:scale)
assert_in_delta(-0.175758,Statsample::Bivariate.spearman(v1,v2),0.0001)
end
should "return correct value for point_biserial correlation" do
c=[1,3,5,6,7,100,200,300,400,300].to_vector(:scale)
d=[1,1,1,1,1,0,0,0,0,0].to_vector(:scale)
assert_raises TypeError do
Statsample::Bivariate.point_biserial(c,d)
end
assert_in_delta(Statsample::Bivariate.point_biserial(d,c), Statsample::Bivariate.pearson(d,c), 0.0001)
end
should "return correct value for tau_a and tau_b" do
v1=[1,2,3,4,5,6,7,8,9,10,11].to_vector(:ordinal)
v2=[1,3,4,5,7,8,2,9,10,6,11].to_vector(:ordinal)
assert_in_delta(0.6727,Statsample::Bivariate.tau_a(v1,v2),0.001)
assert_in_delta(0.6727,Statsample::Bivariate.tau_b((Statsample::Crosstab.new(v1,v2).to_matrix)),0.001)
v1=[12,14,14,17,19,19,19,19,19,20,21,21,21,21,21,22,23,24,24,24,26,26,27].to_vector(:ordinal)
v2=[11,4,4,2,0,0,0,0,0,0,4,0,4,0,0,0,0,4,0,0,0,0,0].to_vector(:ordinal)
assert_in_delta(-0.376201540231705, Statsample::Bivariate.tau_b(Statsample::Crosstab.new(v1,v2).to_matrix),0.001)
end
should "return correct value for gamma correlation" do
m=Matrix[[10,5,2],[10,15,20]]
assert_in_delta(0.636,Statsample::Bivariate.gamma(m),0.001)
m2=Matrix[[15,12,6,5],[12,8,10,8],[4,6,9,10]]
assert_in_delta(0.349,Statsample::Bivariate.gamma(m2),0.001)
end
end
================================================
FILE: test/test_codification.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleCodificationTestCase < MiniTest::Unit::TestCase
def initialize(*args)
v1=%w{run walk,run walking running sleep sleeping,dreaming sleep,dream}.to_vector
@dict={'run'=>'r','walk'=>'w','walking'=>'w','running'=>'r','sleep'=>'s', 'sleeping'=>'s', 'dream'=>'d', 'dreaming'=>'d'}
@ds={"v1"=>v1}.to_dataset
super
end
def test_create_hash
expected_keys_v1=%w{run walk walking running sleep sleeping dream dreaming}.sort
hash=Statsample::Codification.create_hash(@ds,['v1'])
assert_equal(['v1'],hash.keys)
assert_equal(expected_keys_v1,hash['v1'].keys.sort)
assert_equal(expected_keys_v1,hash['v1'].values.sort)
end
def test_create_excel
filename=Dir::tmpdir+"/test_excel"+Time.now().to_s+".xls"
#filename = Tempfile.new("test_codification_"+Time.now().to_s)
Statsample::Codification.create_excel(@ds, ['v1'], filename)
field=(["v1"]*8).to_vector
keys=%w{dream dreaming run running sleep sleeping walk walking}.to_vector
ds=Statsample::Excel.read(filename)
assert_equal(field, ds['field'])
assert_equal(keys, ds['original'])
assert_equal(keys, ds['recoded'])
hash=Statsample::Codification.excel_to_recoded_hash(filename)
assert_equal(keys.data, hash['v1'].keys.sort)
assert_equal(keys.data, hash['v1'].values.sort)
end
def test_create_yaml
assert_raise ArgumentError do
Statsample::Codification.create_yaml(@ds,[])
end
expected_keys_v1=%w{run walk walking running sleep sleeping dream dreaming}.sort
yaml_hash=Statsample::Codification.create_yaml(@ds,['v1'])
h=YAML::load(yaml_hash)
assert_equal(['v1'],h.keys)
assert_equal(expected_keys_v1,h['v1'].keys.sort)
tf = Tempfile.new("test_codification")
yaml_hash=Statsample::Codification.create_yaml(@ds,['v1'],tf, Statsample::SPLIT_TOKEN)
tf.close
tf.open
h=YAML::load(tf)
assert_equal(['v1'],h.keys)
assert_equal(expected_keys_v1,h['v1'].keys.sort)
tf.close(true)
end
def test_recodification
expected=[['r'],['w','r'],['w'],['r'],['s'],['s','d'], ['s','d']]
assert_equal(expected,Statsample::Codification.recode_vector(@ds['v1'],@dict))
v2=['run','walk,dreaming',nil,'walk,dream,dreaming,walking'].to_vector
expected=[['r'],['w','d'],nil,['w','d']]
assert_equal(expected,Statsample::Codification.recode_vector(v2,@dict))
end
def test_recode_dataset_simple
Statsample::Codification.recode_dataset_simple!(@ds,{'v1'=>@dict})
expected_vector=['r','w,r','w','r','s','s,d', 's,d'].to_vector
assert_not_equal(expected_vector,@ds['v1'])
assert_equal(expected_vector,@ds['v1_recoded'])
end
def test_recode_dataset_split
Statsample::Codification.recode_dataset_split!(@ds,{'v1'=>@dict})
e={}
e['r']=[1,1,0,1,0,0,0].to_vector
e['w']=[0,1,1,0,0,0,0].to_vector
e['s']=[0,0,0,0,1,1,1].to_vector
e['d']=[0,0,0,0,0,1,1].to_vector
e.each{|k,expected|
assert_equal(expected,@ds['v1_'+k],"Error on key #{k}")
}
end
end
================================================
FILE: test/test_crosstab.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleCrosstabTestCase < MiniTest::Unit::TestCase
def initialize(*args)
@v1=%w{black blonde black black red black brown black blonde black red black blonde}.to_vector
@v2=%w{woman man man woman man man man woman man woman woman man man}.to_vector
@ct=Statsample::Crosstab.new(@v1,@v2)
super
end
def test_crosstab_errors
e1=%w{black blonde black black red black brown black blonde black}
assert_raise ArgumentError do
Statsample::Crosstab.new(e1,@v2)
end
e2=%w{black blonde black black red black brown black blonde black black}.to_vector
assert_raise ArgumentError do
Statsample::Crosstab.new(e2,@v2)
end
assert_nothing_raised do
Statsample::Crosstab.new(@v1,@v2)
end
end
def test_crosstab_basic
assert_equal(%w{black blonde brown red}, @ct.rows_names)
assert_equal(%w{man woman}, @ct.cols_names)
assert_equal({'black'=>7,'blonde'=>3,'red'=>2,'brown'=>1}, @ct.rows_total)
assert_equal({'man'=>8,'woman'=>5}, @ct.cols_total)
end
def test_crosstab_frequencies
fq=@ct.frequencies
assert_equal(8,fq.size)
sum=fq.inject(0) {|s,x| s+x[1]}
assert_equal(13,sum)
fr=@ct.frequencies_by_row
assert_equal(4,fr.size)
assert_equal(%w{black blonde brown red},fr.keys.sort)
fc=@ct.frequencies_by_col
assert_equal(2,fc.size)
assert_equal(%w{man woman},fc.keys.sort)
assert_equal(Matrix.rows([[3,4],[3,0],[1,0],[1,1]]),@ct.to_matrix)
end
def test_summary
@ct.percentage_row=true
@ct.percentage_column=true
@ct.percentage_total=true
assert(@ct.summary.size>0)
end
def test_expected
v1=%w{1 1 1 1 1 0 0 0 0 0}.to_vector
v2=%w{0 0 0 0 0 1 1 1 1 1}.to_vector
ct=Statsample::Crosstab.new(v1,v2)
assert_equal(Matrix[[2.5,2.5],[2.5,2.5]],ct.matrix_expected)
end
def test_crosstab_with_scale
v1=%w{1 1 1 1 1 0 0 0 0 0}.to_scale
v2=%w{0 0 0 0 0 1 1 1 1 1}.to_scale
ct=Statsample::Crosstab.new(v1,v2)
assert_equal(Matrix[[0,5],[5,0]],ct.to_matrix)
assert_nothing_raised { ct.summary }
end
end
================================================
FILE: test/test_csv.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleCSVTestCase < MiniTest::Unit::TestCase
def setup
@ds=Statsample::CSV.read(File.dirname(__FILE__)+"/fixtures/test_csv.csv")
end
def test_read
assert_equal(6,@ds.cases)
assert_equal(%w{id name age city a1}, @ds.fields)
id=[1,2,3,4,5,6].to_vector(:scale)
name=["Alex","Claude","Peter","Franz","George","Fernand"].to_vector(:nominal)
age=[20,23,25,27,5.5,nil].to_vector(:scale)
city=["New York","London","London","Paris","Tome",nil].to_vector(:nominal)
a1=["a,b","b,c","a",nil,"a,b,c",nil].to_vector(:nominal)
ds_exp=Statsample::Dataset.new({'id'=>id,'name'=>name,'age'=>age,'city'=>city,'a1'=>a1}, %w{id name age city a1})
ds_exp.fields.each{|f|
assert_equal(ds_exp[f],@ds[f])
}
assert_equal(ds_exp,@ds)
end
def test_nil
assert_equal(nil,@ds['age'][5])
end
def test_repeated
ds=Statsample::CSV.read(File.dirname(__FILE__)+"/fixtures/repeated_fields.csv")
assert_equal(%w{id name_1 age_1 city a1 name_2 age_2},ds.fields)
age=[3,4,5,6,nil,8].to_vector(:scale)
assert_equal(age,ds['age_2'])
end
def test_write
filename=Tempfile.new("afile")
# filename=Dir::tmpdir+"/test_write.csv"
Statsample::CSV.write(@ds, filename.path)
ds2=Statsample::CSV.read(filename.path)
i=0
ds2.each_array{|row|
assert_equal(@ds.case_as_array(i),row)
i+=1
}
end
end
=begin
class StatsampleCSVTestCase2 < MiniTest::Unit::TestCase
def setup
@ds=Statsample::CSV.read19(File.dirname(__FILE__)+"/fixtures/test_csv.csv")
end
def test_read
assert_equal(6,@ds.cases)
assert_equal(%w{id name age city a1}, @ds.fields)
id=[1,2,3,4,5,6].to_vector(:scale)
name=["Alex","Claude","Peter","Franz","George","Fernand"].to_vector(:nominal)
age=[20,23,25,27,5.5,nil].to_vector(:scale)
city=["New York","London","London","Paris","Tome",nil].to_vector(:nominal)
a1=["a,b","b,c","a",nil,"a,b,c",nil].to_vector(:nominal)
ds_exp=Statsample::Dataset.new({'id'=>id,'name'=>name,'age'=>age,'city'=>city,'a1'=>a1}, %w{id name age city a1})
ds_exp.fields.each{|f|
assert_equal(ds_exp[f],@ds[f])
}
assert_equal(ds_exp,@ds)
end
def test_nil
assert_equal(nil,@ds['age'][5])
end
def test_repeated
ds=Statsample::CSV.read19(File.dirname(__FILE__)+"/fixtures/repeated_fields.csv")
assert_equal(%w{id name_1 age_1 city a1 name_2 age_2},ds.fields)
age=[3,4,5,6,nil,8].to_vector(:scale)
assert_equal(age,ds['age_2'])
end
def test_write
filename=Tempfile.new("afile")
# filename=Dir::tmpdir+"/test_write.csv"
Statsample::CSV.write(@ds, filename.path)
ds2=Statsample::CSV.read19(filename.path)
i=0
ds2.each_array{|row|
assert_equal(@ds.case_as_array(i),row)
i+=1
}
end
end
=end
================================================
FILE: test/test_dataset.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleDatasetTestCase < MiniTest::Unit::TestCase
def setup
@ds=Statsample::Dataset.new({'id' => Statsample::Vector.new([1,2,3,4,5]), 'name'=>Statsample::Vector.new(%w{Alex Claude Peter Franz George}), 'age'=>Statsample::Vector.new([20,23,25,27,5]),
'city'=>Statsample::Vector.new(['New York','London','London','Paris','Tome']),
'a1'=>Statsample::Vector.new(['a,b','b,c','a',nil,'a,b,c'])}, ['id','name','age','city','a1'])
end
def test_nest
ds={
'a'=>%w{a a a b b b}.to_vector,
'b'=>%w{c c d d e e}.to_vector,
'c'=>%w{f g h i j k}.to_vector
}.to_dataset
nest=ds.nest('a','b')
assert_equal([{'c'=>'f'},{'c'=>'g'}], nest['a']['c'])
assert_equal([{'c'=>'h'}], nest['a']['d'])
assert_equal([{'c'=>'j'},{'c'=>'k'}], nest['b']['e'])
end
def test_should_have_summary
assert(@ds.summary.size>0)
end
def test_basic
assert_equal(5,@ds.cases)
assert_equal(%w{id name age city a1}, @ds.fields)
end
def test_saveload
outfile=Tempfile.new("dataset.ds")
@ds.save(outfile.path)
a=Statsample.load(outfile.path)
assert_equal(@ds,a)
end
def test_gsl
if Statsample.has_gsl?
matrix=GSL::Matrix[[1,2],[3,4],[5,6]]
ds=Statsample::Dataset.new('v1'=>[1,3,5].to_vector,'v2'=>[2,4,6].to_vector)
assert_equal(matrix,ds.to_gsl)
else
skip("Gsl needed")
end
end
def test_matrix
matrix=Matrix[[1,2],[3,4],[5,6]]
ds=Statsample::Dataset.new('v1'=>[1,3,5].to_vector,'v2'=>[2,4,6].to_vector)
assert_equal(matrix,ds.to_matrix)
end
def test_fields
@ds.fields=%w{name a1 id age city}
assert_equal(%w{name a1 id age city}, @ds.fields)
@ds.fields=%w{id name age}
assert_equal(%w{id name age a1 city}, @ds.fields)
end
def test_merge
a=[1,2,3].to_scale
b=[3,4,5].to_vector
c=[4,5,6].to_scale
d=[7,8,9].to_vector
e=[10,20,30].to_vector
ds1={'a'=>a,'b'=>b}.to_dataset
ds2={'c'=>c,'d'=>d}.to_dataset
exp={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset
assert_equal(exp,ds1.merge(ds2))
exp.fields=%w{c d a b}
assert_equal(exp,ds2.merge(ds1))
ds3={'a'=>e}.to_dataset
exp={'a_1'=>a,'b'=>b,'a_2'=>e}.to_dataset
exp.fields=%w{a_1 b a_2}
assert_equal(exp,ds1.merge(ds3))
end
def test_each_vector
a=[1,2,3].to_vector
b=[3,4,5].to_vector
fields=["a","b"]
ds=Statsample::Dataset.new({'a'=>a,'b'=>b},fields)
res=[]
ds.each_vector{|k,v|
res.push([k,v])
}
assert_equal([["a",a],["b",b]],res)
ds.fields=["b","a"]
res=[]
ds.each_vector{|k,v|
res.push([k,v])
}
assert_equal([["b",b],["a",a]],res)
end
def test_equality
v1=[1,2,3,4].to_vector
v2=[5,6,7,8].to_vector
ds1=Statsample::Dataset.new({'v1'=>v1,'v2'=>v2}, %w{v2 v1})
v3=[1,2,3,4].to_vector
v4=[5,6,7,8].to_vector
ds2=Statsample::Dataset.new({'v1'=>v3,'v2'=>v4}, %w{v2 v1})
assert_equal(ds1,ds2)
ds2.fields=%w{v1 v2}
assert_not_equal(ds1,ds2)
end
def test_add_vector
v=Statsample::Vector.new(%w{a b c d e})
@ds.add_vector('new',v)
assert_equal(%w{id name age city a1 new},@ds.fields)
x=Statsample::Vector.new(%w{a b c d e f g})
assert_raise ArgumentError do
@ds.add_vector('new2',x)
end
end
def test_vector_by_calculation
a1=[1,2,3,4,5,6,7].to_vector(:scale)
a2=[10,20,30,40,50,60,70].to_vector(:scale)
a3=[100,200,300,400,500,600,700].to_vector(:scale)
ds={'a1'=>a1,'a2'=>a2,'a3'=>a3}.to_dataset
total=ds.vector_by_calculation() {|row|
row['a1']+row['a2']+row['a3']
}
expected=[111,222,333,444,555,666,777].to_vector(:scale)
assert_equal(expected,total)
end
def test_vector_sum
a1=[1 ,2 ,3 ,4 , 5,nil].to_vector(:scale)
a2=[10 ,10,20,20 ,20,30].to_vector(:scale)
b1=[nil,1 ,1 ,1 ,1 ,2].to_vector(:scale)
b2=[2 ,2 ,2 ,nil,2 ,3].to_vector(:scale)
ds={'a1'=>a1,'a2'=>a2,'b1'=>b1,'b2'=>b2}.to_dataset
total=ds.vector_sum
a=ds.vector_sum(['a1','a2'])
b=ds.vector_sum(['b1','b2'])
expected_a=[11,12,23,24,25,nil].to_vector(:scale)
expected_b=[nil,3,3,nil,3,5].to_vector(:scale)
expected_total=[nil,15,26,nil,28,nil].to_vector(:scale)
assert_equal(expected_a, a)
assert_equal(expected_b, b)
assert_equal(expected_total, total)
end
def test_vector_missing_values
a1=[1 ,nil ,3 ,4 , 5,nil].to_vector(:scale)
a2=[10 ,nil ,20,20 ,20,30].to_vector(:scale)
b1=[nil,nil ,1 ,1 ,1 ,2].to_vector(:scale)
b2=[2 ,2 ,2 ,nil,2 ,3].to_vector(:scale)
c= [nil,2 , 4,2 ,2 ,2].to_vector(:scale)
ds={'a1'=>a1,'a2'=>a2,'b1'=>b1,'b2'=>b2,'c'=>c}.to_dataset
mva=[2,3,0,1,0,1].to_vector(:scale)
assert_equal(mva,ds.vector_missing_values)
end
def test_has_missing_values
a1=[1 ,nil ,3 ,4 , 5,nil].to_vector(:scale)
a2=[10 ,nil ,20,20 ,20,30].to_vector(:scale)
b1=[nil,nil ,1 ,1 ,1 ,2].to_vector(:scale)
b2=[2 ,2 ,2 ,nil,2 ,3].to_vector(:scale)
c= [nil,2 , 4,2 ,2 ,2].to_vector(:scale)
ds={'a1'=>a1,'a2'=>a2,'b1'=>b1,'b2'=>b2,'c'=>c}.to_dataset
assert(ds.has_missing_data?)
clean=ds.dup_only_valid
assert(!clean.has_missing_data?)
end
def test_vector_count_characters
a1=[1 ,"abcde" ,3 ,4 , 5,nil].to_vector(:scale)
a2=[10 ,20.3 ,20 ,20 ,20,30].to_vector(:scale)
b1=[nil,"343434" ,1 ,1 ,1 ,2].to_vector(:scale)
b2=[2 ,2 ,2 ,nil,2 ,3].to_vector(:scale)
c= [nil,2 ,"This is a nice example",2 ,2 ,2].to_vector(:scale)
ds={'a1'=>a1,'a2'=>a2,'b1'=>b1,'b2'=>b2,'c'=>c}.to_dataset
exp=[4,17,27,5,6,5].to_vector(:scale)
assert_equal(exp,ds.vector_count_characters)
end
def test_vector_mean
a1=[1 ,2 ,3 ,4 , 5,nil].to_vector(:scale)
a2=[10 ,10,20,20 ,20,30].to_vector(:scale)
b1=[nil,1 ,1 ,1 ,1 ,2].to_vector(:scale)
b2=[2 ,2 ,2 ,nil,2 ,3].to_vector(:scale)
c= [nil,2, 4,2 ,2 ,2].to_vector(:scale)
ds={'a1'=>a1,'a2'=>a2,'b1'=>b1,'b2'=>b2,'c'=>c}.to_dataset
total=ds.vector_mean
a=ds.vector_mean(['a1','a2'],1)
b=ds.vector_mean(['b1','b2'],1)
c=ds.vector_mean(['b1','b2','c'],1)
expected_a=[5.5,6,11.5,12,12.5,30].to_vector(:scale)
expected_b=[2,1.5,1.5,1,1.5,2.5].to_vector(:scale)
expected_c=[nil, 5.0/3,7.0/3,1.5,5.0/3,7.0/3].to_vector(:scale)
expected_total=[nil,3.4,6,nil,6.0,nil].to_vector(:scale)
assert_equal(expected_a, a)
assert_equal(expected_b, b)
assert_equal(expected_c, c)
assert_equal(expected_total, total)
end
def test_each_array
expected=[[1,'Alex',20,'New York','a,b'], [2,'Claude',23,'London','b,c'], [3,'Peter',25,'London','a'],[4,'Franz', 27,'Paris',nil],[5,'George',5,'Tome','a,b,c']]
out=[]
@ds.each_array{ |a|
out.push(a)
}
assert_equal(expected,out)
end
def test_recode
@ds['age'].type=:scale
@ds.recode!("age") {|c| c['id']*2}
expected=[2,4,6,8,10].to_vector(:scale)
assert_equal(expected,@ds['age'])
end
def test_case_as
assert_equal({'id'=>1,'name'=>'Alex','city'=>'New York','age'=>20,'a1'=>'a,b'},@ds.case_as_hash(0))
assert_equal([5,'George',5,'Tome','a,b,c'],@ds.case_as_array(4))
# Native methods
assert_equal({'id'=>1,'name'=>'Alex','city'=>'New York','age'=>20,'a1'=>'a,b'},@ds._case_as_hash(0))
assert_equal([5,'George',5,'Tome','a,b,c'],@ds._case_as_array(4))
end
def test_delete_vector
@ds.delete_vector('name')
assert_equal(%w{id age city a1},@ds.fields)
assert_equal(%w{a1 age city id},@ds.vectors.keys.sort)
end
def test_change_type
@ds.col('age').type=:scale
assert_equal(:scale,@ds.col('age').type)
end
def test_split_by_separator_recode
@ds.add_vectors_by_split_recode("a1","_")
assert_equal(%w{id name age city a1 a1_1 a1_2 a1_3},@ds.fields)
assert_equal([1,0,1,nil,1],@ds.col('a1_1').to_a)
assert_equal([1,1,0,nil,1],@ds.col('a1_2').to_a)
assert_equal([0,1,0,nil,1],@ds.col('a1_3').to_a)
{'a1_1'=>'a1:a', 'a1_2'=>'a1:b', 'a1_3'=>'a1:c'}.each do |k,v|
assert_equal(v, @ds[k].name)
end
end
def test_split_by_separator
@ds.add_vectors_by_split("a1","_")
assert_equal(%w{id name age city a1 a1_a a1_b a1_c},@ds.fields)
assert_equal([1,0,1,nil,1],@ds.col('a1_a').to_a)
assert_equal([1,1,0,nil,1],@ds.col('a1_b').to_a)
assert_equal([0,1,0,nil,1],@ds.col('a1_c').to_a)
end
def test_percentiles
v1=(1..100).to_a.to_scale
assert_equal(50.5,v1.median)
assert_equal(25.5, v1.percentil(25))
v2=(1..99).to_a.to_scale
assert_equal(50,v2.median)
assert_equal(25,v2.percentil(25))
v3=(1..50).to_a.to_scale
assert_equal(25.5, v3.median)
assert_equal(13, v3.percentil(25))
end
def test_add_case
ds=Statsample::Dataset.new({'a'=>[].to_vector, 'b'=>[].to_vector, 'c'=>[].to_vector})
ds.add_case([1,2,3])
ds.add_case({'a'=>4,'b'=>5,'c'=>6})
ds.add_case([[7,8,9],%w{a b c}])
assert_equal({'a'=>1,'b'=>2,'c'=>3},ds.case_as_hash(0))
assert_equal([4,5,6],ds.case_as_array(1))
assert_equal([7,8,9],ds.case_as_array(2))
assert_equal(['a','b','c'],ds.case_as_array(3))
ds.add_case_array([6,7,1])
ds.update_valid_data
assert_equal([6,7,1],ds.case_as_array(4))
end
def test_marshaling
ds_marshal=Marshal.load(Marshal.dump(@ds))
assert_equal(ds_marshal,@ds)
end
def test_range
v1=[1,2,3,4].to_vector
v2=[5,6,7,8].to_vector
v3=[9,10,11,12].to_vector
ds1=Statsample::Dataset.new({'v1'=>v1,'v2'=>v2,'v3'=>v3}, %w{v3 v2 v1})
assert_same(v1,ds1['v1'])
ds2=ds1["v2".."v1"]
assert_equal(%w{v2 v1},ds2.fields)
assert_same(ds1['v1'],ds2['v1'])
assert_same(ds1['v2'],ds2['v2'])
end
def test_clone
v1=[1,2,3,4].to_vector
v2=[5,6,7,8].to_vector
ds1=Statsample::Dataset.new({'v1'=>v1,'v2'=>v2}, %w{v2 v1})
ds2=ds1.clone
assert_equal(ds1,ds2)
assert_not_same(ds1,ds2)
assert_equal(ds1['v1'],ds2['v1'])
assert_same(ds1['v1'], ds2['v1'])
assert_equal(ds1.fields,ds2.fields)
assert_not_same(ds1.fields,ds2.fields)
assert_equal(ds1.cases,ds2.cases)
# partial clone
ds3=ds1.clone('v1')
ds_exp=Statsample::Dataset.new({'v1'=>v1},%w{v1})
assert_equal(ds_exp,ds3)
assert_not_same(ds_exp,ds3)
assert_equal(ds3['v1'],ds_exp['v1'])
assert_same(ds3['v1'],ds_exp['v1'])
assert_equal(ds3.fields,ds_exp.fields)
assert_equal(ds3.cases,ds_exp.cases)
assert_not_same(ds3.fields,ds_exp.fields)
end
def test_dup
v1=[1,2,3,4].to_vector
v2=[5,6,7,8].to_vector
ds1=Statsample::Dataset.new({'v1'=>v1,'v2'=>v2}, %w{v2 v1})
ds2=ds1.dup
assert_equal(ds1,ds2)
assert_not_same(ds1,ds2)
assert_equal(ds1['v1'],ds2['v1'])
assert_not_same(ds1['v1'],ds2['v1'])
assert_equal(ds1.cases,ds2.cases)
assert_equal(ds1.fields,ds2.fields)
assert_not_same(ds1.fields,ds2.fields)
ds1['v1'].type=:scale
# dup partial
ds3=ds1.dup('v1')
ds_exp=Statsample::Dataset.new({'v1'=>v1},%w{v1})
assert_equal(ds_exp,ds3)
assert_not_same(ds_exp,ds3)
assert_equal(ds3['v1'],ds_exp['v1'])
assert_not_same(ds3['v1'],ds_exp['v1'])
assert_equal(ds3.fields,ds_exp.fields)
assert_equal(ds3.cases,ds_exp.cases)
assert_not_same(ds3.fields,ds_exp.fields)
# empty
ds3=ds1.dup_empty
assert_not_equal(ds1,ds3)
assert_not_equal(ds1['v1'],ds3['v1'])
assert_equal([],ds3['v1'].data)
assert_equal([],ds3['v2'].data)
assert_equal(:scale,ds3['v1'].type)
assert_equal(ds1.fields,ds2.fields)
assert_not_same(ds1.fields,ds2.fields)
end
def test_from_to
assert_equal(%w{name age city}, @ds.from_to("name","city"))
assert_raise ArgumentError do
@ds.from_to("name","a2")
end
end
def test_each_array_with_nils
v1=[1,-99,3,4,"na"].to_vector(:scale,:missing_values=>[-99,"na"])
v2=[5,6,-99,8,20].to_vector(:scale,:missing_values=>[-99])
v3=[9,10,11,12,20].to_vector(:scale,:missing_values=>[-99])
ds1=Statsample::Dataset.new({'v1'=>v1,'v2'=>v2,'v3'=>v3})
ds2=ds1.dup_empty
ds1.each_array_with_nils {|row|
ds2.add_case_array(row)
}
ds2.update_valid_data
assert_equal([1,nil,3,4,nil],ds2['v1'].data)
assert_equal([5,6,nil,8,20],ds2['v2'].data)
end
def test_dup_only_valid
v1=[1,nil,3,4].to_vector(:scale)
v2=[5,6,nil,8].to_vector(:scale)
v3=[9,10,11,12].to_vector(:scale)
ds1=Statsample::Dataset.new({'v1'=>v1,'v2'=>v2,'v3'=>v3})
ds2=ds1.dup_only_valid
expected=Statsample::Dataset.new({'v1'=>[1,4].to_vector(:scale), 'v2'=> [5,8].to_vector(:scale), 'v3'=>[9, 12].to_vector(:scale)})
assert_equal(expected,ds2)
assert_equal(expected.vectors.values,Statsample::only_valid(v1,v2,v3))
expected_partial=Statsample::Dataset.new({'v1'=>[1,3,4].to_vector(:scale), 'v3'=>[9, 11,12].to_vector(:scale)})
assert_equal(expected_partial, ds1.dup_only_valid(%w{v1 v3}))
end
def test_filter
@ds['age'].type=:scale
filtered=@ds.filter{|c| c['id']==2 or c['id']==4}
expected=Statsample::Dataset.new({'id' => Statsample::Vector.new([2,4]), 'name'=>Statsample::Vector.new(%w{Claude Franz}), 'age'=>Statsample::Vector.new([23,27],:scale),
'city'=>Statsample::Vector.new(['London','Paris']),
'a1'=>Statsample::Vector.new(['b,c',nil,])}, ['id','name','age','city','a1'])
assert_equal(expected,filtered)
end
def test_filter_field
@ds['age'].type=:scale
filtered=@ds.filter_field('id') {|c| c['id']==2 or c['id']==4}
expected=[2,4].to_vector
assert_equal(expected,filtered)
end
def test_verify
name=%w{r1 r2 r3 r4}.to_vector(:nominal)
v1=[1,2,3,4].to_vector(:scale)
v2=[4,3,2,1].to_vector(:scale)
v3=[10,20,30,40].to_vector(:scale)
v4=%w{a b a b}.to_vector(:nominal)
ds={'v1'=>v1,'v2'=>v2,'v3'=>v3,'v4'=>v4,'id'=>name}.to_dataset
ds.fields=%w{v1 v2 v3 v4 id}
#Correct
t1=create_test("If v4=a, v1 odd") {|r| r['v4']=='b' or (r['v4']=='a' and r['v1']%2==1)}
t2=create_test("v3=v1*10") {|r| r['v3']==r['v1']*10}
# Fail!
t3=create_test("v4='b'") {|r| r['v4']=='b'}
exp1=["1 [1]: v4='b'", "3 [3]: v4='b'"]
exp2=["1 [r1]: v4='b'", "3 [r3]: v4='b'"]
res=ds.verify(t3,t1,t2)
assert_equal(exp1,res)
res=ds.verify('id',t1,t2,t3)
assert_equal(exp2,res)
end
def test_compute_operation
v1=[1,2,3,4].to_vector(:scale)
v2=[4,3,2,1].to_vector(:scale)
v3=[10,20,30,40].to_vector(:scale)
vscale=[1.quo(2),1,3.quo(2),2].to_vector(:scale)
vsum=[1+4+10.0,2+3+20.0,3+2+30.0,4+1+40.0].to_vector(:scale)
vmult=[1*4,2*3,3*2,4*1].to_vector(:scale)
ds={'v1'=>v1,'v2'=>v2,'v3'=>v3}.to_dataset
assert_equal(vscale,ds.compute("v1/2"))
assert_equal(vsum,ds.compute("v1+v2+v3"))
assert_equal(vmult,ds.compute("v1*v2"))
end
def test_crosstab_with_asignation
v1=%w{a a a b b b c c c}.to_vector
v2=%w{a b c a b c a b c}.to_vector
v3=%w{0 1 0 0 1 1 0 0 1}.to_scale
ds=Statsample::Dataset.crosstab_by_asignation(v1,v2,v3)
assert_equal(:nominal, ds['_id'].type)
assert_equal(:scale, ds['a'].type)
assert_equal(:scale, ds['b'].type)
ev_id=%w{a b c}.to_vector
ev_a =%w{0 0 0}.to_scale
ev_b =%w{1 1 0}.to_scale
ev_c =%w{0 1 1}.to_scale
ds2={'_id'=>ev_id, 'a'=>ev_a, 'b'=>ev_b, 'c'=>ev_c}.to_dataset
assert_equal(ds, ds2)
end
def test_one_to_many
cases=[
['1','george','red',10,'blue',20,nil,nil],
['2','fred','green',15,'orange',30,'white',20],
['3','alfred',nil,nil,nil,nil,nil,nil]
]
ds=Statsample::Dataset.new(%w{id name car_color1 car_value1 car_color2 car_value2 car_color3 car_value3})
cases.each {|c| ds.add_case_array c }
ds.update_valid_data
ids=%w{1 1 2 2 2}.to_vector
colors=%w{red blue green orange white}.to_vector
values=[10,20,15,30,20].to_vector
col_ids=[1,2,1,2,3].to_scale
ds_expected={'id'=>ids, '_col_id'=>col_ids, 'color'=>colors, 'value'=>values}.to_dataset(['id','_col_id', 'color','value'])
assert_equal(ds_expected, ds.one_to_many(%w{id}, "car_%v%n"))
end
end
================================================
FILE: test/test_dominance_analysis.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleDominanceAnalysisTestCase < MiniTest::Unit::TestCase
def test_dominance_univariate
# Example from Budescu (1993)
m=Matrix[[1, 0.683, 0.154, 0.460, 0.618],[0.683, 1, -0.050, 0.297, 0.461], [0.154, -0.050, 1, 0.006, 0.262],[0.460, 0.297, 0.006, 1, 0.507],[0.618, 0.461, 0.262, 0.507, 1]]
m.extend Statsample::CovariateMatrix
m.fields=%w{x1 x2 x3 x4 y}
da=Statsample::DominanceAnalysis.new(m,'y')
contr_x1={'x2'=>0.003, 'x3'=>0.028, 'x4'=>0.063}
contr_x1.each do |k,v|
assert_in_delta(v, da.models_data[['x1']].contributions[k], 0.001)
end
assert_in_delta(0.052, da.models_data[['x2','x3','x4']].contributions['x1'], 0.001)
expected_dominances=[1, 1, 0.5, 0.5, 0,0]
expected_g_dominances=[1, 1, 1, 1, 0,0]
da.pairs.each_with_index do |a,i|
assert_equal(expected_dominances[i], da.total_dominance_pairwise(a[0],a[1]))
assert_equal(expected_dominances[i], da.conditional_dominance_pairwise(a[0],a[1]))
assert_equal(expected_g_dominances[i], da.general_dominance_pairwise(a[0],a[1]))
end
assert(da.summary.size>0)
end
def test_dominance_multivariate
m=Matrix[[1.0, -0.19, -0.358, -0.343, 0.359, 0.257], [-0.19, 1.0, 0.26, 0.29, -0.11, -0.11], [-0.358, 0.26, 1.0, 0.54, -0.49, -0.23], [-0.343, 0.29, 0.54, 1.0, -0.22, -0.41], [0.359, -0.11, -0.49, -0.22, 1.0, 0.62], [0.257, -0.11, -0.23, -0.41, 0.62, 1]]
m.extend Statsample::CovariateMatrix
m.fields=%w{y1 y2 x1 x2 x3 x4}
m2=m.submatrix(%w{y1 x1 x2 x3 x4})
da=Statsample::DominanceAnalysis.new(m, ['y1','y2'], :cases=>683, :method_association=>:p2yx)
contr_x1={'x2'=>0.027, 'x3'=>0.024, 'x4'=>0.017}
contr_x1.each do |k,v|
assert_in_delta(v, da.models_data[['x1']].contributions[k], 0.003)
end
end
end
================================================
FILE: test/test_factor.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
#require 'rserve'
#require 'statsample/rserve_extension'
class StatsampleFactorTestCase < MiniTest::Unit::TestCase
include Statsample::Fixtures
# Based on Hardle and Simar
def setup
@fixtures_dir=File.expand_path(File.dirname(__FILE__)+"/fixtures")
end
# Based on Hurdle example
def test_covariance_matrix
ds=Statsample::PlainText.read(@fixtures_dir+"/bank2.dat", %w{v1 v2 v3 v4 v5 v6})
ds.fields.each {|f|
ds[f]=ds[f].centered
}
cm=ds.covariance_matrix
pca =Statsample::Factor::PCA.new( cm, :m=>6)
#puts pca.summary
#puts pca.feature_matrix
exp_eig=[2.985, 0.931,0.242, 0.194, 0.085, 0.035].to_scale
assert_similar_vector(exp_eig, pca.eigenvalues.to_scale, 0.1)
pcs=pca.principal_components(ds)
k=6
comp_matrix=pca.component_matrix()
k.times {|i|
pc_id="PC_#{i+1}"
k.times {|j| # variable
ds_id="v#{j+1}"
r= Statsample::Bivariate.correlation(ds[ds_id], pcs[pc_id])
assert_in_delta( r, comp_matrix[j,i])
}
}
end
def test_principalcomponents_ruby_gsl
ran=Distribution::Normal.rng
# @r=::Rserve::Connection.new
samples=20
[3,5,7].each {|k|
v={}
v["x0"]=samples.times.map { ran.call()}.to_scale.centered
(1...k).each {|i|
v["x#{i}"]=samples.times.map {|ii| ran.call()*0.5+v["x#{i-1}"][ii]*0.5}.to_scale.centered
}
ds=v.to_dataset
cm=ds.covariance_matrix
# @r.assign('ds',ds)
# @r.eval('cm<-cor(ds);sm<-eigen(cm, sym=TRUE);v<-sm$vectors')
# puts "eigenvalues"
# puts @r.eval('v').to_ruby.to_s
pca_ruby=Statsample::Factor::PCA.new( cm, :m=>k, :use_gsl=>false )
pca_gsl =Statsample::Factor::PCA.new( cm, :m=>k, :use_gsl=>true )
pc_ruby = pca_ruby.principal_components(ds)
pc_gsl = pca_gsl.principal_components(ds)
# Test component matrix correlation!
cm_ruby=pca_ruby.component_matrix
#puts cm_ruby.summary
k.times {|i|
pc_id="PC_#{i+1}"
assert_in_delta(pca_ruby.eigenvalues[i], pca_gsl.eigenvalues[i],1e-10)
# Revert gsl component values
pc_gsl_data= (pc_gsl[pc_id][0]-pc_ruby[pc_id][0]).abs>1e-6 ? pc_gsl[pc_id].recode {|v| -v} : pc_gsl[pc_id]
assert_similar_vector(pc_gsl_data, pc_ruby[pc_id], 1e-6,"PC for #{k} variables")
if false
k.times {|j| # variable
ds_id="x#{j}"
r= Statsample::Bivariate.correlation(ds[ds_id],pc_ruby[pc_id])
puts "#{pc_id}-#{ds_id}:#{r}"
}
end
}
}
#@r.close
end
def test_principalcomponents()
principalcomponents(true)
principalcomponents(false)
end
def principalcomponents(gsl)
ran=Distribution::Normal.rng
samples=50
x1=samples.times.map { ran.call()}.to_scale
x2=samples.times.map {|i| ran.call()*0.5+x1[i]*0.5}.to_scale
ds={'x1'=>x1,'x2'=>x2}.to_dataset
cm=ds.correlation_matrix
r=cm[0,1]
pca=Statsample::Factor::PCA.new(cm,:m=>2,:use_gsl=>gsl)
assert_in_delta(1+r,pca.eigenvalues[0],1e-10)
assert_in_delta(1-r,pca.eigenvalues[1],1e-10)
hs=1.0 / Math.sqrt(2)
assert_equal_vector(Vector[1, 1]*hs, pca.eigenvectors[0])
m_1=gsl ? Vector[-1,1] : Vector[1,-1]
assert_equal_vector(hs*m_1, pca.eigenvectors[1])
pcs=pca.principal_components(ds)
exp_pc_1=ds.collect_with_index {|row,i|
hs*(row['x1']+row['x2'])
}
exp_pc_2=ds.collect_with_index {|row,i|
gsl ? hs*(row['x2']-row['x1']) : hs*(row['x1']-row['x2'])
}
assert_similar_vector(exp_pc_1, pcs["PC_1"])
assert_similar_vector(exp_pc_2, pcs["PC_2"])
end
def test_antiimage
cor=Matrix[[1,0.964, 0.312],[0.964,1,0.411],[0.312,0.411,1]]
expected=Matrix[[0.062,-0.057, 0.074],[-0.057, 0.057, -0.089], [0.074, -0.089, 0.729]]
ai=Statsample::Factor.anti_image_covariance_matrix(cor)
assert(Matrix.equal_in_delta?(expected, ai, 0.01), "#{expected.to_s} not equal to #{ai.to_s}")
end
def test_kmo
@v1=[1 ,2 ,3 ,4 ,7 ,8 ,9 ,10,14,15,20,50,60,70].to_scale
@v2=[5 ,6 ,11,12,13,16,17,18,19,20,30,0,0,0].to_scale
@v3=[10,3 ,20,30,40,50,80,10,20,30,40,2,3,4].to_scale
# KMO: 0.490
ds={'v1'=>@v1,'v2'=>@v2,'v3'=>@v3}.to_dataset
cor=Statsample::Bivariate.correlation_matrix(ds)
kmo=Statsample::Factor.kmo(cor)
assert_in_delta(0.667, kmo,0.001)
assert_in_delta(0.81, Statsample::Factor.kmo(harman_817),0.01)
end
def test_kmo_univariate
m=harman_817
expected=[0.73,0.76,0.84,0.87,0.53,0.93,0.78,0.86]
m.row_size.times.map {|i|
assert_in_delta(expected[i], Statsample::Factor.kmo_univariate(m,i),0.01)
}
end
# Tested with SPSS and R
def test_pca
a=[2.5, 0.5, 2.2, 1.9, 3.1, 2.3, 2.0, 1.0, 1.5, 1.1].to_scale
b=[2.4, 0.7, 2.9, 2.2, 3.0, 2.7, 1.6, 1.1, 1.6, 0.9].to_scale
a.recode! {|c| c-a.mean}
b.recode! {|c| c-b.mean}
ds={'a'=>a,'b'=>b}.to_dataset
cov_matrix=Statsample::Bivariate.covariance_matrix(ds)
if Statsample.has_gsl?
pca=Statsample::Factor::PCA.new(cov_matrix,:use_gsl=>true)
pca_set(pca,"gsl")
else
skip("Eigenvalues could be calculated with GSL (requires gsl)")
end
pca=Statsample::Factor::PCA.new(cov_matrix,:use_gsl=>false)
pca_set(pca,"ruby")
end
def pca_set(pca,type)
expected_eigenvalues=[1.284, 0.0490]
expected_eigenvalues.each_with_index{|ev,i|
assert_in_delta(ev,pca.eigenvalues[i],0.001)
}
expected_communality=[0.590, 0.694]
expected_communality.each_with_index{|ev,i|
assert_in_delta(ev,pca.communalities[i],0.001)
}
expected_cm=[0.768, 0.833]
obs=pca.component_matrix_correlation(1).column(0).to_a
expected_cm.each_with_index{|ev,i|
assert_in_delta(ev,obs[i],0.001)
}
assert(pca.summary)
end
# Tested with R
def test_principalaxis
matrix=::Matrix[
[1.0, 0.709501601093587, 0.877596585880047, 0.272219316266807], [0.709501601093587, 1.0, 0.291633797330304, 0.871141831433844], [0.877596585880047, 0.291633797330304, 1.0, -0.213373722977167], [0.272219316266807, 0.871141831433844, -0.213373722977167, 1.0]]
fa=Statsample::Factor::PrincipalAxis.new(matrix,:m=>1, :max_iterations=>50)
cm=::Matrix[[0.923],[0.912],[0.507],[0.483]]
assert_equal_matrix(cm,fa.component_matrix,0.001)
h2=[0.852,0.832,0.257,0.233]
h2.each_with_index{|ev,i|
assert_in_delta(ev,fa.communalities[i],0.001)
}
eigen1=2.175
assert_in_delta(eigen1, fa.eigenvalues[0],0.001)
assert(fa.summary.size>0)
fa=Statsample::Factor::PrincipalAxis.new(matrix,:smc=>false)
assert_raise RuntimeError do
fa.iterate
end
end
def test_rotation_varimax
a = Matrix[ [ 0.4320, 0.8129, 0.3872] ,
[0.7950, -0.5416, 0.2565] ,
[0.5944, 0.7234, -0.3441],
[0.8945, -0.3921, -0.1863] ]
expected= Matrix[[-0.0204423, 0.938674, -0.340334],
[0.983662, 0.0730206, 0.134997],
[0.0826106, 0.435975, -0.893379],
[0.939901, -0.0965213, -0.309596]]
varimax=Statsample::Factor::Varimax.new(a)
assert(!varimax.rotated.nil?, "Rotated shouldn't be empty")
assert(!varimax.component_transformation_matrix.nil?, "Component matrix shouldn't be empty")
assert(!varimax.h2.nil?, "H2 shouldn't be empty")
assert_equal_matrix(expected,varimax.rotated,1e-6)
assert(varimax.summary.size>0)
end
end
================================================
FILE: test/test_factor_map.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
#require 'rserve'
#require 'statsample/rserve_extension'
class StatsampleFactorMpaTestCase < MiniTest::Unit::TestCase
context Statsample::Factor::MAP do
setup do
m=Matrix[
[ 1, 0.846, 0.805, 0.859, 0.473, 0.398, 0.301, 0.382],
[ 0.846, 1, 0.881, 0.826, 0.376, 0.326, 0.277, 0.415],
[ 0.805, 0.881, 1, 0.801, 0.38, 0.319, 0.237, 0.345],
[ 0.859, 0.826, 0.801, 1, 0.436, 0.329, 0.327, 0.365],
[ 0.473, 0.376, 0.38, 0.436, 1, 0.762, 0.73, 0.629],
[ 0.398, 0.326, 0.319, 0.329, 0.762, 1, 0.583, 0.577],
[ 0.301, 0.277, 0.237, 0.327, 0.73, 0.583, 1, 0.539],
[ 0.382, 0.415, 0.345, 0.365, 0.629, 0.577, 0.539, 1]
]
@map=Statsample::Factor::MAP.new(m)
end
should "return correct values with pure ruby" do
@map.use_gsl=false
map_assertions(@map)
end
should_with_gsl "return correct values with gsl" do
#require 'ruby-prof'
@map.use_gsl=true
map_assertions(@map)
end
end
def map_assertions(map)
assert_in_delta(map.minfm, 0.066445,0.00001)
assert_equal(map.number_of_factors, 2)
assert_in_delta(map.fm[0], 0.312475,0.00001)
assert_in_delta(map.fm[1], 0.245121,0.00001)
end
end
================================================
FILE: test/test_factor_pa.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
#require 'rserve'
#require 'statsample/rserve_extension'
class StatsampleFactorTestCase < MiniTest::Unit::TestCase
include Statsample::Fixtures
# Based on Hardle and Simar
def setup
@fixtures_dir=File.expand_path(File.dirname(__FILE__)+"/fixtures")
end
def test_parallelanalysis_with_data
if Statsample.has_gsl?
samples=100
variables=10
iterations=50
rng = Distribution::Normal.rng
f1=samples.times.collect {rng.call}.to_scale
f2=samples.times.collect {rng.call}.to_scale
vectors={}
variables.times do |i|
if i<5
vectors["v#{i}"]=samples.times.collect {|nv|
f1[nv]*5+f2[nv]*2+rng.call
}.to_scale
else
vectors["v#{i}"]=samples.times.collect {|nv|
f2[nv]*5+f1[nv]*2+rng.call
}.to_scale
end
end
ds=vectors.to_dataset
pa1=Statsample::Factor::ParallelAnalysis.new(ds, :bootstrap_method=>:data, :iterations=>iterations)
pa2=Statsample::Factor::ParallelAnalysis.with_random_data(samples,variables,:iterations=>iterations,:percentil=>95)
3.times do |n|
var="ev_0000#{n+1}"
assert_in_delta(pa1.ds_eigenvalues[var].mean, pa2.ds_eigenvalues[var].mean,0.05)
end
else
skip("Too slow without GSL")
end
end
def test_parallelanalysis
pa=Statsample::Factor::ParallelAnalysis.with_random_data(305,8,:iterations=>100,:percentil=>95)
assert_in_delta(1.2454, pa.ds_eigenvalues['ev_00001'].mean, 0.01)
assert_in_delta(1.1542, pa.ds_eigenvalues['ev_00002'].mean, 0.01)
assert_in_delta(1.0836, pa.ds_eigenvalues['ev_00003'].mean, 0.01)
assert(pa.summary.size>0)
end
end
================================================
FILE: test/test_ggobi.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
require 'ostruct'
class StatsampleGGobiTestCase < MiniTest::Unit::TestCase
def setup
v1=([10.2,20.3,10,20,30,40,30,20,30,40]*10).to_vector(:scale)
@v2=(%w{a b c a a a b b c d}*10).to_vector(:nominal)
@v2.labels={"a"=>"letter a","d"=>"letter d"}
v3=([1,2,3,4,5,4,3,2,1,2]*10).to_vector(:ordinal)
@ds={'v1'=>v1,'v2'=>@v2,'v3'=>v3}.to_dataset
end
def test_values_definition
a=[1.0,2,"a",nil]
assert_equal("1.0 2 a NA", Statsample::GGobi.values_definition(a,"NA"))
end
def test_variable_definition
carrier=OpenStruct.new
carrier.categorials=[]
carrier.conversions={}
real_var_definition=Statsample::GGobi.variable_definition(carrier,@v2,'variable 2',"v2")
expected=<<-EOS
letter a
b
c
letter d
EOS
assert_equal(expected.gsub(/\s/," "),real_var_definition.gsub(/\s/," "))
assert_equal({'variable 2'=>{'a'=>1,'b'=>2,'c'=>3,'d'=>4}},carrier.conversions)
assert_equal(['variable 2'],carrier.categorials)
end
end
================================================
FILE: test/test_gsl.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleGSLTestCase < MiniTest::Unit::TestCase
should_with_gsl "matrix with gsl" do
a=[1,2,3,4,20].to_vector(:scale)
b=[3,2,3,4,50].to_vector(:scale)
c=[6,2,3,4,3].to_vector(:scale)
ds={'a'=>a,'b'=>b,'c'=>c}.to_dataset
gsl=ds.to_matrix.to_gsl
assert_equal(5,gsl.size1)
assert_equal(3,gsl.size2)
matrix=gsl.to_matrix
assert_equal(5,matrix.row_size)
assert_equal(3,matrix.column_size)
end
end
================================================
FILE: test/test_histogram.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleHistogramTestCase < MiniTest::Unit::TestCase
context Statsample::Histogram do
should "alloc correctly with integer" do
h = Statsample::Histogram.alloc(4)
assert_equal([0.0]*4, h.bin)
assert_equal([0.0]*5, h.range)
end
should "alloc correctly with array" do
h = Statsample::Histogram.alloc([1, 3, 7, 9, 20])
assert_equal([0.0]*4, h.bin)
assert_equal([1,3,7,9,20], h.range)
end
should "alloc correctly with integer and min, max array" do
h = Statsample::Histogram.alloc(5, [0, 5])
assert_equal([0.0,1.0,2.0,3.0,4.0,5.0], h.range)
assert_equal([0.0]*5,h.bin)
end
should "bin() method return correct number of bins" do
h = Statsample::Histogram.alloc(4)
assert_equal(4,h.bins)
end
should "increment correctly" do
h = Statsample::Histogram.alloc(5, [0, 5])
h.increment 2.5
assert_equal([0.0,0.0,1.0,0.0,0.0], h.bin)
h.increment [0.5,0.5,3.5,3.5]
assert_equal([2.0,0.0,1.0,2.0,0.0], h.bin)
h.increment 0
assert_equal([3.0,0.0,1.0,2.0,0.0], h.bin)
h.increment 5
assert_equal([3.0,0.0,1.0,2.0,0.0], h.bin)
end
should "alloc_uniform correctly with n, min,max" do
h = Statsample::Histogram.alloc_uniform(5,0,10)
assert_equal(5,h.bins)
assert_equal([0.0]*5,h.bin)
assert_equal([0.0,2.0,4.0,6.0,8.0,10.0], h.range)
end
should "alloc_uniform correctly with n, [min,max]" do
h = Statsample::Histogram.alloc_uniform(5, [0, 10])
assert_equal(5,h.bins)
assert_equal([0.0]*5,h.bin)
assert_equal([0.0,2.0,4.0,6.0,8.0,10.0], h.range)
end
should "get_range()" do
h = Statsample::Histogram.alloc_uniform(5,2,12)
5.times {|i|
assert_equal([2+i*2, 4+i*2], h.get_range(i))
}
end
should "min() and max()" do
h=Statsample::Histogram.alloc_uniform(5,2,12)
assert_equal(2,h.min)
assert_equal(12,h.max)
end
should "max_val()" do
h = Statsample::Histogram.alloc(5, [0, 5])
100.times {h.increment(rand*5)}
max=h.bin[0]
(1..4).each {|i|
max = h.bin[i] if h.bin[i] > max
}
assert_equal(max,h.max_val)
end
should "min_val()" do
h = Statsample::Histogram.alloc(5, [0, 5])
100.times {h.increment(rand*5)}
min=h.bin[0]
(1..4).each {|i|
min = h.bin[i] if h.bin[i]x1,'x2'=>x2}.to_dataset
ds.name="test"
obs=m.to_dataset
assert_equal(ds['x1'],obs['x1'])
assert_equal(ds['x2'],obs['x2'])
assert_equal(ds['x1'].mean,obs['x1'].mean)
end
def test_covariate
a=Matrix[[1.0, 0.3, 0.2], [0.3, 1.0, 0.5], [0.2, 0.5, 1.0]]
a.extend Statsample::CovariateMatrix
a.fields=%w{a b c}
assert_equal(:correlation, a._type)
assert_equal(Matrix[[0.5],[0.3]], a.submatrix(%w{c a}, %w{b}))
assert_equal(Matrix[[1.0, 0.2] , [0.2, 1.0]], a.submatrix(%w{c a}))
assert_equal(:correlation, a.submatrix(%w{c a})._type)
a=Matrix[[20,30,10], [30,60,50], [10,50,50]]
a.extend Statsample::CovariateMatrix
assert_equal(:covariance, a._type)
a=50.times.collect {rand()}.to_scale
b=50.times.collect {rand()}.to_scale
c=50.times.collect {rand()}.to_scale
ds={'a'=>a,'b'=>b,'c'=>c}.to_dataset
corr=Statsample::Bivariate.correlation_matrix(ds)
real=Statsample::Bivariate.covariance_matrix(ds).correlation
corr.row_size.times do |i|
corr.column_size.times do |j|
assert_in_delta(corr[i,j], real[i,j],1e-15)
end
end
end
end
================================================
FILE: test/test_multiset.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleMultisetTestCase < MiniTest::Unit::TestCase
def setup
@x=%w{a a a a b b b b}.to_vector
@y=[1,2,3,4,5,6,7,8].to_scale
@z=[10,11,12,13,14,15,16,17].to_scale
@ds={'x'=>@x,'y'=>@y,'z'=>@z}.to_dataset
@ms=@ds.to_multiset_by_split('x')
end
def test_creation
v1a=[1,2,3,4,5].to_vector
v2b=[11,21,31,41,51].to_vector
v3c=[21,23,34,45,56].to_vector
ds1={'v1'=>v1a,'v2'=>v2b,'v3'=>v3c}.to_dataset
v1b=[15,25,35,45,55].to_vector
v2b=[11,21,31,41,51].to_vector
v3b=[21,23,34,45,56].to_vector
ds2={'v1'=>v1b,'v2'=>v2b,'v3'=>v3b}.to_dataset
ms=Statsample::Multiset.new(['v1','v2','v3'])
ms.add_dataset('ds1',ds1)
ms.add_dataset('ds2',ds2)
assert_equal(ds1,ms['ds1'])
assert_equal(ds2,ms['ds2'])
assert_equal(v1a,ms['ds1']['v1'])
assert_not_equal(v1b,ms['ds1']['v1'])
ds3={'v1'=>v1b,'v2'=>v2b}.to_dataset
assert_raise ArgumentError do
ms.add_dataset(ds3)
end
end
def test_creation_empty
ms=Statsample::Multiset.new_empty_vectors(%w{id age name},%w{male female})
ds_male={'id'=>[].to_vector,'age'=>[].to_vector, 'name'=>[].to_vector}.to_dataset(%w{id age name})
ds_female={'id'=>[].to_vector,'age'=>[].to_vector, 'name'=>[].to_vector}.to_dataset(%w{id age name})
ms2=Statsample::Multiset.new(%w{id age name})
ms2.add_dataset('male',ds_male)
ms2.add_dataset('female',ds_female)
assert_equal(ms2.fields,ms.fields)
assert_equal(ms2['male'],ms['male'])
assert_equal(ms2['female'],ms['female'])
end
def test_to_multiset_by_split_one
sex=%w{m m m m m f f f f m}.to_vector(:nominal)
city=%w{London Paris NY London Paris NY London Paris NY Tome}.to_vector(:nominal)
age=[10,10,20,30,34,34,33,35,36,40].to_vector(:scale)
ds={'sex'=>sex,'city'=>city,'age'=>age}.to_dataset
ms=ds.to_multiset_by_split('sex')
assert_equal(2,ms.n_datasets)
assert_equal(%w{f m},ms.datasets.keys.sort)
assert_equal(6,ms['m'].cases)
assert_equal(4,ms['f'].cases)
assert_equal(%w{London Paris NY London Paris Tome},ms['m']['city'].to_a)
assert_equal([34,33,35,36],ms['f']['age'].to_a)
end
def test_to_multiset_by_split_multiple
sex=%w{m m m m m m m m m m f f f f f f f f f f}.to_vector(:nominal)
city=%w{London London London Paris Paris London London London Paris Paris London London London Paris Paris London London London Paris Paris}.to_vector(:nominal)
hair=%w{blonde blonde black black blonde blonde black black blonde blonde black black blonde blonde black black blonde blonde black black}.to_vector(:nominal)
age=[10,10,20,30,34,34,33,35,36,40, 10,10,20,30,34,34,33,35,36,40].to_vector(:scale)
ds={'sex'=>sex,'city'=>city,'hair'=>hair,'age'=>age}.to_dataset(%w{sex city hair age})
ms=ds.to_multiset_by_split('sex','city','hair')
assert_equal(8,ms.n_datasets)
assert_equal(3,ms[%w{m London blonde}].cases)
assert_equal(3,ms[%w{m London blonde}].cases)
assert_equal(1,ms[%w{m Paris black}].cases)
end
def test_stratum_proportion
ds1={'q1'=>[1,1,1,1,1,0,0,0,0,0,0,0].to_vector}.to_dataset
ds2={'q1'=>[1,1,1,1,1,1,1,0,0].to_vector}.to_dataset
assert_equal(5.0/12, ds1['q1'].proportion )
assert_equal(7.0/9, ds2['q1'].proportion )
ms=Statsample::Multiset.new(['q1'])
ms.add_dataset('d1',ds1)
ms.add_dataset('d2',ds2)
ss=Statsample::StratifiedSample.new(ms,{'d1'=>50,'d2'=>100})
assert_in_delta(0.655, ss.proportion('q1'),0.01)
assert_in_delta(0.345, ss.proportion('q1',0),0.01)
end
def test_stratum_scale
boys={'test'=>[50, 55, 60, 62, 62, 65, 67, 67, 70, 70, 73, 73, 75, 78, 78, 80, 85, 90].to_vector(:scale)}.to_dataset
girls={'test'=>[70, 70, 72, 72, 75, 75, 78, 78, 80, 80, 82, 82, 85, 85, 88, 88, 90, 90].to_vector(:scale)}.to_dataset
ms=Statsample::Multiset.new(['test'])
ms.add_dataset('boys',boys)
ms.add_dataset('girls',girls)
ss=Statsample::StratifiedSample.new(ms,{'boys'=>10000,'girls'=>10000})
assert_equal(2,ss.strata_number)
assert_equal(20000,ss.population_size)
assert_equal(10000,ss.stratum_size('boys'))
assert_equal(10000,ss.stratum_size('girls'))
assert_equal(36,ss.sample_size)
assert_equal(75,ss.mean('test'))
assert_in_delta(1.45,ss.standard_error_wor('test'),0.01)
assert_in_delta(ss.standard_error_wor('test'), ss.standard_error_wor_2('test'),0.00001)
end
def test_each
xpe={
'a'=>%w{a a a a}.to_vector,
'b'=>%w{b b b b}.to_vector
}
ype={
'a'=>[1,2,3,4].to_scale,
'b'=>[5,6,7,8].to_scale,
}
zpe={
'a'=>[10,11,12,13].to_scale,
'b'=>[14,15,16,17].to_scale,
}
xp,yp,zp=Hash.new(),Hash.new(),Hash.new()
@ms.each {|k,ds|
xp[k]=ds['x']
yp[k]=ds['y']
zp[k]=ds['z']
}
assert_equal(xpe,xp)
assert_equal(ype,yp)
assert_equal(zpe,zp)
end
def test_multiset_union_with_block
r1=rand()
r2=rand()
ye=[1*r1,2*r1,3*r1,4*r1,5*r2,6*r2,7*r2,8*r2].to_scale
ze=[10*r1,11*r1,12*r1,13*r1, 14*r2,15*r2,16*r2,17*r2].to_scale
ds2=@ms.union {|k,ds|
ds['y'].recode!{|v|
k=='a' ? v*r1 : v*r2}
ds['z'].recode!{|v|
k=='a' ? v*r1 : v*r2}
}
assert_equal(ye,ds2['y'])
assert_equal(ze,ds2['z'])
end
def test_multiset_union
r1=rand()
r2=rand()
ye=[1*r1,2*r1,3*r1,4*r1,5*r2,6*r2,7*r2,8*r2].to_scale
ze=[10*r1,11*r1,12*r1,13*r1, 14*r2,15*r2,16*r2,17*r2].to_scale
@ms.each {|k,ds|
ds['y'].recode!{|v|
k=='a' ? v*r1 : v*r2}
ds['z'].recode!{|v|
k=='a' ? v*r1 : v*r2}
}
ds2=@ms.union
assert_equal(ye,ds2['y'])
assert_equal(ze,ds2['z'])
end
end
================================================
FILE: test/test_regression.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleRegressionTestCase < MiniTest::Unit::TestCase
context "Example with missing data" do
setup do
@x=[0.285714285714286, 0.114285714285714, 0.314285714285714, 0.2, 0.2, 0.228571428571429, 0.2, 0.4, 0.714285714285714, 0.285714285714286, 0.285714285714286, 0.228571428571429, 0.485714285714286, 0.457142857142857, 0.257142857142857, 0.228571428571429, 0.285714285714286, 0.285714285714286, 0.285714285714286, 0.142857142857143, 0.285714285714286, 0.514285714285714, 0.485714285714286, 0.228571428571429, 0.285714285714286, 0.342857142857143, 0.285714285714286, 0.0857142857142857].to_scale
@y=[nil, 0.233333333333333, nil, 0.266666666666667, 0.366666666666667, nil, 0.333333333333333, 0.3, 0.666666666666667, 0.0333333333333333, 0.333333333333333, nil, nil, 0.533333333333333, 0.433333333333333, 0.4, 0.4, 0.5, 0.4, 0.266666666666667, 0.166666666666667, 0.666666666666667, 0.433333333333333, 0.166666666666667, nil, 0.4, 0.366666666666667, nil].to_scale
@ds={'x'=>@x,'y'=>@y}.to_dataset
@lr=Statsample::Regression::Multiple::RubyEngine.new(@ds,'y')
end
should "have correct values" do
assert_in_delta(0.455,@lr.r2,0.001)
assert_in_delta(0.427,@lr.r2_adjusted, 0.001)
assert_in_delta(0.1165,@lr.se_estimate,0.001)
assert_in_delta(15.925,@lr.f,0.0001)
assert_in_delta(0.675, @lr.standarized_coeffs['x'],0.001)
assert_in_delta(0.778, @lr.coeffs['x'],0.001, "coeff x")
assert_in_delta(0.132, @lr.constant,0.001,"constant")
assert_in_delta(0.195, @lr.coeffs_se['x'],0.001,"coeff x se")
assert_in_delta(0.064, @lr.constant_se,0.001,"constant se")
end
end
should "return an error if data is linearly dependent" do
samples=100
a,b=rand,rand
x1=samples.times.map { rand}.to_scale
x2=samples.times.map {rand}.to_scale
x3=samples.times.map {|i| x1[i]*(1+a)+x2[i]*(1+b)}.to_scale
y=samples.times.map {|i| x1[i]+x2[i]+x3[i]+rand}.to_scale
ds={'x1'=>x1,'x2'=>x2,'x3'=>x3,'y'=>y}.to_dataset
assert_raise(Statsample::Regression::LinearDependency) {
Statsample::Regression::Multiple::RubyEngine.new(ds,'y')
}
end
def test_parameters
@x=[13,20,10,33,15].to_vector(:scale)
@y=[23,18,35,10,27 ].to_vector(:scale)
reg=Statsample::Regression::Simple.new_from_vectors(@x,@y)
_test_simple_regression(reg)
ds={'x'=>@x,'y'=>@y}.to_dataset
reg=Statsample::Regression::Simple.new_from_dataset(ds,'x','y')
_test_simple_regression(reg)
reg=Statsample::Regression.simple(@x,@y)
_test_simple_regression(reg)
end
def _test_simple_regression(reg)
assert_in_delta(40.009, reg.a,0.001)
assert_in_delta(-0.957, reg.b,0.001)
assert_in_delta(4.248,reg.standard_error,0.002)
assert(reg.summary)
end
def test_summaries
a=10.times.map{rand(100)}.to_scale
b=10.times.map{rand(100)}.to_scale
y=10.times.map{rand(100)}.to_scale
ds={'a'=>a,'b'=>b,'y'=>y}.to_dataset
lr=Statsample::Regression::Multiple::RubyEngine.new(ds,'y')
assert(lr.summary.size>0)
end
def test_multiple_dependent
complete=Matrix[
[1,0.53,0.62,0.19,-0.09,0.08,0.02,-0.12,0.08],
[0.53,1,0.61,0.23,0.1,0.18,0.02,-0.1,0.15],
[0.62,0.61,1,0.03,0.1,0.12,0.03,-0.06,0.12],
[0.19,0.23,0.03,1,-0.02,0.02,0,-0.02,-0.02],
[-0.09,0.1,0.1,-0.02,1,0.05,0.06,0.18,0.02],
[0.08,0.18,0.12,0.02,0.05,1,0.22,-0.07,0.36],
[0.02,0.02,0.03,0,0.06,0.22,1,-0.01,-0.05],
[-0.12,-0.1,-0.06,-0.02,0.18,-0.07,-0.01,1,-0.03],
[0.08,0.15,0.12,-0.02,0.02,0.36,-0.05,-0.03,1]]
complete.extend Statsample::CovariateMatrix
complete.fields=%w{adhd cd odd sex age monly mwork mage poverty}
lr=Statsample::Regression::Multiple::MultipleDependent.new(complete, %w{adhd cd odd})
assert_in_delta(0.197, lr.r2yx,0.001)
assert_in_delta(0.197, lr.r2yx_covariance,0.001)
assert_in_delta(0.07, lr.p2yx,0.001)
end
def test_multiple_regression_pairwise_2
@a=[1,3,2,4,3,5,4,6,5,7,3,nil,3,nil,3].to_vector(:scale)
@b=[3,3,4,4,5,5,6,6,4,4,2,2,nil,6,2].to_vector(:scale)
@c=[11,22,30,40,50,65,78,79,99,100,nil,3,7,nil,7].to_vector(:scale)
@y=[3,4,5,6,7,8,9,10,20,30,30,40,nil,50,nil].to_vector(:scale)
ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
lr=Statsample::Regression::Multiple::RubyEngine.new(ds,'y')
assert_in_delta(2407.436,lr.sst,0.001)
assert_in_delta(0.752,lr.r,0.001, "pairwise r")
assert_in_delta(0.565,lr.r2,0.001)
assert_in_delta(1361.130,lr.ssr,0.001)
assert_in_delta(1046.306,lr.sse,0.001)
assert_in_delta(3.035,lr.f,0.001)
end
def test_multiple_regression_gsl
if Statsample.has_gsl?
@a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
@b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
@c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
@y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
lr=Statsample::Regression::Multiple::GslEngine.new(ds,'y')
assert(lr.summary.size>0)
model_test(lr,'gsl')
predicted=[1.7857, 6.0989, 3.2433, 7.2908, 4.9667, 10.3428, 8.8158, 10.4717, 23.6639, 25.3198]
c_predicted=lr.predicted
predicted.each_index{|i|
assert_in_delta(predicted[i],c_predicted[i],0.001)
}
residuals=[1.2142, -2.0989, 1.7566, -1.29085, 2.033, -2.3428, 0.18414, -0.47177, -3.66395, 4.6801]
c_residuals=lr.residuals
residuals.each_index{|i|
assert_in_delta(residuals[i],c_residuals[i],0.001)
}
else
skip "Regression::Multiple::GslEngine not tested (no Gsl)"
end
end
def model_test_matrix(lr,name='undefined')
stan_coeffs={'a'=>0.151,'b'=>-0.547,'c'=>0.997}
unstan_coeffs={'a'=>0.695, 'b'=>-4.286, 'c'=>0.266}
unstan_coeffs.each_key{|k|
assert_in_delta(unstan_coeffs[k], lr.coeffs[k],0.001,"b coeffs - #{name}")
}
stan_coeffs.each_key{|k|
assert_in_delta(stan_coeffs[k], lr.standarized_coeffs[k],0.001, "beta coeffs - #{name}")
}
assert_in_delta(11.027,lr.constant,0.001)
assert_in_delta(0.955,lr.r,0.001)
assert_in_delta(0.913,lr.r2,0.001)
assert_in_delta(20.908, lr.f,0.001)
assert_in_delta(0.001, lr.probability, 0.001)
assert_in_delta(0.226,lr.tolerance("a"),0.001)
coeffs_se={"a"=>1.171,"b"=>1.129,"c"=>0.072}
ccoeffs_se=lr.coeffs_se
coeffs_se.each_key{|k|
assert_in_delta(coeffs_se[k],ccoeffs_se[k],0.001)
}
coeffs_t={"a"=>0.594,"b"=>-3.796,"c"=>3.703}
ccoeffs_t=lr.coeffs_t
coeffs_t.each_key{|k|
assert_in_delta(coeffs_t[k], ccoeffs_t[k],0.001)
}
assert_in_delta(639.6,lr.sst,0.001)
assert_in_delta(583.76,lr.ssr,0.001)
assert_in_delta(55.840,lr.sse,0.001)
assert(lr.summary.size>0, "#{name} without summary")
end
def model_test(lr,name='undefined')
model_test_matrix(lr,name)
assert_in_delta(4.559, lr.constant_se,0.001)
assert_in_delta(2.419, lr.constant_t,0.001)
assert_in_delta(1.785,lr.process([1,3,11]),0.001)
end
def test_regression_matrix
@a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
@b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
@c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
@y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
cor=Statsample::Bivariate.correlation_matrix(ds)
lr=Statsample::Regression::Multiple::MatrixEngine.new(cor,'y', :y_mean=>@y.mean, :x_mean=>{'a'=>ds['a'].mean, 'b'=>ds['b'].mean, 'c'=>ds['c'].mean}, :cases=>@a.size, :y_sd=>@y.sd , :x_sd=>{'a' => @a.sd, 'b' => @b.sd, 'c' => @c.sd})
assert_nil(lr.constant_se)
assert_nil(lr.constant_t)
model_test_matrix(lr, "correlation matrix")
covariance=Statsample::Bivariate.covariance_matrix(ds)
lr=Statsample::Regression::Multiple::MatrixEngine.new(covariance,'y', :y_mean=>@y.mean, :x_mean=>{'a'=>ds['a'].mean, 'b'=>ds['b'].mean, 'c'=>ds['c'].mean}, :cases=>@a.size)
assert(lr.summary.size>0)
model_test(lr , "covariance matrix")
end
def test_regression_rubyengine
@a=[nil,1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
@b=[nil,3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
@c=[nil,11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
@y=[nil,3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
lr=Statsample::Regression::Multiple::RubyEngine.new(ds,'y')
assert_equal(11, lr.total_cases)
assert_equal(10, lr.valid_cases)
model_test(lr, 'rubyengine with missing data')
predicted=[nil,1.7857, 6.0989, 3.2433, 7.2908, 4.9667, 10.3428, 8.8158, 10.4717, 23.6639, 25.3198]
c_predicted = lr.predicted
predicted.each_index do |i|
if c_predicted[i].nil?
assert(predicted[i].nil?, "Actual #{i} is nil, but expected #{predicted[i]}")
else
assert_in_delta(predicted[i], c_predicted[i], 0.001)
end
end
residuals=[nil,1.2142, -2.0989, 1.7566, -1.29085, 2.033, -2.3428, 0.18414, -0.47177, -3.66395, 4.6801]
c_residuals=lr.residuals
residuals.each_index do |i|
if c_residuals[i].nil?
assert(residuals[i].nil?)
else
assert_in_delta(residuals[i],c_residuals[i],0.001)
end
end
end
end
================================================
FILE: test/test_reliability.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleReliabilityTestCase < MiniTest::Unit::TestCase
context Statsample::Reliability do
should "return correct r according to Spearman-Brown prophecy" do
r=0.6849
n=62.quo(15)
assert_in_delta(0.9, Statsample::Reliability.sbp(r,n), 0.001)
end
should "return correct n for desired realiability" do
r=0.6849
r_d=0.9
assert_in_delta(62, Statsample::Reliability.n_for_desired_reliability(r, r_d, 15),0.5)
end
context "Cronbach's alpha" do
setup do
@samples=40
@n_variables=rand(10)+2
@ds=Statsample::Dataset.new()
base=@samples.times.collect {|a| rand()}.to_scale
@n_variables.times do |i|
@ds[i]=base.collect {|v| v+rand()}.to_scale
end
@ds.update_valid_data
@k=@ds.fields.size
@cm=Statsample::Bivariate.covariance_matrix(@ds)
@dse=@ds.dup
@dse.fields.each do |f|
@dse[f]=@dse[f].standarized
end
@cme=Statsample::Bivariate.covariance_matrix(@dse)
@a=Statsample::Reliability.cronbach_alpha(@ds)
@as=Statsample::Reliability.cronbach_alpha_standarized(@ds)
end
should "alpha will be equal to sum of matrix covariance less the individual variances" do
total_sum=@cm.total_sum
ind_var=@ds.fields.inject(0) {|ac,v| ac+@ds[v].variance}
expected = @k.quo(@k-1) * (1-(ind_var.quo(total_sum)))
assert_in_delta(expected, @a,1e-10)
end
should "method cronbach_alpha_from_n_s2_cov return correct values" do
sa=Statsample::Reliability::ScaleAnalysis.new(@ds)
vm, cm = sa.variances_mean, sa.covariances_mean
assert_in_delta(sa.alpha, Statsample::Reliability.cronbach_alpha_from_n_s2_cov(@n_variables, vm,cm), 1e-10)
end
should "method cronbach_alpha_from_covariance_matrix returns correct value" do
cov=Statsample::Bivariate.covariance_matrix(@ds)
assert_in_delta(@a, Statsample::Reliability.cronbach_alpha_from_covariance_matrix(cov),0.0000001)
end
should "return correct n for desired alpha, covariance and variance" do
sa=Statsample::Reliability::ScaleAnalysis.new(@ds)
vm, cm = sa.variances_mean, sa.covariances_mean
n_obtained=Statsample::Reliability.n_for_desired_alpha(@a, vm,cm)
#p n_obtained
assert_in_delta(Statsample::Reliability.cronbach_alpha_from_n_s2_cov(n_obtained, vm,cm) ,@a,0.001)
end
should "standarized alpha will be equal to sum of matrix covariance less the individual variances on standarized values" do
total_sum=@cme.total_sum
ind_var=@dse.fields.inject(0) {|ac,v| ac+@dse[v].variance}
expected = @k.quo(@k-1) * (1-(ind_var.quo(total_sum)))
assert_in_delta(expected, @as, 1e-10)
end
end
context Statsample::Reliability::ItemCharacteristicCurve do
setup do
@samples=100
@points=rand(10)+3
@max_point=(@points-1)*3
@x1=@samples.times.map{rand(@points)}.to_scale
@x2=@samples.times.map{rand(@points)}.to_scale
@x3=@samples.times.map{rand(@points)}.to_scale
@ds={'a'=>@x1,'b'=>@x2,'c'=>@x3}.to_dataset
@icc=Statsample::Reliability::ItemCharacteristicCurve.new(@ds)
end
should "have a correct automatic vector_total" do
assert_equal(@ds.vector_sum, @icc.vector_total)
end
should "have a correct different vector_total" do
x2=@samples.times.map{rand(10)}.to_scale
@icc=Statsample::Reliability::ItemCharacteristicCurve.new(@ds,x2)
assert_equal(x2, @icc.vector_total)
assert_raises(ArgumentError) do
inc=(@samples+10).times.map{rand(10)}.to_scale
@icc=Statsample::Reliability::ItemCharacteristicCurve.new(@ds,inc)
end
end
should "have 0% for 0 points on maximum value values" do
max=@icc.curve_field('a',0)[@max_point.to_f]
max||=0
assert_in_delta(0, max)
end
should "have 0 for max value on minimum value" do
max=@icc.curve_field('a',@max_point)[0.0]
max||=0
assert_in_delta(0, max)
end
should "have correct values of % for any value" do
sum=@icc.vector_total
total={}
total_g=sum.frequencies
index=rand(@points)
@x1.each_with_index do |v,i|
total[sum[i]]||=0
total[sum[i]]+=1 if v==index
end
expected=total.each {|k,v|
total[k]=v.quo(total_g[k])
}
assert_equal(expected, @icc.curve_field('a',index))
end
end
context Statsample::Reliability::MultiScaleAnalysis do
setup do
size=100
@scales=3
@items_per_scale=10
h={}
@scales.times {|s|
@items_per_scale.times {|i|
h["#{s}_#{i}"] = (size.times.map {(s*2)+rand}).to_scale
}
}
@ds=h.to_dataset
@msa=Statsample::Reliability::MultiScaleAnalysis.new(:name=>'Multiple Analysis') do |m|
m.scale "complete", @ds
@scales.times {|s|
m.scale "scale_#{s}", @ds.clone(@items_per_scale.times.map {|i| "#{s}_#{i}"}), {:name=>"Scale #{s}"}
}
end
end
should "Retrieve correct ScaleAnalysis for whole scale" do
sa=Statsample::Reliability::ScaleAnalysis.new(@ds, :name=>"Scale complete")
assert_equal(sa.variances_mean, @msa.scale("complete").variances_mean)
end
should "Retrieve correct ScaleAnalysis for each scale" do
@scales.times {|s|
sa=Statsample::Reliability::ScaleAnalysis.new(@ds.dup(@items_per_scale.times.map {|i| "#{s}_#{i}"}), :name=>"Scale #{s}")
assert_equal(sa.variances_mean,@msa.scale("scale_#{s}").variances_mean)
}
end
should "retrieve correct correlation matrix for each scale" do
vectors={'complete' => @ds.vector_sum}
@scales.times {|s|
vectors["scale_#{s}"]=@ds.dup(@items_per_scale.times.map {|i| "#{s}_#{i}"}).vector_sum
}
ds2=vectors.to_dataset
assert_equal(Statsample::Bivariate.correlation_matrix(ds2), @msa.correlation_matrix)
end
should "delete scale using delete_scale" do
@msa.delete_scale("complete")
assert_equal(@msa.scales.keys.sort, @scales.times.map {|s| "scale_#{s}"})
end
should "retrieve pca for scales" do
@msa.delete_scale("complete")
vectors=Hash.new
@scales.times {|s|
vectors["scale_#{s}"]=@ds.dup(@items_per_scale.times.map {|i| "#{s}_#{i}"}).vector_sum
}
ds2=vectors.to_dataset
cor_matrix=Statsample::Bivariate.correlation_matrix(ds2)
m=3
pca=Statsample::Factor::PCA.new(cor_matrix, :m=>m)
assert_equal(pca.component_matrix, @msa.pca(:m=>m).component_matrix)
end
should "retrieve acceptable summary" do
@msa.delete_scale("scale_0")
@msa.delete_scale("scale_1")
@msa.delete_scale("scale_2")
#@msa.summary_correlation_matrix=true
#@msa.summary_pca=true
assert(@msa.summary.size>0)
end
end
context Statsample::Reliability::ScaleAnalysis do
setup do
@x1=[1,1,1,1,2,2,2,2,3,3,3,30].to_scale
@x2=[1,1,1,2,2,3,3,3,3,4,4,50].to_scale
@x3=[2,2,1,1,1,2,2,2,3,4,5,40].to_scale
@x4=[1,2,3,4,4,4,4,3,4,4,5,30].to_scale
@ds={'x1'=>@x1,'x2'=>@x2,'x3'=>@x3,'x4'=>@x4}.to_dataset
@ia=Statsample::Reliability::ScaleAnalysis.new(@ds)
@cov_matrix=@ia.cov_m
end
should "return correct values for item analysis" do
assert_in_delta(0.980,@ia.alpha,0.001)
assert_in_delta(0.999,@ia.alpha_standarized,0.001)
var_mean=4.times.map{|m| @cov_matrix[m,m]}.to_scale.mean
assert_in_delta(var_mean, @ia.variances_mean)
assert_equal(@x1.mean, @ia.item_statistics['x1'][:mean])
assert_equal(@x4.mean, @ia.item_statistics['x4'][:mean])
assert_in_delta(@x1.sds, @ia.item_statistics['x1'][:sds],1e-14)
assert_in_delta(@x4.sds, @ia.item_statistics['x4'][:sds],1e-14)
ds2=@ds.clone
ds2.delete_vector('x1')
vector_sum=ds2.vector_sum
assert_equal(vector_sum.mean, @ia.stats_if_deleted['x1'][:mean])
assert_equal(vector_sum.sds, @ia.stats_if_deleted['x1'][:sds])
assert_in_delta(vector_sum.variance, @ia.stats_if_deleted['x1'][:variance_sample],1e-10)
assert_equal(Statsample::Reliability.cronbach_alpha(ds2), @ia.stats_if_deleted['x1'][:alpha])
covariances=[]
4.times.each {|i|
4.times.each {|j|
if i!=j
covariances.push(@cov_matrix[i,j])
end
}
}
assert_in_delta(covariances.to_scale.mean, @ia.covariances_mean)
assert_in_delta(0.999,@ia.item_total_correlation()['x1'],0.001)
assert_in_delta(1050.455,@ia.stats_if_deleted()['x1'][:variance_sample],0.001)
end
should "return a summary" do
assert(@ia.summary.size>0)
end
end
end
end
================================================
FILE: test/test_reliability_icc.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
$reliability_icc=nil
class StatsampleReliabilityIccTestCase < MiniTest::Test
context Statsample::Reliability::ICC do
setup do
a=[9,6,8,7,10,6].to_scale
b=[2,1,4,1,5,2].to_scale
c=[5,3,6,2,6,4].to_scale
d=[8,2,8,6,9,7].to_scale
@ds={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset
@icc=Statsample::Reliability::ICC.new(@ds)
end
should "basic method be correct" do
assert_equal(6,@icc.n)
assert_equal(4,@icc.k)
end
should "total mean be correct" do
assert_in_delta(5.291, @icc.total_mean, 0.001)
end
should "df methods be correct" do
assert_equal(5, @icc.df_bt)
assert_equal(18, @icc.df_wt)
assert_equal(3, @icc.df_bj)
assert_equal(15, @icc.df_residual)
end
should "ms between targets be correct" do
assert_in_delta(11.24, @icc.ms_bt, 0.01)
end
should "ms within targets be correct" do
assert_in_delta(6.26, @icc.ms_wt, 0.01)
end
should "ms between judges be correct" do
assert_in_delta(32.49, @icc.ms_bj, 0.01)
end
should "ms residual be correct" do
assert_in_delta(1.02, @icc.ms_residual, 0.01)
end
context "with McGraw and Wong denominations," do
end
context "with Shrout & Fleiss denominations, " do
should "icc(1,1) method be correct" do
assert_in_delta(0.17, @icc.icc_1_1, 0.01)
end
# Verified on SPSS and R
should "icc(2,1) method be correct" do
assert_in_delta(0.29, @icc.icc_2_1, 0.01)
end
should "icc(3,1) method be correct" do
assert_in_delta(0.71, @icc.icc_3_1, 0.01)
end
should "icc(1,k) method be correct" do
assert_in_delta(0.44, @icc.icc_1_k, 0.01)
end
# Verified on SPSS and R
should "icc(2,k) method be correct" do
assert_in_delta(0.62, @icc.icc_2_k, 0.01)
end
should "icc(3,k) method be correct" do
assert_in_delta(0.91, @icc.icc_3_k, 0.01)
end
should "icc(1,1) F be correct" do
assert_in_delta(1.795, @icc.icc_1_f.f)
end
should "icc(1,1) confidence interval should be correct" do
assert_in_delta(-0.133, @icc.icc_1_1_ci[0], 0.001)
assert_in_delta(0.723, @icc.icc_1_1_ci[1], 0.001)
end
should "icc(1,k) confidence interval should be correct" do
assert_in_delta(-0.884, @icc.icc_1_k_ci[0], 0.001)
assert_in_delta(0.912, @icc.icc_1_k_ci[1], 0.001)
end
should "icc(2,1) F be correct" do
assert_in_delta(11.027, @icc.icc_2_f.f)
end
should "icc(2,1) confidence interval should be correct" do
#skip("Not yet operational")
assert_in_delta(0.019, @icc.icc_2_1_ci[0], 0.001)
assert_in_delta(0.761, @icc.icc_2_1_ci[1], 0.001)
end
# Verified on SPSS and R
should "icc(2,k) confidence interval should be correct" do
#skip("Not yet operational")
#p @icc.icc_2_k_ci
assert_in_delta(0.039, @icc.icc_2_k_ci[0], 0.001)
assert_in_delta(0.929, @icc.icc_2_k_ci[1], 0.001)
end
#should "Shrout icc(2,k) and McGraw icc(a,k) ci be equal" do
# assert_in_delta(@icc.icc_2_k_ci_shrout[0], @icc.icc_2_k_ci_mcgraw[0], 10e-5)
#end
should "icc(3,1) F be correct" do
assert_in_delta(11.027, @icc.icc_3_f.f)
end
should "icc(3,1) confidence interval should be correct" do
assert_in_delta(0.342, @icc.icc_3_1_ci[0], 0.001)
assert_in_delta(0.946, @icc.icc_3_1_ci[1], 0.001)
end
should "icc(3,k) confidence interval should be correct" do
assert_in_delta(0.676, @icc.icc_3_k_ci[0], 0.001)
assert_in_delta(0.986, @icc.icc_3_k_ci[1], 0.001)
end
should "incorrect type raises an error" do
assert_raise(::RuntimeError) do
@icc.type=:nonexistant_type
end
end
end
begin
require 'rserve'
require 'statsample/rserve_extension'
context "McGraw and Wong" do
teardown do
@r=$reliability_icc[:r].close unless $reliability_icc[:r].nil?
end
setup do
if($reliability_icc.nil?)
size=100
a=size.times.map {rand(10)}.to_scale
b=a.recode{|i|i+rand(4)-2}
c=a.recode{|i|i+rand(4)-2}
d=a.recode{|i|i+rand(4)-2}
@ds={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset
@icc=Statsample::Reliability::ICC.new(@ds)
@r=Rserve::Connection.new
@r.assign('ds',@ds)
@r.void_eval("library(irr);
iccs=list(
icc_1=icc(ds,'o','c','s'),
icc_k=icc(ds,'o','c','a'),
icc_c_1=icc(ds,'t','c','s'),
icc_c_k=icc(ds,'t','c','a'),
icc_a_1=icc(ds,'t','a','s'),
icc_a_k=icc(ds,'t','a','a'))
")
@iccs=@r.eval('iccs').to_ruby
$reliability_icc={ :icc=>@icc, :iccs=>@iccs, :r=>@r
}
end
@icc=$reliability_icc[:icc]
@iccs=$reliability_icc[:iccs]
@r=$reliability_icc[:r]
end
[:icc_1, :icc_k, :icc_c_1, :icc_c_k, :icc_a_1, :icc_a_k].each do |t|
context "ICC Type #{t} " do
should "value be correct" do
@icc.type=t
@r_icc=@iccs[t.to_s]
assert_in_delta(@r_icc['value'],@icc.r)
end
should "fvalue be correct" do
@icc.type=t
@r_icc=@iccs[t.to_s]
assert_in_delta(@r_icc['Fvalue'],@icc.f.f)
end
should "num df be correct" do
@icc.type=t
@r_icc=@iccs[t.to_s]
assert_in_delta(@r_icc['df1'],@icc.f.df_num)
end
should "den df be correct" do
@icc.type=t
@r_icc=@iccs[t.to_s]
assert_in_delta(@r_icc['df2'],@icc.f.df_den)
end
should "f probability be correct" do
@icc.type=t
@r_icc=@iccs[t.to_s]
assert_in_delta(@r_icc['p.value'],@icc.f.probability)
end
should "bounds be equal" do
@icc.type=t
@r_icc=@iccs[t.to_s]
assert_in_delta(@r_icc['lbound'],@icc.lbound)
assert_in_delta(@r_icc['ubound'],@icc.ubound)
end
should "summary generated" do
assert(@icc.summary.size>0)
end
end
end
end
rescue
puts "requires rserve"
end
end
end
================================================
FILE: test/test_reliability_skillscale.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleReliabilitySkillScaleTestCase < MiniTest::Unit::TestCase
context Statsample::Reliability::SkillScaleAnalysis do
setup do
options=%w{a b c d e}
cases=20
@id=cases.times.map {|v| v}.to_scale
@a=cases.times.map {options[rand(5)]}.to_vector
@b=cases.times.map {options[rand(5)]}.to_vector
@c=cases.times.map {options[rand(5)]}.to_vector
@d=cases.times.map {options[rand(5)]}.to_vector
@e=cases.times.map {|i|
i==0 ? options[rand(0)] :
rand()>0.8 ? nil : options[rand(5)]
}.to_vector
@ds={'id'=>@id,'a'=>@a,'b'=>@b,'c'=>@c,'d'=>@d,'e'=>@e}.to_dataset
@key={'a'=>"a", 'b'=>options[rand(5)], 'c'=>options[rand(5)], 'd'=>options[rand(5)],'e'=>options[rand(5)]}
@ssa=Statsample::Reliability::SkillScaleAnalysis.new(@ds, @key)
@ac=@a.map {|v| v==@key['a'] ? 1 : 0}.to_scale
@bc=@b.map {|v| v==@key['b'] ? 1 : 0}.to_scale
@cc=@c.map {|v| v==@key['c'] ? 1 : 0}.to_scale
@dc=@d.map {|v| v==@key['d'] ? 1 : 0}.to_scale
@ec=@e.map {|v| v.nil? ? nil : (v==@key['e'] ? 1 : 0)}.to_scale
end
should "return proper corrected dataset" do
cds={'id'=>@id, 'a'=>@ac,'b'=>@bc,'c'=>@cc,'d'=>@dc, 'e'=>@ec}.to_dataset
assert_equal(cds, @ssa.corrected_dataset)
end
should "return proper corrected minimal dataset" do
cdsm={'a'=>@ac,'b'=>@bc,'c'=>@cc,'d'=>@dc, 'e'=>@ec}.to_dataset
assert_equal(cdsm, @ssa.corrected_dataset_minimal)
end
should "return correct vector_sum and vector_sum" do
cdsm=@ssa.corrected_dataset_minimal
assert_equal(cdsm.vector_sum, @ssa.vector_sum)
assert_equal(cdsm.vector_mean, @ssa.vector_mean)
end
should "not crash on rare case" do
a=Statsample::Vector["c","c","a","a","c","a","b","c","c","b","a","d","a","d","a","a","d","e","c","d"]
b=Statsample::Vector["e","b","e","b","c","d","a","e","e","c","b","e","e","b","d","c","e","b","b","d"]
c=Statsample::Vector["e","b","e","c","e","c","b","d","e","c","a","a","b","d","e","c","b","a","a","e"]
d=Statsample::Vector["a","b","d","d","e","b","e","b","d","c","e","a","c","d","c","c","e","d","d","b"]
e=Statsample::Vector["a","b",nil,"d","c","c","d",nil,"d","d","e","e",nil,nil,nil,"d","c",nil,"e","d"]
key={"a"=>"a", "b"=>"e", "c"=>"d", "d"=>"c", "e"=>"d"}
ds=Statsample::Dataset.new("a"=>a,"b"=>b,"c"=>c,"d"=>d,"e"=>e)
ssa=Statsample::Reliability::SkillScaleAnalysis.new(ds, key)
assert(ssa.summary)
end
should "return valid summary" do
assert(@ssa.summary.size>0)
end
end
end
================================================
FILE: test/test_resample.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleResampleTestCase < MiniTest::Unit::TestCase
def initialize(*args)
super
end
def test_basic
r=Statsample::Resample.generate(20,1,10)
assert_equal(20,r.size)
assert(r.min>=1)
assert(r.max<=10)
end
def test_repeat_and_save
r=Statsample::Resample.repeat_and_save(400) {
Statsample::Resample.generate(20,1,10).count(1)
}
assert_equal(400,r.size)
v=Statsample::Vector.new(r,:scale)
a=v.count {|x| x > 3}
assert(a>=30 && a<=70)
end
end
================================================
FILE: test/test_rserve_extension.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
begin
require 'rserve'
require 'statsample/rserve_extension'
class StatsampleRserveExtensionTestCase < MiniTest::Unit::TestCase
context "Statsample Rserve extensions" do
setup do
@r=Rserve::Connection.new
end
teardown do
@r.close
end
should "return a valid rexp for numeric vector" do
a=100.times.map {|i| rand()>0.9 ? nil : i+rand() }.to_scale
rexp=a.to_REXP
assert(rexp.is_a? Rserve::REXP::Double)
assert_equal(rexp.to_ruby,a.data_with_nils)
@r.assign 'a',rexp
assert_equal(a.data_with_nils, @r.eval('a').to_ruby)
end
should "return a valid rserve dataframe for statsample datasets" do
a=100.times.map {|i| rand()>0.9 ? nil : i+rand() }.to_scale
b=100.times.map {|i| rand()>0.9 ? nil : i+rand() }.to_scale
c=100.times.map {|i| rand()>0.9 ? nil : i+rand() }.to_scale
ds={'a'=>a,'b'=>b,'c'=>c}.to_dataset
rexp=ds.to_REXP
assert(rexp.is_a? Rserve::REXP::GenericVector)
ret=rexp.to_ruby
assert_equal(a.data_with_nils, ret['a'])
@r.assign 'df', rexp
out_df=@r.eval('df').to_ruby
assert_equal('data.frame', out_df.attributes['class'])
assert_equal(['a','b','c'], out_df.attributes['names'])
assert_equal(a.data_with_nils, out_df['a'])
end
end
end
rescue LoadError
puts "Require rserve extension"
end
================================================
FILE: test/test_srs.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleSrsTestCase < MiniTest::Unit::TestCase
def test_std_error
assert_equal(384,Statsample::SRS.estimation_n0(0.05,0.5,0.95).to_i)
assert_equal(108,Statsample::SRS.estimation_n(0.05,0.5,150,0.95).to_i)
assert_in_delta(0.0289,Statsample::SRS.proportion_sd_kp_wor(0.5,100,150),0.001)
end
end
================================================
FILE: test/test_statistics.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleStatisicsTestCase < MiniTest::Unit::TestCase
def initialize(*args)
super
end
def test_p_using_cdf
assert_equal(0.25, Statsample::Test.p_using_cdf(0.25, tails=:left))
assert_equal(0.75, Statsample::Test.p_using_cdf(0.25, tails=:right))
assert_equal(0.50, Statsample::Test.p_using_cdf(0.25, tails=:both))
assert_equal(1, Statsample::Test.p_using_cdf(0.50, tails=:both))
assert_equal(0.05, Statsample::Test.p_using_cdf(0.025, tails=:both))
assert_in_delta(0.05, Statsample::Test.p_using_cdf(0.975, tails=:both),0.0001)
end
def test_recode_repeated
a=%w{a b c c d d d e}
exp=["a","b","c_1","c_2","d_1","d_2","d_3","e"]
assert_equal(exp,a.recode_repeated)
end
def test_is_number
assert("10".is_number?)
assert("-10".is_number?)
assert("0.1".is_number?)
assert("-0.1".is_number?)
assert("10e3".is_number?)
assert("10e-3".is_number?)
assert(!"1212-1212-1".is_number?)
assert(!"a10".is_number?)
assert(!"".is_number?)
end
def test_estimation_mean
v=([42]*23+[41]*4+[36]*1+[32]*1+[29]*1+[27]*2+[23]*1+[19]*1+[16]*2+[15]*2+[14,11,10,9,7]+ [6]*3+[5]*2+[4,3]).to_vector(:scale)
assert_equal(50,v.size)
assert_equal(1471,v.sum())
#limits=Statsample::SRS.mean_confidence_interval_z(v.mean(), v.sds(), v.size,676,0.80)
end
def test_estimation_proportion
# total
pop=3042
sam=200
prop=0.19
assert_in_delta(81.8, Statsample::SRS.proportion_total_sd_ep_wor(prop, sam, pop), 0.1)
# confidence limits
pop=500
sam=100
prop=0.37
a=0.95
l= Statsample::SRS.proportion_confidence_interval_z(prop, sam, pop, a)
assert_in_delta(0.28,l[0],0.01)
assert_in_delta(0.46,l[1],0.01)
end
def test_ml
if(true)
#real=[1,1,1,1].to_vector(:scale)
#pred=[0.0001,0.0001,0.0001,0.0001].to_vector(:scale)
# puts Statsample::Bivariate.maximum_likehood_dichotomic(pred,real)
end
end
def test_simple_linear_regression
a=[1,2,3,4,5,6].to_vector(:scale)
b=[6,2,4,10,12,8].to_vector(:scale)
reg = Statsample::Regression::Simple.new_from_vectors(a,b)
assert_in_delta((reg.ssr+reg.sse).to_f,reg.sst,0.001)
assert_in_delta(Statsample::Bivariate.pearson(a,b),reg.r,0.001)
assert_in_delta(2.4,reg.a,0.01)
assert_in_delta(1.314,reg.b,0.001)
assert_in_delta(0.657,reg.r,0.001)
assert_in_delta(0.432,reg.r2,0.001)
end
end
================================================
FILE: test/test_stest.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleTestTestCase < MiniTest::Unit::TestCase
def test_chi_square_matrix_with_expected
real=Matrix[[95,95],[45,155]]
expected=Matrix[[68,122],[72,128]]
assert_nothing_raised do
Statsample::Test.chi_square(real,expected)
end
chi=Statsample::Test.chi_square(real,expected).chi_square
assert_in_delta(32.53,chi,0.1)
end
def test_chi_square_matrix_only_observed
observed=Matrix[[20,30,40],[30,40,50],[60,70,80],[10,20,40]]
assert_nothing_raised do
Statsample::Test.chi_square(observed)
end
chi=Statsample::Test.chi_square(observed)
assert_in_delta(9.5602, chi.chi_square, 0.0001)
assert_in_delta(0.1444, chi.probability, 0.0001)
assert_equal(6, chi.df)
end
def test_u_mannwhitney
a=[1,2,3,4,5,6].to_scale
b=[0,5,7,9,10,11].to_scale
assert_equal(7.5, Statsample::Test.u_mannwhitney(a,b).u)
assert_equal(7.5, Statsample::Test.u_mannwhitney(b,a).u)
a=[1, 7,8,9,10,11].to_scale
b=[2,3,4,5,6,12].to_scale
assert_equal(11, Statsample::Test.u_mannwhitney(a,b).u)
end
def test_levene
a=[1,2,3,4,5,6,7,8,100,10].to_scale
b=[30,40,50,60,70,80,90,100,110,120].to_scale
levene=Statsample::Test::Levene.new([a,b])
assert_levene(levene)
end
def test_levene_dataset
a=[1,2,3,4,5,6,7,8,100,10].to_scale
b=[30,40,50,60,70,80,90,100,110,120].to_scale
ds={'a'=>a,'b'=>b}.to_dataset
levene=Statsample::Test::Levene.new(ds)
assert_levene(levene)
end
def assert_levene(levene)
assert_in_delta(0.778, levene.f, 0.001)
assert_in_delta(0.389, levene.probability, 0.001)
end
end
================================================
FILE: test/test_stratified.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleStratifiedTestCase < MiniTest::Unit::TestCase
def initialize(*args)
super
end
def test_mean
a=[10,20,30,40,50]
b=[110,120,130,140]
pop=a+b
av=a.to_vector(:scale)
bv=b.to_vector(:scale)
popv=pop.to_vector(:scale)
assert_equal(popv.mean,Statsample::StratifiedSample.mean(av,bv))
end
end
================================================
FILE: test/test_test_f.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleTestFTestCase < MiniTest::Unit::TestCase
context(Statsample::Test::F) do
setup do
@ssb=84
@ssw=68
@df_num=2
@df_den=15
@f=Statsample::Test::F.new(@ssb.quo(@df_num),@ssw.quo(@df_den), @df_num, @df_den)
end
should "have #f equal to msb/msw" do
assert_equal((@ssb.quo(@df_num)).quo(@ssw.quo(@df_den)), @f.f)
end
should "have df total equal to df_num+df_den" do
assert_equal(@df_num + @df_den, @f.df_total)
end
should "have probability near 0.002" do
assert_in_delta(0.002, @f.probability, 0.0005)
end
should "be coerced into float" do
assert_equal(@f.to_f, @f.f)
end
context("method summary") do
setup do
@summary=@f.summary
end
should "have size > 0" do
assert(@summary.size>0)
end
end
end
end
================================================
FILE: test/test_test_kolmogorovsmirnov.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleTestKolmogorovSmirnovTestCase < MiniTest::Unit::TestCase
context(Statsample::Test::KolmogorovSmirnov) do
should "calculate correctly D for two given samples" do
a=[1.1,2.5,5.6,9]
b=[1,2.3,5.8,10]
ks=Statsample::Test::KolmogorovSmirnov.new(a,b)
assert_equal(0.25,ks.d)
end
should "calculate correctly D for a normal sample and Normal Distribution" do
a=[0.30022510,-0.36664035,0.08593404,1.29881130,-0.49878633,-0.63056010, 0.28397638, -0.04913700,0.03566644,-1.33414346]
ks=Statsample::Test::KolmogorovSmirnov.new(a,Distribution::Normal)
assert_in_delta(0.282, ks.d,0.001)
end
should "calculate correctly D for a variable normal and Normal Distribution" do
rng=Distribution::Normal.rng
a=100.times.map {rng.call}
ks=Statsample::Test::KolmogorovSmirnov.new(a,Distribution::Normal)
assert(ks.d<0.15)
end
context(Statsample::Test::KolmogorovSmirnov::EmpiricDistribution) do
should "Create a correct empirical distribution for an array" do
a=[10,9,8,7,6,5,4,3,2,1]
ed=Statsample::Test::KolmogorovSmirnov::EmpiricDistribution.new(a)
assert_equal(0, ed.cdf(-2))
assert_equal(0.5, ed.cdf(5))
assert_equal(0.5, ed.cdf(5.5))
assert_equal(0.9, ed.cdf(9))
assert_equal(1, ed.cdf(11))
end
end
end
end
================================================
FILE: test/test_test_t.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleTestTTestCase < MiniTest::Unit::TestCase
include Statsample::Test
include Math
context T do
setup do
@a=[30.02, 29.99, 30.11, 29.97, 30.01, 29.99].to_scale
@b=[29.89, 29.93, 29.72, 29.98, 30.02, 29.98].to_scale
@x1=@a.mean
@x2=@b.mean
@s1=@a.sd
@s2=@b.sd
@n1=@a.n
@n2=@b.n
end
should "calculate correctly standard t" do
t=Statsample::Test::T.new(@x1, @s1.quo(Math.sqrt(@a.n)), @a.n-1)
assert_equal((@x1).quo(@s1.quo(Math.sqrt(@a.n))), t.t)
assert_equal(@a.n-1, t.df)
assert(t.summary.size>0)
end
should "calculate correctly t for one sample" do
t1=[6, 4, 6, 7, 4,5,5,12,6,1].to_scale
t2=[9, 6, 5,10,10,8,7,10,6,5].to_scale
d=t1-t2
t=Statsample::Test::T::OneSample.new(d)
assert_in_delta(-2.631, t.t, 0.001)
assert_in_delta( 0.027, t.probability, 0.001)
assert_in_delta( 0.76012, t.se, 0.0001)
assert(t.summary.size>0)
end
should "calculate correctly t for two samples" do
assert_in_delta(1.959, T.two_sample_independent(@x1, @x2, @s1, @s2, @n1, @n2),0.001)
assert_in_delta(1.959, T.two_sample_independent(@x1, @x2, @s1, @s2, @n1, @n2,true),0.001)
end
should "calculate correctly df for equal and unequal variance" do
assert_equal(10, T.df_equal_variance(@n1,@n2))
assert_in_delta(7.03, T.df_not_equal_variance(@s1,@s2,@n1,@n2),0.001)
end
should "calculate all values for T object" do
t=Statsample::Test.t_two_samples_independent(@a,@b)
assert(t.summary.size>0)
assert_in_delta(1.959, t.t_equal_variance,0.001)
assert_in_delta(1.959, t.t_not_equal_variance,0.001)
assert_in_delta(10, t.df_equal_variance,0.001)
assert_in_delta(7.03, t.df_not_equal_variance,0.001)
assert_in_delta(0.07856, t.probability_equal_variance,0.001)
assert_in_delta(0.09095, t.probability_not_equal_variance,0.001)
end
should "be the same using shorthand" do
v=100.times.map {rand(100)}.to_scale
assert_equal(Statsample::Test.t_one_sample(v).t, T::OneSample.new(v).t)
end
should "calculate all values for one sample T test" do
u=@a.mean+(1-rand*2)
tos=T::OneSample.new(@a,{:u=>u})
assert_equal((@a.mean-u).quo(@a.sd.quo(sqrt(@a.n))), tos.t)
assert_equal(@a.n-1, tos.df)
assert(tos.summary.size>0)
end
end
end
================================================
FILE: test/test_umannwhitney.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleUMannWhitneyTestCase < MiniTest::Unit::TestCase
include Statsample::Test
context Statsample::Test::UMannWhitney do
setup do
@v1=[1,2,3,4,7,8,9,10,14,15].to_scale
@v2=[5,6,11,12,13,16,17,18,19].to_scale
@u=Statsample::Test::UMannWhitney.new(@v1,@v2)
end
should "have same result using class or Test#u_mannwhitney" do
assert_equal(Statsample::Test.u_mannwhitney(@v1,@v2).u, @u.u)
end
should "have correct U values" do
assert_equal(73,@u.r1)
assert_equal(117,@u.r2)
assert_equal(18,@u.u)
end
should "have correct value for z" do
assert_in_delta(-2.205,@u.z,0.001)
end
should "have correct value for z and exact probability" do
assert_in_delta(0.027,@u.probability_z,0.001)
assert_in_delta(0.028,@u.probability_exact,0.001)
end
end
end
================================================
FILE: test/test_vector.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleTestVector < MiniTest::Unit::TestCase
include Statsample::Shorthand
def setup
@c = Statsample::Vector.new([5,5,5,5,5,6,6,7,8,9,10,1,2,3,4,nil,-99,-99], :nominal)
@c.name="Test Vector"
@c.missing_values=[-99]
end
def assert_counting_tokens(b)
assert_equal([1,1,0,1,0,nil],b['a'].to_a)
assert_equal([0,1,0,0,0,nil],b['b'].to_a)
assert_equal([0,0,1,0,0,nil],b['c'].to_a)
assert_equal([0,0,1,1,0,nil],b['d'].to_a)
assert_equal([0,0,0,0,1,nil],b[10].to_a)
end
context Statsample do
setup do
@sample=100
@a=@sample.times.map{|i| (i+rand(10)) %10 ==0 ? nil : rand(100)}.to_scale
@b=@sample.times.map{|i| (i+rand(10)) %10 ==0 ? nil : rand(100)}.to_scale
@correct_a=Array.new
@correct_b=Array.new
@a.each_with_index do |v,i|
if !@a[i].nil? and !@b[i].nil?
@correct_a.push(@a[i])
@correct_b.push(@b[i])
end
end
@correct_a=@correct_a.to_scale
@correct_b=@correct_b.to_scale
@common=lambda do |av,bv|
assert_equal(@correct_a, av, "A no es esperado")
assert_equal(@correct_b, bv, "B no es esperado")
assert(!av.has_missing_data?, "A tiene datos faltantes")
assert(!bv.has_missing_data?, "b tiene datos faltantes")
end
end
should "return correct only_valid" do
av,bv=Statsample.only_valid @a,@b
av2,bv2=Statsample.only_valid av,bv
@common.call(av,bv)
assert_equal(av,av2)
assert_not_same(av,av2)
assert_not_same(bv,bv2)
end
should "return correct only_valid_clone" do
av,bv=Statsample.only_valid_clone @a,@b
@common.call(av,bv)
av2,bv2=Statsample.only_valid_clone av,bv
assert_equal(av,av2)
assert_same(av,av2)
assert_same(bv,bv2)
end
end
context Statsample::Vector do
setup do
@c = Statsample::Vector.new([5,5,5,5,5,6,6,7,8,9,10,1,2,3,4,nil,-99,-99], :nominal)
@c.name="Test Vector"
@c.missing_values=[-99]
end
should_with_gsl "be created with GSL::Vector" do
gsl=GSL::Vector[1,2,3,4,5]
v=Statsample::Vector.new(gsl)
assert_equal([1,2,3,4,5], v.to_a)
refute(v.flawed?)
end
context "using matrix operations" do
setup do
@a=[1,2,3,4,5].to_scale
end
should "to_matrix returns a matrix with 1 row" do
mh=Matrix[[1,2,3,4,5]]
assert_equal(mh,@a.to_matrix)
end
should "to_matrix(:vertical) returns a matrix with 1 column" do
mv=Matrix.columns([[1,2,3,4,5]])
assert_equal(mv,@a.to_matrix(:vertical))
end
should "returns valid submatrixes" do
# 3*4 + 2*5 = 22
a=[3,2].to_vector(:scale)
b=[4,5].to_vector(:scale)
assert_equal(22,(a.to_matrix*b.to_matrix(:vertical))[0,0])
end
end
context "when initializing" do
setup do
@data=(10.times.map{rand(100)})+[nil]
@original=Statsample::Vector.new(@data, :scale)
end
should "be the sample using []" do
second=Statsample::Vector[*@data]
assert_equal(@original, second)
end
should "[] returns same results as R-c()" do
reference=[0,4,5,6,10].to_scale
assert_equal(reference, Statsample::Vector[0,4,5,6,10])
assert_equal(reference, Statsample::Vector[0,4..6,10])
assert_equal(reference, Statsample::Vector[[0],[4,5,6],[10]])
assert_equal(reference, Statsample::Vector[[0],[4,[5,[6]]],[10]])
assert_equal(reference, Statsample::Vector[[0],[4,5,6].to_vector,[10]])
end
should "be the same usign #to_vector" do
lazy1=@data.to_vector(:scale)
assert_equal(@original,lazy1)
end
should "be the same using #to_scale" do
lazy2=@data.to_scale
assert_equal(@original,lazy2)
assert_equal(:scale,lazy2.type)
assert_equal(@data.find_all{|v| !v.nil?},lazy2.valid_data)
end
should "could use new_scale with size only" do
v1=10.times.map {nil}.to_scale
v2=Statsample::Vector.new_scale(10)
assert_equal(v1,v2)
end
should "could use new_scale with size and value" do
a=rand
v1=10.times.map {a}.to_scale
v2=Statsample::Vector.new_scale(10,a)
assert_equal(v1,v2)
end
should "could use new_scale with func" do
v1=10.times.map {|i| i*2}.to_scale
v2=Statsample::Vector.new_scale(10) {|i| i*2}
assert_equal(v1,v2)
end
end
context "#split_by_separator" do
setup do
@a = Statsample::Vector.new(["a","a,b","c,d","a,d",10,nil],:nominal)
@b=@a.split_by_separator(",")
end
should "returns a Hash" do
assert_kind_of(Hash, @b)
end
should "return a Hash with keys with different values of @a" do
expected=['a','b','c','d',10]
assert_equal(expected, @b.keys)
end
should "returns a Hash, which values are Statsample::Vector" do
@b.each_key {|k| assert_instance_of(Statsample::Vector, @b[k])}
end
should "hash values are n times the tokens appears" do
assert_counting_tokens(@b)
end
should "#split_by_separator_freq returns the number of ocurrences of tokens" do
assert_equal({'a'=>3,'b'=>1,'c'=>1,'d'=>2,10=>1}, @a.split_by_separator_freq())
end
should "using a different separator give the same values" do
a = Statsample::Vector.new(["a","a*b","c*d","a*d",10,nil],:nominal)
b=a.split_by_separator("*")
assert_counting_tokens(b)
end
end
should "return correct median_absolute_deviation" do
a=[1, 1, 2, 2, 4, 6, 9].to_scale
assert_equal(1, a.median_absolute_deviation)
end
should "return correct histogram" do
a=10.times.map {|v| v}.to_scale
hist=a.histogram(2)
assert_equal([5,5], hist.bin)
3.times do |i|
assert_in_delta(i*4.5, hist.get_range(i)[0], 1e-9)
end
end
should "have a name" do
@c.name=="Test Vector"
end
should "without explicit name, returns vector with succesive numbers" do
a=10.times.map{rand(100)}.to_scale
b=10.times.map{rand(100)}.to_scale
assert_match(/Vector \d+/, a.name)
a.name=~/Vector (\d+)/
next_number=$1.to_i+1
assert_equal("Vector #{next_number}",b.name)
end
should "save to a file and load the same Vector" do
outfile=Tempfile.new("vector.vec")
@c.save(outfile.path)
a=Statsample.load(outfile.path)
assert_equal(@c,a)
end
should "#collect returns an array" do
val=@c.collect {|v| v}
assert_equal(val,[5,5,5,5,5,6,6,7,8,9,10,1,2,3,4,nil,-99,-99])
end
should "#recode returns a recoded array" do
a=@c.recode{|v| @c.is_valid?(v) ? 0 : 1 }
exp=[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1].to_vector
assert_equal(exp,a)
exp.recode!{|v| v==0 ? 1:0}
exp2=(([1]*15)+([0]*3)).to_vector
assert_equal(exp2,exp)
end
should "#product returns the * of all values" do
a=[1,2,3,4,5].to_vector(:scale)
assert_equal(120,a.product)
end
should "missing values" do
@c.missing_values=[10]
assert_equal([-99,-99,1,2,3,4,5,5,5,5,5,6,6,7,8,9], @c.valid_data.sort)
assert_equal([5,5,5,5,5,6,6,7,8,9,nil,1,2,3,4,nil,-99,-99], @c.data_with_nils)
@c.missing_values=[-99]
assert_equal(@c.valid_data.sort,[1,2,3,4,5,5,5,5,5,6,6,7,8,9,10])
assert_equal(@c.data_with_nils,[5,5,5,5,5,6,6,7,8,9,10,1,2,3,4,nil,nil,nil])
@c.missing_values=[]
assert_equal(@c.valid_data.sort,[-99,-99,1,2,3,4,5,5,5,5,5,6,6,7,8,9,10])
assert_equal(@c.data_with_nils,[5,5,5,5,5,6,6,7,8,9,10,1,2,3,4,nil,-99,-99])
end
should "correct has_missing_data? with missing data" do
a=[1,2,3,nil].to_vector
assert(a.has_missing_data?)
end
should "correct has_missing_data? without missing data" do
a=[1,2,3,4,10].to_vector
assert(!a.has_missing_data?)
end
should "with explicit missing_values, should respond has_missing_data?" do
a=[1,2,3,4,10].to_vector
a.missing_values=[10]
assert(a.has_missing_data?)
end
should "label correctly fields" do
@c.labels={5=>'FIVE'}
assert_equal(["FIVE","FIVE","FIVE","FIVE","FIVE",6,6,7,8,9,10,1,2,3,4,nil,-99, -99],@c.vector_labeled.to_a)
end
should "verify" do
h=@c.verify{|d| !d.nil? and d>0}
e={15=>nil,16=>-99,17=>-99}
assert_equal(e,h)
end
should "have a summary with name on it" do
assert_match(/#{@c.name}/, @c.summary)
end
should "GSL::Vector based should push correcty" do
if Statsample.has_gsl?
v=GSL::Vector[1,2,3,4,5].to_scale
v.push(nil)
assert_equal([1,2,3,4,5,nil], v.to_a)
assert(v.flawed?)
else
skip("Requires GSL")
end
end
should "split correctly" do
a = Statsample::Vector.new(["a","a,b","c,d","a,d","d",10,nil],:nominal)
assert_equal([%w{a},%w{a b},%w{c d},%w{a d},%w{d},[10],nil], a.splitted)
end
should "multiply correct for scalar" do
a = [1,2,3].to_scale
assert_equal([5,10,15].to_scale, a*5)
end
should "multiply correct with other vector" do
a = [1,2,3].to_scale
b = [2,4,6].to_scale
assert_equal([2,8,18].to_scale, a*b)
end
should "sum correct for scalar" do
a = [1,2,3].to_scale
assert_equal([11,12,13].to_scale, a+10)
end
should "raise NoMethodError when method requires ordinal and vector is nominal" do
@c.type=:nominal
assert_raise(::NoMethodError) { @c.median }
end
should "raise NoMethodError when method requires scalar and vector is ordinal" do
@c.type=:ordinal
assert_raise(::NoMethodError) { @c.mean }
end
should "jacknife correctly with named method" do
# First example
a=[1,2,3,4].to_scale
ds=a.jacknife(:mean)
assert_equal(a.mean, ds[:mean].mean)
ds=a.jacknife([:mean,:sd])
assert_equal(a.mean, ds[:mean].mean)
assert_equal(a.sd, ds[:mean].sd)
end
should "jacknife correctly with custom method" do
# Second example
a=[17.23, 18.71,13.93,18.81,15.78,11.29,14.91,13.39, 18.21, 11.57, 14.28, 10.94, 18.83, 15.52,13.45,15.25].to_scale
ds=a.jacknife(:log_s2=>lambda {|v| Math.log(v.variance) })
exp=[1.605, 2.972, 1.151, 3.097, 0.998, 3.308, 0.942, 1.393, 2.416, 2.951, 1.043, 3.806, 3.122, 0.958, 1.362, 0.937].to_scale
assert_similar_vector(exp, ds[:log_s2], 0.001)
assert_in_delta(2.00389, ds[:log_s2].mean, 0.00001)
assert_in_delta(1.091, ds[:log_s2].variance, 0.001)
end
should "jacknife correctly with k>1" do
a=rnorm(6)
ds=a.jacknife(:mean,2)
mean=a.mean
exp=[3*mean-2*(a[2]+a[3]+a[4]+a[5]) / 4, 3*mean-2*(a[0]+a[1]+a[4]+a[5]) / 4, 3*mean-2*(a[0]+a[1]+a[2]+a[3]) / 4].to_scale
assert_similar_vector(exp, ds[:mean], 1e-13)
end
should "bootstrap should return a vector with mean=mu and sd=se" do
a=rnorm(100)
ds=a.bootstrap([:mean,:sd],200)
se=1/Math.sqrt(a.size)
assert_in_delta(0, ds[:mean].mean, 0.3)
assert_in_delta(se, ds[:mean].sd, 0.02)
end
end
def test_nominal
assert_equal(@c[1],5)
assert_equal({ 1=>1,2=>1,3=>1,4=>1,5=>5,6=>2,7=>1,8=>1, 9=>1,10=>1},@c.frequencies)
assert_equal({ 1=>1,2=>1,3=>1,4=>1,5=>5,6=>2,7=>1,8=>1, 9=>1,10=>1},@c._frequencies)
assert_equal({ 1 => 1.quo(15) ,2=>1.quo(15), 3=>1.quo(15),4=>1.quo(15),5=>5.quo(15),6=>2.quo(15),7=>1.quo(15), 8=>1.quo(15), 9=>1.quo(15),10=>1.quo(15)}, @c.proportions)
assert_equal(@c.proportion, 1.quo(15))
assert_equal(@c.proportion(2), 1.quo(15))
assert_equal([1,2,3,4,5,6,7,8,9,10], @c.factors.sort)
assert_equal(@c.mode,5)
assert_equal(@c.n_valid,15)
end
def test_equality
v1=[1,2,3].to_vector
v2=[1,2,3].to_vector
assert_equal(v1,v2)
v1=[1,2,3].to_vector(:nominal)
v2=[1,2,3].to_vector(:ordinal)
assert_not_equal(v1,v2)
v2=[1,2,3]
assert_not_equal(v1,v2)
v1=[1,2,3].to_vector()
v2=[1,2,3].to_vector()
assert_equal(v1,v2)
assert_equal(false, v1 == Object.new)
end
def test_vector_percentil
a=[1,2,2,3,4,5,5,5,6,10].to_scale
expected=[10,25,25,40,50,70,70,70,90,100].to_scale
assert_equal(expected, a.vector_percentil)
a=[1,nil,nil,2,2,3,4,nil,nil,5,5,5,6,10].to_scale
expected=[10,nil,nil,25,25,40,50,nil,nil,70,70,70,90,100].to_scale
assert_equal(expected, a.vector_percentil)
end
def test_ordinal
@c.type=:ordinal
assert_equal(5,@c.median)
assert_equal(4,@c.percentil(25))
assert_equal(7,@c.percentil(75))
v=[200000, 200000, 210000, 220000, 230000, 250000, 250000, 250000, 270000, 300000, 450000, 130000, 140000, 140000, 140000, 145000, 148000, 165000, 170000, 180000, 180000, 180000, 180000, 180000, 180000 ].to_scale
assert_equal(180000,v.median)
a=[7.0, 7.0, 7.0, 7.0, 7.0, 8.0, 8.0, 8.0, 9.0, 9.0, 10.0, 10.0, 10.0, 10.0, 10.0, 12.0, 12.0, 13.0, 14.0, 14.0, 2.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0].to_scale
assert_equal(4.5, a.percentil(25))
assert_equal(6.5, a.percentil(50))
assert_equal(9.5, a.percentil(75))
assert_equal(3.0, a.percentil(10))
end
def test_linear_percentil_strategy
values = [102, 104, 105, 107, 108, 109, 110, 112, 115, 116].shuffle.to_scale
assert_equal 102, values.percentil(0, :linear)
assert_equal 104.75, values.percentil(25, :linear)
assert_equal 108.5, values.percentil(50, :linear)
assert_equal 112.75, values.percentil(75, :linear)
assert_equal 116, values.percentil(100, :linear)
values = [102, 104, 105, 107, 108, 109, 110, 112, 115, 116, 118].shuffle.to_scale
assert_equal 102, values.percentil(0, :linear)
assert_equal 105, values.percentil(25, :linear)
assert_equal 109, values.percentil(50, :linear)
assert_equal 115, values.percentil(75, :linear)
assert_equal 118, values.percentil(100, :linear)
end
def test_ranked
v1=[0.8,1.2,1.2,2.3,18].to_vector(:ordinal)
expected=[1,2.5,2.5,4,5].to_vector(:ordinal)
assert_equal(expected,v1.ranked)
v1=[nil,0.8,1.2,1.2,2.3,18,nil].to_vector(:ordinal)
expected=[nil,1,2.5,2.5,4,5,nil].to_vector(:ordinal)
assert_equal(expected,v1.ranked)
end
def test_scale
a=Statsample::Vector.new([1,2,3,4,"STRING"], :scale)
assert_equal(10, a.sum)
i=0
factors=a.factors.sort
[0,1,2,3,4].each{|v|
assert(v==factors[i])
assert(v.class==factors[i].class,"#{v} - #{v.class} != #{factors[i]} - #{factors[i].class}")
i+=1
}
end
def test_vector_centered
mean=rand()
samples=11
centered=samples.times.map {|i| i-((samples/2).floor).to_i}.to_scale
not_centered=centered.recode {|v| v+mean}
obs=not_centered.centered
centered.each_with_index do |v,i|
assert_in_delta(v,obs[i],0.0001)
end
end
def test_vector_standarized
v1=[1,2,3,4,nil].to_vector(:scale)
sds=v1.sds
expected=[((1-2.5).quo(sds)),((2-2.5).quo(sds)),((3-2.5).quo(sds)),((4-2.5).quo(sds)), nil].to_vector(:scale)
vs=v1.vector_standarized
assert_equal(expected, vs)
assert_equal(0,vs.mean)
assert_equal(1,vs.sds)
end
def test_vector_standarized_with_zero_variance
v1=100.times.map {|i| 1}.to_scale
exp=100.times.map {nil}.to_scale
assert_equal(exp,v1.standarized)
end
def test_check_type
v=Statsample::Vector.new
v.type=:nominal
assert_raise(NoMethodError) { v.check_type(:scale)}
assert_raise(NoMethodError) { v.check_type(:ordinal)}
assert(v.check_type(:nominal).nil?)
v.type=:ordinal
assert_raise(NoMethodError) { v.check_type(:scale)}
assert(v.check_type(:ordinal).nil?)
assert(v.check_type(:nominal).nil?)
v.type=:scale
assert(v.check_type(:scale).nil?)
assert(v.check_type(:ordinal).nil?)
assert(v.check_type(:nominal).nil?)
v.type=:date
assert_raise(NoMethodError) { v.check_type(:scale)}
assert_raise(NoMethodError) { v.check_type(:ordinal)}
assert_raise(NoMethodError) { v.check_type(:nominal)}
end
def test_add
a=Statsample::Vector.new([1,2,3,4,5], :scale)
b=Statsample::Vector.new([11,12,13,14,15], :scale)
assert_equal([3,4,5,6,7], (a+2).to_a)
assert_equal([12,14,16,18,20], (a+b).to_a)
assert_raise ArgumentError do
a + @c
end
assert_raise TypeError do
a+"string"
end
a=Statsample::Vector.new([nil,1, 2 ,3 ,4 ,5], :scale)
b=Statsample::Vector.new([11, 12,nil,13,14,15], :scale)
assert_equal([nil,13,nil,16,18,20], (a+b).to_a)
assert_equal([nil,13,nil,16,18,20], (a+b.to_a).to_a)
end
def test_minus
a=Statsample::Vector.new([1,2,3,4,5], :scale)
b=Statsample::Vector.new([11,12,13,14,15], :scale)
assert_equal([-1,0,1,2,3], (a-2).to_a)
assert_equal([10,10,10,10,10], (b-a).to_a)
assert_raise ArgumentError do
a-@c
end
assert_raise TypeError do
a-"string"
end
a=Statsample::Vector.new([nil,1, 2 ,3 ,4 ,5], :scale)
b=Statsample::Vector.new([11, 12,nil,13,14,15], :scale)
assert_equal([nil,11,nil,10,10,10], (b-a).to_a)
assert_equal([nil,11,nil,10,10,10], (b-a.to_a).to_a)
end
def test_sum_of_squares
a=[1,2,3,4,5,6].to_vector(:scale)
assert_equal(17.5, a.sum_of_squared_deviation)
end
def test_average_deviation
a=[1,2,3,4,5,6,7,8,9].to_scale
assert_equal(20.quo(9), a.average_deviation_population)
end
def test_samples
srand(1)
assert_equal(100,@c.sample_with_replacement(100).size)
assert_equal(@c.valid_data.to_a.sort, @c.sample_without_replacement(15).sort)
assert_raise ArgumentError do
@c.sample_without_replacement(20)
end
@c.type=:scale
srand(1)
assert_equal(100, @c.sample_with_replacement(100).size)
assert_equal(@c.valid_data.to_a.sort, @c.sample_without_replacement(15).sort)
end
def test_valid_data
a=Statsample::Vector.new([1,2,3,4,"STRING"])
a.missing_values=[-99]
a.add(1,false)
a.add(2,false)
a.add(-99,false)
a.set_valid_data
exp_valid_data=[1,2,3,4,"STRING",1,2]
assert_equal(exp_valid_data,a.valid_data)
a.add(20,false)
a.add(30,false)
assert_equal(exp_valid_data,a.valid_data)
a.set_valid_data
exp_valid_data_2=[1,2,3,4,"STRING",1,2,20,30]
assert_equal(exp_valid_data_2,a.valid_data)
end
def test_set_value
@c[2]=10
expected=[5,5,10,5,5,6,6,7,8,9,10,1,2,3,4,nil,-99,-99].to_vector
assert_equal(expected.data,@c.data)
end
def test_gsl
if Statsample.has_gsl?
a=Statsample::Vector.new([1,2,3,4,"STRING"], :scale)
assert_equal(2,a.mean)
assert_equal(a.variance_sample_ruby,a.variance_sample)
assert_equal(a.standard_deviation_sample_ruby,a.sds)
assert_equal(a.variance_population_ruby,a.variance_population)
assert_equal(a.standard_deviation_population_ruby,a.standard_deviation_population)
assert_nothing_raised do
a=[].to_vector(:scale)
end
a.add(1,false)
a.add(2,false)
a.set_valid_data
assert_equal(3,a.sum)
b=[1,2,nil,3,4,5,nil,6].to_vector(:scale)
assert_equal(21, b.sum)
assert_equal(3.5, b.mean)
assert_equal(6,b.gsl.size)
c=[10,20,30,40,50,100,1000,2000,5000].to_scale
assert_in_delta(c.skew, c.skew_ruby ,0.0001)
assert_in_delta(c.kurtosis, c.kurtosis_ruby ,0.0001)
end
end
def test_vector_matrix
v1=%w{a a a b b b c c}.to_vector
v2=%w{1 3 4 5 6 4 3 2}.to_vector
v3=%w{1 0 0 0 1 1 1 0}.to_vector
ex=Matrix.rows([["a", "1", "1"], ["a", "3", "0"], ["a", "4", "0"], ["b", "5", "0"], ["b", "6", "1"], ["b", "4", "1"], ["c", "3", "1"], ["c", "2", "0"]])
assert_equal(ex,Statsample.vector_cols_matrix(v1,v2,v3))
end
def test_marshalling
v1=(0..100).to_a.collect{|n| rand(100)}.to_vector(:scale)
v2=Marshal.load(Marshal.dump(v1))
assert_equal(v1,v2)
end
def test_dup
v1=%w{a a a b b b c c}.to_vector
v2=v1.dup
assert_equal(v1.data,v2.data)
assert_not_same(v1.data,v2.data)
assert_equal(v1.type,v2.type)
v1.type=:ordinal
assert_not_equal(v1.type,v2.type)
assert_equal(v1.missing_values,v2.missing_values)
assert_not_same(v1.missing_values,v2.missing_values)
assert_equal(v1.labels,v2.labels)
assert_not_same(v1.labels,v2.labels)
v3=v1.dup_empty
assert_equal([],v3.data)
assert_not_equal(v1.data,v3.data)
assert_not_same(v1.data,v3.data)
assert_equal(v1.type,v3.type)
v1.type=:ordinal
v3.type=:nominal
assert_not_equal(v1.type,v3.type)
assert_equal(v1.missing_values,v3.missing_values)
assert_not_same(v1.missing_values,v3.missing_values)
assert_equal(v1.labels,v3.labels)
assert_not_same(v1.labels,v3.labels)
end
def test_paired_ties
a=[0,0,0,1,1,2,3,3,4,4,4].to_vector(:ordinal)
expected=[2,2,2,4.5,4.5,6,7.5,7.5,10,10,10].to_vector(:ordinal)
assert_equal(expected,a.ranked)
end
def test_dichotomize
a= [0,0,0,1,2,3,nil].to_vector
exp=[0,0,0,1,1,1,nil].to_scale
assert_equal(exp,a.dichotomize)
a= [1,1,1,2,2,2,3].to_vector
exp=[0,0,0,1,1,1,1].to_scale
assert_equal(exp,a.dichotomize)
a= [0,0,0,1,2,3,nil].to_vector
exp=[0,0,0,0,1,1,nil].to_scale
assert_equal(exp,a.dichotomize(1))
a= %w{a a a b c d}.to_vector
exp=[0,0,0,1,1,1].to_scale
assert_equal(exp, a.dichotomize)
end
def test_can_be_methods
a= [0,0,0,1,2,3,nil].to_vector
assert(a.can_be_scale?)
a=[0,"s",0,1,2,3,nil].to_vector
assert(!a.can_be_scale?)
a.missing_values=["s"]
assert(a.can_be_scale?)
a=[Date.new(2009,10,10), Date.today(), "2009-10-10", "2009-1-1", nil, "NOW"].to_vector
assert(a.can_be_date?)
a=[Date.new(2009,10,10), Date.today(),nil,"sss"].to_vector
assert(!a.can_be_date?)
end
def test_date_vector
a=[Date.new(2009,10,10), :NOW, "2009-10-10", "2009-1-1", nil, "NOW","MISSING"].to_vector(:date, :missing_values=>["MISSING"])
assert(a.type==:date)
expected=[Date.new(2009,10,10), Date.today(), Date.new(2009,10,10), Date.new(2009,1,1), nil, Date.today(), nil ]
assert_equal(expected, a.date_data_with_nils)
end
end
================================================
FILE: test/test_wilcoxonsignedrank.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleUMannWhitneyTestCase < MiniTest::Unit::TestCase
include Statsample::Test
context Statsample::Test::WilcoxonSignedRank do
context "Example 1" do
setup do
@v1=[110,122,125,120,140,124,123,137,135,145].to_scale
@v2=[125,115,130,140,140,115,140,125,140,135].to_scale
@u=Statsample::Test::WilcoxonSignedRank.new(@v1,@v2)
end
should "have same result using class or Test#u_mannwhitney" do
assert_equal(Statsample::Test.wilcoxon_signed_rank(@v1,@v2).w, @u.w)
end
should "have correct W values" do
assert_equal(9,@u.w)
end
should "have correct nr values" do
assert_equal(9,@u.nr)
end
should "have correct value for z" do
assert_in_delta(0.503,@u.z,0.001)
end
should "have correct value for probability_z" do
assert_in_delta(0.614,@u.probability_z,0.001)
end
should "have correct value for probability_exact" do
assert_in_delta(0.652,@u.probability_exact,0.001)
end
should "have summary" do
assert(@u.summary!="")
end
end
context "Example 2" do
setup do
@v2=[78,24,64,45,64,52,30,50,64,50,78,22,84,40,90,72].to_scale
@v1=[78,24,62,48,68,56,25,44,56,40,68,36,68,20,58,32].to_scale
@u=Statsample::Test::WilcoxonSignedRank.new(@v1,@v2)
end
should "have same result using class or Test#u_mannwhitney" do
assert_equal(Statsample::Test.wilcoxon_signed_rank(@v1,@v2).w, @u.w)
end
should "have correct W values" do
assert_equal(67,@u.w)
end
should "have correct nr values" do
assert_equal(14,@u.nr)
end
should "have correct value for z" do
assert_in_delta(2.087,@u.z,0.001)
end
should "have correct value for probability_z" do
assert_in_delta(0.036,@u.probability_z,0.001)
end
should "have correct value for probability_exact" do
assert_in_delta(0.036,@u.probability_exact,0.001)
end
should "have summary" do
assert(@u.summary!="")
end
end
end
end
================================================
FILE: test/test_xls.rb
================================================
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleExcelTestCase < MiniTest::Unit::TestCase
context "Excel reader" do
setup do
@ds=Statsample::Excel.read(File.dirname(__FILE__)+"/fixtures/test_xls.xls")
end
should "set the number of cases" do
assert_equal(6,@ds.cases)
end
should "set correct field names" do
assert_equal(%w{id name age city a1},@ds.fields)
end
should "set a dataset equal to expected" do
id=[1,2,3,4,5,6].to_vector(:scale)
name=["Alex","Claude","Peter","Franz","George","Fernand"].to_vector(:nominal)
age=[20,23,25,nil,5.5,nil].to_vector(:scale)
city=["New York","London","London","Paris","Tome",nil].to_vector(:nominal)
a1=["a,b","b,c","a",nil,"a,b,c",nil].to_vector(:nominal)
ds_exp=Statsample::Dataset.new({'id'=>id,'name'=>name,'age'=>age,'city'=>city,'a1'=>a1}, %w{id name age city a1})
ds_exp.fields.each{|f|
assert_equal(ds_exp[f],@ds[f])
}
assert_equal(ds_exp,@ds)
end
should "set to nil empty cells" do
assert_equal(nil,@ds['age'][5])
end
end
context "Excel writer" do
setup do
a=100.times.map{rand(100)}.to_scale
b=(["b"]*100).to_vector
@ds={'b'=>b, 'a'=>a}.to_dataset(%w{b a})
tempfile=Tempfile.new("test_write.xls")
Statsample::Excel.write(@ds,tempfile.path)
@ds2=Statsample::Excel.read(tempfile.path)
end
should "return same fields as original" do
assert_equal(@ds.fields ,@ds2.fields)
end
should "return same number of cases as original" do
assert_equal(@ds.cases, @ds2.cases)
end
should "return same cases as original" do
i=0
@ds2.each_array do |row|
assert_equal(@ds.case_as_array(i),row)
i+=1
end
end
end
end
================================================
FILE: web/Rakefile
================================================
# -*- ruby -*-
require 'rake'
require 'fileutils'
directory "examples"
def get_base(f)
f.sub(File.dirname(__FILE__)+"/../examples/","").gsub("/","_").gsub(".rb","")
end
EXAMPLES=Dir.glob(File.dirname(__FILE__)+"/../examples/**/*.rb").map {|v| [v, get_base(v)]
}.find_all{|v| !v[0].include?"_data"}
EXAMPLES_BASE=EXAMPLES.map {|v| v[1]}
desc "Build all html, rtf and pdf files"
task :build_site do
ruby "build_site.rb"
end
task :clean do
Dir.glob(File.dirname(__FILE__)+"/examples/*.pdf").each do |t|
FileUtils.rm t
end
Dir.glob(File.dirname(__FILE__)+"/examples/*.html").each do |t|
FileUtils.rm t
end
Dir.glob(File.dirname(__FILE__)+"/examples/*.rtf").each do |t|
FileUtils.rm t
end
Dir.glob(File.dirname(__FILE__)+"/examples/images/*.*").each do |t|
FileUtils.rm t
end
end
load 'upload_task.rb' if File.exists? "upload_task.rb"