Repository: clbustos/statsample Branch: master Commit: d5caf4ecf82c Files: 157 Total size: 643.6 KB Directory structure: gitextract_b74amxs6/ ├── .gitignore ├── .travis.yml ├── Gemfile ├── History.txt ├── LICENSE.txt ├── Manifest.txt ├── README.md ├── Rakefile ├── benchmarks/ │ ├── correlation_matrix_15_variables.rb │ ├── correlation_matrix_5_variables.rb │ ├── correlation_matrix_methods/ │ │ ├── correlation_matrix.ds │ │ ├── correlation_matrix.html │ │ ├── correlation_matrix.rb │ │ ├── correlation_matrix.xls │ │ ├── correlation_matrix_gsl_ruby.ods │ │ ├── correlation_matrix_with_graphics.ods │ │ └── results.ds │ ├── factor_map.rb │ └── helpers_benchmark.rb ├── data/ │ └── locale/ │ └── es/ │ └── LC_MESSAGES/ │ └── statsample.mo ├── doc_latex/ │ └── manual/ │ └── equations.tex ├── examples/ │ ├── boxplot.rb │ ├── correlation_matrix.rb │ ├── dataset.rb │ ├── dominance_analysis.rb │ ├── dominance_analysis_bootstrap.rb │ ├── histogram.rb │ ├── icc.rb │ ├── levene.rb │ ├── multiple_regression.rb │ ├── multivariate_correlation.rb │ ├── parallel_analysis.rb │ ├── polychoric.rb │ ├── principal_axis.rb │ ├── reliability.rb │ ├── scatterplot.rb │ ├── t_test.rb │ ├── tetrachoric.rb │ ├── u_test.rb │ ├── vector.rb │ └── velicer_map_test.rb ├── grab_references.rb ├── lib/ │ ├── spss.rb │ ├── statsample/ │ │ ├── analysis/ │ │ │ ├── suite.rb │ │ │ └── suitereportbuilder.rb │ │ ├── analysis.rb │ │ ├── anova/ │ │ │ ├── contrast.rb │ │ │ ├── oneway.rb │ │ │ └── twoway.rb │ │ ├── anova.rb │ │ ├── bivariate/ │ │ │ └── pearson.rb │ │ ├── bivariate.rb │ │ ├── codification.rb │ │ ├── converter/ │ │ │ ├── csv.rb │ │ │ └── spss.rb │ │ ├── converters.rb │ │ ├── crosstab.rb │ │ ├── dataset.rb │ │ ├── dominanceanalysis/ │ │ │ └── bootstrap.rb │ │ ├── dominanceanalysis.rb │ │ ├── factor/ │ │ │ ├── map.rb │ │ │ ├── parallelanalysis.rb │ │ │ ├── pca.rb │ │ │ ├── principalaxis.rb │ │ │ └── rotation.rb │ │ ├── factor.rb │ │ ├── graph/ │ │ │ ├── boxplot.rb │ │ │ ├── histogram.rb │ │ │ └── scatterplot.rb │ │ ├── graph.rb │ │ ├── histogram.rb │ │ ├── matrix.rb │ │ ├── multiset.rb │ │ ├── regression/ │ │ │ ├── multiple/ │ │ │ │ ├── alglibengine.rb │ │ │ │ ├── baseengine.rb │ │ │ │ ├── gslengine.rb │ │ │ │ ├── matrixengine.rb │ │ │ │ └── rubyengine.rb │ │ │ ├── multiple.rb │ │ │ └── simple.rb │ │ ├── regression.rb │ │ ├── reliability/ │ │ │ ├── icc.rb │ │ │ ├── multiscaleanalysis.rb │ │ │ ├── scaleanalysis.rb │ │ │ └── skillscaleanalysis.rb │ │ ├── reliability.rb │ │ ├── resample.rb │ │ ├── rserve_extension.rb │ │ ├── shorthand.rb │ │ ├── srs.rb │ │ ├── test/ │ │ │ ├── bartlettsphericity.rb │ │ │ ├── chisquare.rb │ │ │ ├── f.rb │ │ │ ├── kolmogorovsmirnov.rb │ │ │ ├── levene.rb │ │ │ ├── t.rb │ │ │ ├── umannwhitney.rb │ │ │ └── wilcoxonsignedrank.rb │ │ ├── test.rb │ │ ├── vector/ │ │ │ └── gsl.rb │ │ ├── vector.rb │ │ └── version.rb │ └── statsample.rb ├── po/ │ ├── es/ │ │ ├── statsample.mo │ │ └── statsample.po │ └── statsample.pot ├── references.txt ├── setup.rb ├── test/ │ ├── fixtures/ │ │ ├── correlation_matrix.rb │ │ ├── hartman_23.matrix │ │ ├── repeated_fields.csv │ │ ├── stock_data.csv │ │ ├── test_csv.csv │ │ ├── test_xls.xls │ │ ├── tetmat_matrix.txt │ │ └── tetmat_test.txt │ ├── helpers_tests.rb │ ├── test_analysis.rb │ ├── test_anova_contrast.rb │ ├── test_anovaoneway.rb │ ├── test_anovatwoway.rb │ ├── test_anovatwowaywithdataset.rb │ ├── test_anovawithvectors.rb │ ├── test_awesome_print_bug.rb │ ├── test_bartlettsphericity.rb │ ├── test_bivariate.rb │ ├── test_codification.rb │ ├── test_crosstab.rb │ ├── test_csv.rb │ ├── test_dataset.rb │ ├── test_dominance_analysis.rb │ ├── test_factor.rb │ ├── test_factor_map.rb │ ├── test_factor_pa.rb │ ├── test_ggobi.rb │ ├── test_gsl.rb │ ├── test_histogram.rb │ ├── test_matrix.rb │ ├── test_multiset.rb │ ├── test_regression.rb │ ├── test_reliability.rb │ ├── test_reliability_icc.rb │ ├── test_reliability_skillscale.rb │ ├── test_resample.rb │ ├── test_rserve_extension.rb │ ├── test_srs.rb │ ├── test_statistics.rb │ ├── test_stest.rb │ ├── test_stratified.rb │ ├── test_test_f.rb │ ├── test_test_kolmogorovsmirnov.rb │ ├── test_test_t.rb │ ├── test_umannwhitney.rb │ ├── test_vector.rb │ ├── test_wilcoxonsignedrank.rb │ └── test_xls.rb └── web/ └── Rakefile ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ doc.yaml *.swp *.rbc coverage *~ agregar_adsense_a_doc.rb pkg doc .yardoc examples/images/* examples/*.html web/upload_task.rb .idea ================================================ FILE: .travis.yml ================================================ language: ruby rvm: - '1.9.3' - '2.0.0' - '2.1.1' script: bundle exec rake test before_install: - sudo apt-get update -qq - sudo apt-get install -y libgsl0-dev r-base r-base-dev - sudo Rscript -e "install.packages(c('Rserve','irr'),,'http://cran.us.r-project.org')" ================================================ FILE: Gemfile ================================================ source "https://www.rubygems.org" gem 'minitest' gem 'rdoc' gem 'mocha', '0.14.0' #:require=>'mocha/setup' gem 'shoulda','3.5.0' gem 'shoulda-matchers','2.2.0' gem 'hoe' #gem 'bio-statsample-timeseries' gem 'reportbuilder' gem 'dirty-memoize' gem 'distribution' gem 'extendmatrix' gem 'minimization' gem 'rserve-client' gem 'rubyvis' gem 'spreadsheet' gem 'rb-gsl' gem 'awesome_print' ================================================ FILE: History.txt ================================================ === 1.4.0 / 2014-10-11 * Replaced README.txt for README.md * Replace File.exists? for File.exist? + New Dataset.join to join two dataset based on some fields * Deleted MLE based regression (Probit and logistic). Now all GML methods are on statsample-glm === 1.3.1 / 2014-06-26 * Example referred to a SimpleRegression class which doesn't exist. Updated to working example. * Merge pull request #15 from Blahah/patch-1 * Updated Gemfile * Updated README.txt for v1.3.0 * Updated to ruby 2.1.0 === 1.3.0 / 2013-09-19 * Merge remote-tracking branch 'vpereira/master' into vpereira * New Wilcoxon Signed Rank test * Remove TimeSeries class. Now is available on gem "bio-statsample-timeseries" [GSOC 2013 project :) ] * Update shoulda support * added Bundle depds * improved the csv read method (requires tests) * open svg on mac osx === 1.2.0 / 2011-12-15 * Added support for time series (TimeSeries object): MA, EMA, MACD, acf, lag and delta. [Rob Britton] * Changed summary attribute to properly display 'b' value for simple linear regression [hstove] * Merge pull request #6 from hstove/patch-1Changed summary attribute to properly display 'b' value for simple linear regression [Claudio Bustos] * fix example code for CovariateMatrix [James Kebinger] === 1.1.0 / 2011-06-02 * New Statsample::Anova::Contrast * Jacknife and bootstrap for Vector. Thanks to John Firebaugh for the idea * Improved Statsample::Analysis API * Updated CSV.read. Third argument is a Hash with options to CSV class * Added restriction on Statsample::Excel.read * Updated spanish po * Better summary for Vector * Improving summary of t related test (confidence interval and estimate output) * Replaced c for vector on Statsample::Analysis examples * Added Vector#median_absolute_deviation * First implementation of Kolmogorov Smirnov test. Returns correct D value, but without Kolmogorov distribution isn't very useful. === 1.0.1 / 2011-01-28 * Updated spanish po. * Update distribution gem dependence. On Ruby 1.8.7, distribution 0.2.0 raises an error. === 1.0.0 / 2011-01-27 * Added Statsample::Analysis, a beautiful DSL to perform fast statistical analysis using statsample. See directory /examples * Created benchmarks directory * Removed Distribution module from statsample and moved to a gem. Changes on code to reflect new API * Optimized simple regression. Better library detection * New 'should_with_gsl' to test methods with gsl. Refactored Factor::MAP * Almost complete GSL cleanup on Vector * Updated some doc on Vector * Used GSL::Matrix on Factor classes when available * SkillScaleAnalysis doesn't crash with one or more vectors with 0 variance * Modified examples using Statsample::Analysis * Simplified eigen calculations * Updated some examples. Added correlation matrix speed suite * Correlation matrix optimized. Better specs * Optimized correlation matrix. Use gsl matrix algebra or pairwise correlations depending on empiric calculated equations. See benchmarks/correlation_matrix.rb to see implementation of calculation * Moved tests fixtures from data to test/fixtures * Fixed some errors on tests * Bug fix: constant_se on binomial regression have an error * All test should work on ruby 1.9.3 * New Vector.[] and Vector.new_scale * Detect linearly dependent predictors on OLS. === 0.18.0 / 2011-01-07 * New Statsample.load_excel * New Statsample.load_csv * Statsample::Dataset#[] accepts an array of fields and uses clone * New Dataset#correlation_matrix and Statsample::Dataset#covariance_matrix * Statsample::Dataset.filter add labels to vectors * Principal Components generation complete on PCA (covariance matrix prefered) * Added note on Statsample::Factor::PCA about erratic signs on eigenvalues, * Statsample::Factor::PCA.component_matrix calculated different for covariance matrix * Improved summary for PCA using covariance matrix * New attribute :label_angle for Statsample::Graph::Boxplot * Fixed Scatterplots scaling problems * New attributes for Scatterplots: groups, minimum_x, minimum_y, maximum_x, * New Statsample::Multiset#union allows to create a new dataset based on a m * New Statsample::Multiset#each to traverse through datasets * Bug fix: Vector#standarized and Vector#percentile crash on nil data * Bug fix: Vector#mean and Vector#sd crash on data without valid values * Modified methods names on Statsample::Factor::PCA : feature_vector to feature_matrix, data_transformation to principal_components * Added Statsample::Vector.vector_centered * Factor::MAP.with_dataset() implemented * Bug fix: Factor::MAP with correlation matrix with non-real eigenvalues crashes * Added documentation for Graph::Histogram * Added MPA to Reliability::MultiScaleAnalysis * Added custom names for returned vectors and datasets * Updated spanish traslation * Graph::Histogram updated. Custom x and y max and min, optional normal distribution drawing * Updated Histogram class, with several new methods compatibles with GSL::Histogram === 0.17.0 / 2010-12-09 * Added Statsample::Graph::Histogram and Statsample::Graph::Boxplot * Added Statsample::Reliability::SkillScaleAnalysis for analysis of skill based scales. * Delete combination and permutation clases. Backport for ruby 1.8.7 widely available * Deleted unused variables (thanks, ruby-head) === 0.16.0 / 2010-11-13 * Works on ruby 1.9.2 and HEAD. Updated Rakefile and manifest * Removed all graph based on Svg::Graph. * First operative version of Graph with Rubyvis * Corrected bug on Distribution::Normal.cdf. * Added reference on references.txt * Ruby-based random gaussian distribution generator when gsl not available * Added population average deviation [Al Chou] === 0.15.1 / 2010-10-20 * Statsample::Excel and Statsample::PlainText add name to vectors equal to field name * Statsample::Dataset.delete_vector accept multiple fields. * Statsample::Dataset.dup_only_valid allows duplication of specific fields * ScaleAnalysis doesn't crash on one-item scales * Updated references === 0.15.0 / 2010-09-07 * Added class Statsample::Reliability::ICC for calculation of Intra-class correlation (Shrout & Fleiss, 1979; McGraw & Wong, 1996). Tested with SPSS and R values. * References: Updated and standarized references on many classes. Added grab_references.rb script, to create a list of references for library * Added Spearman-Brown prophecy on Reliability module * Distribution::F uses Gsl when available * Added mean r.p.b. and item sd on Scale Analysis * Corrected bug on Vector.ary_method and example of Anova Two Way using vector. === 0.14.1 / 2010-08-18 * Added extra information on $DEBUG=true. * Changed ParallelAnalysis: with_random_data parameters, bootstrap_method options are data and random, resolve bug related to number of factors to preserve, resolved bug related to original eigenvalues, can support failed bootstrap of data for Tetrachoric correlation. * Optimized eigenpairs on Matrix when GSL is available. * Added test for parallel analysis using data bootstraping * Updated .pot and Manifest.txt * Added test for kmo(global and univariate), bartlett and anti-image. Kmo and Bartlett have test based on Dziuban and Shirkey with correct results * Complete set of test to test if a correlation matrix is appropriate for factor analysis: test of sphericity, KMO and anti-image (see Dziuban and Shirkey, 1974) * Updated Parallel Analysis to work on Principal Axis Analysis based on O'Connors formulae * Added reference for Statsample::Factor::MAP === 0.14.0 / 2010-08-16 * Added Statsample::Factor::MAP, to execute Velicer's (1976) MAP to determine the number of factors to retain on EFA * Bug fix on test suite on Ruby 1.8.7 * Horn's Parallel Analysis operational and tested for pure random data * Fixed bug on Excel writer on Ruby1.9 (frozen string on header raises an error). * Extra information on Factorial Analysis on summaries * Fixed bug on Factor::Rotation when used ::Matrix without field method. * Added Vector#vector_percentil method * Summaries for PCA, Rotation, MultiScale and ScaleAnalysis created or improved. * Factor::PCA could have rotation and parallel analysis on summary. * Cronbach's alpha from covariance matrix raise an error on size<2 * MultiScaleAnalysis could have Parallel Analysis on summary. * Added Chi Square test * Added new information on README.txt === 0.13.1 / 2010-07-03 * Rserve extensions for dataset and vector operational * On x86_64, variance from gsl is not exactly equal to sum of variance-covariance on Statsample::Reliability::Scale, but in delta 1e-10 * Updated README.txt * Reliability::ScaleAnalysis uses covariance matrix for 'if deleted' calculations to optimize memory and speed. Test for 'if deleted' statistics * More string translated. Added dependency on tetrachoric on parallel analysis === 0.13.0 / 2010-06-13 * Polychoric and Tetrachoric moved to gem statsample-bivariate-extension * All classes left with summary method include Summarizable now. Every method which return localizable string is now parsed with _() * Correct implementation of Reliability::MultiScaleAnalysis. * Spanish translation for Mann-Whitney's U * Added example for Mann-Whitney's U test * Better summary for Mann-Whitney's U Test * Added Statsample::Bivariate::Pearson class to retrieve complete analysis for r correlations * Bug fix on DominanceAnalysis::Bootstrap === 0.12.0 / 2010-06-09 * Modified Rakefile to remove dependencies based on C extensions. These are moved to statsample-optimization * T test with unequal variance fixed on i686 * API Change: Renamed Reliability::ItemAnalysis and moved to independent file * New Reliability::MultiScaleAnalysis for easy analysis of scales on a same survey, includind reliability, correlation matrix and Factor Analysis * Updated README to reflect changes on Reliability module * SvgGraph works with reportbuilder. * Added methods on Polychoric based on Olsson(1979): the idea is estimate using second derivatives. * Distribution test changed (reduced precision on 32 bits system === 0.11.2 / 2010-05-05 * Updated dependency for 'extendedmatrix' to 0.2 (Matrix#build method) === 0.11.1 / 2010-05-04 * Removed Matrix almost all Matrix extensions and replaced by dependency on 'extendmatrix' gem * Added dependency to gsl >=1.12.109. Polychoric with joint method fails without this explicit dependency === 0.11.0 / 2010-04-16 New features: * Added Statsample::Anova::TwoWay and Statsample::Anova::TwoWayWithVectors * Added Statsample.clone_only valid and Statsample::Dataset.clone_only_valid, for cheap copy on already clean vectors Optimizations and bug fix * Removed library statistics2 from package. Used gem statistics2 instead, because have a extension version * Added example for Reliability class * Bug fix on Statsample::DominanceAnalysis === 0.10.0 / 2010-04-13 API modifications * Refactoring of Statsample::Anova module. * Statsample::Anova::OneWay :implementation of generic ANOVA One-Way, used by Multiple Regression, for example. * Statsample::Anova::OneWayWithVectors: implementation of ANOVA One-Way to test differences of means. New features * New Statsample::Factor::Parallel Analysis, to performs Horn's 'parallel analysis' to a PCA, to adjust for sample bias on retention of components. * New Statsample.only_valid_clone and Statsample::Dataset.clone, which allows to create shallow copys of valid vector and datasets. Used by correlation matrix methods to optimize calculations * New module Statsample::Summarizable, which add GetText and ReportBuilder support to classes. Better summaries for Vector, Dataset, Crosstab, PrincipalAxis, PCA and Regression::Multiple classes Optimizations and bug fix * Refactoring of Statsample::Regression::Multiple classes. Still needs works * Bug fix on Statsample::Factor::PCA and Statsample::Factor::PrincipalAxis * Bug fix on Statsample::Bivariate::Polychoric.new_with_vectors. Should be defined class method, no instance method. * Optimized correlation and covariance matrix. Only calculates the half of matrix and the other half is returned from cache * More tests coverage. RCOV Total: 82.51% , Code: 77.83% === 0.9.0 / 2010-04-04 * New Statsample::Test::F. Anova::OneWay subclasses it and Regression classes uses it. === 0.8.2 / 2010-04-01 * Statsample::PromiseAfter replaced by external package DirtyMemoize [http://rubygems.org/gems/dirty-memoize] === 0.8.1 / 2010-03-29 * Fixed Regression summaries === 0.8.0 / 2010-03-29 * New Statsample::Test::T module, with classes and methods to do Student's t tests for one and two samples. * Statsample::PromiseAfter module to set a number of variables without explicitly call the compute or iterate method * All tests ported to MiniUnit * Directory 'demo' renamed to 'examples' * Bug fix on report_building on Statsample::Regression::Multiple classes === 0.7.0 / 2010-03-25 * Ported to ReportBuilder 1.x series * Implementation of ruby based covariance and correlation changed to a clearer code * Statsample::Vector#svggraph_frequencies accepts IO * Some test ported to Miniunit * CSV on Ruby1.8 uses FasterCSV === 0.6.7 / 2010-03-23 * Bug fix: dependency on ReportBuilder should be set to "~>0.2.0", not "0.2" === 0.6.6 / 2010-03-22 * Set ReportBuilder dependency to '0.2.~' version, because future API break * Removed Alglib dependency * Factor::PrincipalAxis and Factor::PCA reworked * Standarization of documentation on almost every file * New Statsample::Test::Levene, to test equality of variances * Constant HAS_GSL replaced by Statsample.has_gsl? * PCA and Principal Axis test based on R and SPSS results * Bug fix on test_dataset.rb / test_saveload * Added Rakefile * Demos for levene, Principal Axis === 0.6.5 / 2010-02-24 * Bug fix on test: Use tempfile instead of tempdir * Multiple Regression: Calculation of constant standard error , using covariance matrix. * Calculation of R^2_yx and P^2_yx for Regresion on Multiple Dependents variables * Dominance Analysis could use Correlation or Covariance Matrix as input. * Dominance Analysis extension to multiple dependent variables (Azen & Budescu, 2006) * Two-step estimate of Polychoric correlation uses minimization gem, so could be executed without rb-gsl === 0.6.4 / 2010-02-19 * Dominance Analysis and Dominance Analysis Bootstrap allows multivariate dependent analysis. * Test suite for Dominance Analysis, using Azen and Budescu papers as references * X^2 for polychoric correlation === 0.6.3 / 2010-02-15 * Statsample::Bivariate::Polychoric have joint estimation. * Some extra documentation and bug fixs === 0.6.2 / 2010-02-11 * New Statsample::Bivariate::Polychoric. For implement: X2 and G2 * New matrix.rb, for faster development of Contingence Tables and Correlation Matrix === 0.6.1 / 2010-02-08 * Bug fix on DominanceAnalysis summary for Ruby1.9 * Some extra documentation === 0.6.0 / 2010-02-05 * New Statsample::Factor module. Include classes for extracting factors (Statsample::Factor::PCA and Statsample::Factor::PrincipalAxis) and rotate component matrix ( Statsample::Factor::Rotation subclasses). For now, only orthogonal rotations * New Statsample::Dataset.crosstab_with_asignation, Statsample::Dataset.one_to_many * New class Statsample::Permutation to produce permutations of a given array * New class Statsample::Histogram, with same interface as GSL one * New class Statsample::Test::UMannWhitney, to perform Mann-Whitney's U test. Gives z based and exact calculation of probability * Improved support for ReportBuilder * Statsample::Codification module reworked * Fixed bugs on Dominance Analysis classes * Fixed bugs on Statsample::Vector.kurtosis and Statsample::Vector.skew === 0.5.1 / 2009-10-06 * New class Statsample::Bivariate::Tetrachoric, for calculation of tetrachoric correlations. See http://www.john-uebersax.com/stat/tetra.htm for information. * New Statsample::Dataset.merge * New Statsample::Vector.dichotomize * New ItemReliability.item_difficulty_analysis * New module Statsample::SPSS, to export information to SPSS. For now, only tetrachoric correlation matrix are provided * All SpreadSheet based importers now accept repeated variable names and renames they on the fly * MultipleRegression::BaseEngine moved to new file * Bug fix for MultipleRegression::GslEngine checks for Alglib, not GSL === 0.5.0 / 2009-09-26 * Vector now uses a Hash as a third argument * Tested on Ruby 1.8.6, 1.8.7 and 1.9.1 with multiruby === 0.4.1 / 2009-09-12 * More methods and usage documentation * Logit tests * Bug fix: rescue for requires doesn't specify LoadError * Binomial::BaseEngine new methods: coeffs_se, coeffs, constant and constant_se === 0.4.0 / 2009-09-10 * New Distribution module, based on statistics2.rb by Shin-ichiro HARA. Replaces all instances of GSL distributions pdf and cdf calculations for native calculation. * New Maximum Likehood Estimation for Logit, Probit and Normal Distribution using Von Tessin(2005) algorithm. See MLE class and subclasses for more information. * New Binomial regression subclasses (Logit and Probit), usign MLE class * Added tests for gsl, Distribution, MLE and Logit * Bug fix on svggraph.rb. Added check_type for scale graphics * Bug fix on gdchart. Replaced old Nominal, Ordinal and Scale for Vector === 0.3.4 / 2009-08-21 * Works with statsample-optimization 2.0.0 * Vector doesn't uses delegation. All methods are part of Vector * Added Combination. Generates all combination of n elements taken r at a time * Bivariate#prop_pearson now can uses as a second parameter :both, :left, :right, :positive or :negative * Added LICENSE.txt === 0.3.3 / 2009-08-11 * Added i18n support. For now, only spanish translation available * Bug fix: Test now load libraries on ../lib path * Excel and CSV importers automatically modify type of vector to Scale when all data are numbers or nils values === 0.3.2 / 2009-08-04 * Added Regression::Multiple::GslEngine * Added setup.rb * Crosstab#row_label and #column_name * DominanceAnalysis and DominanceAnalysisBootstrap uses Dataset#labels for Vector names. === 0.3.1 / 2009-08-03 * Name and logic of Regression classes changed. Now, you have Regression::Simple class and Regression::Multiple module with two engines: RubyEngine and AlglibEngne * New Crosstab#summary === 0.3.0 / 2009-08-02 * Statsample renamed to Statsample * Optimization extension goes to another gem: ruby-statsample-optimization === 0.2.0 / 2009-08-01 * One Way Anova on Statsample::Anova::OneWay * Dominance Analysis!!!! The one and only reason to develop a Multiple Regression on pure ruby. * Multiple Regression on Multiple Regression module. Pairwise (pure ruby) or MultipleRegressionPairwise and Listwise (optimized) on MultipleRegressionAlglib and * New Dataset#to_gsl_matrix, #from_to,#[..],#bootstrap,#vector_missing_values, #vector_count_characters, #each_with_index, #collect_with_index * New Vector#box_cox_transformation * Module Correlation renamed to Bivariate * Some fancy methods and classes to create Summaries * Some documentation about Algorithm used on doc_latex * Deleted 'distributions' extension. Ruby/GSL has all the pdf and cdf you ever need. * Tests work without any dependency. Only nags about missing deps. * Test for MultipleRegression, Anova, Excel, Bivariate.correlation_matrix and many others === 0.1.9 / 2009-05-22 * Class Vector: new method vector_standarized_pop, []=, min,max * Class Dataset: global variable $RUBY_SS_ROW stores the row number on each() and related methods. dup() with argument returns a copy of the dataset only for given fields. New methods: standarize, vector_mean, collect, verify,collect_matrix * Module Correlation: new methods covariance, t_pearson, t_r, prop_pearson, covariance_matrix, correlation_matrix, correlation_probability_matrix * Module SRS: New methods estimation_n0 and estimation_n * Module Reliability: new ItemCharacteristicCurve class * New HtmlReport class * New experimental SPSS Class. * Converters: Module CSV with new options. Added write() method for GGobi module * New Mx exporter (http://www.vcu.edu/mx/) * Class SimpleRegression: new methods standard error * Added tests for regression and reliability, Vector#vector_mean, Dataset#dup (partial) and Dataset#verify === 0.1.8 / 2008-12-10 * Added Regression and Reliability modules * Class Vector: added methods vector_standarized, recode, inspect, ranked * Class Dataset: added methods vector_by_calculation, vector_sum, filter_field * Module Correlation: added methods like spearman, point biserial and tau-b * Added tests for Vector#ranked, Vector#vector_standarized, Vector#sum_of_squared_deviation, Dataset#vector_by_calculation, Dataset#vector_sum, Dataset#filter_field and various test for Correlation module * Added demos: item_analysis and sample_test === 0.1.7 / 2008-10-1 * New module for codification * ... === 0.1.6 / 2008-09-26 * New modules for SRS and stratified sampling * Statsample::Database for read and write onto databases. You could use Database and CSV on-tandem for mass-editing and reimport of databases === 0.1.5 / 2008-08-29 * New extension statsampleopt for optimizing some functions on Statsample submodules * New submodules Correlation and Test === 0.1.4 / 2008-08-27 * New extension, with cdf functions for chi-square, t, gamma and normal distributions. Based on dcdflib (http://www.netlib.org/random/) Also, has a function to calculate the tail for a noncentral T distribution === 0.1.3 / 2008-08-22 * Operational versions of Vector, Dataset, Crosstab and Resample * Read and write CSV files * Calculate chi-square for 2 matrixes === 0.1.1 - 0.1.2 / 2008-08-18 * Included several methods on Ruby::Type classes * Organized dirs with sow === 0.1.0 / 2008-08-12 * First version. ================================================ FILE: LICENSE.txt ================================================ Copyright (c) 2009-2014, Claudio Bustos All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: Manifest.txt ================================================ .travis.yml Gemfile Gemfile.lock History.txt LICENSE.txt Manifest.txt README.md Rakefile benchmarks/correlation_matrix_15_variables.rb benchmarks/correlation_matrix_5_variables.rb benchmarks/correlation_matrix_methods/correlation_matrix.ds benchmarks/correlation_matrix_methods/correlation_matrix.html benchmarks/correlation_matrix_methods/correlation_matrix.rb benchmarks/correlation_matrix_methods/correlation_matrix.xls benchmarks/correlation_matrix_methods/correlation_matrix_gsl_ruby.ods benchmarks/correlation_matrix_methods/correlation_matrix_with_graphics.ods benchmarks/correlation_matrix_methods/results.ds benchmarks/factor_map.rb benchmarks/helpers_benchmark.rb data/locale/es/LC_MESSAGES/statsample.mo doc_latex/manual/equations.tex examples/boxplot.rb examples/correlation_matrix.rb examples/dataset.rb examples/dominance_analysis.rb examples/dominance_analysis_bootstrap.rb examples/histogram.rb examples/icc.rb examples/levene.rb examples/multiple_regression.rb examples/multivariate_correlation.rb examples/parallel_analysis.rb examples/polychoric.rb examples/principal_axis.rb examples/reliability.rb examples/scatterplot.rb examples/t_test.rb examples/tetrachoric.rb examples/u_test.rb examples/vector.rb examples/velicer_map_test.rb grab_references.rb lib/spss.rb lib/statsample.rb lib/statsample/analysis.rb lib/statsample/analysis/suite.rb lib/statsample/analysis/suitereportbuilder.rb lib/statsample/anova.rb lib/statsample/anova/contrast.rb lib/statsample/anova/oneway.rb lib/statsample/anova/twoway.rb lib/statsample/bivariate.rb lib/statsample/bivariate/pearson.rb lib/statsample/codification.rb lib/statsample/converter/csv.rb lib/statsample/converter/spss.rb lib/statsample/converters.rb lib/statsample/crosstab.rb lib/statsample/dataset.rb lib/statsample/dominanceanalysis.rb lib/statsample/dominanceanalysis/bootstrap.rb lib/statsample/factor.rb lib/statsample/factor/map.rb lib/statsample/factor/parallelanalysis.rb lib/statsample/factor/pca.rb lib/statsample/factor/principalaxis.rb lib/statsample/factor/rotation.rb lib/statsample/graph.rb lib/statsample/graph/boxplot.rb lib/statsample/graph/histogram.rb lib/statsample/graph/scatterplot.rb lib/statsample/histogram.rb lib/statsample/matrix.rb lib/statsample/multiset.rb lib/statsample/regression.rb lib/statsample/regression/multiple.rb lib/statsample/regression/multiple/alglibengine.rb lib/statsample/regression/multiple/baseengine.rb lib/statsample/regression/multiple/gslengine.rb lib/statsample/regression/multiple/matrixengine.rb lib/statsample/regression/multiple/rubyengine.rb lib/statsample/regression/simple.rb lib/statsample/reliability.rb lib/statsample/reliability/icc.rb lib/statsample/reliability/multiscaleanalysis.rb lib/statsample/reliability/scaleanalysis.rb lib/statsample/reliability/skillscaleanalysis.rb lib/statsample/resample.rb lib/statsample/rserve_extension.rb lib/statsample/shorthand.rb lib/statsample/srs.rb lib/statsample/test.rb lib/statsample/test/bartlettsphericity.rb lib/statsample/test/chisquare.rb lib/statsample/test/f.rb lib/statsample/test/kolmogorovsmirnov.rb lib/statsample/test/levene.rb lib/statsample/test/t.rb lib/statsample/test/umannwhitney.rb lib/statsample/test/wilcoxonsignedrank.rb lib/statsample/vector.rb lib/statsample/vector/gsl.rb lib/statsample/version.rb po/es/statsample.mo po/es/statsample.po po/statsample.pot references.txt setup.rb test/fixtures/bank2.dat test/fixtures/correlation_matrix.rb test/fixtures/hartman_23.matrix test/fixtures/repeated_fields.csv test/fixtures/stock_data.csv test/fixtures/test_csv.csv test/fixtures/test_xls.xls test/fixtures/tetmat_matrix.txt test/fixtures/tetmat_test.txt test/helpers_tests.rb test/test_analysis.rb test/test_anova_contrast.rb test/test_anovaoneway.rb test/test_anovatwoway.rb test/test_anovatwowaywithdataset.rb test/test_anovawithvectors.rb test/test_bartlettsphericity.rb test/test_bivariate.rb test/test_codification.rb test/test_crosstab.rb test/test_csv.rb test/test_dataset.rb test/test_dominance_analysis.rb test/test_factor.rb test/test_factor_map.rb test/test_factor_pa.rb test/test_ggobi.rb test/test_gsl.rb test/test_histogram.rb test/test_matrix.rb test/test_multiset.rb test/test_regression.rb test/test_reliability.rb test/test_reliability_icc.rb test/test_reliability_skillscale.rb test/test_resample.rb test/test_rserve_extension.rb test/test_srs.rb test/test_statistics.rb test/test_stest.rb test/test_stratified.rb test/test_test_f.rb test/test_test_kolmogorovsmirnov.rb test/test_test_t.rb test/test_umannwhitney.rb test/test_vector.rb test/test_wilcoxonsignedrank.rb test/test_xls.rb web/Rakefile ================================================ FILE: README.md ================================================ # Statsample Homepage :: https://github.com/sciruby/statsample [![Build Status](https://travis-ci.org/clbustos/statsample.svg?branch=master)](https://travis-ci.org/clbustos/statsample) [![Gem Version](https://badge.fury.io/rb/statsample.svg)](http://badge.fury.io/rb/statsample) ## DESCRIPTION A suite for basic and advanced statistics on Ruby. Tested on Ruby 2.1.1p76 (June 2014), 1.8.7, 1.9.1, 1.9.2 (April, 2010), ruby-head(June, 2011) and JRuby 1.4 (Ruby 1.8.7 compatible). Include: * Descriptive statistics: frequencies, median, mean, standard error, skew, kurtosis (and many others). * Imports and exports datasets from and to Excel, CSV and plain text files. * Correlations: Pearson's r, Spearman's rank correlation (rho), point biserial, tau a, tau b and gamma. Tetrachoric and Polychoric correlation provides by +statsample-bivariate-extension+ gem. * Intra-class correlation * Anova: generic and vector-based One-way ANOVA and Two-way ANOVA, with contrasts for One-way ANOVA. * Tests: F, T, Levene, U-Mannwhitney. * Regression: Simple, Multiple (OLS), Probit and Logit * Factorial Analysis: Extraction (PCA and Principal Axis), Rotation (Varimax, Equimax, Quartimax) and Parallel Analysis and Velicer's MAP test, for estimation of number of factors. * Reliability analysis for simple scale and a DSL to easily analyze multiple scales using factor analysis and correlations, if you want it. * Basic time series support * Dominance Analysis, with multivariate dependent and bootstrap (Azen & Budescu) * Sample calculation related formulas * Structural Equation Modeling (SEM), using R libraries +sem+ and +OpenMx+ * Creates reports on text, html and rtf, using ReportBuilder gem * Graphics: Histogram, Boxplot and Scatterplot ## Principles * Software Design: * One module/class for each type of analysis * Options can be set as hash on initialize() or as setters methods * Clean API for interactive sessions * summary() returns all necessary informacion for interactive sessions * All statistical data available though methods on objects * All (important) methods should be tested. Better with random data. * Statistical Design * Results are tested against text results, SPSS and R outputs. * Go beyond Null Hiphotesis Testing, using confidence intervals and effect sizes when possible * (When possible) All references for methods are documented, providing sensible information on documentation ## Features * Classes for manipulation and storage of data: * Statsample::Vector: An extension of an array, with statistical methods like sum, mean and standard deviation * Statsample::Dataset: a group of Statsample::Vector, analog to a excel spreadsheet or a dataframe on R. The base of almost all operations on statsample. * Statsample::Multiset: multiple datasets with same fields and type of vectors * Anova module provides generic Statsample::Anova::OneWay and vector based Statsample::Anova::OneWayWithVectors. Also you can create contrast using Statsample::Anova::Contrast * Module Statsample::Bivariate provides covariance and pearson, spearman, point biserial, tau a, tau b, gamma, tetrachoric (see Bivariate::Tetrachoric) and polychoric (see Bivariate::Polychoric) correlations. Include methods to create correlation and covariance matrices * Multiple types of regression. * Simple Regression : Statsample::Regression::Simple * Multiple Regression: Statsample::Regression::Multiple * Logit Regression: Statsample::Regression::Binomial::Logit * Probit Regression: Statsample::Regression::Binomial::Probit * Factorial Analysis algorithms on Statsample::Factor module. * Classes for Extraction of factors: * Statsample::Factor::PCA * Statsample::Factor::PrincipalAxis * Classes for Rotation of factors: * Statsample::Factor::Varimax * Statsample::Factor::Equimax * Statsample::Factor::Quartimax * Classes for calculation of factors to retain * Statsample::Factor::ParallelAnalysis performs Horn's 'parallel analysis' to a principal components analysis to adjust for sample bias in the retention of components. * Statsample::Factor::MAP performs Velicer's Minimum Average Partial (MAP) test, which retain components as long as the variance in the correlation matrix represents systematic variance. * Dominance Analysis. Based on Budescu and Azen papers, dominance analysis is a method to analyze the relative importance of one predictor relative to another on multiple regression * Statsample::DominanceAnalysis class can report dominance analysis for a sample, using uni or multivariate dependent variables * Statsample::DominanceAnalysis::Bootstrap can execute bootstrap analysis to determine dominance stability, as recomended by Azen & Budescu (2003) link[http://psycnet.apa.org/journals/met/8/2/129/]. * Module Statsample::Codification, to help to codify open questions * Converters to import and export data: * Statsample::Database : Can create sql to create tables, read and insert data * Statsample::CSV : Read and write CSV files * Statsample::Excel : Read and write Excel files * Statsample::Mx : Write Mx Files * Statsample::GGobi : Write Ggobi files * Module Statsample::Crosstab provides function to create crosstab for categorical data * Module Statsample::Reliability provides functions to analyze scales with psychometric methods. * Class Statsample::Reliability::ScaleAnalysis provides statistics like mean, standard deviation for a scale, Cronbach's alpha and standarized Cronbach's alpha, and for each item: mean, correlation with total scale, mean if deleted, Cronbach's alpha is deleted. * Class Statsample::Reliability::MultiScaleAnalysis provides a DSL to easily analyze reliability of multiple scales and retrieve correlation matrix and factor analysis of them. * Class Statsample::Reliability::ICC provides intra-class correlation, using Shrout & Fleiss(1979) and McGraw & Wong (1996) formulations. * Module Statsample::SRS (Simple Random Sampling) provides a lot of functions to estimate standard error for several type of samples * Module Statsample::Test provides several methods and classes to perform inferencial statistics * Statsample::Test::BartlettSphericity * Statsample::Test::ChiSquare * Statsample::Test::F * Statsample::Test::KolmogorovSmirnov (only D value) * Statsample::Test::Levene * Statsample::Test::UMannWhitney * Statsample::Test::T * Statsample::Test::WilcoxonSignedRank * Module Graph provides several classes to create beautiful graphs using rubyvis * Statsample::Graph::Boxplot * Statsample::Graph::Histogram * Statsample::Graph::Scatterplot * Gem bio-statsample-timeseries provides module Statsample::TimeSeries with support for time series, including ARIMA estimation using Kalman-Filter. * Gem statsample-sem provides a DSL to R libraries +sem+ and +OpenMx+ * Gem statsample-glm provides you with GML method, to work with Logistic, Poisson and Gaussian regression ,using ML or IRWLS. * Close integration with gem reportbuilder, to easily create reports on text, html and rtf formats. # Examples of use: See the [examples folder](https://github.com/clbustos/statsample/tree/master/examples/) too. ## Boxplot ```ruby require 'statsample' ss_analysis(Statsample::Graph::Boxplot) do n=30 a=rnorm(n-1,50,10) b=rnorm(n, 30,5) c=rnorm(n,5,1) a.push(2) boxplot(:vectors=>[a,b,c], :width=>300, :height=>300, :groups=>%w{first first second}, :minimum=>0) end Statsample::Analysis.run # Open svg file on *nix application defined ``` ## Correlation matrix ```ruby require 'statsample' # Note R like generation of random gaussian variable # and correlation matrix ss_analysis("Statsample::Bivariate.correlation_matrix") do samples=1000 ds=data_frame( 'a'=>rnorm(samples), 'b'=>rnorm(samples), 'c'=>rnorm(samples), 'd'=>rnorm(samples)) cm=cor(ds) summary(cm) end Statsample::Analysis.run_batch # Echo output to console ``` # Requirements Optional: * Plotting: gnuplot and rbgnuplot, SVG::Graph * Factorial analysis and polychorical correlation(joint estimate and polychoric series): gsl library and rb-gsl (https://rubygems.org/gems/rb-gsl/). You should install it using gem install rb-gsl. *Note*: Use gsl 1.12.109 or later. # Resources * Source code on github :: http://github.com/clbustos/statsample * Docs :: http://statsample.apsique.cl/ * Bug report and feature request :: http://github.com/clbustos/statsample/issues * E-mailing list :: http://groups.google.com/group/statsample # Installation ```bash $ sudo gem install statsample ``` On *nix, you should install statsample-optimization to retrieve gems gsl, statistics2 and a C extension to speed some methods. There are available precompiled version for Ruby 1.9 on x86, x86_64 and mingw32 archs. ```bash $ sudo gem install statsample-optimization ``` If you use Ruby 1.8, you should compile statsample-optimization, usign parameter --platform ruby ```bash $ sudo gem install statsample-optimization --platform ruby ``` If you need to work on Structural Equation Modeling, you could see +statsample-sem+. You need R with +sem+ or +OpenMx+ [http://openmx.psyc.virginia.edu/] libraries installed ```bash $ sudo gem install statsample-sem ``` Available setup.rb file ```bash sudo gem ruby setup.rb ``` ## License BSD-3 (See LICENSE.txt) Could change between version, without previous warning. If you want a specific license, just choose the version that you need. ================================================ FILE: Rakefile ================================================ #!/usr/bin/ruby # -*- ruby -*- # -*- coding: utf-8 -*- $:.unshift(File.dirname(__FILE__)+'/lib/') require 'rubygems' require 'statsample' require 'hoe' require 'rdoc' Hoe.plugin :git Hoe.plugin :doofus desc "Ruby Lint" task :lint do executable=Config::CONFIG['RUBY_INSTALL_NAME'] Dir.glob("lib/**/*.rb") {|f| if !system %{#{executable} -w -c "#{f}"} puts "Error on: #{f}" end } end task :release do system %{git push origin master} end task "clobber_docs" do # Only to omit warnings end desc "Update pot/po files." task "gettext:updatepo" do require 'gettext/tools' GetText.update_pofiles("statsample", Dir.glob("{lib,bin}/**/*.{rb,rhtml}"), "statsample #{Statsample::VERSION}") end desc "Create mo-files" task "gettext:makemo" do require 'gettext/tools' GetText.create_mofiles() # GetText.create_mofiles(true, "po", "locale") # This is for "Ruby on Rails". end h=Hoe.spec('statsample') do self.version=Statsample::VERSION self.urls=["https://github.com/clbustos/statsample"] #self.testlib=:minitest self.readme_file = 'README.md' self.urls = ['https://github.com/clbustos/statsample'] self.developer('Claudio Bustos', 'clbustos@gmail.com') self.extra_deps << ["spreadsheet","~>0.6"] << ["reportbuilder", "~>1.4"] << ["minimization", "~>0.2.0"] << ["fastercsv", ">0"] << ["dirty-memoize", "~>0.0"] << ["extendmatrix","~>0.3.1"] << ["statsample-bivariate-extension", ">0"] << ["rserve-client"] << ["rubyvis"] << ["distribution"] self.extra_dev_deps << ["hoe","~>0"] << ["shoulda","~>3"] << ["minitest", "~>2"] << ["gettext", "~>0"] << ["mocha", "~>0"] << ["hoe-git", "~>0"] self.clean_globs << "test/images/*" << "demo/item_analysis/*" << "demo/Regression" self.post_install_message = <<-EOF *************************************************** Thanks for installing statsample. On *nix, you could install statsample-optimization to retrieve gems gsl, statistics2 and a C extension to speed some methods. $ sudo gem install statsample-optimization On Ubuntu, install build-essential and libgsl0-dev using apt-get. Compile ruby 1.8 or 1.9 from source code first. $ sudo apt-get install build-essential libgsl0-dev ***************************************************** EOF self.need_rdoc=false end if Rake.const_defined?(:RDocTask) Rake::RDocTask.new(:docs) do |rd| rd.main = h.readme_file rd.options << '-d' if (`which dot` =~ /\/dot/) unless ENV['NODOT'] || Hoe::WINDOZE rd.rdoc_dir = 'doc' rd.rdoc_files.include("lib/**/*.rb") rd.rdoc_files += h.spec.extra_rdoc_files rd.rdoc_files.reject! {|f| f=="Manifest.txt"} title = h.spec.rdoc_options.grep(/^(-t|--title)=?$/).first if title then rd.options << title unless title =~ /\=/ then # for ['-t', 'title here'] title_index = spec.rdoc_options.index(title) rd.options << spec.rdoc_options[title_index + 1] end else title = "#{h.name}-#{h.version} Documentation" title = "#{h.rubyforge_name}'s " + title if h.rubyforge_name != h.name rd.options << '--title' << title end end end desc 'Publish rdocs with analytics support' task :publicar_docs => [:clean] do # ruby %{agregar_adsense_a_doc.rb} path = File.expand_path("./doc.yaml") config = YAML.load(File.read(path)) host = "#{config["user"]}@#{config["host"]}" remote_dir = config["dir"] local_dir = h.local_rdoc_dir Dir.glob(local_dir+"/**/*") {|file| sh %{chmod 755 #{file}} } sh %{rsync #{h.rsync_args} #{local_dir}/ #{host}:#{remote_dir}} end # vim: syntax=Ruby ================================================ FILE: benchmarks/correlation_matrix_15_variables.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_benchmark.rb')) extend BenchPress cases=250 vars=20 name "gsl matrix based vs. manual ruby correlation matrix (#{vars} vars, #{cases} cases)" author 'Clbustos' date '2011-01-18' summary " A correlation matrix could be constructed using matrix algebra or mannualy, calculating covariances, means and sd for each pair of vectors. In this test, we test the calculation using #{vars} variables with #{cases} cases on each vector " reps 200 #number of repetitions ds=vars.times.inject({}) {|ac,v| ac["x#{v}"]=Statsample::Vector.new_scale(cases) {rand()} ac }.to_dataset measure "Statsample::Bivariate.correlation_matrix_optimized" do Statsample::Bivariate.correlation_matrix_optimized(ds) end measure "Statsample::Bivariate.correlation_matrix_pairwise" do Statsample::Bivariate.correlation_matrix_pairwise(ds) end ================================================ FILE: benchmarks/correlation_matrix_5_variables.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_benchmark.rb')) extend BenchPress cases=500 vars=5 name "gsl matrix based vs. manual ruby correlation matrix (#{vars} vars, #{cases} cases)" author 'Clbustos' date '2011-01-18' summary " A correlation matrix could be constructed using matrix algebra or mannualy, calculating covariances, means and sd for each pair of vectors. In this test, we test the calculation using #{vars} variables with #{cases} cases on each vector " reps 200 #number of repetitions ds=vars.times.inject({}) {|ac,v| ac["x#{v}"]=Statsample::Vector.new_scale(cases) {rand()} ac }.to_dataset measure "Statsample::Bivariate.correlation_matrix_optimized" do Statsample::Bivariate.correlation_matrix_optimized(ds) end measure "Statsample::Bivariate.correlation_matrix_pairwise" do Statsample::Bivariate.correlation_matrix_pairwise(ds) end ================================================ FILE: benchmarks/correlation_matrix_methods/correlation_matrix.html ================================================ Correlation matrix analysis

Correlation matrix analysis

List of contents
List of tables

Multiple reggresion of cases,vars,c_v on time_optimized

Engine: Statsample::Regression::Multiple::RubyEngine

Cases(listwise)=63(63)

R=0.978844

R^2=0.958137

R^2 Adj=0.956008

Std.Error R=3.092024

Equation=4.031667 + 0.018039cases + 0.244790vars + 0.001197c_v

ANOVA

ANOVA Table
sourcessdfmsfp
Regression12910.09834303.366450.1140.000
Error564.076599.561
Total13474.174624312.927
Beta coefficients
coeffbbetaset
Constant4.031667-0.7526045.356953
cases0.0180390.3815870.0019619.200093
vars0.2447900.2243900.0360556.789335
c_v0.0011970.5841740.00009412.738410

Multiple reggresion of cases,vars,c_v on time_pairwise

Engine: Statsample::Regression::Multiple::RubyEngine

Cases(listwise)=63(63)

R=0.999637

R^2=0.999275

R^2 Adj=0.999238

Std.Error R=0.538365

Equation=-0.520303 + -0.000708cases + 1.234451vars + 0.000735c_v

ANOVA

ANOVA Table
sourcessdfmsfp
Regression23554.27137851.42427089.1340.000
Error17.100590.290
Total23571.372627851.714
Beta coefficients
coeffbbetaset
Constant-0.520303-0.131039-3.970594
cases-0.000708-0.0113240.000341-2.074007
vars1.2344510.8555460.006278196.641087
c_v0.0007350.2711380.00001644.912972
================================================ FILE: benchmarks/correlation_matrix_methods/correlation_matrix.rb ================================================ # This test create a database to adjust the best algorithm # to use on correlation matrix require(File.expand_path(File.dirname(__FILE__)+'/../helpers_benchmark.rb')) require 'statsample' require 'benchmark' def create_dataset(vars,cases) ran=Distribution::Normal.rng ds=vars.times.inject({}) {|ac,v| ac["x#{v}"]=Statsample::Vector.new_scale(cases) {ran.call} ac }.to_dataset end def prediction_pairwise(vars,cases) Statsample::Bivariate.prediction_pairwise(vars,cases) / 10 end def prediction_optimized(vars,cases) Statsample::Bivariate.prediction_optimized(vars,cases) / 10 end if !File.exists?("correlation_matrix.ds") or File.mtime(__FILE__) > File.mtime("correlation_matrix.ds") reps=100 #number of repetitions ds_sizes=[5,10,30,50,100,150,200,500,1000] ds_vars=[3,4,5,10,20,30,40] #ds_sizes=[5,10] #ds_vars=[3,5,20] rs=Statsample::Dataset.new(%w{cases vars time_optimized time_pairwise}) ds_sizes.each do |cases| ds_vars.each do |vars| ds=create_dataset(vars,cases) time_optimized= Benchmark.realtime do reps.times { Statsample::Bivariate.correlation_matrix_optimized(ds) ds.clear_gsl } end time_pairwise= Benchmark.realtime do reps.times { Statsample::Bivariate.correlation_matrix_pairwise(ds) } end puts "Cases:#{cases}, vars:#{vars} -> opt:%0.3f (%0.3f) | pair: %0.3f (%0.3f)" % [time_optimized, prediction_optimized(vars,cases), time_pairwise, prediction_pairwise(vars,cases)] rs.add_case({'cases'=>cases,'vars'=>vars,'time_optimized'=>Math.sqrt(time_optimized*1000),'time_pairwise'=>Math.sqrt(time_pairwise*1000)}) end end else rs=Statsample.load("correlation_matrix.ds") end rs.fields.each {|f| rs[f].type=:scale} rs['c_v']=rs.collect {|row| row['cases']*row['vars']} rs.update_valid_data rs.save("correlation_matrix.ds") Statsample::Excel.write(rs,"correlation_matrix.xls") rb=ReportBuilder.new(:name=>"Correlation matrix analysis") rb.add(Statsample::Regression.multiple(rs[['cases','vars','time_optimized','c_v']],'time_optimized', :digits=>6)) rb.add(Statsample::Regression.multiple(rs[['cases','vars','time_pairwise','c_v']],'time_pairwise', :digits=>6)) rb.save_html("correlation_matrix.html") ================================================ FILE: benchmarks/factor_map.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_benchmark.rb')) extend BenchPress name "Statsample::Factor::Map with and without GSL" author 'Clbustos' date '2011-01-18' summary "Velicer's MAP uses a lot of Matrix algebra. How much we can improve the timing using GSL? " reps 20 #number of repetitions m=Matrix[ [ 1, 0.846, 0.805, 0.859, 0.473, 0.398, 0.301, 0.382], [ 0.846, 1, 0.881, 0.826, 0.376, 0.326, 0.277, 0.415], [ 0.805, 0.881, 1, 0.801, 0.38, 0.319, 0.237, 0.345], [ 0.859, 0.826, 0.801, 1, 0.436, 0.329, 0.327, 0.365], [ 0.473, 0.376, 0.38, 0.436, 1, 0.762, 0.73, 0.629], [ 0.398, 0.326, 0.319, 0.329, 0.762, 1, 0.583, 0.577], [ 0.301, 0.277, 0.237, 0.327, 0.73, 0.583, 1, 0.539], [ 0.382, 0.415, 0.345, 0.365, 0.629, 0.577, 0.539, 1] ] map=Statsample::Factor::MAP.new(m) measure "Statsample::Factor::MAP without GSL" do map.use_gsl=false map.compute end measure "Statsample::Factor::MAP with GSL" do map.use_gsl=true map.compute end ================================================ FILE: benchmarks/helpers_benchmark.rb ================================================ $:.unshift(File.expand_path(File.dirname(__FILE__)+'/../lib/')) $:.unshift(File.expand_path(File.dirname(__FILE__)+'/')) require 'statsample' require 'bench_press' ================================================ FILE: doc_latex/manual/equations.tex ================================================ \part{Equations} \section{Convention} \begin{align*} n &= \text{sample size}\\ N &= \text{population size}\\ p &= \text{proportion inside a sample}\\ P &= \text{proportion inside a population} \end{align*} \section{Ruby::Regression::Multiple} To compute the standard error of coefficients, you obtain the estimated variance-covariance matrix of error. Let \mathbf{X} be matrix of predictors data, including a constant column; \mathbf{MSE} as mean square error; SSE as Sum of squares of errors; n the number of cases; p as number of predictors \begin{equation} \mathbf{MSE}=\frac{SSE}{n-p-1} \end{equation} \begin{equation} \mathbf{E}=(\mathbf{X'}\mathbf{X})^-1\mathbf{MSE} \end{equation} The root squares of diagonal should be standard errors \section{Ruby::SRS} Finite Poblation correction is used on standard error calculation on poblation below 10.000. Function \begin{verbatim} fpc_var(sam,pop) \end{verbatim} calculate FPC for variance with \begin{equation} fpc_{var} = \frac{N-n} {N-1} \end{equation} with n as sam and N as pop Function \begin{verbatim} fpc = fpc(sam,pop) \end{verbatim} calculate FPC for standard deviation with \begin{equation} fpc_{sd} = \sqrt{\frac{N-n} {N-1}} \label{fpc} \end{equation} with n as sample size and N as population size. \subsection{Sample Size estimation for proportions} On infinite poblations, you should use method \begin{verbatim} estimation_n0(d,prop,margin=0.95) \end{verbatim} which uses \begin{equation} n = \frac{t^2(pq)}{d^2} \label{n_i} \end{equation} where \begin{align*} t &= \text{t value for given level of confidence ( 1.96 for 95\% )}\\ d &= \text{margin of error} \end{align*} On finite poblations, you should use \begin{verbatim} estimation_n(d,prop,n_pobl, margin=0.95) \end{verbatim} which uses \begin{equation} n = \frac{n_i}{1+(\frac{n_i-1}{N})} \end{equation} Where $n_i$ is n on \ref{n_i} and N is population size ================================================ FILE: examples/boxplot.rb ================================================ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib/') require 'statsample' Statsample::Analysis.store(Statsample::Graph::Boxplot) do n=30 a=rnorm(n-1,50,10) b=rnorm(n, 30,5) c=rnorm(n,5,1) a.push(2) boxplot(:vectors=>[a,b,c],:width=>300, :height=>300, :groups=>%w{first first second}, :minimum=>0) end if __FILE__==$0 Statsample::Analysis.run end ================================================ FILE: examples/correlation_matrix.rb ================================================ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib/') require 'statsample' Statsample::Analysis.store("Statsample::Bivariate.correlation_matrix") do samples=1000 ds=data_frame( 'a'=>rnorm(samples), 'b'=>rnorm(samples), 'c'=>rnorm(samples), 'd'=>rnorm(samples)) cm=cor(ds) summary(cm) end if __FILE__==$0 Statsample::Analysis.run_batch end ================================================ FILE: examples/dataset.rb ================================================ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib/') require 'statsample' Statsample::Analysis.store(Statsample::Dataset) do samples=1000 a=Statsample::Vector.new_scale(samples) {r=rand(5); r==4 ? nil: r} b=Statsample::Vector.new_scale(samples) {r=rand(5); r==4 ? nil: r} ds={'a'=>a,'b'=>b}.to_dataset summary(ds) end if __FILE__==$0 Statsample::Analysis.run_batch end ================================================ FILE: examples/dominance_analysis.rb ================================================ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib/') require 'statsample' Statsample::Analysis.store(Statsample::DominanceAnalysis) do sample=300 a=rnorm(sample) b=rnorm(sample) c=rnorm(sample) d=rnorm(sample) ds={'a'=>a,'b'=>b,'cc'=>c,'d'=>d}.to_dataset attach(ds) ds['y']=a*5+b*3+cc*2+d+rnorm(300) cm=cor(ds) summary(cm) lr=lr(ds,'y') summary(lr) da=dominance_analysis(ds,'y') summary(da) da=dominance_analysis(ds,'y',:name=>"Dominance Analysis using group of predictors", :predictors=>['a', 'b', %w{cc d}]) summary(da) end if __FILE__==$0 Statsample::Analysis.run_batch end ================================================ FILE: examples/dominance_analysis_bootstrap.rb ================================================ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib/') require 'statsample' Statsample::Analysis.store(Statsample::DominanceAnalysis::Bootstrap) do sample=300 a=rnorm(sample) b=rnorm(sample) c=rnorm(sample) d=rnorm(sample) a.name="a" b.name="b" c.name="c" d.name="d" ds={'a'=>a,'b'=>b,'cc'=>c,'d'=>d}.to_dataset attach(ds) ds['y1']=a*5+b*2+cc*2+d*2+rnorm(sample,0,10) ds['y2']=a*10+rnorm(sample) dab=dominance_analysis_bootstrap(ds, ['y1','y2'], :debug=>true) dab.bootstrap(100,nil) summary(dab) ds2=ds['a'..'y1'] dab2=dominance_analysis_bootstrap(ds2, 'y1', :debug=>true) dab2.bootstrap(100,nil) summary(dab2) end if __FILE__==$0 Statsample::Analysis.run_batch end ================================================ FILE: examples/histogram.rb ================================================ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib/') require 'statsample' Statsample::Analysis.store(Statsample::Graph::Histogram) do histogram(rnorm(3000,0,20)) end if __FILE__==$0 Statsample::Analysis.run end ================================================ FILE: examples/icc.rb ================================================ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib/') require 'statsample' Statsample::Analysis.store(Statsample::Reliability::ICC) do size=1000 a=Statsample::Vector.new_scale(size) {rand(10)} b=a.recode{|i|i+rand(4)-2} c=a.recode{|i|i+rand(4)-2} d=a.recode{|i|i+rand(4)-2} @ds={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset @icc=Statsample::Reliability::ICC.new(@ds) summary(@icc) @icc.type=:icc_3_1 summary(@icc) @icc.type=:icc_a_k summary(@icc) end if __FILE__==$0 Statsample::Analysis.run_batch end ================================================ FILE: examples/levene.rb ================================================ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib/') require 'statsample' Statsample::Analysis.store(Statsample::Test::Levene) do a=[1,2,3,4,5,6,7,8,100,10].to_scale b=[30,40,50,60,70,80,90,100,110,120].to_scale summary(levene([a,b])) end if __FILE__==$0 Statsample::Analysis.run_batch end ================================================ FILE: examples/multiple_regression.rb ================================================ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib/') require 'statsample' Statsample::Analysis.store(Statsample::Regression::Multiple) do samples=2000 ds=dataset('a'=>rnorm(samples),'b'=>rnorm(samples),'cc'=>rnorm(samples),'d'=>rnorm(samples)) attach(ds) ds['y']=a*5+b*3+cc*2+d+rnorm(samples) summary lr(ds,'y') end if __FILE__==$0 Statsample::Analysis.run_batch end ================================================ FILE: examples/multivariate_correlation.rb ================================================ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib/') require 'statsample' require 'mathn' Statsample::Analysis.store(Statsample::Regression::Multiple::MultipleDependent) do complete=Matrix[ [1,0.53,0.62,0.19,-0.09,0.08,0.02,-0.12,0.08], [0.53,1,0.61,0.23,0.1,0.18,0.02,-0.1,0.15], [0.62,0.61,1,0.03,0.1,0.12,0.03,-0.06,0.12], [0.19,0.23,0.03,1,-0.02,0.02,0,-0.02,-0.02], [-0.09,0.1,0.1,-0.02,1,0.05,0.06,0.18,0.02], [0.08,0.18,0.12,0.02,0.05,1,0.22,-0.07,0.36], [0.02,0.02,0.03,0,0.06,0.22,1,-0.01,-0.05], [-0.12,-0.1,-0.06,-0.02,0.18,-0.07,-0.01,1,-0.03], [0.08,0.15,0.12,-0.02,0.02,0.36,-0.05,-0.03,1]] complete.extend Statsample::CovariateMatrix complete.fields=%w{adhd cd odd sex age monly mwork mage poverty} lr=Statsample::Regression::Multiple::MultipleDependent.new(complete, %w{adhd cd odd}) echo "R^2_yx #{lr.r2yx}" echo "P^2_yx #{lr.p2yx}" end if __FILE__==$0 Statsample::Analysis.run_batch end ================================================ FILE: examples/parallel_analysis.rb ================================================ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib/') require 'statsample' samples=150 variables=30 iterations=50 Statsample::Analysis.store(Statsample::Factor::ParallelAnalysis) do rng = Distribution::Normal.rng() f1=rnorm(samples) f2=rnorm(samples) f3=rnorm(samples) vectors={} variables.times do |i| vectors["v#{i}"]=samples.times.collect {|nv| f1[nv]*i+(f2[nv]*(15-i))+((f3[nv]*(30-i))*1.5)*rng.call}.to_scale vectors["v#{i}"].name="Vector #{i}" end ds=vectors.to_dataset pa=Statsample::Factor::ParallelAnalysis.new(ds, :iterations=>iterations, :debug=>true) pca=pca(cor(ds)) echo "There are 3 real factors on data" summary pca echo "Traditional Kaiser criterion (k>1) returns #{pca.m} factors" summary pa echo "Parallel Analysis returns #{pa.number_of_factors} factors to preserve" end if __FILE__==$0 Statsample::Analysis.run_batch end ================================================ FILE: examples/polychoric.rb ================================================ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib/') $:.unshift("/home/cdx/usr/lib/statsample-bivariate-extension/lib/") require 'statsample' Statsample::Analysis.store(Statsample::Bivariate::Polychoric) do ct=Matrix[[rand(10)+50, rand(10)+50, rand(10)+1], [rand(20)+5, rand(50)+4, rand(10)+1], [rand(8)+1, rand(12)+1, rand(10)+1]] # Estimation of polychoric correlation using two-step (default) poly=polychoric(ct, :name=>"Polychoric with two-step", :debug=>false) summary poly # Estimation of polychoric correlation using joint method (slow) poly=polychoric(ct, :method=>:joint, :name=>"Polychoric with joint") summary poly # Uses polychoric series (not recomended) poly=polychoric(ct, :method=>:polychoric_series, :name=>"Polychoric with polychoric series") summary poly end if __FILE__==$0 Statsample::Analysis.run_batch end ================================================ FILE: examples/principal_axis.rb ================================================ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib/') require 'statsample' Statsample::Analysis.store(Statsample::Factor::PrincipalAxis) do matrix=Matrix[ [1.0, 0.709501601093587, 0.877596585880047, 0.272219316266807], [0.709501601093587, 1.0, 0.291633797330304, 0.871141831433844], [0.877596585880047, 0.291633797330304, 1.0, -0.213373722977167], [0.272219316266807, 0.871141831433844, -0.213373722977167, 1.0]] matrix.extend Statsample::CovariateMatrix #matrix.fields=%w{a b c d} fa=principal_axis(matrix,:m=>1,:smc=>false) summary fa end if __FILE__==$0 Statsample::Analysis.run_batch end ================================================ FILE: examples/reliability.rb ================================================ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib') require 'statsample' Statsample::Analysis.store(Statsample::Reliability) do samples=100 a=rnorm(samples) ds=Statsample::Dataset.new 20.times do |i| ds["v#{i}"]=a+rnorm(samples,0,0.2) end ds.update_valid_data rel=Statsample::Reliability::ScaleAnalysis.new(ds) summary rel ms=Statsample::Reliability::MultiScaleAnalysis.new(:name=>"Multi Scale analyss") do |m| m.scale "Scale 1", ds.clone(%w{v1 v2 v3 v4 v5 v6 v7 v8 v9 v10}) m.scale "Scale 2", ds.clone(%w{v11 v12 v13 v14 v15 v16 v17 v18 v19}) end summary ms end if __FILE__==$0 Statsample::Analysis.run_batch end ================================================ FILE: examples/scatterplot.rb ================================================ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib/') $:.unshift('/home/cdx/dev/reportbuilder/lib/') require 'benchmark' require 'statsample' n=100 Statsample::Analysis.store(Statsample::Graph::Scatterplot) do x=rnorm(n) y=x+rnorm(n,0.5,0.2) scatterplot(x,y) end if __FILE__==$0 Statsample::Analysis.run end ================================================ FILE: examples/t_test.rb ================================================ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib') require 'statsample' Statsample::Analysis.store(Statsample::Test::T) do a=rnorm(10) t_1=Statsample::Test.t_one_sample(a,{:u=>50}) summary t_1 b=rnorm(10,2) t_2=Statsample::Test.t_two_samples_independent(a,b) summary t_2 end if __FILE__==$0 Statsample::Analysis.run_batch end ================================================ FILE: examples/tetrachoric.rb ================================================ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib/') require 'statsample' Statsample::Analysis.store(Statsample::Bivariate::Tetrachoric) do a=40 b=10 c=20 d=30 summary tetrachoric(a,b,c,d) end if __FILE__==$0 Statsample::Analysis.run_batch end ================================================ FILE: examples/u_test.rb ================================================ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib') require 'statsample' Statsample::Analysis.store(Statsample::Test::UMannWhitney) do a=10.times.map {rand(100)}.to_scale b=20.times.map {(rand(20))**2+50}.to_scale u=Statsample::Test::UMannWhitney.new(a,b) summary u end if __FILE__==$0 Statsample::Analysis.run_batch end ================================================ FILE: examples/vector.rb ================================================ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib/') require 'statsample' Statsample::Analysis.store(Statsample::Vector) do a=Statsample::Vector.new_scale(1000) {r=rand(5); r==4 ? nil: r;} summary a b=c(1,2,3,4,6..10) summary b end if __FILE__==$0 Statsample::Analysis.run_batch end ================================================ FILE: examples/velicer_map_test.rb ================================================ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib/') require 'statsample' Statsample::Analysis.store(Statsample::Factor::MAP) do rng=Distribution::Normal.rng samples=100 variables=10 f1=rnorm(samples) f2=rnorm(samples) vectors={} variables.times do |i| vectors["v#{i}"]=samples.times.collect {|nv| if i<5 f1[nv]*5 + f2[nv] *2 +rng.call else f1[nv]*2 + f2[nv] *3 +rng.call end }.to_scale end ds=vectors.to_dataset cor=cor(ds) pca=pca(cor) map=Statsample::Factor::MAP.new(cor) echo ("There are 2 real factors on data") summary(pca) echo("Traditional Kaiser criterion (k>1) returns #{pca.m} factors") summary(map) echo("Velicer's MAP Test returns #{map.number_of_factors} factors to preserve") end if __FILE__==$0 Statsample::Analysis.run_batch end ================================================ FILE: grab_references.rb ================================================ #!/usr/bin/env ruby1.9 require 'reportbuilder' refs=[] Dir.glob "**/*.rb" do |f| next if f=~/pkg/ reference=false File.open(f).each_line do |l| if l=~/== Reference/ reference=true elsif reference if l=~/\*\s+(.+)/ refs.push $1 else reference=false end end end end rb=ReportBuilder.new(:name=>"References") do |g| refs.uniq.sort.each do |r| g.text "* #{r}" end end rb.save_text("references.txt") ================================================ FILE: lib/spss.rb ================================================ # = spss.rb - # # Provides utilites for working with spss files # # Copyright (C) 2009 Claudio Bustos # # Claudio Bustos mailto:clbustos@gmail.com module SPSS # :nodoc: all module Dictionary class Element def add(a) @elements.push(a) end def parse_elements(func=:to_s) @elements.collect{|e| " "+e.send(func)}.join("\n") end def init_with config config.each {|key,value| self.send(key.to_s+"=",value) if methods.include? key.to_s } end def initialize(config={}) @config=config @elements=[] end end class Dictionary < Element attr_accessor :locale, :date_time, :row_count def initialize(config={}) super init_with ({ :locale=>"en_US", :date_time=>Time.new().strftime("%Y-%m-%dT%H:%M:%S"), :row_count=>1 }) init_with config end def to_xml "\n"+parse_elements(:to_xml)+"\n" end def to_spss parse_elements(:to_spss) end end class MissingValue < Element attr_accessor :data, :type, :from, :to def initialize(data,type=nil) @data=data if type.nil? or type=="lowerBound" or type=="upperBound" @type=type else raise Exception,"Incorrect value for type" end end def to_xml "" end end class LabelSet attr_accessor def initialize(labels) @labels=labels end def parse_xml(name) "\n "+@labels.collect{|key,value| ""}.join("\n ")+"\n \n" end def parse_spss() @labels.collect{|key,value| "#{key} '#{value}'"}.join("\n ") end end class Variable < Element attr_accessor :aligment, :display_width, :label, :measurement_level, :name, :type, :decimals, :width, :type_format, :labelset, :missing_values def initialize(config={}) super @@var_number||=1 init_with({ :aligment => "left", :display_width => 8, :label => "Variable #{@@var_number}", :measurement_level => "SCALE", :name => "var#{@@var_number}", :type => 0, :decimals => 2, :width => 10, :type_format => "F", :labelset => nil }) init_with config @missing_values=[] @@var_number+=1 end def to_xml labelset_s=(@labelset.nil?) ? "":"\n"+@labelset.parse_xml(@name) missing_values=(@missing_values.size>0) ? @missing_values.collect {|m| m.to_xml}.join("\n"):"" "\n\n"+parse_elements(:to_xml)+missing_values+""+labelset_s end def to_spss out=<0 out << "MISSING VALUES #{@name} ("+@missing_values.collect{|m| m.data}.join(",")+") ." end out end end end end n=SPSS::Dictionary::Dictionary.new ls=SPSS::Dictionary::LabelSet.new({1=>"Si",2=>"No"}) var1=SPSS::Dictionary::Variable.new var1.labelset=ls mv1=SPSS::Dictionary::MissingValue.new("-99") var2=SPSS::Dictionary::Variable.new n.add(var1) n.add(var2) var2.missing_values=[mv1] File.open("dic_spss.sps","wb") {|f| f.puts n.to_spss } ================================================ FILE: lib/statsample/analysis/suite.rb ================================================ module Statsample module Analysis class Suite include Statsample::Shorthand attr_accessor :output attr_accessor :name attr_reader :block def initialize(opts=Hash.new(), &block) if !opts.is_a? Hash opts={:name=>opts} end @block=block @name=opts[:name] || "Analysis #{Time.now}" @attached=[] @output=opts[:output] || ::STDOUT end # Run the analysis, putting output on def run @block.arity<1 ? instance_eval(&@block) : @block.call(self) end # Provides a description of the procedure. Only appears as a commentary on # SuiteReportBuilder outputs def desc(d) @output.puts("Description:") @output.puts(" #{d}") end def echo(*args) @output.puts(*args) end def summary(obj) obj.summary end def add_to_reportbuilder(rb) SuiteReportBuilder.new({:name=>name, :rb=>rb}, &block) end def generate(filename) ar=SuiteReportBuilder.new({:name=>name}, &block) ar.generate(filename) end def to_text ar=SuiteReportBuilder.new({:name=>name}, &block) ar.to_text end def attach(ds) @attached.push(ds) end def detach(ds=nil) if ds.nil? @attached.pop else @attached.delete(ds) end end alias :old_boxplot :boxplot alias :old_histogram :histogram alias :old_scatterplot :scatterplot def show_svg(svg) require 'tmpdir' fn=Dir.tmpdir+"/image_#{Time.now.to_f}.svg" File.open(fn,"w") {|fp| fp.write svg} if RUBY_PLATFORM =~/darwin/ %x(open -a safari #{fn}) else %x(xdg-open #{fn}) end end def boxplot(*args) show_svg(old_boxplot(*args).to_svg) end def histogram(*args) show_svg(old_histogram(*args).to_svg) end def scatterplot(*args) show_svg(old_scatterplot(*args).to_svg) end def method_missing(name, *args,&block) @attached.reverse.each do |ds| return ds[name.to_s] if ds.fields.include? (name.to_s) end raise "Method #{name} doesn't exists" end end end end ================================================ FILE: lib/statsample/analysis/suitereportbuilder.rb ================================================ module Statsample module Analysis class SuiteReportBuilder < Suite attr_accessor :rb def initialize(opts=Hash.new,&block) if !opts.is_a? Hash opts={:name=>opts} end super(opts,&block) @rb=opts[:rb] || ReportBuilder.new(:name=>name) end def generate(filename) run if @block @rb.save(filename) end def to_text run if @block @rb.to_text end def summary(o) @rb.add(o) end def desc(d) @rb.add(d) end def echo(*args) args.each do |a| @rb.add(a) end end def boxplot(*args) @rb.add(old_boxplot(*args)) end def histogram(*args) @rb.add(old_histogram(*args)) end def boxplot(*args) @rb.add(old_boxplot(*args)) end end end end ================================================ FILE: lib/statsample/analysis.rb ================================================ require 'statsample/analysis/suite' require 'statsample/analysis/suitereportbuilder' module Statsample # DSL to create analysis without hazzle. # * Shortcuts methods to avoid use complete namescapes, many based on R # * Attach/detach vectors to workspace, like R # == Example # an1=Statsample::Analysis.store(:first) do # # Load excel file with x,y,z vectors # ds=excel('data.xls') # # See variables on ds dataset # names(ds) # # Attach the vectors to workspace, like R # attach(ds) # # vector 'x' is attached to workspace like a method, # # so you can use like any variable # mean,sd=x.mean, x.sd # # Shameless R robbery # a=c( 1:10) # b=c(21:30) # summary(cor(ds)) # Call summary method on correlation matrix # end # # You can run the analysis by its name # Statsample::Analysis.run(:first) # # or using the returned variables # an1.run # # You can also generate a report using ReportBuilder. # # .summary() method call 'report_building' on the object, # # instead of calling text summary # an1.generate("report.html") module Analysis @@stored_analysis={} @@last_analysis=nil def self.clear_analysis @@stored_analysis.clear end def self.stored_analysis @@stored_analysis end def self.last @@stored_analysis[@@last_analysis] end def self.store(name, opts=Hash.new,&block) raise "You should provide a block" if !block @@last_analysis=name opts={:name=>name}.merge(opts) @@stored_analysis[name]=Suite.new(opts,&block) end # Run analysis +*args+ # Without arguments, run all stored analysis # Only 'echo' will be returned to screen def self.run(*args) args=stored_analysis.keys if args.size==0 raise "Analysis #{args} doesn't exists" if (args - stored_analysis.keys).size>0 args.each do |name| stored_analysis[name].run end end # Add analysis +*args+ to an reportbuilder object. # Without arguments, add all stored analysis # Each analysis is wrapped inside a ReportBuilder::Section object # This is the method is used by save() and to_text() def self.add_to_reportbuilder(rb, *args) args=stored_analysis.keys if args.size==0 raise "Analysis #{name} doesn't exists" if (args - stored_analysis.keys).size>0 args.each do |name| section=ReportBuilder::Section.new(:name=>stored_analysis[name].name) rb_an=stored_analysis[name].add_to_reportbuilder(section) rb.add(section) rb_an.run end end # Save the analysis on a file # Without arguments, add all stored analysis def self.save(filename, *args) rb=ReportBuilder.new(:name=>filename) add_to_reportbuilder(rb, *args) rb.save(filename) end # Run analysis and return as string # output of echo callings # Without arguments, add all stored analysis def self.to_text(*args) rb=ReportBuilder.new(:name=>"Analysis #{Time.now}") add_to_reportbuilder(rb, *args) rb.to_text end # Run analysis and return to screen all # echo and summary callings def self.run_batch(*args) puts to_text(*args) end end end ================================================ FILE: lib/statsample/anova/contrast.rb ================================================ module Statsample module Anova class Contrast attr_reader :psi attr_reader :msw include Summarizable def initialize(opts=Hash.new) raise "Should set at least vectors options" if opts[:vectors].nil? @vectors=opts[:vectors] @c=opts[:c] @c1,@c2=opts[:c1], opts[:c2] @t_options=opts[:t_options] || {:estimate_name=>_("Psi estimate")} @name=opts[:name] || _("Contrast") @psi=nil @anova=Statsample::Anova::OneWayWithVectors.new(@vectors) @msw=@anova.msw end # Hypothesis contrast, selecting index for each constrast # For example, if you want to contrast x_0 against x_1 and x_2 # you should use # c.contrast([0],[1,2]) def c_by_index(c1,c2) contrast=[0]*@vectors.size c1.each {|i| contrast[i]=1.quo(c1.size)} c2.each {|i| contrast[i]=-1.quo(c2.size)} @c=contrast c(contrast) end def psi if @psi.nil? c(@c) if @c c_by_index(@c1,@c2) if (@c1 and @c2) end @psi end def confidence_interval(cl=nil) t_object.confidence_interval(cl) end # Hypothesis contrast, using custom values # Every parameter is a contrast value. You should use # the same number of contrast as vectors on class and the sum # of constrast should be 0. def c(args=nil) return @c if args.nil? @c=args raise "contrast number!=vector number" if args.size!=@vectors.size #raise "Sum should be 0" if args.inject(0) {|ac,v| ac+v}!=0 @psi=args.size.times.inject(0) {|ac,i| ac+(args[i]*@vectors[i].mean)} end def standard_error sum=@vectors.size.times.inject(0) {|ac,i| ac+((@c[i].rationalize**2).quo(@vectors[i].size)) } Math.sqrt(@msw*sum) end alias :se :standard_error def df @vectors.inject(0) {|ac,v| ac+v.size}-@vectors.size end def t_object Statsample::Test::T.new(psi, se, df, @t_options) end def t t_object.t end def probability t_object.probability end def report_building(builder) builder.section(:name=>@name) do |s| s.text _("Contrast:%s") % c.join(",") s.parse_element(t_object) end end end end end ================================================ FILE: lib/statsample/anova/oneway.rb ================================================ module Statsample module Anova # = Generic Anova one-way. # You could enter the sum of squares or the mean squares. You # should enter the degrees of freedom for numerator and denominator. # == Usage # anova=Statsample::Anova::OneWay(:ss_num=>10,:ss_den=>20, :df_num=>2, :df_den=>10, @name=>"ANOVA for....") class OneWay include Summarizable attr_reader :df_num, :df_den, :ss_num, :ss_den, :ms_num, :ms_den, :ms_total, :df_total, :ss_total # Name of ANOVA Analisys attr_accessor :name attr_accessor :name_denominator attr_accessor :name_numerator def initialize(opts=Hash.new) @name=@name_numerator=@name_denominator=nil # First see if sum of squares or mean squares are entered raise ArgumentError, "You should set d.f." unless (opts.has_key? :df_num and opts.has_key? :df_den) @df_num=opts.delete :df_num @df_den=opts.delete :df_den @df_total=@df_num+@df_den if(opts.has_key? :ss_num and opts.has_key? :ss_den) @ss_num = opts.delete :ss_num @ss_den =opts.delete :ss_den @ms_num =@ss_num.quo(@df_num) @ms_den =@ss_den.quo(@df_den) elsif (opts.has_key? :ms_num and opts.has_key? :ms_den) @ms_num =opts.delete :ms_num @ms_den =opts.delete :ms_den @ss_num =@ms_num * @df_num @ss_den =@ss_den * @df_den end @ss_total=@ss_num+@ss_den @ms_total=@ms_num+@ms_den opts_default={:name=>"ANOVA", :name_denominator=>_("Explained variance"), :name_numerator=>_("Unexplained variance")} @opts=opts_default.merge(opts) opts.keys.each {|k| send("#{k}=", @opts[k]) if self.respond_to? "#{k}=" } @f_object=Statsample::Test::F.new(@ms_num, @ms_den, @df_num,@df_den) end # F value def f @f_object.f end # P-value of F test def probability @f_object.probability end def report_building(builder) #:nodoc: builder.section(:name=>@name) do |b| report_building_table(b) end end def report_building_table(builder) #:nodoc: builder.table(:name=>_("%s Table") % @name, :header=>%w{source ss df ms f p}.map {|v| _(v)}) do |t| t.row([@name_numerator, sprintf("%0.3f",@ss_num), @df_num, sprintf("%0.3f",@ms_num), sprintf("%0.3f",f), sprintf("%0.3f", probability)]) t.row([@name_denominator, sprintf("%0.3f",@ss_den), @df_den, sprintf("%0.3f",@ms_den), "", ""]) t.row([_("Total"), sprintf("%0.3f",@ss_total), @df_total, sprintf("%0.3f",@ms_total),"",""]) end end end # One Way Anova with vectors # Example: # v1=[2,3,4,5,6].to_scale # v2=[3,3,4,5,6].to_scale # v3=[5,3,1,5,6].to_scale # anova=Statsample::Anova::OneWayWithVectors.new([v1,v2,v3]) # anova.f # => 0.0243902439024391 # anova.probability # => 0.975953044203438 # anova.sst # => 32.9333333333333 # class OneWayWithVectors < OneWay # Show on summary Levene test attr_accessor :summary_levene # Show on summary descriptives for vectors attr_accessor :summary_descriptives # Show on summary of contrasts attr_accessor :summary_contrasts # Array with stored contrasts attr_reader :contrasts def initialize(*args) if args[0].is_a? Array @vectors=args.shift else @vectors=args.find_all {|v| v.is_a? Statsample::Vector} opts=args.find {|v| v.is_a? Hash} end opts||=Hash.new opts_default={:name=>_("Anova One-Way"), :name_numerator=>_("Between Groups"), :name_denominator=>_("Within Groups"), :summary_descriptives=>false, :summary_levene=>true, :summary_contrasts=>true } @opts=opts_default.merge(opts).merge(:ss_num=>ssbg, :ss_den=>sswg, :df_num=>df_bg, :df_den=>df_wg) @contrasts=[] super(@opts) end alias :sst :ss_total alias :msb :ms_num alias :msw :ms_den # Generates and store a contrast. # Options should be provided as a hash # [:c]=>contrast vector # [:c1 - :c2]=>index for automatic construction of contrast # [:name]=>contrast name def contrast(opts=Hash.new) name=opts[:name] || _("Contrast for %s") % @name opts=opts.merge({:vectors=>@vectors, :name=>name}) c=Statsample::Anova::Contrast.new(opts) @contrasts.push(c) c end def levene Statsample::Test.levene(@vectors, :name=>_("Test of Homogeneity of variances (Levene)")) end # Total mean def total_mean sum=@vectors.inject(0){|a,v| a+v.sum} sum.quo(n) end # Sum of squares within groups def sswg @sswg||=@vectors.inject(0) {|total,vector| total+vector.ss } end # Sum of squares between groups def ssbg m=total_mean @vectors.inject(0) do |total,vector| total + (vector.mean-m).square * vector.size end end # Degrees of freedom within groups def df_wg @dk_wg||=n-k end def k @k||=@vectors.size end # Degrees of freedom between groups def df_bg k-1 end # Total number of cases def n @vectors.inject(0){|a,v| a+v.size} end def report_building(builder) # :nodoc: builder.section(:name=>@name) do |s| if summary_descriptives s.table(:name=>_("Descriptives"),:header=>%w{Name N Mean SD Min Max}.map {|v| _(v)}) do |t| @vectors.each do |v| t.row [v.name, v.n_valid, "%0.4f" % v.mean, "%0.4f" % v.sd, "%0.4f" % v.min, "%0.4f" % v.max] end end end if summary_levene s.parse_element(levene) end report_building_table(s) if summary_contrasts and @contrasts.size>0 @contrasts.each do |c| s.parse_element(c) end end end end end end end ================================================ FILE: lib/statsample/anova/twoway.rb ================================================ module Statsample module Anova # = Generic Anova two-way. # You could enter the sum of squares or the mean squares for a, b, axb and within. # You should enter the degrees of freedom for a,b and within, because df_axb=df_a*df_b # == Usage # anova=Statsample::Anova::TwoWay(:ss_a=>10,:ss_b=>20,:ss_axb=>10, :ss_within=>20, :df_a=>2, :df_b=>3,df_within=100 @name=>"ANOVA for....") class TwoWay include Summarizable attr_reader :df_a, :df_b, :df_axb, :df_within, :df_total attr_reader :ss_a, :ss_b, :ss_axb, :ss_within, :ss_total attr_reader :ms_a, :ms_b, :ms_axb, :ms_within, :ms_total # Name of ANOVA Analisys attr_accessor :name # Name of a factor attr_accessor :name_a # Name of b factor attr_accessor :name_b # Name of within factor attr_accessor :name_within attr_reader :f_a_object, :f_b_object, :f_axb_object def initialize(opts=Hash.new) # First see if sum of squares or mean squares are entered raise ArgumentError, "You should set all d.f." unless [:df_a, :df_b, :df_within].all? {|v| opts.has_key? v} @df_a=opts.delete :df_a @df_b=opts.delete :df_b @df_axb=@df_a*@df_b @df_within=opts.delete :df_within @df_total=@df_a+@df_b+@df_axb+@df_within if [:ss_a, :ss_b, :ss_axb, :ss_within].all? {|v| opts.has_key? v} @ss_a = opts.delete :ss_a @ss_b = opts.delete :ss_b @ss_axb = opts.delete :ss_axb @ss_within = opts.delete :ss_within @ms_a =@ss_a.quo(@df_a) @ms_b =@ss_b.quo(@df_b) @ms_axb =@ss_axb.quo(@df_axb) @ms_within =@ss_within.quo(@df_within) elsif [:ms_a, :ms_b, :ms_axb, :ms_within].all? {|v| opts.has_key? v} @ms_a = opts.delete :ms_a @ms_b = opts.delete :ms_b @ms_axb = opts.delete :ms_axb @ms_within = opts.delete :ms_within @ss_a =@ms_a*@df_a @ss_b =@ms_b*@df_b @ss_axb =@ms_axb*@df_axb @ss_within =@ms_within*@df_within else raise "You should set all ss or ss" end @ss_total=@ss_a+@ss_b+@ss_axb+@ss_within @ms_total=@ms_a+@ms_b+@ms_axb+@ms_within opts_default={:name=>_("ANOVA Two-Way"), :name_a=>_("A"), :name_b=>_("B"), :name_within=>_("Within") } @opts=opts_default.merge(opts) opts_default.keys.each {|k| send("#{k}=", @opts[k]) } @f_a_object=Statsample::Test::F.new(@ms_a,@ms_within,@df_a,@df_within) @f_b_object=Statsample::Test::F.new(@ms_b,@ms_within,@df_b,@df_within) @f_axb_object=Statsample::Test::F.new(@ms_axb,@ms_within,@df_axb,@df_within) end def f_a @f_a_object.f end def f_b @f_b_object.f end def f_axb @f_axb_object.f end def f_a_probability @f_a_object.probability end def f_b_probability @f_b_object.probability end def f_axb_probability @f_axb_object.probability end def report_building(builder) #:nodoc: builder.section(:name=>@name) do |b| report_building_table(b) end end def report_building_table(builder) #:nodoc: builder.table(:name=>_("%s Table") % @name, :header=>%w{source ss df ms f p}.map {|v| _(v)}) do |t| t.row([@name_a, "%0.3f" % @ss_a, @df_a, "%0.3f" % @ms_a , "%0.3f" % f_a, "%0.4f" % f_a_probability] ) t.row([@name_b, "%0.3f" % @ss_b, @df_b, "%0.3f" % @ms_b , "%0.3f" % f_b, "%0.4f" % f_b_probability] ) t.row(["%s X %s" % [@name_a, @name_b], "%0.3f" % @ss_axb, @df_axb, "%0.3f" % @ms_axb , "%0.3f" % f_axb, "%0.4f" % f_axb_probability] ) t.row([@name_within, "%0.3f" % @ss_within, @df_within, nil,nil,nil] ) t.row([_("Total"), "%0.3f" % @ss_total, @df_total, nil,nil,nil] ) end end end # Two Way Anova with vectors # Example: # v1=[1,1,2,2].to_scale # v2=[1,2,1,2].to_scale # v3=[5,3,1,5].to_scale # anova=Statsample::Anova::TwoWayWithVectors.new(:a=>v1,:b=>v2, :dependent=>v3) # class TwoWayWithVectors < TwoWay # Show summary Levene test attr_accessor :summary_levene # Show summary descriptives for variables (means) attr_accessor :summary_descriptives attr_reader :a_var, :b_var, :dep_var # For now, only equal sample cells allowed def initialize(opts=Hash.new) raise "You should insert at least :a, :b and :dependent" unless [:a, :b, :dependent].all? {|v| opts.has_key? v} @a_var='a' @b_var='b' @dep_var='dependent' @a_vector, @b_vector, @dep_vector=Statsample.only_valid_clone opts[:a], opts[:b], opts[:dependent] ds={@a_var=>@a_vector, @b_var=>@b_vector, @dep_var=>@dep_vector}.to_dataset @ds=ds.clone_only_valid _p=@a_vector.factors.size _q=@b_vector.factors.size @x_general=@dep_vector.mean @axb_means={} @axb_sd={} @vectors=[] n=nil @ds.to_multiset_by_split(a_var,b_var).each_vector(dep_var) {|k,v| @axb_means[k]=v.mean @axb_sd[k]=v.sd @vectors << v n||=v.size raise "All cell sizes should be equal" if n!=v.size } @a_means={} @ds.to_multiset_by_split(a_var).each_vector(dep_var) {|k,v| @a_means[k]=v.mean } @b_means={} @ds.to_multiset_by_split(b_var).each_vector(dep_var) {|k,v| @b_means[k]=v.mean } ss_a=n*_q*@ds[a_var].factors.inject(0) {|ac,v| ac+(@a_means[v]-@x_general)**2 } ss_b=n*_p*@ds[b_var].factors.inject(0) {|ac,v| ac+(@b_means[v]-@x_general)**2 } ss_within=@ds.collect {|row| (row[dep_var]-@axb_means[[row[a_var],row[b_var]]])**2 }.sum ss_axb=n*@axb_means.inject(0) {|ac,v| j,k=v[0] xjk=v[1] ac+(xjk-@a_means[j]-@b_means[k]+@x_general)**2 } df_a=_p-1 df_b=_q-1 df_within=(_p*_q)*(n-1) opts_default={:name=>_("Anova Two-Way on %s") % @ds[dep_var].name, :name_a=>@ds[a_var].name, :name_b=>@ds[b_var].name, :summary_descriptives=>true, :summary_levene=>false} @opts=opts_default.merge(opts).merge({:ss_a=>ss_a,:ss_b=>ss_b, :ss_axb=>ss_axb, :ss_within=>ss_within, :df_a=>df_a, :df_b=>df_b, :df_within=>df_within}) super(@opts) end def levene Statsample::Test.levene(@vectors, :name=>_("Test of Homogeneity of variances (Levene)")) end def report_building(builder) #:nodoc:# builder.section(:name=>@name) do |s| if summary_descriptives s.table(:header =>['']+@ds[a_var].factors.map {|a| @ds[a_var].labeling(a)}+[_("%s Mean") % @name_b]) do |t| @ds[b_var].factors.each do |b| t.row([@ds[b_var].labeling(b)]+@ds[a_var].factors.map {|a| "%0.3f" % @axb_means[[a,b]] } + ["%0.3f" % @b_means[b]]) end t.row([_("%s Mean") % @name_a]+@ds[a_var].factors.map {|a| "%0.3f" % @a_means[a]}+ ["%0.3f" % @x_general]) end end if summary_levene s.parse_element(levene) end report_building_table(s) end end end end end ================================================ FILE: lib/statsample/anova.rb ================================================ module Statsample module Anova class << self def oneway(*args) OneWay.new(*args) end def twoway(*args) TwoWay.new(*args) end def oneway_with_vectors(*args) OneWayWithVectors.new(*args) end def twoway_with_vectors(*args) TwoWayWithVectors.new(*args) end end end end require 'statsample/anova/oneway' require 'statsample/anova/contrast' require 'statsample/anova/twoway' ================================================ FILE: lib/statsample/bivariate/pearson.rb ================================================ module Statsample module Bivariate # = Pearson correlation coefficient (r) # # The moment-product Pearson's correlation coefficient, known as 'r' # is a measure of bivariate associate between two continous # variables. # # == Usage # a = [1,2,3,4,5,6].to_scale # b = [2,3,4,5,6,7].to_scale # pearson = Statsample::Bivariate::Pearson.new(a,b) # puts pearson.r # puts pearson.t # puts pearson.probability # puts pearson.summary # class Pearson include Statsample::Test include Summarizable # Name of correlation attr_accessor :name # Tails for probability (:both, :left or :right) attr_accessor :tails attr_accessor :n def initialize(v1,v2,opts=Hash.new) @v1_name,@v2_name = v1.name,v2.name @v1,@v2 = Statsample.only_valid_clone(v1,v2) @n=@v1.size opts_default={ :name=>_("Correlation (%s - %s)") % [@v1_name, @v2_name], :tails=>:both } @opts=opts.merge(opts_default) @opts.each{|k,v| self.send("#{k}=",v) if self.respond_to? k } end def r Statsample::Bivariate.pearson(@v1,@v2) end def t Statsample::Bivariate.t_pearson(@v1,@v2) end def probability p_using_cdf(Distribution::T.cdf(t, @v1.size-2), tails) end def report_building(builder) builder.text(_("%s : r=%0.3f (t:%0.3f, g.l.=%d, p:%0.3f / %s tails)") % [@name, r,t, (n-2), probability, tails]) end end end end ================================================ FILE: lib/statsample/bivariate.rb ================================================ require 'statsample/bivariate/pearson' module Statsample # Diverse methods and classes to calculate bivariate relations # Specific classes: # * Statsample::Bivariate::Pearson : Pearson correlation coefficient (r) # * Statsample::Bivariate::Tetrachoric : Tetrachoric correlation # * Statsample::Bivariate::Polychoric : Polychoric correlation (using joint, two-step and polychoric series) module Bivariate autoload(:Polychoric, 'statsample/bivariate/polychoric') autoload(:Tetrachoric, 'statsample/bivariate/tetrachoric') class << self # Covariance between two vectors def covariance(v1,v2) v1a,v2a=Statsample.only_valid_clone(v1,v2) return nil if v1a.size==0 if Statsample.has_gsl? GSL::Stats::covariance(v1a.gsl, v2a.gsl) else covariance_slow(v1a,v2a) end end # Estimate the ML between two dichotomic vectors def maximum_likehood_dichotomic(pred,real) preda,reala=Statsample.only_valid_clone(pred,real) sum=0 preda.each_index{|i| sum+=(reala[i]*Math::log(preda[i])) + ((1-reala[i])*Math::log(1-preda[i])) } sum end def covariance_slow(v1,v2) # :nodoc: v1a,v2a=Statsample.only_valid(v1,v2) sum_of_squares(v1a,v2a) / (v1a.size-1) end def sum_of_squares(v1,v2) v1a,v2a=Statsample.only_valid_clone(v1,v2) m1=v1a.mean m2=v2a.mean (v1a.size).times.inject(0) {|ac,i| ac+(v1a[i]-m1)*(v2a[i]-m2)} end # Calculate Pearson correlation coefficient (r) between 2 vectors def pearson(v1,v2) v1a,v2a=Statsample.only_valid_clone(v1,v2) return nil if v1a.size ==0 if Statsample.has_gsl? GSL::Stats::correlation(v1a.gsl, v2a.gsl) else pearson_slow(v1a,v2a) end end def pearson_slow(v1,v2) # :nodoc: v1a,v2a=Statsample.only_valid_clone(v1,v2) # Calculate sum of squares ss=sum_of_squares(v1a,v2a) ss.quo(Math::sqrt(v1a.sum_of_squares) * Math::sqrt(v2a.sum_of_squares)) end alias :correlation :pearson # Retrieves the value for t test for a pearson correlation # between two vectors to test the null hipothesis of r=0 def t_pearson(v1,v2) v1a,v2a=Statsample.only_valid_clone(v1,v2) r=pearson(v1a,v2a) if(r==1.0) 0 else t_r(r,v1a.size) end end # Retrieves the value for t test for a pearson correlation # giving r and vector size # Source : http://faculty.chass.ncsu.edu/garson/PA765/correl.htm def t_r(r,size) r * Math::sqrt(((size)-2).to_f / (1 - r**2)) end # Retrieves the probability value (a la SPSS) # for a given t, size and number of tails. # Uses a second parameter # * :both or 2 : for r!=0 (default) # * :right, :positive or 1 : for r > 0 # * :left, :negative : for r < 0 def prop_pearson(t, size, tails=:both) tails=:both if tails==2 tails=:right if tails==1 or tails==:positive tails=:left if tails==:negative n_tails=case tails when :both then 2 else 1 end t=-t if t>0 and (tails==:both) cdf=Distribution::T.cdf(t, size-2) if(tails==:right) 1.0-(cdf*n_tails) else cdf*n_tails end end # Predicted time for pairwise correlation matrix, in miliseconds # See benchmarks/correlation_matrix.rb to see mode of calculation def prediction_pairwise(vars,cases) ((-0.518111-0.000746*cases+1.235608*vars+0.000740*cases*vars)**2) / 100 end # Predicted time for optimized correlation matrix, in miliseconds # See benchmarks/correlation_matrix.rb to see mode of calculation def prediction_optimized(vars,cases) ((4+0.018128*cases+0.246871*vars+0.001169*vars*cases)**2) / 100 end # Returns residual score after delete variance # from another variable # def residuals(from,del) r=Statsample::Bivariate.pearson(from,del) froms, dels = from.vector_standarized, del.vector_standarized nv=[] froms.data_with_nils.each_index do |i| if froms[i].nil? or dels[i].nil? nv.push(nil) else nv.push(froms[i]-r*dels[i]) end end nv.to_vector(:scale) end # Correlation between v1 and v2, controling the effect of # control on both. def partial_correlation(v1,v2,control) v1a,v2a,cona=Statsample.only_valid_clone(v1,v2,control) rv1v2=pearson(v1a,v2a) rv1con=pearson(v1a,cona) rv2con=pearson(v2a,cona) (rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2)) end def covariance_matrix_optimized(ds) x=ds.to_gsl n=x.row_size m=x.column_size means=((1/n.to_f)*GSL::Matrix.ones(1,n)*x).row(0) centered=x-(GSL::Matrix.ones(n,m)*GSL::Matrix.diag(means)) ss=centered.transpose*centered s=((1/(n-1).to_f))*ss s end # Covariance matrix. # Order of rows and columns depends on Dataset#fields order def covariance_matrix(ds) vars,cases=ds.fields.size,ds.cases if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases) cm=covariance_matrix_optimized(ds) else cm=covariance_matrix_pairwise(ds) end cm.extend(Statsample::CovariateMatrix) cm.fields=ds.fields cm end def covariance_matrix_pairwise(ds) cache={} matrix=ds.collect_matrix do |row,col| if (ds[row].type!=:scale or ds[col].type!=:scale) nil elsif row==col ds[row].variance else if cache[[col,row]].nil? cov=covariance(ds[row],ds[col]) cache[[row,col]]=cov cov else cache[[col,row]] end end end matrix end # Correlation matrix. # Order of rows and columns depends on Dataset#fields order def correlation_matrix(ds) vars,cases=ds.fields.size,ds.cases if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases) cm=correlation_matrix_optimized(ds) else cm=correlation_matrix_pairwise(ds) end cm.extend(Statsample::CovariateMatrix) cm.fields=ds.fields cm end def correlation_matrix_optimized(ds) s=covariance_matrix_optimized(ds) sds=GSL::Matrix.diagonal(s.diagonal.sqrt.pow(-1)) cm=sds*s*sds # Fix diagonal s.row_size.times {|i| cm[i,i]=1.0 } cm end def correlation_matrix_pairwise(ds) cache={} cm=ds.collect_matrix do |row,col| if row==col 1.0 elsif (ds[row].type!=:scale or ds[col].type!=:scale) nil else if cache[[col,row]].nil? r=pearson(ds[row],ds[col]) cache[[row,col]]=r r else cache[[col,row]] end end end end # Retrieves the n valid pairwise. def n_valid_matrix(ds) ds.collect_matrix do |row,col| if row==col ds[row].valid_data.size else rowa,rowb=Statsample.only_valid_clone(ds[row],ds[col]) rowa.size end end end # Matrix of correlation probabilities. # Order of rows and columns depends on Dataset#fields order def correlation_probability_matrix(ds, tails=:both) rows=ds.fields.collect do |row| ds.fields.collect do |col| v1a,v2a=Statsample.only_valid_clone(ds[row],ds[col]) (row==col or ds[row].type!=:scale or ds[col].type!=:scale) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size, tails) end end Matrix.rows(rows) end # Spearman ranked correlation coefficient (rho) between 2 vectors def spearman(v1,v2) v1a,v2a=Statsample.only_valid_clone(v1,v2) v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale) pearson(v1r,v2r) end # Calculate Point biserial correlation. Equal to Pearson correlation, with # one dichotomous value replaced by "0" and the other by "1" def point_biserial(dichotomous,continous) ds={'d'=>dichotomous,'c'=>continous}.to_dataset.dup_only_valid raise(TypeError, "First vector should be dichotomous") if ds['d'].factors.size!=2 raise(TypeError, "Second vector should be continous") if ds['c'].type!=:scale f0=ds['d'].factors.sort[0] m0=ds.filter_field('c') {|c| c['d']==f0} m1=ds.filter_field('c') {|c| c['d']!=f0} ((m1.mean-m0.mean).to_f / ds['c'].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.cases**2) end # Kendall Rank Correlation Coefficient (Tau a) # Based on Hervé Adbi article def tau_a(v1,v2) v1a,v2a=Statsample.only_valid_clone(v1,v2) n=v1.size v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale) o1=ordered_pairs(v1r) o2=ordered_pairs(v2r) delta= o1.size*2-(o2 & o1).size*2 1-(delta * 2 / (n*(n-1)).to_f) end # Calculates Goodman and Kruskal’s Tau b correlation. # Tb is an asymmetric P-R-E measure of association for nominal scales # (Mielke, X) # # Tau-b defines perfect association as strict monotonicity. Although it # requires strict monotonicity to reach 1.0, it does not penalize ties as # much as some other measures. # == Reference # Mielke, P. GOODMAN–KRUSKAL TAU AND GAMMA. # Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm def tau_b(matrix) v=pairs(matrix) ((v['P']-v['Q']).to_f / Math::sqrt((v['P']+v['Q']+v['Y'])*(v['P']+v['Q']+v['X'])).to_f) end # Calculates Goodman and Kruskal's gamma. # # Gamma is the surplus of concordant pairs over discordant pairs, as a # percentage of all pairs ignoring ties. # # Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm def gamma(matrix) v=pairs(matrix) (v['P']-v['Q']).to_f / (v['P']+v['Q']).to_f end # Calculate indexes for a matrix the rows and cols has to be ordered def pairs(matrix) # calculate concordant #p matrix rs=matrix.row_size cs=matrix.column_size conc=disc=ties_x=ties_y=0 (0...(rs-1)).each do |x| (0...(cs-1)).each do |y| ((x+1)...rs).each do |x2| ((y+1)...cs).each do |y2| # #p sprintf("%d:%d,%d:%d",x,y,x2,y2) conc+=matrix[x,y]*matrix[x2,y2] end end end end (0...(rs-1)).each {|x| (1...(cs)).each{|y| ((x+1)...rs).each{|x2| (0...y).each{|y2| # #p sprintf("%d:%d,%d:%d",x,y,x2,y2) disc+=matrix[x,y]*matrix[x2,y2] } } } } (0...(rs-1)).each {|x| (0...(cs)).each{|y| ((x+1)...(rs)).each{|x2| ties_x+=matrix[x,y]*matrix[x2,y] } } } (0...rs).each {|x| (0...(cs-1)).each{|y| ((y+1)...(cs)).each{|y2| ties_y+=matrix[x,y]*matrix[x,y2] } } } {'P'=>conc,'Q'=>disc,'Y'=>ties_y,'X'=>ties_x} end def ordered_pairs(vector) d=vector.data a=[] (0...(d.size-1)).each{|i| ((i+1)...(d.size)).each {|j| a.push([d[i],d[j]]) } } a end =begin def sum_of_codeviated(v1,v2) v1a,v2a=Statsample.only_valid(v1,v2) sum=0 (0...v1a.size).each{|i| sum+=v1a[i]*v2a[i] } sum-((v1a.sum*v2a.sum) / v1a.size.to_f) end =end # Report the minimum number of cases valid of a covariate matrix # based on a dataset def min_n_valid(ds) min=ds.cases m=n_valid_matrix(ds) for x in 0...m.row_size for y in 0...m.column_size min=m[x,y] if m[x,y] < min end end min end end end end ================================================ FILE: lib/statsample/codification.rb ================================================ require 'yaml' module Statsample # This module aids to code open questions # * Select one or more vectors of a dataset, to create a yaml files, on which each vector is a hash, which keys and values are the vector's factors . If data have Statsample::SPLIT_TOKEN on a value, each value will be separated on two or more hash keys. # * Edit the yaml and replace the values of hashes with your codes. If you need to create two or mores codes for an answer, use the separator (default Statsample::SPLIT_TOKEN) # * Recode the vectors, loading the yaml file: # * recode_dataset_simple!() : The new vectors have the same name of the original plus "_recoded" # * recode_dataset_split!() : Create equal number of vectors as values. See Vector.add_vectors_by_split() for arguments # # Usage: # recode_file="recodification.yaml" # phase=:first # flag # if phase==:first # File.open(recode_file,"w") {|fp| # Statsample::Codification.create_yaml(ds,%w{vector1 vector2}, ",",fp) # } # # Edit the file recodification.yaml and verify changes # elsif phase==:second # File.open(recode_file,"r") {|fp| # Statsample::Codification.verify(fp,['vector1']) # } # # Add new vectors to the dataset # elsif phase==:third # File.open(recode_file,"r") {|fp| # Statsample::Codification.recode_dataset_split!(ds,fp,"*") # } # end # module Codification class << self # Create a hash, based on vectors, to create the dictionary. # The keys will be vectors name on dataset and the values # will be hashes, with keys = values, for recodification def create_hash(dataset, vectors, sep=Statsample::SPLIT_TOKEN) raise ArgumentError,"Array should't be empty" if vectors.size==0 pro_hash=vectors.inject({}){|h,v_name| raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name v=dataset[v_name] split_data=v.splitted(sep).flatten.collect {|c| c.to_s}.find_all {|c| !c.nil?} factors=split_data.uniq.compact.sort.inject({}) {|ac,val| ac[val]=val;ac } h[v_name]=factors h } pro_hash end # Create a yaml to create a dictionary, based on vectors # The keys will be vectors name on dataset and the values # will be hashes, with keys = values, for recodification # # v1=%w{a,b b,c d}.to_vector # ds={"v1"=>v1}.to_dataset # Statsample::Codification.create_yaml(ds,['v1']) # => "--- \nv1: \n a: a\n b: b\n c: c\n d: d\n" def create_yaml(dataset, vectors, io=nil, sep=Statsample::SPLIT_TOKEN) pro_hash=create_hash(dataset, vectors, sep) YAML.dump(pro_hash,io) end # Create a excel to create a dictionary, based on vectors. # Raises an error if filename exists # The rows will be: # * field: name of vector # * original: original name # * recoded: new code def create_excel(dataset, vectors, filename, sep=Statsample::SPLIT_TOKEN) require 'spreadsheet' if File.exist?(filename) raise "Exists a file named #{filename}. Delete ir before overwrite." end book = Spreadsheet::Workbook.new sheet = book.create_worksheet sheet.row(0).concat(%w{field original recoded}) i=1 create_hash(dataset, vectors, sep).sort.each do |field, inner_hash| inner_hash.sort.each do |k,v| sheet.row(i).concat([field.dup,k.dup,v.dup]) i+=1 end end book.write(filename) end # From a excel generates a dictionary hash # to use on recode_dataset_simple!() or recode_dataset_split!(). # def excel_to_recoded_hash(filename) require 'spreadsheet' h={} book = Spreadsheet.open filename sheet= book.worksheet 0 row_i=0 sheet.each do |row| row_i+=1 next if row_i==1 or row[0].nil? or row[1].nil? or row[2].nil? h[row[0]]={} if h[row[0]].nil? h[row[0]][row[1]]=row[2] end h end def inverse_hash(h, sep=Statsample::SPLIT_TOKEN) h.inject({}) do |a,v| v[1].split(sep).each do |val| a[val]||=[] a[val].push(v[0]) end a end end def dictionary(h, sep=Statsample::SPLIT_TOKEN) h.inject({}) {|a,v| a[v[0]]=v[1].split(sep); a } end def recode_vector(v,h,sep=Statsample::SPLIT_TOKEN) dict=dictionary(h,sep) new_data=v.splitted(sep) new_data.collect do |c| if c.nil? nil else c.collect{|value| dict[value] }.flatten.uniq end end end def recode_dataset_simple!(dataset, dictionary_hash ,sep=Statsample::SPLIT_TOKEN) _recode_dataset(dataset,dictionary_hash ,sep,false) end def recode_dataset_split!(dataset, dictionary_hash, sep=Statsample::SPLIT_TOKEN) _recode_dataset(dataset, dictionary_hash, sep,true) end def _recode_dataset(dataset, h , sep=Statsample::SPLIT_TOKEN, split=false) v_names||=h.keys v_names.each do |v_name| raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name recoded=recode_vector(dataset[v_name], h[v_name],sep).collect { |c| if c.nil? nil else c.join(sep) end }.to_vector if(split) recoded.split_by_separator(sep).each {|k,v| dataset[v_name+"_"+k]=v } else dataset[v_name+"_recoded"]=recoded end end end def verify(h, v_names=nil,sep=Statsample::SPLIT_TOKEN,io=$>) require 'pp' v_names||=h.keys v_names.each{|v_name| inverse=inverse_hash(h[v_name],sep) io.puts "- Field: #{v_name}" inverse.sort{|a,b| -(a[1].count<=>b[1].count)}.each {|k,v| io.puts " - \"#{k}\" (#{v.count}) :\n -'"+v.join("\n -'")+"'" } } end end end end ================================================ FILE: lib/statsample/converter/csv.rb ================================================ module Statsample class CSV < SpreadsheetBase if RUBY_VERSION<"1.9" require 'fastercsv' CSV_klass=::FasterCSV else require 'csv' CSV_klass=::CSV end class << self def read19(filename,ignore_lines=0,csv_opts=Hash.new) #default first line is header csv_opts.merge!(:headers=>true, :header_converters => :symbol) csv = CSV_klass::Table.new(CSV_klass::read(filename,'r',csv_opts)) csv_headers = if csv_opts[:headers] csv.headers else #as in R, if no header we name the headers as V1,V2,V3,V4,.. 1.upto(csv.first.length).collect { |i| "V#{i}" } end #we invert row -> column. It means csv[0] is the first column and not row. Similar to R csv.by_col! thash = {} csv_headers.each_with_index do |header,idx| thash[header] = Statsample::Vector.new(csv[idx].drop(ignore_lines)) end Statsample::Dataset.new(thash) end # Returns a Dataset based on a csv file # # USE: # ds=Statsample::CSV.read("test_csv.csv") def read(filename, empty=[''],ignore_lines=0,csv_opts=Hash.new) first_row=true fields=[] #fields_data={} ds=nil line_number=0 csv=CSV_klass.open(filename,'rb', csv_opts) csv.each do |row| line_number+=1 if(line_number<=ignore_lines) #puts "Skip line" next end row.collect!{|c| c.to_s } if first_row fields=extract_fields(row) ds=Statsample::Dataset.new(fields) first_row=false else rowa=process_row(row,empty) ds.add_case(rowa,false) end end convert_to_scale_and_date(ds,fields) ds.update_valid_data ds end # Save a Dataset on a csv file # # USE: # Statsample::CSV.write(ds,"test_csv.csv") def write(dataset,filename, convert_comma=false,*opts) writer=CSV_klass.open(filename,'w',*opts) writer << dataset.fields dataset.each_array do|row| if(convert_comma) row.collect!{|v| v.to_s.gsub(".",",")} end writer << row end writer.close end end end end ================================================ FILE: lib/statsample/converter/spss.rb ================================================ module Statsample module SPSS class << self # Export a SPSS Matrix with tetrachoric correlations . # # Use: # ds=Statsample::Excel.read("my_data.xls") # puts Statsample::SPSS.tetrachoric_correlation_matrix(ds) def tetrachoric_correlation_matrix(ds) dsv=ds.dup_only_valid # Delete all vectors doesn't have variation dsv.fields.each{|f| if dsv[f].factors.size==1 dsv.delete_vector(f) else dsv[f]=dsv[f].dichotomize end } tcm=Statsample::Bivariate.tetrachoric_correlation_matrix(dsv) n=dsv.fields.collect {|f| sprintf("%d",dsv[f].size) } meanlist=dsv.fields.collect{|f| sprintf("%0.3f", dsv[f].mean) } stddevlist=dsv.fields.collect{|f| sprintf("%0.3f", dsv[f].sd) } out=<<-HEREDOC MATRIX DATA VARIABLES=ROWTYPE_ #{dsv.fields.join(",")}. BEGIN DATA N #{n.join(" ")} MEAN #{meanlist.join(" ")} STDDEV #{stddevlist.join(" ")} HEREDOC tcm.row_size.times {|i| out +="CORR " (i+1).times {|j| out+=sprintf("%0.3f",tcm[i,j])+" " } out +="\n" } out+="END DATA.\nEXECUTE.\n" end end end end ================================================ FILE: lib/statsample/converters.rb ================================================ require 'statsample/converter/spss' module Statsample # Create and dumps Datasets on a database module Database class << self # Read a database query and returns a Dataset # # USE: # # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password") # Statsample.read(dbh, "SELECT * FROM test") # def read(dbh,query) require 'dbi' sth=dbh.execute(query) vectors={} fields=[] sth.column_info.each {|c| vectors[c['name']]=Statsample::Vector.new([]) vectors[c['name']].name=c['name'] vectors[c['name']].type= (c['type_name']=='INTEGER' or c['type_name']=='DOUBLE') ? :scale : :nominal fields.push(c['name']) } ds=Statsample::Dataset.new(vectors,fields) sth.fetch do |row| ds.add_case(row.to_a, false ) end ds.update_valid_data ds end # Insert each case of the Dataset on the selected table # # USE: # # ds={'id'=>[1,2,3].to_vector, 'name'=>["a","b","c"].to_vector}.to_dataset # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password") # Statsample::Database.insert(ds,dbh,"test") # def insert(ds, dbh, table) require 'dbi' query="INSERT INTO #{table} ("+ds.fields.join(",")+") VALUES ("+((["?"]*ds.fields.size).join(","))+")" sth=dbh.prepare(query) ds.each_array{|c| sth.execute(*c) } return true end # Create a sql, basen on a given Dataset # # USE: # # ds={'id'=>[1,2,3,4,5].to_vector,'name'=>%w{Alex Peter Susan Mary John}.to_vector}.to_dataset # Statsample::Database.create_sql(ds,'names') # ==>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;" # def create_sql(ds,table,charset="UTF8") sql="CREATE TABLE #{table} (" fields=ds.fields.collect{|f| v=ds[f] f+" "+v.db_type } sql+fields.join(",\n ")+") CHARACTER SET=#{charset};" end end end module Mondrian class << self def write(dataset,filename) File.open(filename,"wb") do |fp| fp.puts dataset.fields.join("\t") dataset.each_array_with_nils do |row| row2=row.collect{|v| v.nil? ? "NA" : v.to_s.gsub(/\s+/,"_") } fp.puts row2.join("\t") end end end end end class SpreadsheetBase class << self def extract_fields(row) i=0; fields=row.to_a.collect{|c| if c.nil? i+=1 "var%05d" % i else c.to_s.downcase end } fields.recode_repeated end def process_row(row,empty) row.to_a.map do |c| if empty.include?(c) nil else if c.is_a? String and c.is_number? if c=~/^\d+$/ c.to_i else c.gsub(",",".").to_f end else c end end end end def convert_to_scale_and_date(ds,fields) fields.each do |f| if ds[f].can_be_scale? ds[f].type=:scale elsif ds[f].can_be_date? ds[f].type=:date end end end end end class PlainText < SpreadsheetBase class << self def read(filename, fields) ds=Statsample::Dataset.new(fields) fp=File.open(filename,"r") fp.each_line do |line| row=process_row(line.strip.split(/\s+/),[""]) next if row==["\x1A"] ds.add_case_array(row) end convert_to_scale_and_date(ds,fields) ds.update_valid_data fields.each {|f| ds[f].name=f } ds end end end class Excel < SpreadsheetBase class << self # Write a Excel spreadsheet based on a dataset # * TODO: Format nicely date values def write(dataset,filename) require 'spreadsheet' book = Spreadsheet::Workbook.new sheet = book.create_worksheet format = Spreadsheet::Format.new :color => :blue, :weight => :bold sheet.row(0).concat(dataset.fields.map {|i| i.dup}) # Unfreeze strings sheet.row(0).default_format = format i=1 dataset.each_array{|row| sheet.row(i).concat(row) i+=1 } book.write(filename) end # This should be fixed. # If we have a Formula, should be resolver first def preprocess_row(row, dates) i=-1 row.collect!{|c| i+=1 if c.is_a? Spreadsheet::Formula if(c.value.is_a? Spreadsheet::Excel::Error) nil else c.value end elsif dates.include? i and !c.nil? and c.is_a? Numeric row.date(i) else c end } end private :process_row, :preprocess_row # Returns a dataset based on a xls file # USE: # ds = Statsample::Excel.read("test.xls") # def read(filename, opts=Hash.new) require 'spreadsheet' raise "options should be Hash" unless opts.is_a? Hash opts_default={ :worksheet_id=>0, :ignore_lines=>0, :empty=>[''] } opts=opts_default.merge opts worksheet_id=opts[:worksheet_id] ignore_lines=opts[:ignore_lines] empty=opts[:empty] first_row=true fields=[] fields_data={} ds=nil line_number=0 book = Spreadsheet.open filename sheet= book.worksheet worksheet_id sheet.each do |row| begin dates=[] row.formats.each_index{|i| if !row.formats[i].nil? and row.formats[i].number_format=="DD/MM/YYYY" dates.push(i) end } line_number+=1 next if(line_number<=ignore_lines) preprocess_row(row,dates) if first_row fields=extract_fields(row) ds=Statsample::Dataset.new(fields) first_row=false else rowa=process_row(row,empty) (fields.size - rowa.size).times { rowa << nil } ds.add_case(rowa,false) end rescue => e error="#{e.to_s}\nError on Line # #{line_number}:#{row.join(",")}" raise end end convert_to_scale_and_date(ds, fields) ds.update_valid_data fields.each {|f| ds[f].name=f } ds.name=filename ds end end end module Mx class << self def write(dataset,filename,type=:covariance) puts "Writing MX File" File.open(filename,"w") do |fp| fp.puts "! #{filename}" fp.puts "! Output generated by Statsample" fp.puts "Data Ninput=#{dataset.fields.size} Nobservations=#{dataset.cases}" fp.puts "Labels "+dataset.fields.join(" ") case type when :raw fp.puts "Rectangular" dataset.each do |row| out=dataset.fields.collect do |f| if dataset[f].is_valid? row[f] row[f] else "." end end fp.puts out.join("\t") end fp.puts "End Rectangular" when :covariance fp.puts " CMatrix Full" cm=Statsample::Bivariate.covariance_matrix(dataset) d=(0...(cm.row_size)).collect {|row| (0...(cm.column_size)).collect{|col| cm[row,col].nil? ? "." : sprintf("%0.3f", cm[row,col]) }.join(" ") }.join("\n") fp.puts d end end end end end module GGobi class << self def write(dataset,filename,opt={}) File.open(filename,"w") {|fp| fp.write(self.out(dataset,opt)) } end def out(dataset,opt={}) require 'ostruct' default_opt = {:dataname => "Default", :description=>"", :missing=>"NA"} default_opt.merge! opt carrier=OpenStruct.new carrier.categorials=[] carrier.conversions={} variables_def=dataset.fields.collect{|k| variable_definition(carrier,dataset[k],k) }.join("\n") indexes=carrier.categorials.inject({}) {|s,c| s[dataset.fields.index(c)]=c s } records="" dataset.each_array {|c| indexes.each{|ik,iv| c[ik]=carrier.conversions[iv][c[ik]] } records << "#{values_definition(c, default_opt[:missing])}\n" } out=< #{default_opt[:description]} #{variables_def} #{records} EOC out end def values_definition(c,missing) c.collect{|v| if v.nil? "#{missing}" elsif v.is_a? Numeric "#{v}" else "#{v.gsub(/\s+/,"_")}" end }.join(" ") end # Outputs a string for a variable definition # v = vector # name = name of the variable # nickname = nickname def variable_definition(carrier,v,name,nickname=nil) nickname = (nickname.nil? ? "" : "nickname=\"#{nickname}\"" ) if v.type==:nominal or v.data.find {|d| d.is_a? String } carrier.categorials.push(name) carrier.conversions[name]={} factors=v.factors out ="\n" out << "\n" out << (1..factors.size).to_a.collect{|i| carrier.conversions[name][factors[i-1]]=i "#{v.labeling(factors[i-1])}" }.join("\n") out << "\n\n" out elsif v.data.find {|d| d.is_a? Float} "" else "" end end end end end require 'statsample/converter/csv.rb' ================================================ FILE: lib/statsample/crosstab.rb ================================================ module Statsample # Class to create crosstab of data # With this, you can create reports and do chi square test # The first vector will be at rows and the second will the the columns # class Crosstab include Summarizable attr_reader :v_rows, :v_cols attr_accessor :row_label, :column_label, :name, :percentage_row, :percentage_column, :percentage_total def initialize(v1, v2, opts=Hash.new) #raise ArgumentError, "Both arguments should be Vectors" unless v1.is_a? Statsample::Vector and v2.is_a? Statsample::Vector raise ArgumentError, "Vectors should be the same size" unless v1.size==v2.size @v_rows, @v_cols=Statsample.only_valid_clone(v1.to_vector,v2.to_vector) @cases=@v_rows.size @row_label=v1.name @column_label=v2.name @name=nil @percentage_row = @percentage_column = @percentage_total=false opts.each{|k,v| self.send("#{k}=",v) if self.respond_to? k } @name||=_("Crosstab %s - %s") % [@row_label, @column_label] end def rows_names @v_rows.factors.sort end def cols_names @v_cols.factors.sort end def rows_total @v_rows.frequencies end def cols_total @v_cols.frequencies end def frequencies base=rows_names.inject([]){|s,row| s+=cols_names.collect{|col| [row,col]} }.inject({}) {|s,par| s[par]=0 s } base.update(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a.to_vector.frequencies) end def to_matrix f=frequencies rn=rows_names cn=cols_names Matrix.rows(rn.collect{|row| cn.collect{|col| f[[row,col]]} }) end def frequencies_by_row f=frequencies rows_names.inject({}){|sr,row| sr[row]=cols_names.inject({}) {|sc,col| sc[col]=f[[row,col]]; sc} sr } end def frequencies_by_col f=frequencies cols_names.inject({}){|sc,col| sc[col]=rows_names.inject({}) {|sr,row| sr[row]=f[[row,col]]; sr} sc } end # Chi square, based on expected and real matrix def chi_square require 'statsample/test' Statsample::Test.chi_square(self.to_matrix, matrix_expected) end # Useful to obtain chi square def matrix_expected rn=rows_names cn=cols_names rt=rows_total ct=cols_total t=@v_rows.size m=rn.collect{|row| cn.collect{|col| (rt[row]*ct[col]).quo(t) } } Matrix.rows(m) end def cols_empty_hash cols_names.inject({}) {|a,x| a[x]=0;a} end def report_building(builder) builder.section(:name=>@name) do |generator| fq=frequencies rn=rows_names cn=cols_names total=0 total_cols=cols_empty_hash generator.text "Chi Square: #{chi_square}" generator.text(_("Rows: %s") % @row_label) unless @row_label.nil? generator.text(_("Columns: %s") % @column_label) unless @column_label.nil? t=ReportBuilder::Table.new(:name=>@name+" - "+_("Raw"), :header=>[""]+cols_names.collect {|c| @v_cols.labeling(c)}+[_("Total")]) rn.each do |row| total_row=0 t_row=[@v_rows.labeling(row)] cn.each do |col| data=fq[[row,col]] total_row+=fq[[row,col]] total+=fq[[row,col]] total_cols[col]+=fq[[row,col]] t_row.push(data) end t_row.push(total_row) t.row(t_row) end t.hr t_row=[_("Total")] cn.each do |v| t_row.push(total_cols[v]) end t_row.push(total) t.row(t_row) generator.parse_element(t) if(@percentage_row) table_percentage(generator,:row) end if(@percentage_column) table_percentage(generator,:column) end if(@percentage_total) table_percentage(generator,:total) end end end def table_percentage(generator,type) fq=frequencies cn=cols_names rn=rows_names rt=rows_total ct=cols_total type_name=case type when :row then _("% Row") when :column then _("% Column") when :total then _("% Total") end t=ReportBuilder::Table.new(:name=>@name+" - "+_(type_name), :header=>[""]+cols_names.collect {|c| @v_cols.labeling(c) } + [_("Total")]) rn.each do |row| t_row=[@v_rows.labeling(row)] cn.each do |col| total=case type when :row then rt[row] when :column then ct[col] when :total then @cases end data = sprintf("%0.2f%%", fq[[row,col]]*100.0/ total ) t_row.push(data) end total=case type when :row then rt[row] when :column then @cases when :total then @cases end t_row.push(sprintf("%0.2f%%", rt[row]*100.0/total)) t.row(t_row) end t.hr t_row=[_("Total")] cn.each{|col| total=case type when :row then @cases when :column then ct[col] when :total then @cases end t_row.push(sprintf("%0.2f%%", ct[col]*100.0/total)) } t_row.push("100%") t.row(t_row) generator.parse_element(t) end end end ================================================ FILE: lib/statsample/dataset.rb ================================================ require 'statsample/vector' class Hash # Creates a Statsample::Dataset based on a Hash def to_dataset(*args) Statsample::Dataset.new(self, *args) end end class Array def prefix(s) # :nodoc: self.collect{|c| s+c.to_s } end def suffix(s) # :nodoc: self.collect{|c| c.to_s+s } end end module Statsample class DatasetException < RuntimeError # :nodoc: attr_reader :ds,:exp def initialize(ds,e) @ds=ds @exp=e end def to_s m="Error on iteration: "+@exp.message+"\n"+@exp.backtrace.join("\n") m+="\nRow ##{@ds.i}:#{@ds.case_as_hash(@ds.i)}" unless @ds.i.nil? m end end # Set of cases with values for one or more variables, # analog to a dataframe on R or a standard data file of SPSS. # Every vector has #field name, which represent it. By default, # the vectors are ordered by it field name, but you can change it # the fields order manually. # The Dataset work as a Hash, with keys are field names # and values are Statsample::Vector # # # ==Usage # Create a empty dataset: # Dataset.new() # Create a dataset with three empty vectors, called v1, v2 and v3: # Dataset.new(%w{v1 v2 v3}) # Create a dataset with two vectors, called v1 # and v2: # Dataset.new({'v1'=>%w{1 2 3}.to_vector, 'v2'=>%w{4 5 6}.to_vector}) # Create a dataset with two given vectors (v1 and v2), # with vectors on inverted order: # Dataset.new({'v2'=>v2,'v1'=>v1},['v2','v1']) # # The fast way to create a dataset uses Hash#to_dataset, with # field order as arguments # v1 = [1,2,3].to_scale # v2 = [1,2,3].to_scale # ds = {'v1'=>v2, 'v2'=>v2}.to_dataset(%w{v2 v1}) class Dataset include Writable include Summarizable # Hash of Statsample::Vector attr_reader :vectors # Ordered ids of vectors attr_reader :fields # Name of dataset attr_accessor :name # Number of cases attr_reader :cases # Location of pointer on enumerations methods (like #each) attr_reader :i # Generates a new dataset, using three vectors # - Rows # - Columns # - Values # # For example, you have these values # # x y v # a a 0 # a b 1 # b a 1 # b b 0 # # You obtain # id a b # a 0 1 # b 1 0 # # Useful to process outputs from databases def self.crosstab_by_asignation(rows,columns,values) raise "Three vectors should be equal size" if rows.size!=columns.size or rows.size!=values.size cols_values=columns.factors cols_n=cols_values.size h_rows=rows.factors.inject({}){|a,v| a[v]=cols_values.inject({}){ |a1,v1| a1[v1]=nil; a1 } ;a} values.each_index{|i| h_rows[rows[i]][columns[i]]=values[i] } ds=Dataset.new(["_id"]+cols_values) cols_values.each{|c| ds[c].type=values.type } rows.factors.each {|row| n_row=Array.new(cols_n+1) n_row[0]=row cols_values.each_index {|i| n_row[i+1]=h_rows[row][cols_values[i]] } ds.add_case_array(n_row) } ds.update_valid_data ds end # Return true if any vector has missing data def has_missing_data? @vectors.any? {|k,v| v.has_missing_data?} end # Return a nested hash using fields as keys and # an array constructed of hashes with other values. # If block provided, is used to provide the # values, with parameters +row+ of dataset, # +current+ last hash on hierarchy and # +name+ of the key to include def nest(*tree_keys,&block) tree_keys=tree_keys[0] if tree_keys[0].is_a? Array out=Hash.new each do |row| current=out # Create tree tree_keys[0,tree_keys.size-1].each do |f| root=row[f] current[root]||=Hash.new current=current[root] end name=row[tree_keys.last] if !block current[name]||=Array.new current[name].push(row.delete_if{|key,value| tree_keys.include? key}) else current[name]=block.call(row, current,name) end end out end # Creates a new dataset. A dataset is a set of ordered named vectors # of the same size. # # [vectors] With an array, creates a set of empty vectors named as # values on the array. With a hash, each Vector is assigned as # a variable of the Dataset named as its key # [fields] Array of names for vectors. Is only used for set the # order of variables. If empty, vectors keys on alfabethic order as # used as fields. def initialize(vectors={}, fields=[]) @@n_dataset||=0 @@n_dataset+=1 @name=_("Dataset %d") % @@n_dataset @cases=0 @gsl=nil @i=nil if vectors.instance_of? Array @fields=vectors.dup @vectors=vectors.inject({}){|a,x| a[x]=Statsample::Vector.new(); a} else # Check vectors @vectors=vectors @fields=fields check_order check_length end end # # Creates a copy of the given dataset, deleting all the cases with # missing data on one of the vectors. # # @param array of fields to include. No value include all fields # def dup_only_valid(*fields_to_include) if fields_to_include.size==1 and fields_to_include[0].is_a? Array fields_to_include=fields_to_include[0] end fields_to_include=@fields if fields_to_include.size==0 if fields_to_include.any? {|f| @vectors[f].has_missing_data?} ds=Dataset.new(fields_to_include) fields_to_include.each {|f| ds[f].type=@vectors[f].type} each {|row| unless fields_to_include.any? {|f| @vectors[f].has_missing_data? and !@vectors[f].is_valid? row[f]} row_2=fields_to_include.inject({}) {|ac,v| ac[v]=row[v]; ac} ds.add_case(row_2) end } else ds=dup fields_to_include end ds.name= self.name ds end # # Returns a duplicate of the Dataset. # All vectors are copied, so any modification on new # dataset doesn't affect original dataset's vectors. # If fields given as parameter, only include those vectors. # # @param array of fields to include. No value include all fields # @return {Statsample::Dataset} def dup(*fields_to_include) if fields_to_include.size==1 and fields_to_include[0].is_a? Array fields_to_include=fields_to_include[0] end fields_to_include=@fields if fields_to_include.size==0 vectors={} fields=[] fields_to_include.each{|f| raise "Vector #{f} doesn't exists" unless @vectors.has_key? f vectors[f]=@vectors[f].dup fields.push(f) } ds=Dataset.new(vectors,fields) ds.name= self.name ds end # Returns an array with the fields from first argumen to last argument def from_to(from,to) raise ArgumentError, "Field #{from} should be on dataset" if !@fields.include? from raise ArgumentError, "Field #{to} should be on dataset" if !@fields.include? to @fields.slice(@fields.index(from)..@fields.index(to)) end # Returns (when possible) a cheap copy of dataset. # If no vector have missing values, returns original vectors. # If missing values presents, uses Dataset.dup_only_valid. # # @param array of fields to include. No value include all fields # @return {Statsample::Dataset} def clone_only_valid(*fields_to_include) if fields_to_include.size==1 and fields_to_include[0].is_a? Array fields_to_include=fields_to_include[0] end fields_to_include=@fields.dup if fields_to_include.size==0 if fields_to_include.any? {|v| @vectors[v].has_missing_data?} dup_only_valid(fields_to_include) else clone(fields_to_include) end end # Returns a shallow copy of Dataset. # Object id will be distinct, but @vectors will be the same. # @param array of fields to include. No value include all fields # @return {Statsample::Dataset} def clone(*fields_to_include) if fields_to_include.size==1 and fields_to_include[0].is_a? Array fields_to_include=fields_to_include[0] end fields_to_include=@fields.dup if fields_to_include.size==0 ds=Dataset.new fields_to_include.each{|f| raise "Vector #{f} doesn't exists" unless @vectors.has_key? f ds[f]=@vectors[f] } ds.fields=fields_to_include ds.name=@name ds.update_valid_data ds end # Creates a copy of the given dataset, without data on vectors # # @return {Statsample::Dataset} def dup_empty vectors=@vectors.inject({}) {|a,v| a[v[0]]=v[1].dup_empty a } Dataset.new(vectors,@fields.dup) end # Merge vectors from two datasets # In case of name collition, the vectors names are changed to # x_1, x_2 .... # # @return {Statsample::Dataset} def merge(other_ds) raise "Cases should be equal (this:#{@cases}; other:#{other_ds.cases}" unless @cases==other_ds.cases types = @fields.collect{|f| @vectors[f].type} + other_ds.fields.collect{|f| other_ds[f].type} new_fields = (@fields+other_ds.fields).recode_repeated ds_new=Statsample::Dataset.new(new_fields) new_fields.each_index{|i| field=new_fields[i] ds_new[field].type=types[i] } @cases.times {|i| row=case_as_array(i)+other_ds.case_as_array(i) ds_new.add_case_array(row) } ds_new.update_valid_data ds_new end # Join 2 Datasets by given fields # type is one of :left and :inner, default is :left # # @return {Statsample::Dataset} def join(other_ds,fields_1=[],fields_2=[],type=:left) fields_new = other_ds.fields - fields_2 fields = self.fields + fields_new other_ds_hash = {} other_ds.each do |row| key = row.select{|k,v| fields_2.include?(k)}.values value = row.select{|k,v| fields_new.include?(k)} if other_ds_hash[key].nil? other_ds_hash[key] = [value] else other_ds_hash[key] << value end end new_ds = Dataset.new(fields) self.each do |row| key = row.select{|k,v| fields_1.include?(k)}.values new_case = row.dup if other_ds_hash[key].nil? if type == :left fields_new.each{|field| new_case[field] = nil} new_ds.add_case(new_case) end else other_ds_hash[key].each do |new_values| new_ds.add_case new_case.merge(new_values) end end end new_ds end # Returns a dataset with standarized data. # # @return {Statsample::Dataset} def standarize ds=dup() ds.fields.each do |f| ds[f]=ds[f].vector_standarized end ds end # Generate a matrix, based on fields of dataset # # @return {::Matrix} def collect_matrix rows=@fields.collect{|row| @fields.collect{|col| yield row,col } } Matrix.rows(rows) end # We have the same datasets if +vectors+ and +fields+ are the same # # @return {Boolean} def ==(d2) @vectors==d2.vectors and @fields==d2.fields end # Returns vector c # # @return {Statsample::Vector} def col(c) @vectors[c] end alias_method :vector, :col # Equal to Dataset[name]=vector # # @return self def add_vector(name, vector) raise ArgumentError, "Vector have different size" if vector.size!=@cases @vectors[name]=vector check_order self end # Returns true if dataset have vector v. # # @return {Boolean} def has_vector? (v) return @vectors.has_key?(v) end # Creates a dataset with the random data, of a n size # If n not given, uses original number of cases. # # @return {Statsample::Dataset} def bootstrap(n=nil) n||=@cases ds_boot=dup_empty n.times do ds_boot.add_case_array(case_as_array(rand(n))) end ds_boot.update_valid_data ds_boot end # Fast version of #add_case. # Can only add one case and no error check if performed # You SHOULD use #update_valid_data at the end of insertion cycle # # def add_case_array(v) v.each_index {|i| d=@vectors[@fields[i]].data; d.push(v[i])} end # Insert a case, using: # * Array: size equal to number of vectors and values in the same order as fields # * Hash: keys equal to fields # If uvd is false, #update_valid_data is not executed after # inserting a case. This is very useful if you want to increase the # performance on inserting many cases, because #update_valid_data # performs check on vectors and on the dataset def add_case(v,uvd=true) case v when Array if (v[0].is_a? Array) v.each{|subv| add_case(subv,false)} else raise ArgumentError, "Input array size (#{v.size}) should be equal to fields number (#{@fields.size})" if @fields.size!=v.size v.each_index {|i| @vectors[@fields[i]].add(v[i],false)} end when Hash raise ArgumentError, "Hash keys should be equal to fields #{(v.keys - @fields).join(",")}" if @fields.sort!=v.keys.sort @fields.each{|f| @vectors[f].add(v[f],false)} else raise TypeError, 'Value must be a Array or a Hash' end if uvd update_valid_data end end # Check vectors and fields after inserting data. Use only # after #add_case_array or #add_case with second parameter to false def update_valid_data @gsl=nil @fields.each{|f| @vectors[f].set_valid_data} check_length end # Delete vector named +name+. Multiple fields accepted. def delete_vector(*args) if args.size==1 and args[0].is_a? Array names=args[0] else names=args end names.each do |name| @fields.delete(name) @vectors.delete(name) end end def add_vectors_by_split_recode(name_,join='-',sep=Statsample::SPLIT_TOKEN) split=@vectors[name_].split_by_separator(sep) i=1 split.each{|k,v| new_field=name_+join+i.to_s v.name=name_+":"+k add_vector(new_field,v) i+=1 } end def add_vectors_by_split(name,join='-',sep=Statsample::SPLIT_TOKEN) split=@vectors[name].split_by_separator(sep) split.each{|k,v| add_vector(name+join+k,v) } end def vector_by_calculation(type=:scale) a=[] each do |row| a.push(yield(row)) end a.to_vector(type) end # Returns a vector with sumatory of fields # if fields parameter is empty, sum all fields def vector_sum(fields=nil) fields||=@fields vector=collect_with_index do |row, i| if(fields.find{|f| !@vectors[f].data_with_nils[i]}) nil else fields.inject(0) {|ac,v| ac + row[v].to_f} end end vector.name=_("Sum from %s") % @name vector end # Check if #fields attribute is correct, after inserting or deleting vectors def check_fields(fields) fields||=@fields raise "Fields #{(fields-@fields).join(", ")} doesn't exists on dataset" if (fields-@fields).size>0 fields end # Returns a vector with the numbers of missing values for a case def vector_missing_values(fields=nil) fields=check_fields(fields) collect_with_index do |row, i| fields.inject(0) {|a,v| a+ ((@vectors[v].data_with_nils[i].nil?) ? 1: 0) } end end def vector_count_characters(fields=nil) fields=check_fields(fields) collect_with_index do |row, i| fields.inject(0){|a,v| a+((@vectors[v].data_with_nils[i].nil?) ? 0: row[v].to_s.size) } end end # Returns a vector with the mean for a set of fields # if fields parameter is empty, return the mean for all fields # if max invalid parameter > 0, returns the mean for all tuples # with 0 to max_invalid invalid fields def vector_mean(fields=nil, max_invalid=0) a=[] fields=check_fields(fields) size=fields.size each_with_index do |row, i | # numero de invalidos sum=0 invalids=0 fields.each{|f| if !@vectors[f].data_with_nils[i].nil? sum+=row[f].to_f else invalids+=1 end } if(invalids>max_invalid) a.push(nil) else a.push(sum.quo(size-invalids)) end end a=a.to_vector(:scale) a.name=_("Means from %s") % @name a end # Check vectors for type and size. def check_length # :nodoc: size=nil @vectors.each do |k,v| raise Exception, "Data #{v.class} is not a vector on key #{k}" if !v.is_a? Statsample::Vector if size.nil? size=v.size else if v.size!=size raise Exception, "Vector #{k} have size #{v.size} and dataset have size #{size}" end end end @cases=size end # Retrieves each vector as [key, vector] def each_vector # :yield: |key, vector| @fields.each{|k| yield k, @vectors[k]} end if Statsample::STATSAMPLE__.respond_to?(:case_as_hash) def case_as_hash(c) # :nodoc: Statsample::STATSAMPLE__.case_as_hash(self,c) end else # Retrieves case i as a hash def case_as_hash(i) _case_as_hash(i) end end if Statsample::STATSAMPLE__.respond_to?(:case_as_array) def case_as_array(c) # :nodoc: Statsample::STATSAMPLE__.case_as_array(self,c) end else # Retrieves case i as a array, ordered on #fields order def case_as_array(i) _case_as_array(i) end end def _case_as_hash(c) # :nodoc: @fields.inject({}) {|a,x| a[x]=@vectors[x][c];a } end def _case_as_array(c) # :nodoc: @fields.collect {|x| @vectors[x][c]} end # Returns each case as a hash def each begin @i=0 @cases.times {|i| @i=i row=case_as_hash(i) yield row } @i=nil rescue =>e raise DatasetException.new(self, e) end end # Returns each case as hash and index def each_with_index # :yield: |case, i| begin @i=0 @cases.times{|i| @i=i row=case_as_hash(i) yield row, i } @i=nil rescue =>e raise DatasetException.new(self, e) end end # Returns each case as an array, coding missing values as nils def each_array_with_nils m=fields.size @cases.times {|i| @i=i row=Array.new(m) fields.each_index{|j| f=fields[j] row[j]=@vectors[f].data_with_nils[i] } yield row } @i=nil end # Returns each case as an array def each_array @cases.times {|i| @i=i row=case_as_array(i) yield row } @i=nil end # Set fields order. If you omit one or more vectors, they are # ordered by alphabetic order. def fields=(f) @fields=f check_order end # Check congruence between +fields+ attribute # and keys on +vectors def check_order #:nodoc: if(@vectors.keys.sort!=@fields.sort) @fields=@fields&@vectors.keys @fields+=@vectors.keys.sort-@fields end end # Returns the vector named i def[](i) if i.is_a? Range fields=from_to(i.begin,i.end) clone(*fields) elsif i.is_a? Array clone(i) else raise Exception,"Vector '#{i}' doesn't exists on dataset" unless @vectors.has_key?(i) @vectors[i] end end # Retrieves a Statsample::Vector, based on the result # of calculation performed on each case. def collect(type=:scale) data=[] each {|row| data.push yield(row) } Statsample::Vector.new(data,type) end # Same as Statsample::Vector.collect, but giving case index as second parameter on yield. def collect_with_index(type=:scale) data=[] each_with_index {|row, i| data.push(yield(row, i)) } Statsample::Vector.new(data,type) end # Recode a vector based on a block def recode!(vector_name) 0.upto(@cases-1) {|i| @vectors[vector_name].data[i]=yield case_as_hash(i) } @vectors[vector_name].set_valid_data end def crosstab(v1,v2,opts={}) Statsample::Crosstab.new(@vectors[v1], @vectors[v2],opts) end def[]=(i,v) if v.instance_of? Statsample::Vector @vectors[i]=v check_order else raise ArgumentError,"Should pass a Statsample::Vector" end end # Return data as a matrix. Column are ordered by #fields and # rows by orden of insertion def to_matrix rows=[] self.each_array{|c| rows.push(c) } Matrix.rows(rows) end if Statsample.has_gsl? def clear_gsl @gsl=nil end def to_gsl if @gsl.nil? if cases.nil? update_valid_data end @gsl=GSL::Matrix.alloc(cases,fields.size) self.each_array{|c| @gsl.set_row(@i,c) } end @gsl end end # Return a correlation matrix for fields included as parameters. # By default, uses all fields of dataset def correlation_matrix(fields=nil) if fields ds=clone(fields) else ds=self end Statsample::Bivariate.correlation_matrix(ds) end # Return a correlation matrix for fields included as parameters. # By default, uses all fields of dataset def covariance_matrix(fields=nil) if fields ds=clone(fields) else ds=self end Statsample::Bivariate.covariance_matrix(ds) end # Create a new dataset with all cases which the block returns true def filter ds=self.dup_empty each {|c| ds.add_case(c, false) if yield c } ds.update_valid_data ds.name=_("%s(filtered)") % @name ds end # creates a new vector with the data of a given field which the block returns true def filter_field(field) a=[] each do |c| a.push(c[field]) if yield c end a.to_vector(@vectors[field].type) end # Creates a Stastample::Multiset, using one or more fields # to split the dataset. def to_multiset_by_split(*fields) require 'statsample/multiset' if fields.size==1 to_multiset_by_split_one_field(fields[0]) else to_multiset_by_split_multiple_fields(*fields) end end # Creates a Statsample::Multiset, using one field def to_multiset_by_split_one_field(field) raise ArgumentError,"Should use a correct field name" if !@fields.include? field factors=@vectors[field].factors ms=Multiset.new_empty_vectors(@fields, factors) each {|c| ms[c[field]].add_case(c,false) } #puts "Ingreso a los dataset" ms.datasets.each {|k,ds| ds.update_valid_data ds.name=@vectors[field].labeling(k) ds.vectors.each{|k1,v1| # puts "Vector #{k1}:"+v1.to_s v1.type=@vectors[k1].type v1.name=@vectors[k1].name v1.labels=@vectors[k1].labels } } ms end def to_multiset_by_split_multiple_fields(*fields) factors_total=nil fields.each do |f| if factors_total.nil? factors_total=@vectors[f].factors.collect{|c| [c] } else suma=[] factors=@vectors[f].factors factors_total.each{|f1| factors.each{|f2| suma.push(f1+[f2]) } } factors_total=suma end end ms=Multiset.new_empty_vectors(@fields,factors_total) p1=eval "Proc.new {|c| ms[["+fields.collect{|f| "c['#{f}']"}.join(",")+"]].add_case(c,false) }" each{|c| p1.call(c)} ms.datasets.each do |k,ds| ds.update_valid_data ds.name=fields.size.times.map {|i| f=fields[i] sk=k[i] @vectors[f].labeling(sk) }.join("-") ds.vectors.each{|k1,v1| v1.type=@vectors[k1].type v1.name=@vectors[k1].name v1.labels=@vectors[k1].labels } end ms end # Returns a vector, based on a string with a calculation based # on vector # The calculation will be eval'ed, so you can put any variable # or expression valid on ruby # For example: # a=[1,2].to_vector(scale) # b=[3,4].to_vector(scale) # ds={'a'=>a,'b'=>b}.to_dataset # ds.compute("a+b") # => Vector [4,6] def compute(text) @fields.each{|f| if @vectors[f].type=:scale text.gsub!(f,"row['#{f}'].to_f") else text.gsub!(f,"row['#{f}']") end } collect_with_index {|row, i| invalid=false @fields.each{|f| if @vectors[f].data_with_nils[i].nil? invalid=true end } if invalid nil else eval(text) end } end # Test each row with one or more tests # each test is a Proc with the form # Proc.new {|row| row['age']>0} # The function returns an array with all errors def verify(*tests) if(tests[0].is_a? String) id=tests[0] tests.shift else id=@fields[0] end vr=[] i=0 each do |row| i+=1 tests.each{|test| if ! test[2].call(row) values="" if test[1].size>0 values=" ("+test[1].collect{|k| "#{k}=#{row[k]}"}.join(", ")+")" end vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}") end } end vr end def to_s "#<"+self.class.to_s+":"+self.object_id.to_s+" @name=#{@name} @fields=["+@fields.join(",")+"] cases="+@vectors[@fields[0]].size.to_s end def inspect self.to_s end # Creates a new dataset for one to many relations # on a dataset, based on pattern of field names. # # for example, you have a survey for number of children # with this structure: # id, name, child_name_1, child_age_1, child_name_2, child_age_2 # with # ds.one_to_many(%w{id}, "child_%v_%n" # the field of first parameters will be copied verbatim # to new dataset, and fields which responds to second # pattern will be added one case for each different %n. # For example # cases=[ # ['1','george','red',10,'blue',20,nil,nil], # ['2','fred','green',15,'orange',30,'white',20], # ['3','alfred',nil,nil,nil,nil,nil,nil] # ] # ds=Statsample::Dataset.new(%w{id name car_color1 car_value1 car_color2 car_value2 car_color3 car_value3}) # cases.each {|c| ds.add_case_array c } # ds.one_to_many(['id'],'car_%v%n').to_matrix # => Matrix[ # ["red", "1", 10], # ["blue", "1", 20], # ["green", "2", 15], # ["orange", "2", 30], # ["white", "2", 20] # ] # def one_to_many(parent_fields, pattern) #base_pattern=pattern.gsub(/%v|%n/,"") re=Regexp.new pattern.gsub("%v","(.+?)").gsub("%n","(\\d+?)") ds_vars=parent_fields vars=[] max_n=0 h=parent_fields.inject({}) {|a,v| a[v]=Statsample::Vector.new([], @vectors[v].type);a } # Adding _row_id h['_col_id']=[].to_scale ds_vars.push("_col_id") @fields.each do |f| if f=~re if !vars.include? $1 vars.push($1) h[$1]=Statsample::Vector.new([], @vectors[f].type) end max_n=$2.to_i if max_n < $2.to_i end end ds=Dataset.new(h,ds_vars+vars) each do |row| row_out={} parent_fields.each do |f| row_out[f]=row[f] end max_n.times do |n1| n=n1+1 any_data=false vars.each do |v| data=row[pattern.gsub("%v",v.to_s).gsub("%n",n.to_s)] row_out[v]=data any_data=true if !data.nil? end if any_data row_out["_col_id"]=n ds.add_case(row_out,false) end end end ds.update_valid_data ds end def report_building(b) b.section(:name=>@name) do |g| g.text _"Cases: %d" % cases @fields.each do |f| g.text "Element:[#{f}]" g.parse_element(@vectors[f]) end end end end end ================================================ FILE: lib/statsample/dominanceanalysis/bootstrap.rb ================================================ module Statsample class DominanceAnalysis # == Goal # Generates Bootstrap sample to identity the replicability of a Dominance Analysis. See Azen & Bodescu (2003) for more information. # # == Usage # # require 'statsample' # a=100.times.collect {rand}.to_scale # b=100.times.collect {rand}.to_scale # c=100.times.collect {rand}.to_scale # d=100.times.collect {rand}.to_scale # ds={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset # ds['y']=ds.collect{|row| row['a']*5+row['b']*2+row['c']*2+row['d']*2+10*rand()} # dab=Statsample::DominanceAnalysis::Bootstrap.new(ds2, 'y', :debug=>true) # dab.bootstrap(100,nil) # puts dab.summary # Output # Sample size: 100 # t: 1.98421693632958 # # Linear Regression Engine: Statsample::Regression::Multiple::MatrixEngine # Table: Bootstrap report # -------------------------------------------------------------------------------------------- # | pairs | sD | Dij | SE(Dij) | Pij | Pji | Pno | Reproducibility | # -------------------------------------------------------------------------------------------- # | Complete dominance | # -------------------------------------------------------------------------------------------- # | a - b | 1.0 | 0.6150 | 0.454 | 0.550 | 0.320 | 0.130 | 0.550 | # | a - c | 1.0 | 0.9550 | 0.175 | 0.930 | 0.020 | 0.050 | 0.930 | # | a - d | 1.0 | 0.9750 | 0.131 | 0.960 | 0.010 | 0.030 | 0.960 | # | b - c | 1.0 | 0.8800 | 0.276 | 0.820 | 0.060 | 0.120 | 0.820 | # | b - d | 1.0 | 0.9250 | 0.193 | 0.860 | 0.010 | 0.130 | 0.860 | # | c - d | 0.5 | 0.5950 | 0.346 | 0.350 | 0.160 | 0.490 | 0.490 | # -------------------------------------------------------------------------------------------- # | Conditional dominance | # -------------------------------------------------------------------------------------------- # | a - b | 1.0 | 0.6300 | 0.458 | 0.580 | 0.320 | 0.100 | 0.580 | # | a - c | 1.0 | 0.9700 | 0.156 | 0.960 | 0.020 | 0.020 | 0.960 | # | a - d | 1.0 | 0.9800 | 0.121 | 0.970 | 0.010 | 0.020 | 0.970 | # | b - c | 1.0 | 0.8850 | 0.283 | 0.840 | 0.070 | 0.090 | 0.840 | # | b - d | 1.0 | 0.9500 | 0.181 | 0.920 | 0.020 | 0.060 | 0.920 | # | c - d | 0.5 | 0.5800 | 0.360 | 0.350 | 0.190 | 0.460 | 0.460 | # -------------------------------------------------------------------------------------------- # | General Dominance | # -------------------------------------------------------------------------------------------- # | a - b | 1.0 | 0.6500 | 0.479 | 0.650 | 0.350 | 0.000 | 0.650 | # | a - c | 1.0 | 0.9800 | 0.141 | 0.980 | 0.020 | 0.000 | 0.980 | # | a - d | 1.0 | 0.9900 | 0.100 | 0.990 | 0.010 | 0.000 | 0.990 | # | b - c | 1.0 | 0.9000 | 0.302 | 0.900 | 0.100 | 0.000 | 0.900 | # | b - d | 1.0 | 0.9700 | 0.171 | 0.970 | 0.030 | 0.000 | 0.970 | # | c - d | 1.0 | 0.5600 | 0.499 | 0.560 | 0.440 | 0.000 | 0.560 | # -------------------------------------------------------------------------------------------- # # Table: General averages # --------------------------------------- # | var | mean | se | p.5 | p.95 | # --------------------------------------- # | a | 0.133 | 0.049 | 0.062 | 0.218 | # | b | 0.106 | 0.048 | 0.029 | 0.199 | # | c | 0.035 | 0.032 | 0.002 | 0.106 | # | d | 0.023 | 0.019 | 0.002 | 0.062 | # --------------------------------------- # # == References: # * Azen, R. & Budescu, D.V. (2003). The dominance analysis approach for comparing predictors in multiple regression. Psychological Methods, 8(2), 129-148. class Bootstrap include Writable include Summarizable # Total Dominance results attr_reader :samples_td # Conditional Dominance results attr_reader :samples_cd # General Dominance results attr_reader :samples_gd # General average results attr_reader :samples_ga # Name of fields attr_reader :fields # Regression class used for analysis attr_accessor :regression_class # Dataset attr_accessor :ds # Name of analysis attr_accessor :name # Alpha level of confidence. Default: ALPHA attr_accessor :alpha # Debug? attr_accessor :debug # Default level of confidence for t calculation ALPHA=0.95 # Create a new Dominance Analysis Bootstrap Object # # * ds: A Dataset object # * y_var: Name of dependent variable # * opts: Any other attribute of the class def initialize(ds,y_var, opts=Hash.new) @ds=ds @y_var=y_var @n=ds.cases @n_samples=0 @alpha=ALPHA @debug=false if y_var.is_a? Array @fields=ds.fields-y_var @regression_class=Regression::Multiple::MultipleDependent else @fields=ds.fields-[y_var] @regression_class=Regression::Multiple::MatrixEngine end @samples_ga=@fields.inject({}){|a,v| a[v]=[];a} @name=_("Bootstrap dominance Analysis: %s over %s") % [ ds.fields.join(",") , @y_var] opts.each{|k,v| self.send("#{k}=",v) if self.respond_to? k } create_samples_pairs end # lr_class deprecated alias_method :lr_class, :regression_class def da if @da.nil? @da=DominanceAnalysis.new(@ds,@y_var, :regression_class => @regression_class) end @da end # Creates n re-samples from original dataset and store result of # each sample on @samples_td, @samples_cd, @samples_gd, @samples_ga # # * number_samples: Number of new samples to add # * n: size of each new sample. If nil, equal to original sample size def bootstrap(number_samples,n=nil) number_samples.times{ |t| @n_samples+=1 puts _("Bootstrap %d of %d") % [t+1, number_samples] if @debug ds_boot=@ds.bootstrap(n) da_1=DominanceAnalysis.new(ds_boot, @y_var, :regression_class => @regression_class) da_1.total_dominance.each{|k,v| @samples_td[k].push(v) } da_1.conditional_dominance.each{|k,v| @samples_cd[k].push(v) } da_1.general_dominance.each{|k,v| @samples_gd[k].push(v) } da_1.general_averages.each{|k,v| @samples_ga[k].push(v) } } end def create_samples_pairs @samples_td={} @samples_cd={} @samples_gd={} @pairs=[] c=(0...@fields.size).to_a.combination(2) c.each do |data| p data convert=data.collect {|i| @fields[i] } @pairs.push(convert) [@samples_td, @samples_cd, @samples_gd].each{|s| s[convert]=[] } end end def t Distribution::T.p_value(1-((1-@alpha) / 2), @n_samples - 1) end def report_building(builder) # :nodoc: raise "You should bootstrap first" if @n_samples==0 builder.section(:name=>@name) do |generator| generator.text _("Sample size: %d\n") % @n_samples generator.text "t: #{t}\n" generator.text _("Linear Regression Engine: %s") % @regression_class.name table=ReportBuilder::Table.new(:name=>"Bootstrap report", :header => [_("pairs"), "sD","Dij", _("SE(Dij)"), "Pij", "Pji", "Pno", _("Reproducibility")]) table.row([_("Complete dominance"),"","","","","","",""]) table.hr @pairs.each{|pair| std=@samples_td[pair].to_vector(:scale) ttd=da.total_dominance_pairwise(pair[0],pair[1]) table.row(summary_pairs(pair,std,ttd)) } table.hr table.row([_("Conditional dominance"),"","","","","","",""]) table.hr @pairs.each{|pair| std=@samples_cd[pair].to_vector(:scale) ttd=da.conditional_dominance_pairwise(pair[0],pair[1]) table.row(summary_pairs(pair,std,ttd)) } table.hr table.row([_("General Dominance"),"","","","","","",""]) table.hr @pairs.each{|pair| std=@samples_gd[pair].to_vector(:scale) ttd=da.general_dominance_pairwise(pair[0],pair[1]) table.row(summary_pairs(pair,std,ttd)) } generator.parse_element(table) table=ReportBuilder::Table.new(:name=>_("General averages"), :header=>[_("var"), _("mean"), _("se"), _("p.5"), _("p.95")]) @fields.each{|f| v=@samples_ga[f].to_vector(:scale) row=[@ds[f].name, sprintf("%0.3f",v.mean), sprintf("%0.3f",v.sd), sprintf("%0.3f",v.percentil(5)),sprintf("%0.3f",v.percentil(95))] table.row(row) } generator.parse_element(table) end end def summary_pairs(pair,std,ttd) freqs=std.proportions [0, 0.5, 1].each{|n| freqs[n]=0 if freqs[n].nil? } name="%s - %s" % [@ds[pair[0]].name, @ds[pair[1]].name] [name,f(ttd,1),f(std.mean,4),f(std.sd),f(freqs[1]), f(freqs[0]), f(freqs[0.5]), f(freqs[ttd])] end def f(v,n=3) prec="%0.#{n}f" sprintf(prec,v) end end end end ================================================ FILE: lib/statsample/dominanceanalysis.rb ================================================ module Statsample # Dominance Analysis is a procedure based on an examination of the R2 values # for all possible subset models, to identify the relevance of one or more # predictors in the prediction of criterium. # # See Budescu(1993), Azen & Budescu (2003, 2006) for more information. # # == Use # # a=1000.times.collect {rand}.to_scale # b=1000.times.collect {rand}.to_scale # c=1000.times.collect {rand}.to_scale # ds={'a'=>a,'b'=>b,'c'=>c}.to_dataset # ds['y']=ds.collect{|row| row['a']*5+row['b']*3+row['c']*2+rand()} # da=Statsample::DominanceAnalysis.new(ds,'y') # puts da.summary # # === Output: # # Report: Report 2010-02-08 19:10:11 -0300 # Table: Dominance Analysis result # ------------------------------------------------------------ # | | r2 | sign | a | b | c | # ------------------------------------------------------------ # | Model 0 | | | 0.648 | 0.265 | 0.109 | # ------------------------------------------------------------ # | a | 0.648 | 0.000 | -- | 0.229 | 0.104 | # | b | 0.265 | 0.000 | 0.612 | -- | 0.104 | # | c | 0.109 | 0.000 | 0.643 | 0.260 | -- | # ------------------------------------------------------------ # | k=1 Average | | | 0.627 | 0.244 | 0.104 | # ------------------------------------------------------------ # | a*b | 0.877 | 0.000 | -- | -- | 0.099 | # | a*c | 0.752 | 0.000 | -- | 0.224 | -- | # | b*c | 0.369 | 0.000 | 0.607 | -- | -- | # ------------------------------------------------------------ # | k=2 Average | | | 0.607 | 0.224 | 0.099 | # ------------------------------------------------------------ # | a*b*c | 0.976 | 0.000 | -- | -- | -- | # ------------------------------------------------------------ # | Overall averages | | | 0.628 | 0.245 | 0.104 | # ------------------------------------------------------------ # # Table: Pairwise dominance # ----------------------------------------- # | Pairs | Total | Conditional | General | # ----------------------------------------- # | a - b | 1.0 | 1.0 | 1.0 | # | a - c | 1.0 | 1.0 | 1.0 | # | b - c | 1.0 | 1.0 | 1.0 | # ----------------------------------------- # # == Reference: # * Budescu, D. V. (1993). Dominance analysis: a new approach to the problem of relative importance of predictors in multiple regression. Psychological Bulletin, 114, 542-551. # * Azen, R. & Budescu, D.V. (2003). The dominance analysis approach for comparing predictors in multiple regression. Psychological Methods, 8(2), 129-148. # * Azen, R. & Budescu, D.V. (2006). Comparing predictors in Multivariate Regression Models: An extension of Dominance Analysis. Journal of Educational and Behavioral Statistics, 31(2), 157-180. # class DominanceAnalysis include Summarizable # Class to generate the regressions. Default to Statsample::Regression::Multiple::MatrixEngine attr_accessor :regression_class # Name of analysis attr_accessor :name # Set to true if you want to build from dataset, not correlation matrix attr_accessor :build_from_dataset # Array with independent variables. You could create subarrays, # to test groups of predictors as blocks attr_accessor :predictors # If you provide a matrix as input, you should set # the number of cases to define significance of R^2 attr_accessor :cases # Method of :regression_class used to measure association. # # Only necessary to change if you have multivariate dependent. # * :r2yx (R^2_yx), the default option, is the option when distinction # between independent and dependents variable is arbitrary # * :p2yx is the option when the distinction between independent and dependents variables is real. # attr_accessor :method_association attr_reader :dependent UNIVARIATE_REGRESSION_CLASS=Statsample::Regression::Multiple::MatrixEngine MULTIVARIATE_REGRESSION_CLASS=Statsample::Regression::Multiple::MultipleDependent def self.predictor_name(variable) if variable.is_a? Array sprintf("(%s)", variable.join(",")) else variable end end # Creates a new DominanceAnalysis object # Parameters: # * input: A Matrix or Dataset object # * dependent: Name of dependent variable. Could be an array, if you want to # do an Multivariate Regression Analysis. If nil, set to all # fields on input, except criteria def initialize(input, dependent, opts=Hash.new) @build_from_dataset=false if dependent.is_a? Array @regression_class= MULTIVARIATE_REGRESSION_CLASS @method_association=:r2yx else @regression_class= UNIVARIATE_REGRESSION_CLASS @method_association=:r2 end @name=nil opts.each{|k,v| self.send("#{k}=",v) if self.respond_to? k } @dependent=dependent @dependent=[@dependent] unless @dependent.is_a? Array @predictors ||= input.fields-@dependent @name=_("Dominance Analysis: %s over %s") % [ @predictors.flatten.join(",") , @dependent.join(",")] if @name.nil? if input.is_a? Statsample::Dataset @ds=input @matrix=Statsample::Bivariate.correlation_matrix(input) @cases=Statsample::Bivariate.min_n_valid(input) elsif input.is_a? ::Matrix @ds=nil @matrix=input else raise ArgumentError.new("You should use a Matrix or a Dataset") end @models=nil @models_data=nil @general_averages=nil end # Compute models. def compute create_models fill_models end def models if @models.nil? compute end @models end def models_data if @models_data.nil? compute end @models_data end def create_models @models=[] @models_data={} for i in 1..@predictors.size c=(0...@predictors.size).to_a.combination(i) c.each do |data| independent=data.collect {|i1| @predictors[i1] } @models.push(independent) if (@build_from_dataset) data=@ds.dup(independent.flatten+@dependent) else data=@matrix.submatrix(independent.flatten+@dependent) end modeldata=ModelData.new(independent, data, self) models_data[independent.sort {|a,b| a.to_s<=>b.to_s}]=modeldata end end end def fill_models @models.each do |m| @predictors.each do |f| next if m.include? f base_model=md(m) comp_model=md(m+[f]) base_model.add_contribution(f,comp_model.r2) end end end private :create_models, :fill_models def dominance_for_nil_model(i,j) if md([i]).r2>md([j]).r2 1 elsif md([i]).r2m.contributions[j] dominances.push(1) elsif m.contributions[i]1 ? 0.5 : final[0] end # Returns 1 if i cD k, 0 if j cD i and 0.5 if undetermined def conditional_dominance_pairwise(i,j) dm=dominance_for_nil_model(i,j) return 0.5 if dm==0.5 dominances=[dm] for k in 1...@predictors.size a=average_k(k) if a[i]>a[j] dominances.push(1) elsif a[i]1 ? 0.5 : final[0] end # Returns 1 if i gD k, 0 if j gD i and 0.5 if undetermined def general_dominance_pairwise(i,j) ga=general_averages if ga[i]>ga[j] 1 elsif ga[i]b.to_s}] end # Get all model of size k def md_k(k) out=[] @models.each{|m| out.push(md(m)) if m.size==k } out end # For a hash with arrays of numbers as values # Returns a hash with same keys and # value as the mean of values of original hash def get_averages(averages) out={} averages.each{|key,val| out[key]=val.to_vector(:scale).mean } out end # Hash with average for each k size model. def average_k(k) return nil if k==@predictors.size models=md_k(k) averages=@predictors.inject({}) {|a,v| a[v]=[];a} models.each do |m| @predictors.each do |f| averages[f].push(m.contributions[f]) unless m.contributions[f].nil? end end get_averages(averages) end def general_averages if @general_averages.nil? averages=@predictors.inject({}) {|a,v| a[v]=[md([v]).r2];a} for k in 1...@predictors.size ak=average_k(k) @predictors.each do |f| averages[f].push(ak[f]) end end @general_averages=get_averages(averages) end @general_averages end def report_building(g) compute if @models.nil? g.section(:name=>@name) do |generator| header=["","r2",_("sign")]+@predictors.collect {|c| DominanceAnalysis.predictor_name(c) } generator.table(:name=>_("Dominance Analysis result"), :header=>header) do |t| row=[_("Model 0"),"",""]+@predictors.collect{|f| sprintf("%0.3f",md([f]).r2) } t.row(row) t.hr for i in 1..@predictors.size mk=md_k(i) mk.each{|m| t.row(m.add_table_row) } # Report averages a=average_k(i) if !a.nil? t.hr row=[_("k=%d Average") % i,"",""] + @predictors.collect{|f| sprintf("%0.3f",a[f]) } t.row(row) t.hr end end g=general_averages t.hr row=[_("Overall averages"),"",""]+@predictors.collect{|f| sprintf("%0.3f",g[f]) } t.row(row) end td=total_dominance cd=conditional_dominance gd=general_dominance generator.table(:name=>_("Pairwise dominance"), :header=>[_("Pairs"),_("Total"),_("Conditional"),_("General")]) do |t| pairs.each{|pair| name=pair.map{|v| v.is_a?(Array) ? "("+v.join("-")+")" : v}.join(" - ") row=[name, sprintf("%0.1f",td[pair]), sprintf("%0.1f",cd[pair]), sprintf("%0.1f",gd[pair])] t.row(row) } end end end class ModelData # :nodoc: attr_reader :contributions def initialize(independent, data, da) @independent=independent @data=data @predictors=da.predictors @dependent=da.dependent @cases=da.cases @method=da.method_association @contributions=@independent.inject({}){|a,v| a[v]=nil;a} r_class=da.regression_class if @dependent.size==1 @lr=r_class.new(data, @dependent[0], :cases=>@cases) else @lr=r_class.new(data, @dependent, :cases=>@cases) end end def add_contribution(f, v) @contributions[f]=v-r2 end def r2 @lr.send(@method) end def name @independent.collect {|variable| DominanceAnalysis.predictor_name(variable) }.join("*") end def add_table_row if @cases sign=sprintf("%0.3f", @lr.probability) else sign="???" end [name, sprintf("%0.3f",r2), sign] + @predictors.collect{|k| v=@contributions[k] if v.nil? "--" else sprintf("%0.3f",v) end } end def summary out=sprintf("%s: r2=%0.3f(p=%0.2f)\n",name, r2, @lr.significance, @lr.sst) out << @predictors.collect{|k| v=@contributions[k] if v.nil? "--" else sprintf("%s=%0.3f",k,v) end }.join(" | ") out << "\n" return out end end # end ModelData end # end Dominance Analysis end require 'statsample/dominanceanalysis/bootstrap' ================================================ FILE: lib/statsample/factor/map.rb ================================================ module Statsample module Factor # = Velicer's Minimum Average Partial # # "Velicer’s (1976) MAP test involves a complete princi- # pal components analysis followed by the examination of # a series of matrices of partial correlations. Specifically, # on the first step, the first principal component is par- # tialed out of the correlations between the variables of in- # terest, and the average squared coefficient in the off- # diagonals of the resulting partial correlation matrix is # computed. On the second step, the first two principal # components are partialed out of the original correlation # matrix and the average squared partial correlation is # again computed. These computations are conducted for k # (the number of variables) minus one steps. The average # squared partial correlations from these steps are then # lined up, and the number of components is determined by # the step number in the analyses that resulted in the lowest # average squared partial correlation. The average squared # coefficient in the original correlation matrix is also com- # puted, and if this coefficient happens to be lower than # the lowest average squared partial correlation, then no # components should be extracted from the correlation ma- # trix. Statistically, components are retained as long as the # variance in the correlation matrix represents systematic # variance. Components are no longer retained when there # is proportionately more unsystematic variance than sys- # tematic variance." (O'Connor, 2000, p.397). # # Current algorithm is loosely based on SPSS O'Connor algorithm # # == Reference # * O'Connor, B. (2000). SPSS and SAS programs for determining the number of components using parallel analysis and Velicer's MAP test. Behavior Research Methods, Instruments, & Computers, 32(3), 396-402. # class MAP include Summarizable include DirtyMemoize # Name of analysis attr_accessor :name attr_reader :eigenvalues # Number of factors to retain attr_reader :number_of_factors # Average squared correlations attr_reader :fm # Smallest average squared correlation attr_reader :minfm attr_accessor :use_gsl def self.with_dataset(ds,opts=Hash.new) new(ds.correlation_matrix,opts) end def initialize(matrix, opts=Hash.new) @matrix=matrix opts_default={ :use_gsl=>true, :name=>_("Velicer's MAP") } @opts=opts_default.merge(opts) opts_default.keys.each {|k| send("#{k}=", @opts[k]) } end def compute gsl_m=(use_gsl and Statsample.has_gsl?) ? @matrix.to_gsl : @matrix klass_m=gsl_m.class eigvect,@eigenvalues=gsl_m.eigenvectors_matrix, gsl_m.eigenvalues eigenvalues_sqrt=@eigenvalues.collect {|v| Math.sqrt(v)} loadings=eigvect*(klass_m.diagonal(*eigenvalues_sqrt)) fm=Array.new(@matrix.row_size) ncol=@matrix.column_size fm[0]=(gsl_m.mssq - ncol).quo(ncol*(ncol-1)) (ncol-1).times do |m| puts "MAP:Eigenvalue #{m+1}" if $DEBUG a=loadings[0..(loadings.row_size-1),0..m] partcov= gsl_m - (a*a.transpose) d=klass_m.diagonal(*(partcov.diagonal.collect {|v| Math::sqrt(1/v)})) pr=d*partcov*d fm[m+1]=(pr.mssq-ncol).quo(ncol*(ncol-1)) end minfm=fm[0] nfactors=0 @errors=[] fm.each_with_index do |v,s| if defined?(Complex) and v.is_a? ::Complex @errors.push(s) else if v < minfm minfm=v nfactors=s end end end @number_of_factors=nfactors @fm=fm @minfm=minfm end def report_building(g) #:nodoc: g.section(:name=>@name) do |s| s.table(:name=>_("Eigenvalues"),:header=>[_("Value")]) do |t| eigenvalues.each_with_index do |e,i| t.row([@errors.include?(i) ? "*" : "%0.6f" % e]) end end s.table(:name=>_("Velicer's Average Squared Correlations"), :header=>[_("number of components"),_("average square correlation")]) do |t| fm.each_with_index do |v,i| t.row(["%d" % i, @errors.include?(i) ? "*" : "%0.6f" % v]) end end s.text(_("The smallest average squared correlation is : %0.6f" % minfm)) s.text(_("The number of components is : %d" % number_of_factors)) end end dirty_memoize :number_of_factors, :fm, :minfm, :eigenvalues end end end ================================================ FILE: lib/statsample/factor/parallelanalysis.rb ================================================ module Statsample module Factor # Performs Horn's 'parallel analysis' to a principal components analysis # to adjust for sample bias in the retention of components. # Can create the bootstrap samples using random data, using number # of cases and variables, parameters for actual data (mean and standard # deviation of each variable) or bootstrap sampling for actual data. # == Description # "PA involves the construction of a number of correlation matrices of random variables based on the same sample size and number of variables in the real data set. The average eigenvalues from the random correlation matrices are then compared to the eigenvalues from the real data correlation matrix, such that the first observed eigenvalue is compared to the first random eigenvalue, the second observed eigenvalue is compared to the second random eigenvalue, and so on." (Hayton, Allen & Scarpello, 2004, p.194) # == Usage # *With real dataset* # # ds should be any valid dataset # pa=Statsample::Factor::ParallelAnalysis.new(ds, :iterations=>100, :bootstrap_method=>:data) # # *With number of cases and variables* # pa=Statsample::Factor::ParallelAnalysis.with_random_data(100,8) # # == Reference # * Hayton, J., Allen, D. & Scarpello, V.(2004). Factor Retention Decisions in Exploratory Factor Analysis: a Tutorial on Parallel Analysis. Organizational Research Methods, 7 (2), 191-205. # * O'Connor, B. (2000). SPSS and SAS programs for determining the number of components using parallel analysis and Velicer's MAP test. Behavior Research Methods, Instruments, & Computers, 32(3), 396-402. # * Liu, O., & Rijmen, F. (2008). A modified procedure for parallel analysis of ordered categorical data. Behavior Research Methods, 40(2), 556-562. class ParallelAnalysis def self.with_random_data(cases,vars,opts=Hash.new) require 'ostruct' ds=OpenStruct.new ds.fields=vars.times.map {|i| "v#{i+1}"} ds.cases=cases opts=opts.merge({:bootstrap_method=> :random, :no_data=>true}) new(ds, opts) end include DirtyMemoize include Summarizable # Number of random sets to produce. 50 by default attr_accessor :iterations # Name of analysis attr_accessor :name # Dataset. You could use mock vectors when use bootstrap method attr_reader :ds # Bootstrap method. :random used by default # * :random: uses number of variables and cases for the dataset # * :data : sample with replacement from actual data. attr_accessor :bootstrap_method # Uses smc on diagonal of matrixes, to perform simulation # of a Principal Axis analysis. # By default, false. attr_accessor :smc # Percentil over bootstrap eigenvalue should be accepted. 95 by default attr_accessor :percentil # Correlation matrix used with :raw_data . :correlation_matrix used by default attr_accessor :matrix_method # Number of eigenvalues to calculate. Should be set for # Principal Axis Analysis. attr_accessor :n_variables # Dataset with bootstrapped eigenvalues attr_reader :ds_eigenvalues # Perform analysis without actual data. attr_accessor :no_data # Show extra information if true attr_accessor :debug attr_accessor :use_gsl def initialize(ds, opts=Hash.new) @ds=ds @fields=@ds.fields @n_variables=@fields.size @n_cases=ds.cases opts_default={ :name=>_("Parallel Analysis"), :iterations=>50, # See Liu and Rijmen (2008) :bootstrap_method => :random, :smc=>false, :percentil=>95, :debug=>false, :no_data=>false, :matrix_method=>:correlation_matrix } @use_gsl=Statsample.has_gsl? @opts=opts_default.merge(opts) @opts[:matrix_method]==:correlation_matrix if @opts[:bootstrap_method]==:parameters opts_default.keys.each {|k| send("#{k}=", @opts[k]) } end # Number of factor to retent def number_of_factors total=0 ds_eigenvalues.fields.each_with_index do |f,i| if (@original[i]>0 and @original[i]>ds_eigenvalues[f].percentil(percentil)) total+=1 else break end end total end def report_building(g) #:nodoc: g.section(:name=>@name) do |s| s.text _("Bootstrap Method: %s") % bootstrap_method s.text _("Uses SMC: %s") % (smc ? _("Yes") : _("No")) s.text _("Correlation Matrix type : %s") % matrix_method s.text _("Number of variables: %d") % @n_variables s.text _("Number of cases: %d") % @n_cases s.text _("Number of iterations: %d") % @iterations if @no_data s.table(:name=>_("Eigenvalues"), :header=>[_("n"), _("generated eigenvalue"), "p.#{percentil}"]) do |t| ds_eigenvalues.fields.each_with_index do |f,i| v=ds_eigenvalues[f] t.row [i+1, "%0.4f" % v.mean, "%0.4f" % v.percentil(percentil), ] end end else s.text _("Number or factors to preserve: %d") % number_of_factors s.table(:name=>_("Eigenvalues"), :header=>[_("n"), _("data eigenvalue"), _("generated eigenvalue"),"p.#{percentil}",_("preserve?")]) do |t| ds_eigenvalues.fields.each_with_index do |f,i| v=ds_eigenvalues[f] t.row [i+1, "%0.4f" % @original[i], "%0.4f" % v.mean, "%0.4f" % v.percentil(percentil), (v.percentil(percentil)>0 and @original[i] > v.percentil(percentil)) ? "Yes":""] end end end end end # Perform calculation. Shouldn't be called directly for the user def compute @original=Statsample::Bivariate.send(matrix_method, @ds).eigenvalues unless no_data @ds_eigenvalues=Statsample::Dataset.new((1..@n_variables).map{|v| "ev_%05d" % v}) @ds_eigenvalues.fields.each {|f| @ds_eigenvalues[f].type=:scale} if bootstrap_method==:parameter or bootstrap_method==:random rng = Distribution::Normal.rng end @iterations.times do |i| begin puts "#{@name}: Iteration #{i}" if $DEBUG or debug # Create a dataset of dummy values ds_bootstrap=Statsample::Dataset.new(@ds.fields) @fields.each do |f| if bootstrap_method==:random ds_bootstrap[f]=@n_cases.times.map {|c| rng.call}.to_scale elsif bootstrap_method==:data ds_bootstrap[f]=ds[f].sample_with_replacement(@n_cases) else raise "bootstrap_method doesn't recogniced" end end ds_bootstrap.update_valid_data matrix=Statsample::Bivariate.send(matrix_method, ds_bootstrap) matrix=matrix.to_gsl if @use_gsl if smc smc_v=matrix.inverse.diagonal.map{|ii| 1-(1.quo(ii))} smc_v.each_with_index do |v,ii| matrix[ii,ii]=v end end ev=matrix.eigenvalues @ds_eigenvalues.add_case_array(ev) rescue Statsample::Bivariate::Tetrachoric::RequerimentNotMeet => e puts "Error: #{e}" if $DEBUG redo end end @ds_eigenvalues.update_valid_data end dirty_memoize :number_of_factors, :ds_eigenvalues dirty_writer :iterations, :bootstrap_method, :percentil, :smc end end end ================================================ FILE: lib/statsample/factor/pca.rb ================================================ # encoding: UTF-8 module Statsample module Factor # Principal Component Analysis (PCA) of a covariance or # correlation matrix.. # # NOTE: Sign of second and later eigenvalues could be different # using Ruby or GSL, so values for PCs and component matrix # should differ, because extendmatrix and gsl's methods to calculate # eigenvectors are different. Using R is worse, cause first # eigenvector could have negative values! # For Principal Axis Analysis, use Statsample::Factor::PrincipalAxis # # == Usage: # require 'statsample' # a=[2.5, 0.5, 2.2, 1.9, 3.1, 2.3, 2.0, 1.0, 1.5, 1.1].to_scale # b=[2.4,0.7,2.9,2.2,3.0,2.7,1.6,1.1,1.6,0.9].to_scale # ds={'a'=>a,'b'=>b}.to_dataset # cor_matrix=Statsample::Bivariate.correlation_matrix(ds) # pca=Statsample::Factor::PCA.new(cor_matrix) # pca.m # => 1 # pca.eigenvalues # => [1.92592927269225, 0.0740707273077545] # pca.component_matrix # => GSL::Matrix # [ 9.813e-01 # 9.813e-01 ] # pca.communalities # => [0.962964636346122, 0.962964636346122] # # == References: # * SPSS Manual # * Smith, L. (2002). A tutorial on Principal Component Analysis. Available on http://courses.eas.ualberta.ca/eas570/pca_tutorial.pdf # * Härdle, W. & Simar, L. (2003). Applied Multivariate Statistical Analysis. Springer # class PCA include Summarizable # Name of analysis attr_accessor :name # Number of factors. Set by default to the number of factors # with eigen values > 1 attr_accessor :m # Use GSL if available attr_accessor :use_gsl # Add to the summary a rotation report attr_accessor :summary_rotation # Add to the summary a parallel analysis report attr_accessor :summary_parallel_analysis # Type of rotation. By default, Statsample::Factor::Rotation::Varimax attr_accessor :rotation_type attr_accessor :matrix_type def initialize(matrix, opts=Hash.new) @use_gsl=nil @name=_("Principal Component Analysis") @matrix=matrix @n_variables=@matrix.column_size @variables_names=(@matrix.respond_to? :fields) ? @matrix.fields : @n_variables.times.map {|i| _("VAR_%d") % (i+1)} @matrix_type = @matrix.respond_to?(:_type) ? @matrix._type : :correlation @m=nil @rotation_type=Statsample::Factor::Varimax opts.each{|k,v| self.send("#{k}=",v) if self.respond_to? k } if @use_gsl.nil? @use_gsl=Statsample.has_gsl? end if @matrix.respond_to? :fields @variables_names=@matrix.fields else @variables_names=@n_variables.times.map {|i| "V#{i+1}"} end calculate_eigenpairs if @m.nil? # Set number of factors with eigenvalues > 1 @m=@eigenpairs.find_all {|ev,ec| ev>=1.0}.size end end def rotation @rotation_type.new(component_matrix) end def total_eigenvalues eigenvalues.inject(0) {|ac,v| ac+v} end def create_centered_ds h={} @original_ds.factors.each {|f| mean=@original_ds[f].mean h[f]=@original_ds[f].recode {|c| c-mean} } @ds=h.to_dataset end # Feature matrix for +m+ factors # Returns +m+ eigenvectors as columns. # So, i=variable, j=component def feature_matrix(m=nil) m||=@m if @use_gsl omega_m=GSL::Matrix.zeros(@n_variables,m) ev=eigenvectors m.times do |i| omega_m.set_column(i,ev[i]) end omega_m else omega_m=::Matrix.build(@n_variables, m) {0} m.times do |i| omega_m.column= i, @eigenpairs[i][1] end omega_m end end # Returns Principal Components for +input+ matrix or dataset # The number of PC to return is equal to parameter +m+. # If +m+ isn't set, m set to number of PCs selected at object creation. # Use covariance matrix def principal_components(input, m=nil) if @use_gsl data_matrix=input.to_gsl else data_matrix=input.to_matrix end m||=@m raise "data matrix variables<>pca variables" if data_matrix.column_size!=@n_variables fv=feature_matrix(m) pcs=(fv.transpose*data_matrix.transpose).transpose pcs.extend Statsample::NamedMatrix pcs.fields_y=m.times.map {|i| "PC_%d" % (i+1)} pcs.to_dataset end def component_matrix(m=nil) var="component_matrix_#{matrix_type}" send(var,m) end # Matrix with correlations between components and # variables. Based on Härdle & Simar (2003, p.243) def component_matrix_covariance(m=nil) m||=@m raise "m should be > 0" if m<1 ff=feature_matrix(m) cm=::Matrix.build(@n_variables, m) {0} @n_variables.times {|i| m.times {|j| cm[i,j]=ff[i,j] * Math.sqrt(eigenvalues[j] / @matrix[i,i]) } } cm.extend NamedMatrix cm.name=_("Component matrix (from covariance)") cm.fields_x = @variables_names cm.fields_y = m.times.map {|i| "PC_%d" % (i+1)} cm end # Matrix with correlations between components and # variables def component_matrix_correlation(m=nil) m||=@m raise "m should be > 0" if m<1 omega_m=::Matrix.build(@n_variables, m) {0} gammas=[] m.times {|i| omega_m.column=i, @eigenpairs[i][1] gammas.push(Math::sqrt(@eigenpairs[i][0])) } gamma_m=::Matrix.diagonal(*gammas) cm=(omega_m*(gamma_m)).to_matrix cm.extend CovariateMatrix cm.name=_("Component matrix") cm.fields_x = @variables_names cm.fields_y = m.times.map {|i| "PC_%d" % (i+1)} cm end def communalities(m=nil) m||=@m h=[] @n_variables.times do |i| sum=0 m.times do |j| sum+=(@eigenpairs[j][0].abs*@eigenpairs[j][1][i]**2) end h.push(sum) end h end # Array with eigenvalues def eigenvalues @eigenpairs.collect {|c| c[0] } end def eigenvectors @eigenpairs.collect {|c| @use_gsl ? c[1].to_gsl : c[1].to_vector } end def calculate_eigenpairs @eigenpairs= @use_gsl ? @matrix.to_gsl.eigenpairs : @matrix.to_matrix.eigenpairs_ruby end def report_building(builder) # :nodoc: builder.section(:name=>@name) do |generator| generator.text _("Number of factors: %d") % m generator.table(:name=>_("Communalities"), :header=>[_("Variable"),_("Initial"),_("Extraction"), _("%")]) do |t| communalities(m).each_with_index {|com, i| perc=com*100.quo(@matrix[i,i]) t.row([@variables_names[i], "%0.3f" % @matrix[i,i] , "%0.3f" % com, "%0.3f" % perc]) } end te=total_eigenvalues generator.table(:name=>_("Total Variance Explained"), :header=>[_("Component"), _("E.Total"), _("%"), _("Cum. %")]) do |t| ac_eigen=0 eigenvalues.each_with_index {|eigenvalue,i| ac_eigen+=eigenvalue t.row([_("Component %d") % (i+1), sprintf("%0.3f",eigenvalue), sprintf("%0.3f%%", eigenvalue*100.quo(te)), sprintf("%0.3f",ac_eigen*100.quo(te))]) } end generator.parse_element(component_matrix(m)) if (summary_rotation) generator.parse_element(rotation) end end end private :calculate_eigenpairs, :create_centered_ds end end end ================================================ FILE: lib/statsample/factor/principalaxis.rb ================================================ module Statsample module Factor # Principal Axis Analysis for a covariance or correlation matrix. # # For PCA, use Statsample::Factor::PCA # # == Usage: # require 'statsample' # a=[2.5, 0.5, 2.2, 1.9, 3.1, 2.3, 2.0, 1.0, 1.5, 1.1].to_scale # b=[2.4,0.7,2.9,2.2,3.0,2.7,1.6,1.1,1.6,0.9].to_scale # ds={'a'=>a,'b'=>b}.to_dataset # cor_matrix=Statsample::Bivariate.correlation_matrix(ds) # pa=Statsample::Factor::PrincipalAxis.new(cor_matrix) # pa.iterate(1) # pa.m # => 1 # pca.component_matrix # => GSL::Matrix # [ 9.622e-01 # 9.622e-01 ] # pca.communalities # => [0.962964636346122, 0.962964636346122] # # == References: # * SPSS Manual # * Smith, L. (2002). A tutorial on Principal Component Analysis. Available on http://courses.eas.ualberta.ca/eas570/pca_tutorial.pdf # class PrincipalAxis include DirtyMemoize include Summarizable # Name of analysis attr_accessor :name # Number of factors. Set by default to the number of factors # with eigenvalues > 1 (Kaiser criterion). # # _Warning:_ Kaiser criterion overfactors! Give yourself some time # and use Horn's Parallel Analysis. # attr_accessor :m # Number of iterations required to converge attr_reader :iterations # Initial eigenvalues attr_reader :initial_eigenvalues # Tolerance for iterations attr_accessor :epsilon # Use SMC(squared multiple correlations) as diagonal. If false, use 1 attr_accessor :smc # Maximum number of iterations attr_accessor :max_iterations # Eigenvalues of factor analysis attr_reader :eigenvalues # Minimum difference between succesive iterations on sum of communalities DELTA=1e-3 # Maximum number of iterations MAX_ITERATIONS=25 def initialize(matrix, opts=Hash.new) @matrix=matrix if @matrix.respond_to? :fields @fields=@matrix.fields else @fields=@matrix.row_size.times.map {|i| _("Variable %d") % (i+1)} end @n_variables=@matrix.row_size @name="" @m=nil @initial_eigenvalues=nil @initial_communalities=nil @component_matrix=nil @delta=DELTA @smc=true @max_iterations=MAX_ITERATIONS opts.each{|k,v| self.send("#{k}=",v) if self.respond_to? k } if @matrix.respond_to? :fields @variables_names=@matrix.fields else @variables_names=@n_variables.times.map {|i| "V#{i+1}"} end if @m.nil? pca=PCA.new(::Matrix.rows(@matrix.to_a)) @m=pca.m end @clean=true end # Communality for all variables given m factors def communalities(m=nil) if m!=@m or @clean iterate(m) raise "Can't calculate comunality" if @communalities.nil? end @communalities end # Component matrix for m factors def component_matrix(m=nil) if m!=@m or @clean iterate(m) end @component_matrix end # Iterate to find the factors def iterate(m=nil) @clean=false m||=@m @m=m t = @max_iterations work_matrix=@matrix.to_a prev_com=initial_communalities pca=PCA.new(::Matrix.rows(work_matrix)) @initial_eigenvalues=pca.eigenvalues prev_sum=prev_com.inject(0) {|ac,v| ac+v} @iterations=0 t.times do |i| "#{@name}: Iteration #{i}" if $DEBUG @iterations+=1 prev_com.each_with_index{|v,it| work_matrix[it][it]=v } pca=PCA.new(::Matrix.rows(work_matrix)) @communalities=pca.communalities(m) @eigenvalues=pca.eigenvalues com_sum = @communalities.inject(0) {|ac,v| ac+v} #jump=true break if (com_sum-prev_sum).abs < @delta @communalities.each_with_index do |v2,i2| raise "Variable #{i2} with communality > 1" if v2>1.0 end prev_sum=com_sum prev_com=@communalities end @component_matrix=pca.component_matrix(m) @component_matrix.extend CovariateMatrix @component_matrix.name=_("Factor Matrix") @component_matrix.fields_x = @variables_names @component_matrix.fields_y = m.times.map {|i| "factor_#{i+1}"} end alias :compute :iterate def initial_communalities if @initial_communalities.nil? if @smc # Based on O'Connors(2000) @initial_communalities=@matrix.inverse.diagonal.map{|i| 1-(1.quo(i))} =begin @initial_communalities=@matrix.column_size.times.collect {|i| rxx , rxy = PrincipalAxis.separate_matrices(@matrix,i) matrix=(rxy.t*rxx.inverse*rxy) matrix[0,0] } =end else @initial_communalities=[1.0]*@matrix.column_size end end @initial_communalities end # Returns two matrixes from a correlation matrix # with regressors correlation matrix and criteria xy # matrix. def self.separate_matrices(matrix, y) ac=[] matrix.column_size.times do |i| ac.push(matrix[y,i]) if i!=y end rxy=Matrix.columns([ac]) rows=[] matrix.row_size.times do |i| if i!=y row=[] matrix.row_size.times do |j| row.push(matrix[i,j]) if j!=y end rows.push(row) end end rxx=Matrix.rows(rows) [rxx,rxy] end def report_building(generator) iterate if @clean generator.section(:name=>@name) do |s| s.text _("Number of factors: %d") % m s.text _("Iterations: %d") % @iterations s.table(:name=>_("Communalities"), :header=>[_("Variable"),_("Initial"),_("Extraction")]) do |t| communalities(m).each_with_index {|com,i| t.row([@fields[i], sprintf("%0.4f", initial_communalities[i]), sprintf("%0.3f", com)]) } end s.table(:name=>_("Total Variance"), :header=>[_("Factor"), _("I.E.Total"), _("I.E. %"), _("I.E.Cum. %"), _("S.L.Total"), _("S.L. %"), _("S.L.Cum. %") ]) do |t| ac_eigen,ac_i_eigen=0,0 @initial_eigenvalues.each_with_index {|eigenvalue,i| ac_i_eigen+=eigenvalue ac_eigen+=@eigenvalues[i] new_row=[ _("Factor %d") % (i+1), sprintf("%0.3f",eigenvalue), sprintf("%0.3f%%", eigenvalue*100.quo(@n_variables)), sprintf("%0.3f",ac_i_eigen*100.quo(@n_variables)) ] if i<@m new_row.concat [ sprintf("%0.3f", @eigenvalues[i]), sprintf("%0.3f%%", @eigenvalues[i]*100.quo(@n_variables)), sprintf("%0.3f",ac_eigen*100.quo(@n_variables)) ] else new_row.concat ["","",""] end t.row new_row } end s.parse_element(component_matrix) end end dirty_writer :max_iterations, :epsilon, :smc dirty_memoize :eigenvalues, :iterations, :initial_eigenvalues end end end ================================================ FILE: lib/statsample/factor/rotation.rb ================================================ module Statsample module Factor # Base class for component matrix rotation. # # == Reference: # * SPSS Manual # * Lin, J. (2007). VARIMAX_K58 [Source code]. [http://www.johnny-lin.com/idl_code/varimax_k58.pro] # # Use subclasses Varimax, Equimax or Quartimax for desired type of rotation # Use: # a = Matrix[ [ 0.4320, 0.8129, 0.3872] # , [ 0.7950, -0.5416, 0.2565] # , [ 0.5944, 0.7234, -0.3441] # , [ 0.8945, -0.3921, -0.1863] ] # rotation = Statsample::Factor::Varimax(a) # rotation.iterate # p rotation.rotated # p rotation.component_transformation_matrix # class Rotation EPSILON=1e-15 MAX_ITERATIONS=25 include Summarizable include DirtyMemoize attr_reader :iterations, :rotated, :component_transformation_matrix, :h2 # Maximum number of iterations attr_accessor :max_iterations # Maximum precision attr_accessor :epsilon attr_accessor :use_gsl dirty_writer :max_iterations, :epsilon dirty_memoize :iterations, :rotated, :component_transformation_matrix, :h2 def initialize(matrix, opts=Hash.new) @name=_("%s rotation") % rotation_name @matrix=matrix @n=@matrix.row_size # Variables, p on original @m=@matrix.column_size # Factors, r on original @component_transformation_matrix=nil @max_iterations=MAX_ITERATIONS @epsilon=EPSILON @rotated=nil @h2=(@matrix.collect {|c| c**2} * Matrix.column_vector([1]*@m)).column(0).to_a @use_gsl=Statsample.has_gsl? opts.each{|k,v| self.send("#{k}=",v) if self.respond_to? k } end def report_building(g) g.section(:name=>@name) do |s| s.parse_element(rotated) s.parse_element(component_transformation_matrix) end end alias_method :communalities, :h2 alias_method :rotated_component_matrix, :rotated def compute iterate end # Start iteration def iterate k_matrix=@use_gsl ? GSL::Matrix : ::Matrix t=k_matrix.identity(@m) b=(@use_gsl ? @matrix.to_gsl : @matrix.dup) h=k_matrix.diagonal(*@h2).collect {|c| Math::sqrt(c)} h_inverse=h.collect {|c| c!=0 ? 1/c : 0 } bh=h_inverse * b @not_converged=true @iterations=0 while @not_converged break if @iterations>@max_iterations @iterations+=1 #puts "Iteration #{iterations}" num_pairs=@m*(@m-1).quo(2) (0..(@m-2)).each do |i| #+ go through factor index 0:r-1-1 (begin) ((i+1)..(@m-1)).each do |j| #+ pair i to "rest" of factors (begin) xx = bh.column(i) yy = bh.column(j) tx = t.column(i) ty = t.column(j) uu = @n.times.collect {|var_i| xx[var_i]**2-yy[var_i]**2} vv = @n.times.collect {|var_i| 2*xx[var_i]*yy[var_i]} a = @n.times.inject(0) {|ac,var_i| ac+ uu[var_i] } b = @n.times.inject(0) {|ac,var_i| ac+ vv[var_i] } c = @n.times.inject(0) {|ac,var_i| ac+ (uu[var_i]**2 - vv[var_i]**2) } d = @n.times.inject(0) {|ac,var_i| ac+ (2*uu[var_i]*vv[var_i]) } num=x(a,b,c,d) den=y(a,b,c,d) phi=Math::atan2(num,den) / 4.0 # puts "#{i}-#{j}: #{phi}" if(Math::sin(phi.abs) >= @epsilon) xx_rot=( Math::cos(phi)*xx)+(Math::sin(phi)*yy) yy_rot=((-Math::sin(phi))*xx)+(Math::cos(phi)*yy) tx_rot=( Math::cos(phi)*tx)+(Math::sin(phi)*ty) ty_rot=((-Math::sin(phi))*tx)+(Math::cos(phi)*ty) bh=bh.to_a @n.times {|row_i| bh[row_i][i] = xx_rot[row_i] bh[row_i][j] = yy_rot[row_i] } t=t.to_a @m.times {|row_i| t[row_i][i]=tx_rot[row_i] t[row_i][j]=ty_rot[row_i] } #if @use_gsl bh=k_matrix.[](*bh) t=k_matrix.[](*t) #else # bh=Matrix.rows(bh) # t=Matrix.rows(t) #end else num_pairs=num_pairs-1 @not_converged=false if num_pairs==0 end # if end #j end #i end # while @rotated=h*bh @rotated.extend CovariateMatrix @rotated.name=_("Rotated Component matrix") if @matrix.respond_to? :fields_x @rotated.fields_x = @matrix.fields_x else @rotated.fields_x = @n.times.map {|i| "var_#{i+1}"} end if @matrix.respond_to? :fields_y @rotated.fields_y = @matrix.fields_y else @rotated.fields_y = @m.times.map {|i| "var_#{i+1}"} end @component_transformation_matrix=t @component_transformation_matrix.extend CovariateMatrix @component_transformation_matrix.name=_("Component transformation matrix") if @matrix.respond_to? :fields_y @component_transformation_matrix.fields = @matrix.fields_y else @component_transformation_matrix.fields = @m.times.map {|i| "var_#{i+1}"} end @rotated end end class Varimax < Rotation def x(a,b,c,d) d-(2*a*b / @n.to_f) end def y(a,b,c,d) c-((a**2-b**2) / @n.to_f) end def rotation_name "Varimax" end end class Equimax < Rotation def x(a,b,c,d) d-(@m*a*b / @n.to_f) end def y(a,b,c,d) c-@m*((a**2-b**2) / (2*@n.to_f)) end def rotation_name "Equimax" end end class Quartimax < Rotation def x(a,b,c,d) d end def y(a,b,c,d) c end def rotation_name "Quartimax" end end end end ================================================ FILE: lib/statsample/factor.rb ================================================ require 'statsample/factor/rotation' require 'statsample/factor/pca' require 'statsample/factor/principalaxis' require 'statsample/factor/parallelanalysis' require 'statsample/factor/map' module Statsample # Factor Analysis toolbox. # * Classes for Extraction of factors: # * Statsample::Factor::PCA # * Statsample::Factor::PrincipalAxis # * Classes for Rotation of factors: # * Statsample::Factor::Varimax # * Statsample::Factor::Equimax # * Statsample::Factor::Quartimax # * Classes for determining the number of components # * Statsample::Factor::MAP # * Statsample::Factor::ParallelAnalysis # # About number of components, O'Connor(2000) said: # The two procedures [PA and MAP ] complement each other nicely, # in that the MAP tends to err (when it does err) in the direction # of underextraction, whereas parallel analysis tends to err # (when it does err) in the direction of overextraction. # Optimal decisions are thus likely to be made after considering # the results of both analytic procedures. (p.10) module Factor # Anti-image covariance matrix. # Useful for inspection of desireability of data for factor analysis. # According to Dziuban & Shirkey (1974, p.359): # "If this matrix does not exhibit many zero off-diagonal elements, # the investigator has evidence that the correlation # matrix is not appropriate for factor analysis." # def self.anti_image_covariance_matrix(matrix) s2=Matrix.diag(*(matrix.inverse.diagonal)).inverse aicm=(s2)*matrix.inverse*(s2) aicm.extend(Statsample::CovariateMatrix) aicm.fields=matrix.fields if matrix.respond_to? :fields aicm end def self.anti_image_correlation_matrix(matrix) matrix=matrix.to_matrix s=Matrix.diag(*(matrix.inverse.diagonal)).sqrt.inverse aicm=s*matrix.inverse*s aicm.extend(Statsample::CovariateMatrix) aicm.fields=matrix.fields if matrix.respond_to? :fields aicm end # Kaiser-Meyer-Olkin measure of sampling adequacy for correlation matrix. # # Kaiser's (1974, cited on Dziuban & Shirkey, 1974) present calibration of the index is as follows : # * .90s—marvelous # * .80s— meritorious # * .70s—middling # * .60s—mediocre # * .50s—miserable # * .50 •—unacceptable def self.kmo(matrix) q=anti_image_correlation_matrix(matrix) n=matrix.row_size sum_r,sum_q=0,0 n.times do |j| n.times do |k| if j!=k sum_r+=matrix[j,k]**2 sum_q+=q[j,k]**2 end end end sum_r.quo(sum_r+sum_q) end # Kaiser-Meyer-Olkin measure of sampling adequacy for one variable. # def self.kmo_univariate(matrix, var) if var.is_a? String if matrix.respond_to? :fields j=matrix.fields.index(var) raise "Matrix doesn't have field #{var}" if j.nil? else raise "Matrix doesn't respond to fields" end else j=var end q=anti_image_correlation_matrix(matrix) n=matrix.row_size sum_r,sum_q=0,0 n.times do |k| if j!=k sum_r+=matrix[j,k]**2 sum_q+=q[j,k]**2 end end sum_r.quo(sum_r+sum_q) end end end ================================================ FILE: lib/statsample/graph/boxplot.rb ================================================ require 'rubyvis' module Statsample module Graph # = Boxplot # # From Wikipedia: # In descriptive statistics, a box plot or boxplot (also known as a box-and-whisker diagram or plot) is a convenient way of graphically depicting groups of numerical data through their five-number summaries: the smallest observation (sample minimum), lower quartile (Q1), median (Q2), upper quartile (Q3), and largest observation (sample maximum). A boxplot may also indicate which observations, if any, might be considered outliers. # # == Usage # === Svg output # a=[1,2,3,4].to_scale # b=[3,4,5,6].to_scale # puts Statsample::Graph::Boxplot.new(:vectors=>[a,b]).to_svg # === Using ReportBuilder # a=[1,2,3,4].to_scale # b=[3,4,5,6].to_scale # rb=ReportBuilder.new # rb.add(Statsample::Graph::Boxplot.new(:vectors=>[a,b])) # rb.save_html('boxplot.html') class Boxplot include Summarizable attr_accessor :name # Total width of Boxplot attr_accessor :width # Total height of Boxplot attr_accessor :height # Top margin attr_accessor :margin_top # Bottom margin attr_accessor :margin_bottom # Left margin attr_accessor :margin_left # Right margin attr_accessor :margin_right # Array with assignation to groups of bars # For example, for four vectors, # boxplot.groups=[1,2,1,3] # Assign same color to first and third element, and different to # second and fourth attr_accessor :groups # Minimum value on y-axis. Automaticly defined from data attr_accessor :minimum # Maximum value on y-axis. Automaticly defined from data attr_accessor :maximum # Vectors to box-ploting attr_accessor :vectors # The rotation angle, in radians. Text is rotated clockwise relative # to the anchor location. For example, with the default left alignment, # an angle of Math.PI / 2 causes text to proceed downwards. The default angle is zero. attr_accessor :label_angle attr_reader :x_scale, :y_scale # Create a new Boxplot. # Parameters: Hash of options # * :vectors: Array of vectors # * :groups: Array of same size as :vectors:, with name of groups # to colorize vectors def initialize(opts=Hash.new) @vectors=opts.delete :vectors raise "You should define vectors" if @vectors.nil? opts_default={ :name=>_("Boxplot"), :groups=>nil, :width=>400, :height=>300, :margin_top=>10, :margin_bottom=>20, :margin_left=>20, :margin_right=>20, :minimum=>nil, :maximum=>nil, :label_angle=>0 } @opts=opts_default.merge(opts) opts_default.keys.each {|k| send("#{k}=", @opts[k]) } end # Returns a Rubyvis panel with scatterplot def rubyvis_panel # :nodoc: that=self min,max=@minimum, @maximum min||=@vectors.map {|v| v.min}.min max||=@vectors.map {|v| v.max}.max margin_hor=margin_left + margin_right margin_vert=margin_top + margin_bottom x_scale = pv.Scale.ordinal(@vectors.size.times.map.to_a).split_banded(0, width-margin_hor, 4.0/5) y_scale=Rubyvis::Scale.linear(min,max).range(0,height-margin_vert) y_scale.nice # cache data colors=Rubyvis::Colors.category10 data=@vectors.map {|v| out={:percentil_25=>v.percentil(25), :median=>v.median, :percentil_75=>v.percentil(75), :name=>v.name} out[:iqr]=out[:percentil_75] - out[:percentil_25] irq_max=out[:percentil_75] + out[:iqr] irq_min=out[:percentil_25] - out[:iqr] # Find the last data inside the margin min = out[:percentil_25] max = out[:percentil_75] v.each {|d| min=d if d < min and d > irq_min max=d if d > max and d < irq_max } # Whiskers! out[:low_whisker]=min out[:high_whisker]=max # And now, data outside whiskers out[:outliers]=v.data_with_nils.find_all {|d| d < min or d > max } out } vis=Rubyvis::Panel.new do |pan| pan.width width - margin_hor pan.height height - margin_vert pan.bottom margin_bottom pan.left margin_left pan.right margin_right pan.top margin_top # Y axis pan.rule do data y_scale.ticks bottom y_scale stroke_style {|d| d!=0 ? "#eee" : "#000"} label(:anchor=>'left') do text y_scale.tick_format end end pan.rule do bottom 0 stroke_style 'black' end # Labels pan.label do |l| l.data data l.text_angle that.label_angle l.left {|v| x_scale[index] } l.bottom(-15) l.text {|v,x| v[:name]} end pan.panel do |bp| bp.data data bp.left {|v| x_scale[index]} bp.width x_scale.range_band # Bar bp.bar do |b| b.bottom {|v| y_scale[v[:percentil_25]]} b.height {|v| y_scale[v[:percentil_75]] - y_scale[v[:percentil_25]] } b.line_width 1 b.stroke_style {|v| if that.groups colors.scale(that.groups[parent.index]).darker else colors.scale(index).darker end } b.fill_style {|v| if that.groups colors.scale(that.groups[parent.index]) else colors.scale(index) end } end # Median bp.rule do |r| r.bottom {|v| y_scale[v[:median]]} r.width x_scale.range_band r.line_width 2 end ## # Whiskeys ## # Low whiskey bp.rule do |r| r.visible {|v| v[:percentil_25] > v[:low_whisker]} r.bottom {|v| y_scale[v[:low_whisker]]} end bp.rule do |r| r.visible {|v| v[:percentil_25] > v[:low_whisker]} r.bottom {|v| y_scale[v[:low_whisker]]} r.left {|v| x_scale.range_band / 2.0} r.height {|v| y_scale.scale(v[:percentil_25]) - y_scale.scale(v[:low_whisker])} end # High whiskey bp.rule do |r| r.visible {|v| v[:percentil_75] < v[:high_whisker]} r.bottom {|v| y_scale.scale(v[:high_whisker])} end bp.rule do |r| r.visible {|v| v[:percentil_75] < v[:high_whisker]} r.bottom {|v| y_scale.scale(v[:percentil_75])} r.left {|v| x_scale.range_band / 2.0} r.height {|v| y_scale.scale(v[:high_whisker]) - y_scale.scale(v[:percentil_75])} end # Outliers bp.dot do |dot| dot.shape_size 4 dot.data {|v| v[:outliers]} dot.left {|v| x_scale.range_band / 2.0} dot.bottom {|v| y_scale.scale(v)} dot.title {|v| v} end end end vis end # Returns SVG with scatterplot def to_svg rp=rubyvis_panel rp.render rp.to_svg end def report_building(builder) # :nodoc: builder.section(:name=>name) do |b| b.image(to_svg, :type=>'svg', :width=>width, :height=>height) end end end end end ================================================ FILE: lib/statsample/graph/histogram.rb ================================================ require 'rubyvis' module Statsample module Graph # In statistics, a histogram is a graphical representation, showing a visual impression of the distribution of experimental data. It is an estimate of the probability distribution of a continuous variable and was first introduced by Karl Pearson [1]. A histogram consists of tabular frequencies, shown as adjacent rectangles, erected over discrete intervals (bins), with an area equal to the frequency of the observations in the interval. The height of a rectangle is also equal to the frequency density of the interval, i.e., the frequency divided by the width of the interval. The total area of the histogram is equal to the number of data. # # == Usage # === Svg output # a=[1,2,3,4].to_scale # puts Statsample::Graph::Histogram.new(a).to_svg # === Using ReportBuilder # a=[1,2,3,4].to_scale # rb=ReportBuilder.new # rb.add(Statsample::Graph::Histogram.new(a)) # rb.save_html('histogram.html') class Histogram include Summarizable # Histogram name attr_accessor :name # Total width attr_accessor :width # Total height attr_accessor :height # Top margin attr_accessor :margin_top # Bottom margin attr_accessor :margin_bottom # Left margin attr_accessor :margin_left # Right margin attr_accessor :margin_right attr_reader :hist # Could be an array of ranges or number of bins attr_accessor :bins # Minimum value on x axis. Calculated automaticly from data if not set attr_accessor :minimum_x # Maximum value on x axis. Calculated automaticly from data if not set attr_accessor :maximum_x # Minimum value on y axis. Set to 0 if not set attr_accessor :minimum_y # Maximum value on y axis. Calculated automaticly from data if not set. attr_accessor :maximum_y # Add a line showing normal distribution attr_accessor :line_normal_distribution # data could be a vector or a histogram def initialize(data, opts=Hash.new) prov_name=(data.respond_to?(:name)) ? data.name : "" opts_default={ :name=>_("Histograma (%s)") % prov_name, :width=>400, :height=>300, :margin_top=>10, :margin_bottom=>20, :margin_left=>30, :margin_right=>20, :minimum_x=>nil, :maximum_x=>nil, :minimum_y=>nil, :maximum_y=>nil, :bins=>nil, :line_normal_distribution=>false } @opts=opts_default.merge(opts) opts_default.keys.each {|k| send("#{k}=", @opts[k]) } @data=data end def pre_vis # :nodoc: if @data.is_a? Statsample::Histogram @hist=@data @mean=@hist.estimated_mean @sd=@hist.estimated_standard_deviation elsif @data.is_a? Statsample::Vector @mean=@data.mean @sd=@data.sd @bins||=Math::sqrt(@data.size).floor @hist=@data.histogram(@bins) end end def rubyvis_normal_distribution(pan) x_scale=@x_scale y_scale=@y_scale wob = @hist.get_range(0)[1] - @hist.get_range(0)[0] nob = ((@maximum_x-@minimum_x) / wob.to_f).floor sum=@hist.sum data=nob.times.map {|i| l=@minimum_x+i*wob r=@minimum_x+(i+1)*wob middle=(l+r) / 2.0 pi=Distribution::Normal.cdf((r-@mean) / @sd) - Distribution::Normal.cdf((l-@mean) / @sd) {:x=>middle, :y=>pi*sum} } pan.line do |l| l.data data l.interpolate "cardinal" l.stroke_style "black" l.bottom {|d| y_scale[d[:y]]} l.left {|d| x_scale[d[:x]]} end end # Returns a Rubyvis panel with scatterplot def rubyvis_panel # :nodoc: pre_vis #that=self @minimum_x||=@hist.min @maximum_x||=@hist.max @minimum_y||=0 @maximum_y||=@hist.max_val margin_hor=margin_left + margin_right margin_vert=margin_top + margin_bottom x_scale = pv.Scale.linear(@minimum_x, @maximum_x).range(0, width - margin_hor) y_scale=Rubyvis::Scale.linear(@minimum_y, @maximum_y).range(0, height - margin_vert) y_scale.nice bins=@hist.bins.times.map {|i| { :low =>@hist.get_range(i)[0], :high=>@hist.get_range(i)[1], :value=>@hist.bin[i] } } @x_scale=x_scale @y_scale=y_scale # cache data vis=Rubyvis::Panel.new do |pan| pan.width width - margin_hor pan.height height - margin_vert pan.bottom margin_bottom pan.left margin_left pan.right margin_right pan.top margin_top # Y axis pan.rule do data y_scale.ticks bottom y_scale stroke_style {|d| d!=0 ? "#eee" : "#000"} label(:anchor=>'left') do text y_scale.tick_format end end # X axis pan.rule do data x_scale.ticks left x_scale stroke_style "black" height 5 bottom(-5) label(:anchor=>'bottom') do text x_scale.tick_format end end pan.bar do |bar| bar.data(bins) bar.left {|v| x_scale[v[:low]]} bar.width {|v| x_scale[v[:high]] - x_scale[v[:low]]} bar.bottom 0 bar.height {|v| y_scale[v[:value]]} bar.stroke_style "black" bar.line_width 1 end rubyvis_normal_distribution(pan) if @line_normal_distribution end vis end # Returns SVG with scatterplot def to_svg rp=rubyvis_panel rp.render rp.to_svg end def report_building(builder) # :nodoc: builder.section(:name=>name) do |b| b.image(to_svg, :type=>'svg', :width=>width, :height=>height) end end def report_building_text(generator) pre_vis #anchor=generator.toc_entry(_("Histogram %s") % [@name]) step= @hist.max_val > 40 ? ( @hist.max_val / 40).ceil : 1 @hist.range.each_with_index do |r,i| next if i==@hist.bins generator.text(sprintf("%5.2f : %s", r, "*" * (@hist.bin[i] / step).floor )) end end end end end ================================================ FILE: lib/statsample/graph/scatterplot.rb ================================================ require 'rubyvis' module Statsample module Graph # = Scatterplot # # From Wikipedia: # A scatter plot or scattergraph is a type of mathematical diagram using # Cartesian coordinates to display values for two variables for a set of data. # # The data is displayed as a collection of points, each having the value of one variable determining the position on the horizontal axis and the value of the other variable determining the position on the vertical axis.[2] This kind of plot is also called a scatter chart, scatter diagram and scatter graph. # == Usage # === Svg output # a=[1,2,3,4].to_scale # b=[3,4,5,6].to_scale # puts Statsample::Graph::Scatterplot.new(a,b).to_svg # === Using ReportBuilder # a=[1,2,3,4].to_scale # b=[3,4,5,6].to_scale # rb=ReportBuilder.new # rb.add(Statsample::Graph::Scatterplot.new(a,b)) # rb.save_html('scatter.html') class Scatterplot include Summarizable attr_accessor :name # Total width of Scatterplot attr_accessor :width # Total height of Scatterplot attr_accessor :height attr_accessor :dot_alpha # Add a line on median of x and y axis attr_accessor :line_median # Top margin attr_accessor :margin_top # Bottom margin attr_accessor :margin_bottom # Left margin attr_accessor :margin_left # Right margin attr_accessor :margin_right attr_reader :data attr_reader :v1,:v2 # Array with assignation to groups of bars # For example, for four vectors, # boxplot.groups=[1,2,1,3] # Assign same color to first and third element, and different to # second and fourth attr_accessor :groups attr_reader :x_scale, :y_scale # Minimum value on x axis. Calculated automaticly from data if not set attr_accessor :minimum_x # Maximum value on x axis. Calculated automaticly from data if not set attr_accessor :maximum_x # Minimum value on y axis. Set to 0 if not set attr_accessor :minimum_y # Maximum value on y axis. Calculated automaticly from data if not set. attr_accessor :maximum_y # Create a new Scatterplot. # Params: # * v1: Vector on X axis # * v2: Vector on Y axis # * opts: Hash of options. See attributes of Scatterplot def initialize(v1,v2,opts=Hash.new) @v1_name,@v2_name = v1.name,v2.name @v1,@v2 = Statsample.only_valid_clone(v1,v2) opts_default={ :name=>_("Scatterplot (%s - %s)") % [@v1_name, @v2_name], :width=>400, :height=>300, :dot_alpha=>0.5, :line_median=>false, :margin_top=>10, :margin_bottom=>20, :margin_left=>20, :margin_right=>20, :minimum_x=>nil, :maximum_x=>nil, :minimum_y=>nil, :maximum_y=>nil, :groups=>nil } @opts=opts_default.merge(opts) opts_default.keys.each {|k| send("#{k}=", @opts[k]) } @data=[] @v1.each_with_index {|d1,i| @data.push({:x=>d1, :y=>@v2[i]}) } end # Add a rule on median of X and Y axis def add_line_median(vis) # :nodoc: that=self x=@x_scale y=@y_scale vis.execute { rule do data [that.v1.median] left x stroke_style Rubyvis.color("#933").alpha(0.5) label(:anchor=>"top") do text x.tick_format end end rule do data [that.v2.median] bottom y stroke_style Rubyvis.color("#933").alpha(0.5) label(:anchor=>"right") do text y.tick_format end end } end # Returns a Rubyvis panel with scatterplot def rubyvis_panel # :nodoc: that=self #p @v1.map {|v| v} @minimum_x||=@v1.min @maximum_x||=@v1.max @minimum_y||=@v2.min @maximum_y||=@v2.max colors=Rubyvis::Colors.category10 margin_hor=margin_left + margin_right margin_vert=margin_top + margin_bottom x=Rubyvis::Scale.linear(@minimum_x, @maximum_x).range(0, width - margin_hor) y=Rubyvis::Scale.linear(@minimum_y, @maximum_y).range(0, height - margin_vert) @x_scale=x @y_scale=y vis=Rubyvis::Panel.new do |pan| pan.width width - margin_hor pan.height height - margin_vert pan.bottom margin_bottom pan.left margin_left pan.right margin_right pan.top margin_top # X axis pan.rule do data y.ticks bottom y stroke_style {|d| d!=0 ? "#eee" : "#000"} label(:anchor=>'left') do visible {|d| d!=0 and d < that.width} text y.tick_format end end # Y axis pan.rule do data x.ticks left x stroke_style {|d| d!=0 ? "#eee" : "#000"} label(:anchor=>'bottom') do visible {|d| d>0 and d < that.height} text x.tick_format end end # Add lines on median add_line_median(pan) if line_median pan.panel do data(that.data) dot do left {|d| x[d[:x]]} bottom {|d| y[d[:y]]} fill_style {|v| alpha=(that.dot_alpha-0.3<=0) ? 0.1 : that.dot_alpha-0.3 if that.groups colors.scale(that.groups[index]).alpha(alpha) else colors.scale(0).alpha(alpha) end } stroke_style {|v| if that.groups colors.scale(that.groups[parent.index]).alpha(that.dot_alpha) else colors.scale(0).alpha(that.dot_alpha) end } shape_radius 2 end end end vis end # Returns SVG with scatterplot def to_svg rp=rubyvis_panel rp.render rp.to_svg end def report_building(builder) # :nodoc: builder.section(:name=>name) do |b| b.image(to_svg, :type=>'svg', :width=>width, :height=>height) end end end end end ================================================ FILE: lib/statsample/graph.rb ================================================ require 'statsample/graph/scatterplot' require 'statsample/graph/boxplot' require 'statsample/graph/histogram' module Statsample # Several Graph, based on Rubyvis # * Statsample::Graph::Boxplot # * Statsample::Graph::Histogram # * Statsample::Graph::Scatterplot module Graph end end ================================================ FILE: lib/statsample/histogram.rb ================================================ module Statsample # A histogram consists of a set of bins which count the # number of events falling into a given range of a continuous variable x. # # This implementations follows convention of GSL # for specification. # # * Verbatim: * # # The range for bin[i] is given by range[i] to range[i+1]. # For n bins there are n+1 entries in the array range. # Each bin is inclusive at the lower end and exclusive at the upper end. # Mathematically this means that the bins are defined # by the following inequality, # # bin[i] corresponds to range[i] <= x < range[i+1] # # Here is a diagram of the correspondence between ranges and bins # on the number-line for x, # # # [ bin[0] )[ bin[1] )[ bin[2] )[ bin[3] )[ bin[4] ) # ---|---------|---------|---------|---------|---------|--- x # r[0] r[1] r[2] r[3] r[4] r[5] # # # In this picture the values of the range array are denoted by r. # On the left-hand side of each bin the square bracket ‘[’ denotes # an inclusive lower bound ( r <= x), and the round parentheses ‘)’ # on the right-hand side denote an exclusive upper bound (x < r). # Thus any samples which fall on the upper end of the histogram are # excluded. # If you want to include this value for the last bin you will need to # add an extra bin to your histogram. # # # == Reference: # * http://www.gnu.org/software/gsl/manual/html_node/The-histogram-struct.html class Histogram include Enumerable class << self # Alloc +n_bins+, using +range+ as ranges of bins def alloc(n_bins, range=nil, opts=Hash.new) Histogram.new(n_bins, range, opts) end # Alloc +n_bins+ bins, using +p1+ as minimum and +p2+ # as maximum def alloc_uniform(n_bins, p1=nil,p2=nil) if p1.is_a? Array min,max=p1 else min,max=p1,p2 end range=max - min step=range / n_bins.to_f range=(n_bins+1).times.map {|i| min + (step*i)} Histogram.new(range) end end attr_accessor :name attr_reader :bin attr_reader :range include GetText bindtextdomain("statsample") def initialize(p1, min_max=false, opts=Hash.new) if p1.is_a? Array range=p1 @n_bins=p1.size-1 elsif p1.is_a? Integer @n_bins=p1 end @bin=[0.0]*(@n_bins) if(min_max) min, max=min_max[0], min_max[1] range=Array.new(@n_bins+1) (@n_bins+1).times {|i| range[i]=min+(i*(max-min).quo(@n_bins)) } end range||=[0.0]*(@n_bins+1) set_ranges(range) @name="" opts.each{|k,v| self.send("#{k}=",v) if self.respond_to? k } end # Number of bins def bins @n_bins end # def increment(x, w=1) if x.respond_to? :each x.each{|y| increment(y,w) } elsif x.is_a? Numeric (range.size-1).times do |i| if x>=range[i] and xi, :low=>r[0],:high=>r[1], :middle=>(r[0]+r[1]) / 2.0, :value=>@bin[i]} yield arg end end def estimated_variance sum,n=0,0 mean=estimated_mean each do |v| sum+=v[:value]*(v[:middle]-mean)**2 n+=v[:value] end sum / (n-1) end def estimated_standard_deviation Math::sqrt(estimated_variance) end def estimated_mean sum,n=0,0 each do |v| sum+= v[:value]* v[:middle] n+=v[:value] end sum / n end alias :mean :estimated_mean alias :sigma :estimated_standard_deviation def sum(start=nil,_end=nil) start||=0 _end||=@n_bins-1 (start.._end).inject(0) {|ac,i| ac+@bin[i]} end def report_building(generator) hg=Statsample::Graph::Histogram.new(self) generator.parse_element(hg) end def report_building_text(generator) @range.each_with_index do |r,i| next if i==@bin.size generator.text(sprintf("%5.2f : %d", r, @bin[i])) end end end end ================================================ FILE: lib/statsample/matrix.rb ================================================ class ::Vector def to_matrix ::Matrix.columns([self.to_a]) end def to_vector self end end class ::Matrix def to_matrix self end def to_dataset f = (self.respond_to? :fields_y) ? fields_y : column_size.times.map {|i| _("VAR_%d") % (i+1) } ds=Statsample::Dataset.new(f) f.each do |ff| ds[ff].type=:scale ds[ff].name=ff end row_size.times {|i| ds.add_case_array(self.row(i).to_a) } ds.update_valid_data ds.name=self.name if self.respond_to? :name ds end if defined? :eigenpairs alias_method :eigenpairs_ruby, :eigenpairs end if Statsample.has_gsl? # Optimize eigenpairs of extendmatrix module using gsl def eigenpairs to_gsl.eigenpairs end end def eigenvalues eigenpairs.collect {|v| v[0]} end def eigenvectors eigenpairs.collect {|v| v[1]} end def eigenvectors_matrix Matrix.columns(eigenvectors) end def to_gsl out=[] self.row_size.times{|i| out[i]=self.row(i).to_a } GSL::Matrix[*out] end end module GSL class Vector class Col def to_matrix ::Matrix.columns([self.size.times.map {|i| self[i]}]) end def to_ary to_a end def to_gsl self end end end class Matrix def to_gsl self end def to_dataset f = (self.respond_to? :fields_y) ? fields_y : column_size.times.map {|i| _("VAR_%d") % (i+1) } ds=Statsample::Dataset.new(f) f.each do |ff| ds[ff].type=:scale ds[ff].name=ff end row_size.times {|i| ds.add_case_array(self.row(i).to_a) } ds.update_valid_data ds.name=self.name if self.respond_to? :name ds end def row_size size1 end def column_size size2 end def determinant det end def inverse GSL::Linalg::LU.invert(self) end def eigenvalues eigenpairs.collect {|v| v[0]} end def eigenvectors eigenpairs.collect {|v| v[1]} end # Matrix sum of squares def mssq sum=0 to_v.each {|i| sum+=i**2} sum end def eigenvectors_matrix eigval, eigvec= GSL::Eigen.symmv(self) GSL::Eigen::symmv_sort(eigval, eigvec, GSL::Eigen::SORT_VAL_DESC) eigvec end def eigenpairs eigval, eigvec= GSL::Eigen.symmv(self) GSL::Eigen::symmv_sort(eigval, eigvec, GSL::Eigen::SORT_VAL_DESC) @eigenpairs=eigval.size.times.map {|i| [eigval[i],eigvec.get_col(i)] } end #def eigenpairs_ruby # self.to_matrix.eigenpairs_ruby #end def square? size1==size2 end def to_matrix rows=self.size1 cols=self.size2 out=(0...rows).collect{|i| (0...cols).collect {|j| self[i,j]} } ::Matrix.rows(out) end def total_sum sum=0 size1.times {|i| size2.times {|j| sum+=self[i,j] } } sum end end end module Statsample # Module to add names to X and Y fields module NamedMatrix include Summarizable def fields raise "Should be square" if !square? fields_x end def fields=(v) raise "Matrix should be square" if !square? @fields_x=v @fields_y=v end def fields_x=(v) raise "Size of fields != row_size" if v.size!=row_size @fields_x=v end def fields_y=(v) raise "Size of fields != column_size" if v.size!=column_size @fields_y=v end def fields_x @fields_x||=row_size.times.collect {|i| _("X%d") % i} end def fields_y @fields_y||=column_size.times.collect {|i| _("Y%d") % i} end def name @name||=get_new_name end def name=(v) @name=v end def get_new_name @@named_matrix||=0 @@named_matrix+=1 _("Matrix %d") % @@named_matrix end end # Module to add method for variance/covariance and correlation matrices # == Usage # matrix=Matrix[[1,2],[2,3]] # matrix.extend CovariateMatrix # module CovariateMatrix include NamedMatrix @@covariatematrix=0 # Get type of covariate matrix. Could be :covariance or :correlation def _type if row_size==column_size if row_size.times.find {|i| self[i,i]!=1.0} :covariance else :correlation end else @type end end def _type=(t) @type=t end def correlation if(_type==:covariance) matrix=Matrix.rows(row_size.times.collect { |i| column_size.times.collect { |j| if i==j 1.0 else self[i,j].quo(Math::sqrt(self[i,i])*Math::sqrt(self[j,j])) end } }) matrix.extend CovariateMatrix matrix.fields_x=fields_x matrix.fields_y=fields_y matrix._type=:correlation matrix else self end end # Get variance for field k # def variance(k) submatrix([k])[0,0] end def get_new_name @@covariatematrix+=1 _("Covariate matrix %d") % @@covariatematrix end # Select a submatrix of factors. If you have a correlation matrix # with a, b and c, you could obtain a submatrix of correlations of # a and b, b and c or a and b # # You could use labels or index to select the factors. # If you don't specify columns, its will be equal to rows. # # Example: # a=Matrix[[1.0, 0.3, 0.2], # [0.3, 1.0, 0.5], # [0.2, 0.5, 1.0]] # a.extend CovariateMatrix # a.fields=%w{a b c} # a.submatrix(%w{c a}, %w{b}) # => Matrix[[0.5],[0.3]] # a.submatrix(%w{c a}) # => Matrix[[1.0, 0.2] , [0.2, 1.0]] def submatrix(rows,columns=nil) raise ArgumentError, "rows shouldn't be empty" if rows.respond_to? :size and rows.size==0 columns||=rows # Convert all fields on index row_index=rows.collect {|v| r=v.is_a?(Numeric) ? v : fields_x.index(v) raise "Index #{v} doesn't exists on matrix" if r.nil? r } column_index=columns.collect {|v| r=v.is_a?(Numeric) ? v : fields_y.index(v) raise "Index #{v} doesn't exists on matrix" if r.nil? r } fx=row_index.collect {|v| fields_x[v]} fy=column_index.collect {|v| fields_y[v]} matrix= Matrix.rows(row_index.collect {|i| row=column_index.collect {|j| self[i,j]}}) matrix.extend CovariateMatrix matrix.fields_x=fx matrix.fields_y=fy matrix._type=_type matrix end def report_building(generator) @name||= (_type==:correlation ? _("Correlation"):_("Covariance"))+_(" Matrix") generator.table(:name=>@name, :header=>[""]+fields_y) do |t| row_size.times {|i| t.row([fields_x[i]]+row(i).to_a.collect {|i1| i1.nil? ? "--" : sprintf("%0.3f",i1).gsub("0.",".") }) } end end end end ================================================ FILE: lib/statsample/multiset.rb ================================================ module Statsample # Multiset joins multiple dataset with the same fields and vectors # but with different number of cases. # This is the base class for stratified and cluster sampling estimation class Multiset # Name of fields attr_reader :fields # Array with Statsample::Dataset attr_reader :datasets # To create a multiset # * Multiset.new(%w{f1 f2 f3}) # define only fields def initialize(fields) @fields=fields @datasets={} end def self.new_empty_vectors(fields,ds_names) ms=Multiset.new(fields) ds_names.each{|d| ms.add_dataset(d,Dataset.new(fields)) } ms end # Generate a new dataset as a union of partial dataset # If block given, this is applied to each dataset before union def union(&block) union_field={} types={} names={} labels={} each do |k,ds| if block ds=ds.dup yield k,ds end @fields.each do |f| union_field[f]||=Array.new union_field[f].concat(ds[f].data) types[f]||=ds[f].type names[f]||=ds[f].name labels[f]||=ds[f].labels end end @fields.each do |f| union_field[f]=union_field[f].to_vector(types[f]) union_field[f].name=names[f] union_field[f].labels=labels[f] end ds_union=union_field.to_dataset ds_union.fields=@fields ds_union end def datasets_names @datasets.keys.sort end def n_datasets @datasets.size end def add_dataset(key,ds) if(ds.fields!=@fields) raise ArgumentError, "Dataset(#{ds.fields.to_s})must have the same fields of the Multiset(#{@fields})" else @datasets[key]=ds end end def sum_field(field) @datasets.inject(0) {|a,da| stratum_name=da[0] vector=da[1][field] val=yield stratum_name,vector a+val } end def collect_vector(field) @datasets.collect {|k,v| yield k, v[field] } end def each_vector(field) @datasets.each {|k,v| yield k, v[field] } end def[](i) @datasets[i] end def each(&block) @datasets.each {|k,ds| next if ds.cases==0 block.call(k,ds) } end end class StratifiedSample class << self # mean for an array of vectors def mean(*vectors) n_total=0 means=vectors.inject(0){|a,v| n_total+=v.size a+v.sum } means.to_f/n_total end def standard_error_ksd_wr(es) n_total=0 sum=es.inject(0){|a,h| n_total+=h['N'] a+((h['N']**2 * h['s']**2) / h['n'].to_f) } (1.to_f / n_total)*Math::sqrt(sum) end def variance_ksd_wr(es) standard_error_ksd_wr(es)**2 end def calculate_n_total(es) es.inject(0) {|a,h| a+h['N'] } end # Source : Cochran (1972) def variance_ksd_wor(es) n_total=calculate_n_total(es) es.inject(0){|a,h| val=((h['N'].to_f / n_total)**2) * (h['s']**2 / h['n'].to_f) * (1 - (h['n'].to_f / h['N'])) a+val } end def standard_error_ksd_wor(es) Math::sqrt(variance_ksd_wor(es)) end def variance_esd_wor(es) n_total=calculate_n_total(es) sum=es.inject(0){|a,h| val=h['N']*(h['N']-h['n'])*(h['s']**2 / h['n'].to_f) a+val } (1.0/(n_total**2))*sum end def standard_error_esd_wor(es) Math::sqrt(variance_ksd_wor(es)) end # Based on http://stattrek.com/Lesson6/STRAnalysis.aspx def variance_esd_wr(es) n_total=calculate_n_total(es) sum=es.inject(0){|a,h| val= ((h['s']**2 * h['N']**2) / h['n'].to_f) a+val } (1.0/(n_total**2))*sum end def standard_error_esd_wr(es) Math::sqrt(variance_esd_wr(es)) end def proportion_variance_ksd_wor(es) n_total=calculate_n_total(es) es.inject(0){|a,h| val= (((h['N'].to_f / n_total)**2 * h['p']*(1-h['p'])) / (h['n'])) * (1- (h['n'].to_f / h['N'])) a+val } end def proportion_sd_ksd_wor(es) Math::sqrt(proportion_variance_ksd_wor(es)) end def proportion_sd_ksd_wr(es) n_total=calculate_n_total(es) sum=es.inject(0){|a,h| val= (h['N']**2 * h['p']*(1-h['p'])) / h['n'].to_f a+val } Math::sqrt(sum) * (1.0/n_total) end def proportion_variance_ksd_wr(es) proportion_variance_ksd_wor(es)**2 end def proportion_variance_esd_wor(es) n_total=n_total=calculate_n_total(es) sum=es.inject(0){|a,h| a=(h['N']**2 * (h['N']-h['n']) * h['p']*(1.0-h['p'])) / ((h['n']-1)*(h['N']-1)) a+val } Math::sqrt(sum) * (1.0/n_total**2) end def proportion_sd_esd_wor(es) Math::sqrt(proportion_variance_ksd_wor(es)) end end def initialize(ms,strata_sizes) raise TypeError,"ms should be a Multiset" unless ms.is_a? Statsample::Multiset @ms=ms raise ArgumentError,"You should put a strata size for each dataset" if strata_sizes.keys.sort!=ms.datasets_names @strata_sizes=strata_sizes @population_size=@strata_sizes.inject(0) {|a,x| a+x[1]} @strata_number=@ms.n_datasets @sample_size=@ms.datasets.inject(0) {|a,x| a+x[1].cases} end # Number of strata def strata_number @strata_number end # Population size. Equal to sum of strata sizes # Symbol: Nh def population_size @population_size end # Sample size. Equal to sum of sample of each stratum def sample_size @sample_size end # Size of stratum x def stratum_size(h) @strata_sizes[h] end def vectors_by_field(field) @ms.datasets.collect{|k,ds| ds[field] } end # Population proportion based on strata def proportion(field, v=1) @ms.sum_field(field) {|s_name,vector| stratum_ponderation(s_name)*vector.proportion(v) } end # Stratum ponderation. # Symbol: W\h\ def stratum_ponderation(h) @strata_sizes[h].to_f / @population_size end alias_method :wh, :stratum_ponderation # Population mean based on strata def mean(field) @ms.sum_field(field) {|s_name,vector| stratum_ponderation(s_name)*vector.mean } end # Standard error with estimated population variance and without replacement. # Source: Cochran (1972) def standard_error_wor(field) es=@ms.collect_vector(field) {|s_n, vector| {'N'=>@strata_sizes[s_n],'n'=>vector.size, 's'=>vector.sds} } StratifiedSample.standard_error_esd_wor(es) end # Standard error with estimated population variance and without replacement. # Source: http://stattrek.com/Lesson6/STRAnalysis.aspx def standard_error_wor_2(field) sum=@ms.sum_field(field) {|s_name,vector| s_size=@strata_sizes[s_name] (s_size**2 * (1-(vector.size.to_f / s_size)) * vector.variance_sample / vector.size.to_f) } (1/@population_size.to_f)*Math::sqrt(sum) end def standard_error_wr(field) es=@ms.collect_vector(field) {|s_n, vector| {'N'=>@strata_sizes[s_n],'n'=>vector.size, 's'=>vector.sds} } StratifiedSample.standard_error_esd_wr(es) end def proportion_sd_esd_wor(field,v=1) es=@ms.collect_vector(field) {|s_n, vector| {'N'=>@strata_sizes[s_n],'n'=>vector.size, 'p'=>vector.proportion(v)} } StratifiedSample.proportion_sd_esd_wor(es) end def proportion_standard_error(field,v=1) prop=proportion(field,v) sum=@ms.sum_field(field) {|s_name,vector| nh=vector.size s_size=@strata_sizes[s_name] (s_size**2 * (1-(nh / s_size)) * prop * (1-prop) / (nh - 1 )) } (1.quo(@population_size)) * Math::sqrt(sum) end # Cochran(1971), p. 150 def variance_pst(field,v=1) sum=@ms.datasets.inject(0) {|a,da| stratum_name=da[0] ds=da[1] nh=ds.cases.to_f s_size=@strata_sizes[stratum_name] prop=ds[field].proportion(v) a + (((s_size**2 * (s_size-nh)) / (s_size-1))*(prop*(1-prop) / (nh-1))) } (1/@population_size.to_f ** 2)*sum end end end ================================================ FILE: lib/statsample/regression/multiple/alglibengine.rb ================================================ if HAS_ALGIB module Statsample module Regression module Multiple # Class for Multiple Regression Analysis # Requires Alglib gem and uses a listwise aproach. # Faster than GslEngine on massive prediction use, because process is c-based. # Prefer GslEngine if you need good memory use. # If you need pairwise, use RubyEngine # Example: # # @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale) # @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale) # @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale) # @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale) # ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset # lr=Statsample::Regression::Multiple::AlglibEngine.new(ds,'y') # class AlglibEngine < BaseEngine def initialize(ds,y_var, opts=Hash.new) super @ds=ds.dup_only_valid @ds_valid=@ds @dy=@ds[@y_var] @ds_indep=ds.dup(ds.fields-[y_var]) # Create a custom matrix columns=[] @fields=[] @ds.fields.each{|f| if f!=@y_var columns.push(@ds[f].to_a) @fields.push(f) end } @dep_columns=columns.dup columns.push(@ds[@y_var]) matrix=Matrix.columns(columns) @lr_s=nil @lr=::Alglib::LinearRegression.build_from_matrix(matrix) @coeffs=assign_names(@lr.coeffs) end def _dump(i) Marshal.dump({'ds'=>@ds,'y_var'=>@y_var}) end def self._load(data) h=Marshal.load(data) self.new(h['ds'], h['y_var']) end def coeffs @coeffs end # Coefficients using a constant # Based on http://www.xycoon.com/ols1.htm def matrix_resolution mse_p=mse columns=@dep_columns.dup.map {|xi| xi.map{|i| i.to_f}} columns.unshift([1.0]*@ds.cases) y=Matrix.columns([@dy.data.map {|i| i.to_f}]) x=Matrix.columns(columns) xt=x.t matrix=((xt*x)).inverse*xt matrix*y end def r2 r**2 end def r Bivariate::pearson(@dy,predicted) end def sst @dy.ss end def constant @lr.constant end def standarized_coeffs l=lr_s assign_names(l.coeffs) end def lr_s if @lr_s.nil? build_standarized end @lr_s end def build_standarized @ds_s=@ds.standarize columns=[] @ds_s.fields.each{|f| columns.push(@ds_s[f].to_a) unless f==@y_var } @dep_columns_s=columns.dup columns.push(@ds_s[@y_var]) matrix=Matrix.columns(columns) @lr_s=Alglib::LinearRegression.build_from_matrix(matrix) end def process(v) @lr.process(v) end def process_s(v) lr_s.process(v) end # ???? Not equal to SPSS output def standarized_residuals res=residuals red_sd=residuals.sds res.collect {|v| v.quo(red_sd) }.to_vector(:scale) end end end end end # for Statsample end # for if ================================================ FILE: lib/statsample/regression/multiple/baseengine.rb ================================================ module Statsample module Regression module Multiple # Base class for Multiple Regression Engines class BaseEngine include Statsample::Summarizable # Name of analysis attr_accessor :name # Minimum number of valid case for pairs of correlation attr_reader :cases # Number of valid cases (listwise) attr_reader :valid_cases # Number of total cases (dataset.cases) attr_reader :total_cases attr_accessor :digits def self.univariate? true end def initialize(ds, y_var, opts = Hash.new) @ds=ds @predictors_n=@ds.fields.size-1 @total_cases=@ds.cases @cases=@ds.cases @y_var=y_var @r2=nil @name=_("Multiple Regression: %s over %s") % [ ds.fields.join(",") , @y_var] opts_default={:digits=>3} @opts=opts_default.merge opts @opts.each{|k,v| self.send("#{k}=",v) if self.respond_to? k } end # Calculate F Test def anova @anova||=Statsample::Anova::OneWay.new(:ss_num=>ssr, :ss_den=>sse, :df_num=>df_r, :df_den=>df_e, :name_numerator=>_("Regression"), :name_denominator=>_("Error"), :name=>"ANOVA") end # Standard error of estimate def se_estimate Math::sqrt(sse.quo(df_e)) end # Retrieves a vector with predicted values for y def predicted @total_cases.times.collect { |i| invalid=false vect=@dep_columns.collect {|v| invalid=true if v[i].nil?; v[i]} if invalid nil else process(vect) end }.to_vector(:scale) end # Retrieves a vector with standarized values for y def standarized_predicted predicted.standarized end # Retrieves a vector with residuals values for y def residuals (0...@total_cases).collect{|i| invalid=false vect=@dep_columns.collect{|v| invalid=true if v[i].nil?; v[i]} if invalid or @ds[@y_var][i].nil? nil else @ds[@y_var][i] - process(vect) end }.to_vector(:scale) end # R Multiple def r raise "You should implement this" end # Sum of squares Total def sst raise "You should implement this" end # R^2 Adjusted. # Estimate Population R^2 usign Ezequiel formula. # Always lower than sample R^2 # == Reference: # * Leach, L. & Henson, R. (2007). The Use and Impact of Adjusted R2 Effects in Published Regression Research. Multiple Linear Regression Viewpoints, 33(1), 1-11. def r2_adjusted r2-((1-r2)*@predictors_n).quo(df_e) end # Sum of squares (regression) def ssr r2*sst end # Sum of squares (Error) def sse sst - ssr end # T values for coeffs def coeffs_t out={} se=coeffs_se coeffs.each do |k,v| out[k]=v / se[k] end out end # Mean square Regression def msr ssr.quo(df_r) end # Mean Square Error def mse sse.quo(df_e) end # Degrees of freedom for regression def df_r @predictors_n end # Degrees of freedom for error def df_e @valid_cases-@predictors_n-1 end # Fisher for Anova def f anova.f end # p-value of Fisher def probability anova.probability end # Tolerance for a given variable # http://talkstats.com/showthread.php?t=5056 def tolerance(var) ds=assign_names(@dep_columns) ds.each{|k,v| ds[k]=v.to_vector(:scale) } lr=self.class.new(ds.to_dataset,var) 1-lr.r2 end # Tolerances for each coefficient def coeffs_tolerances @fields.inject({}) {|a,f| a[f]=tolerance(f); a } end # Standard Error for coefficients def coeffs_se out={} mse=sse.quo(df_e) coeffs.each {|k,v| out[k]=Math::sqrt(mse/(@ds[k].sum_of_squares * tolerance(k))) } out end # Estandar error of R^2 # ???? def se_r2 Math::sqrt((4*r2*(1-r2)**2*(df_e)**2).quo((@cases**2-1)*(@cases+3))) end # Estimated Variance-Covariance Matrix # Used for calculation of se of constant def estimated_variance_covariance_matrix #mse_p=mse columns=[] @ds_valid.fields.each{|k| v=@ds_valid[k] columns.push(v.data) unless k==@y_var } columns.unshift([1.0]*@valid_cases) x=Matrix.columns(columns) matrix=((x.t*x)).inverse * mse matrix.collect {|i| Math::sqrt(i) if i>=0 } end # T for constant def constant_t constant.to_f/constant_se end # Standard error for constant def constant_se estimated_variance_covariance_matrix[0,0] end def report_building(b) di="%0.#{digits}f" b.section(:name=>@name) do |g| c=coeffs g.text _("Engine: %s") % self.class g.text(_("Cases(listwise)=%d(%d)") % [@total_cases, @valid_cases]) g.text _("R=")+(di % r) g.text _("R^2=")+(di % r2) g.text _("R^2 Adj=")+(di % r2_adjusted) g.text _("Std.Error R=")+ (di % se_estimate) g.text(_("Equation")+"="+ sprintf(di,constant) +" + "+ @fields.collect {|k| sprintf("#{di}%s",c[k],k)}.join(' + ') ) g.parse_element(anova) sc=standarized_coeffs cse=coeffs_se g.table(:name=>_("Beta coefficients"), :header=>%w{coeff b beta se t}.collect{|field| _(field)} ) do |t| t.row([_("Constant"), sprintf(di, constant), "-", constant_se.nil? ? "": sprintf(di, constant_se), constant_t.nil? ? "" : sprintf(di, constant_t)]) @fields.each do |f| t.row([f, sprintf(di, c[f]), sprintf(di, sc[f]), sprintf(di, cse[f]), sprintf(di, c[f].quo(cse[f]))]) end end end end def assign_names(c) a={} @fields.each_index {|i| a[@fields[i]]=c[i] } a end # Sum of squares of regression # using the predicted value minus y mean def ssr_direct mean=@dy.mean cases=0 ssr=(0...@ds.cases).inject(0) {|a,i| invalid=false v=@dep_columns.collect{|c| invalid=true if c[i].nil?; c[i]} if !invalid cases+=1 a+((process(v)-mean)**2) else a end } ssr end def sse_direct sst-ssr end def process(v) c=coeffs total=constant @fields.each_index{|i| total+=c[@fields[i]]*v[i] } total end end end end end ================================================ FILE: lib/statsample/regression/multiple/gslengine.rb ================================================ if Statsample.has_gsl? module Statsample module Regression module Multiple # Class for Multiple Regression Analysis # Requires rbgsl and uses a listwise aproach. # Slower on prediction of values than Alglib, because predict is ruby based. # Better memory management on multiple (+1000) series of regression. # If you need pairwise, use RubyEngine # Example: # # @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale) # @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale) # @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale) # @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale) # ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset # lr=Statsample::Regression::Multiple::GslEngine.new(ds,'y') # class GslEngine < BaseEngine def initialize(ds,y_var, opts=Hash.new) super @ds=ds.dup_only_valid @ds_valid=@ds @valid_cases=@ds_valid.cases @dy=@ds[@y_var] @ds_indep=ds.dup(ds.fields-[y_var]) # Create a custom matrix columns=[] @fields=[] max_deps = GSL::Matrix.alloc(@ds.cases, @ds.fields.size) constant_col=@ds.fields.size-1 for i in 0...@ds.cases max_deps.set(i,constant_col,1) end j=0 @ds.fields.each{|f| if f!=@y_var @ds[f].each_index{|i1| max_deps.set(i1,j,@ds[f][i1]) } columns.push(@ds[f].to_a) @fields.push(f) j+=1 end } @dep_columns=columns.dup @lr_s=nil c, @cov, @chisq, @status = GSL::MultiFit.linear(max_deps, @dy.gsl) @constant=c[constant_col] @coeffs_a=c.to_a.slice(0...constant_col) @coeffs=assign_names(@coeffs_a) c=nil end def _dump(i) Marshal.dump({'ds'=>@ds,'y_var'=>@y_var}) end def self._load(data) h=Marshal.load(data) self.new(h['ds'], h['y_var']) end def coeffs @coeffs end # Coefficients using a constant # Based on http://www.xycoon.com/ols1.htm def matrix_resolution columns=@dep_columns.dup.map {|xi| xi.map{|i| i.to_f}} columns.unshift([1.0]*@ds.cases) y=Matrix.columns([@dy.data.map {|i| i.to_f}]) x=Matrix.columns(columns) xt=x.t matrix=((xt*x)).inverse*xt matrix*y end def r2 r**2 end def r Bivariate::pearson(@dy, predicted) end def sst @dy.ss end def constant @constant end def standarized_coeffs l=lr_s l.coeffs end def lr_s if @lr_s.nil? build_standarized end @lr_s end def build_standarized @ds_s=@ds.standarize @lr_s=GslEngine.new(@ds_s,@y_var) end def process_s(v) lr_s.process(v) end # ???? Not equal to SPSS output def standarized_residuals res=residuals red_sd=residuals.sds res.collect {|v| v.quo(red_sd) }.to_vector(:scale) end # Standard error for coeffs def coeffs_se out={} evcm=estimated_variance_covariance_matrix @ds_valid.fields.each_with_index do |f,i| mi=i+1 next if f==@y_var out[f]=evcm[mi,mi] end out end end end end end # for Statsample end # for if ================================================ FILE: lib/statsample/regression/multiple/matrixengine.rb ================================================ module Statsample module Regression module Multiple # Pure Ruby Class for Multiple Regression Analysis, based on a covariance or correlation matrix. # # Use Statsample::Regression::Multiple::RubyEngine if you have a # Dataset, to avoid setting all details. # # Remember: NEVER use a Covariance data if you have missing data. Use only correlation matrix on that case. # # # Example: # # matrix=[[1.0, 0.5, 0.2], [0.5, 1.0, 0.7], [0.2, 0.7, 1.0]] # # lr=Statsample::Regression::Multiple::MatrixEngine.new(matrix,2) class MatrixEngine < BaseEngine # Hash of standard deviation of predictors. # Only useful for Correlation Matrix, because by default is set to 1 attr_accessor :x_sd # Standard deviation of criterion # Only useful for Correlation Matrix, because by default is set to 1 attr_accessor :y_sd # Hash of mean for predictors. By default, set to 0 attr_accessor :x_mean # Mean for criteria. By default, set to 0 attr_accessor :y_mean # Number of cases attr_writer :cases attr_writer :digits # Create object # def initialize(matrix,y_var, opts=Hash.new) matrix.extend Statsample::CovariateMatrix raise "#{y_var} variable should be on data" unless matrix.fields.include? y_var if matrix._type==:covariance @matrix_cov=matrix @matrix_cor=matrix.correlation @no_covariance=false else @matrix_cor=matrix @matrix_cov=matrix @no_covariance=true end @y_var=y_var @fields=matrix.fields-[y_var] @n_predictors=@fields.size @predictors_n=@n_predictors @matrix_x= @matrix_cor.submatrix(@fields) @matrix_x_cov= @matrix_cov.submatrix(@fields) raise LinearDependency, "Regressors are linearly dependent" if @matrix_x.determinant<1e-15 @matrix_y = @matrix_cor.submatrix(@fields, [y_var]) @matrix_y_cov = @matrix_cov.submatrix(@fields, [y_var]) @y_sd=Math::sqrt(@matrix_cov.submatrix([y_var])[0,0]) @x_sd=@n_predictors.times.inject({}) {|ac,i| ac[@matrix_x_cov.fields[i]]=Math::sqrt(@matrix_x_cov[i,i]) ac; } @cases=nil @x_mean=@fields.inject({}) {|ac,f| ac[f]=0.0 ac; } @y_mean=0.0 @name=_("Multiple reggresion of %s on %s") % [@fields.join(","), @y_var] opts_default={:digits=>3} opts=opts_default.merge opts opts.each{|k,v| self.send("#{k}=",v) if self.respond_to? k } result_matrix=@matrix_x_cov.inverse * @matrix_y_cov if matrix._type==:covariance @coeffs=result_matrix.column(0).to_a @coeffs_stan=coeffs.collect {|k,v| coeffs[k]*@x_sd[k].quo(@y_sd) } else @coeffs_stan=result_matrix.column(0).to_a @coeffs=standarized_coeffs.collect {|k,v| standarized_coeffs[k]*@y_sd.quo(@x_sd[k]) } end @total_cases=@valid_cases=@cases end def cases raise "You should define the number of valid cases first" if @cases.nil? @cases end # Get R^2 for the regression # For fixed models is the coefficient of determination. # On random models, is the 'squared-multiple correlation' # Equal to # * 1-(|R| / |R_x|) or # * Sum(b_i*r_yi) <- used def r2 @n_predictors.times.inject(0) {|ac,i| ac+@coeffs_stan[i]* @matrix_y[i,0]} end # Multiple correlation, on random models. def r Math::sqrt(r2) end # Value of constant def constant c=coeffs @y_mean - @fields.inject(0){|a,k| a + (c[k] * @x_mean[k])} end # Hash of b or raw coefficients def coeffs assign_names(@coeffs) end # Hash of beta or standarized coefficients def standarized_coeffs assign_names(@coeffs_stan) end # Total sum of squares def sst @y_sd**2*(cases-1.0) end # Degrees of freedom for regression def df_r @n_predictors end # Degrees of freedom for error def df_e cases-@n_predictors-1 end # Tolerance for a given variable # defined as (1-R^2) of regression of other independent variables # over the selected # == Reference: # * http://talkstats.com/showthread.php?t=5056 def tolerance(var) return 1 if @matrix_x.column_size==1 lr=Statsample::Regression::Multiple::MatrixEngine.new(@matrix_x, var) 1-lr.r2 end # Standard Error for coefficients. # Standard error of a coefficients depends on # * Tolerance of the coeffients: Higher tolerances implies higher error # * Higher r2 implies lower error # == Reference: # * Cohen et al. (2003). Applied Multiple Reggression / Correlation Analysis for the Behavioral Sciences # def coeffs_se out={} #mse=sse.quo(df_e) coeffs.each {|k,v| out[k]=@y_sd.quo(@x_sd[k])*Math::sqrt( 1.quo(tolerance(k)))*Math::sqrt((1-r2).quo(df_e)) } out end # t value for constant def constant_t return nil if constant_se.nil? constant.to_f / constant_se end # Standard error for constant. # This method recreates the estimaded variance-covariance matrix # using means, standard deviation and covariance matrix. # So, needs the covariance matrix. def constant_se return nil if @no_covariance means=@x_mean #means[@y_var]=@y_mean means[:constant]=1 sd=@x_sd #sd[@y_var]=@y_sd sd[:constant]=0 fields=[:constant]+@matrix_cov.fields-[@y_var] # Recreate X'X using the variance-covariance matrix xt_x=Matrix.rows(fields.collect {|i| fields.collect {|j| if i==:constant or j==:constant cov=0 elsif i==j cov=sd[i]**2 else cov=@matrix_cov.submatrix(i..i,j..j)[0,0] end cov*(@cases-1)+@cases*means[i]*means[j] } }) matrix=xt_x.inverse * mse matrix.collect {|i| Math::sqrt(i) if i>0 }[0,0] end end end end end ================================================ FILE: lib/statsample/regression/multiple/rubyengine.rb ================================================ module Statsample module Regression module Multiple # Pure Ruby Class for Multiple Regression Analysis. # Slower than AlglibEngine, but is pure ruby and can use a pairwise aproach for missing values. # Coeffient calculation uses correlation matrix between the vectors # If you need listwise aproach for missing values, use AlglibEngine, because is faster. # # Example: # # @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale) # @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale) # @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale) # @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale) # ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset # lr=Statsample::Regression::Multiple::RubyEngine.new(ds,'y') class RubyEngine < MatrixEngine def initialize(ds,y_var, opts=Hash.new) matrix=ds.correlation_matrix fields_indep=ds.fields-[y_var] default={ :y_mean=>ds[y_var].mean, :x_mean=>fields_indep.inject({}) {|ac,f| ac[f]=ds[f].mean; ac}, :y_sd=>ds[y_var].sd, :x_sd=>fields_indep.inject({}) {|ac,f| ac[f]=ds[f].sd; ac}, :cases=>Statsample::Bivariate.min_n_valid(ds) } opts=opts.merge(default) super(matrix, y_var, opts) @ds=ds @dy=ds[@y_var] @ds_valid=ds.dup_only_valid @total_cases=@ds.cases @valid_cases=@ds_valid.cases @ds_indep = ds.dup(ds.fields-[y_var]) set_dep_columns end def set_dep_columns @dep_columns=[] @ds_indep.each_vector{|k,v| @dep_columns.push(v.data_with_nils) } end def fix_with_mean i=0 @ds_indep.each do |row| empty=[] row.each do |k,v| empty.push(k) if v.nil? end if empty.size==1 @ds_indep[empty[0]][i]=@ds[empty[0]].mean end i+=1 end @ds_indep.update_valid_data set_dep_columns end def fix_with_regression i=0 @ds_indep.each{|row| empty=[] row.each{|k,v| empty.push(k) if v.nil? } if empty.size==1 field=empty[0] lr=MultipleRegression.new(@ds_indep,field) fields=[] @ds_indep.fields.each{|f| fields.push(row[f]) unless f==field } @ds_indep[field][i]=lr.process(fields) end i+=1 } @ds_indep.update_valid_data set_dep_columns end # Standard error for constant def constant_se estimated_variance_covariance_matrix[0,0] end end end end end ================================================ FILE: lib/statsample/regression/multiple.rb ================================================ require 'statsample/regression/multiple/baseengine' module Statsample module Regression # Module for OLS Multiple Regression Analysis. # # Use:. # # require 'statsample' # a=1000.times.collect {rand}.to_scale # b=1000.times.collect {rand}.to_scale # c=1000.times.collect {rand}.to_scale # ds={'a'=>a,'b'=>b,'c'=>c}.to_dataset # ds['y']=ds.collect{|row| row['a']*5+row['b']*3+row['c']*2+rand()} # lr=Statsample::Regression.multiple(ds,'y') # puts lr.summary # Summary for regression of a,b,c over y # ************************************************************* # Engine: Statsample::Regression::Multiple::AlglibEngine # Cases(listwise)=1000(1000) # r=0.986 # r2=0.973 # Equation=0.504+5.011a + 2.995b + 1.988c # ---------------------------- # ANOVA TABLE # -------------------------------------------------------------- # | source | ss | df | ms | f | s | # -------------------------------------------------------------- # | Regression | 2979.321 | 3 | 993.107 | 12040.067 | 0.000 | # | Error | 82.154 | 996 | 0.082 | | | # | Total | 3061.475 | 999 | | | | # -------------------------------------------------------------- # Beta coefficientes # ----------------------------------------------- # | coeff | b | beta | se | t | # ----------------------------------------------- # | Constant | 0.504 | - | 0.030 | 16.968 | # | a | 5.011 | 0.832 | 0.031 | 159.486 | # | b | 2.995 | 0.492 | 0.032 | 94.367 | # | c | 1.988 | 0.323 | 0.032 | 62.132 | # ----------------------------------------------- # module Multiple # Obtain r2 for regressors def self.r2_from_matrices(rxx,rxy) matrix=(rxy.transpose*rxx.inverse*rxy) matrix[0,0] end class MultipleDependent def significance 0.0 end def initialize(matrix,y_var, opts=Hash.new) matrix.extend Statsample::CovariateMatrix @matrix=matrix @fields=matrix.fields-y_var @y_var=y_var @q=@y_var.size @matrix_cor=matrix.correlation @matrix_cor_xx = @matrix_cor.submatrix(@fields) @matrix_cor_yy = @matrix_cor.submatrix(y_var, y_var) @sxx = @matrix.submatrix(@fields) @syy = @matrix.submatrix(y_var, y_var) @sxy = @matrix.submatrix(@fields, y_var) @syx = @sxy.t end def r2yx 1- (@matrix_cor.determinant.quo(@matrix_cor_yy.determinant * @matrix_cor_xx.determinant)) end # Residual covariance of Y after accountin with lineal relation with x def syyx @syy-@syx*@sxx.inverse*@sxy end def r2yx_covariance 1-(syyx.determinant.quo(@syy.determinant)) end def vxy @q-(@syy.inverse*syyx).trace end def p2yx vxy.quo(@q) end end end end end ================================================ FILE: lib/statsample/regression/simple.rb ================================================ module Statsample module Regression # Class for calculation of linear regressions with form # y = a+bx # To create a Statsample::Regression::Simple object: # * Statsample::Regression::Simple.new_from_dataset(ds,x,y) # * Statsample::Regression::Simple.new_from_vectors(vx,vy) # * Statsample::Regression::Simple.new_from_gsl(gsl) # class Simple include Summarizable attr_accessor :a,:b,:cov00, :cov01, :covx1, :chisq, :status attr_accessor :name attr_accessor :digits def initialize(init_method, *argv) self.send(init_method, *argv) end private_class_method :new # Obtain y value given x value # x=a+bx def y(val_x) @a+@b*val_x end # Obtain x value given y value # x=(y-a)/b def x(val_y) (val_y-@a) / @b.to_f end # Sum of square error def sse (0...@vx.size).inject(0) {|acum,i| acum+((@vy[i]-y(@vx[i]))**2) } end def standard_error Math::sqrt(sse / (@vx.size-2).to_f) end # Sum of square regression def ssr vy_mean=@vy.mean (0...@vx.size).inject(0) {|a,i| a+((y(@vx[i])-vy_mean)**2) } end # Sum of square total def sst @vy.sum_of_squared_deviation end # Value of r def r @b * (@vx.sds / @vy.sds) end # Value of r^2 def r2 r**2 end class << self # Create a regression object giving an array with following parameters: # a,b,cov00, cov01, covx1, chisq, status # Useful to obtain x and y values with a and b values. def new_from_gsl(ar) new(:init_gsl, *ar) end # Create a simple regression using two vectors def new_from_vectors(vx,vy, opts=Hash.new) new(:init_vectors,vx,vy, opts) end # Create a simple regression using a dataset and two vector names. def new_from_dataset(ds,x,y, opts=Hash.new) new(:init_vectors,ds[x],ds[y], opts) end end def init_vectors(vx,vy, opts=Hash.new) @vx,@vy=Statsample.only_valid_clone(vx,vy) x_m=@vx.mean y_m=@vy.mean num=den=0 (0...@vx.size).each {|i| num+=(@vx[i]-x_m)*(@vy[i]-y_m) den+=(@vx[i]-x_m)**2 } @b=num.to_f/den @a=y_m - @b*x_m opts_default={ :digits=>3, :name=>_("Regression of %s over %s") % [@vx.name, @vy.name] } @opts=opts_default.merge opts @opts.each{|k,v| self.send("#{k}=",v) if self.respond_to? k } end def init_gsl(a,b,cov00, cov01, covx1, chisq, status) @a=a @b=b @cov00=cov00 @cov01=cov01 @covx1=covx1 @chisq=chisq @status=status end def report_building(gen) f="%0.#{digits}f" gen.section(:name=>name) do |s| s.table(:header=>[_("Variable"), _("Value")]) do |t| t.row [_("r"), f % r] t.row [_("r^2"), f % r2] t.row [_("a"), f % a] t.row [_("b"), f % b] t.row [_("s.e"), f % standard_error] end end end private :init_vectors, :init_gsl end end end ================================================ FILE: lib/statsample/regression.rb ================================================ require 'statsample/regression/simple' require 'statsample/regression/multiple' require 'statsample/regression/multiple/matrixengine' require 'statsample/regression/multiple/rubyengine' require 'statsample/regression/multiple/gslengine' module Statsample # = Module for regression procedures. # Use the method on this class to generate # analysis. # If you need more control, you can # create and control directly the objects who computes # the regressions. # # * Simple Regression : Statsample::Regression::Simple # * Multiple Regression: Statsample::Regression::Multiple # * Logit Regression: Statsample::Regression::Binomial::Logit # * Probit Regression: Statsample::Regression::Binomial::Probit module Regression LinearDependency=Class.new(Exception) # Create a Statsample::Regression::Simple object, for simple regression # * x: independent Vector # * y: dependent Vector # Usage: # x=100.times.collect {|i| rand(100)}.to_scale # y=100.times.collect {|i| 2+x[i]*2+rand()}.to_scale # sr=Statsample::Regression.simple(x,y) # sr.a # => 2.51763295177808 # sr.b # => 1.99973746599856 # sr.r # => 0.999987881153254 def self.simple(x,y) Statsample::Regression::Simple.new_from_vectors(x,y) end # Creates one of the Statsample::Regression::Multiple object, # for OLS multiple regression. # Parameters: # * ds: Dataset. # * y: Name of dependent variable. # * opts: A hash with options # * missing_data: Could be # * :listwise: delete cases with one or more empty data (default). # * :pairwise: uses correlation matrix. Use with caution. # # Usage: # lr=Statsample::Regression::multiple(ds,'y') def self.multiple(ds,y_var, opts=Hash.new) missing_data= (opts[:missing_data].nil? ) ? :listwise : opts.delete(:missing_data) if missing_data==:pairwise Statsample::Regression::Multiple::RubyEngine.new(ds,y_var, opts) else if Statsample.has_gsl? and false Statsample::Regression::Multiple::GslEngine.new(ds, y_var, opts) else ds2=ds.dup_only_valid Statsample::Regression::Multiple::RubyEngine.new(ds2,y_var, opts) end end end end end ================================================ FILE: lib/statsample/reliability/icc.rb ================================================ module Statsample module Reliability # = Intra-class correlation # According to Shrout & Fleiss (1979, p.422): "ICC is the correlation # between one measurement (either a single rating or a mean of # several ratings) on a target and another measurement obtained on that target" # == Usage # require 'statsample' # size=1000 # a = size.times.map {rand(10)}.to_scale # b = a.recode{|i|i+rand(4)-2} # c =a.recode{|i|i+rand(4)-2} # d = a.recode{|i|i+rand(4)-2} # ds={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset # # Use :type attribute to set type to summarize # icc=Statsample::Reliability::ICC.new(ds, :type=>:icc_1_k) # puts icc.summary # # == Reference # * Shrout,P. & Fleiss, J. (1979). Intraclass Correlation: Uses in assessing rater reliability. Psychological Bulletin, 86(2), 420-428 # * McGraw, K. & Wong, S.P. (1996). Forming Inferences About Some Intraclass Correlation Coefficients. Psychological methods, 1(1), 30-46. class ICC include Summarizable # Create a ICC analysis for a given dataset # Each vector is a different measurement. Only uses complete data # (listwise deletion). # attr_reader :df_bt attr_reader :df_wt attr_reader :df_bj attr_reader :df_residual attr_reader :ms_bt attr_reader :ms_wt attr_reader :ms_bj attr_reader :ms_residual alias :bms :ms_bt alias :wms :ms_wt alias :jms :ms_bj alias :ems :ms_residual alias :msr :ms_bt alias :msw :ms_wt alias :msc :ms_bj alias :mse :ms_residual # :section: Shrout and Fleiss ICC denominations attr_reader :icc_1_1 attr_reader :icc_2_1 attr_reader :icc_3_1 attr_reader :icc_1_k attr_reader :icc_2_k attr_reader :icc_3_k # :section: McGraw and Wong ICC denominations attr_reader :icc_1 attr_reader :icc_c_1 attr_reader :icc_a_1 attr_reader :icc_k attr_reader :icc_c_k attr_reader :icc_a_k attr_reader :n, :k attr_reader :total_mean # Type of analysis, for easy summarization # By default, set to :icc_1 # * Shrout & Fleiss(1979) denominations # * :icc_1_1 # * :icc_2_1 # * :icc_3_1 # * :icc_1_k # * :icc_2_k # * :icc_3_k # * McGraw & Wong (1996) denominations # * :icc_1 # * :icc_k # * :icc_c_1 # * :icc_c_k # * :icc_a_1 # * :icc_a_k attr_reader :type # ICC value, set with :type attr_reader :r attr_reader :f attr_reader :lbound attr_reader :ubound attr_accessor :g_rho attr_accessor :alpha attr_accessor :name def initialize(ds, opts=Hash.new) @ds=ds.dup_only_valid @vectors=@ds.vectors.values @n=@ds.cases @k=@ds.fields.size compute @g_rho=0 @alpha=0.05 @icc_name=nil opts_default={:name=>"Intra-class correlation", :type=>:icc_1} @opts=opts_default.merge(opts) @opts.each{|k,v| self.send("#{k}=",v) if self.respond_to? k } end def type=(v) case v when :icc_1_1 @icc_name=_("Shrout & Fleiss ICC(1,1)") @r=@icc_1_1 @f=icc_1_f @lbound, @ubound=icc_1_1_ci(@alpha) when :icc_2_1 @icc_name=_("Shrout & Fleiss ICC(2,1)") @r=@icc_2_1 @f=icc_2_f @lbound, @ubound=icc_2_1_ci(@alpha) when :icc_3_1 @icc_name=_("Shrout & Fleiss ICC(3,1)") @r=@icc_3_1 @f=icc_3_f @lbound, @ubound=icc_3_1_ci(@alpha) when :icc_1_k @icc_name=_("Shrout & Fleiss ICC(1,k)") @r=@icc_1_k @f=icc_1_k_f @lbound, @ubound=icc_1_k_ci(@alpha) when :icc_2_k @icc_name=_("Shrout & Fleiss ICC(2,k)") @r=@icc_2_k @f=icc_2_k_f @lbound, @ubound=icc_2_k_ci(@alpha) when :icc_3_k @icc_name=_("Shrout & Fleiss ICC(3,k)") @r=@icc_3_k @f=icc_3_k_f @lbound, @ubound=icc_3_k_ci(@alpha) when :icc_1 @icc_name=_("McGraw & Wong ICC(1)") @r=@icc_1_1 @f=icc_1_f(@g_rho) @lbound, @ubound=icc_1_1_ci(@alpha) when :icc_k @icc_name=_("McGraw & Wong ICC(K)") @r=@icc_1_k @f=icc_1_k_f(@g_rho) @lbound, @ubound=icc_1_k_ci(@alpha) when :icc_c_1 @icc_name=_("McGraw & Wong ICC(C,1)") @r=@icc_3_1 @f=icc_c_1_f(@g_rho) @lbound, @ubound=icc_3_1_ci(@alpha) when :icc_c_k @icc_name=_("McGraw & Wong ICC(C,K)") @r=@icc_3_k @f=icc_c_k_f(@g_rho) @lbound, @ubound=icc_c_k_ci(@alpha) when :icc_a_1 @icc_name=_("McGraw & Wong ICC(A,1)") @r=@icc_2_1 @f=icc_a_1_f(@g_rho) @lbound,@ubound = icc_2_1_ci(@alpha) when :icc_a_k @icc_name=_("McGraw & Wong ICC(A,K)") @r=@icc_2_k @f=icc_a_k_f(@g_rho) @lbound,@ubound=icc_2_k_ci(@alpha) else raise "Type #{v} doesn't exists" end end def compute @df_bt=n-1 @df_wt=n*(k-1) @df_bj=k-1 @df_residual=(n-1)*(k-1) @total_mean=@vectors.inject(0){|ac,v| ac+v.sum}.quo(n*k) vm=@ds.vector_mean @ss_bt=k*vm.ss(@total_mean) @ms_bt=@ss_bt.quo(@df_bt) @ss_bj=n*@vectors.inject(0){|ac,v| ac+(v.mean-@total_mean).square} @ms_bj=@ss_bj.quo(@df_bj) @ss_wt=@vectors.inject(0){|ac,v| ac+(v-vm).ss(0)} @ms_wt=@ss_wt.quo(@df_wt) @ss_residual=@ss_wt-@ss_bj @ms_residual=@ss_residual.quo(@df_residual) ### # Shrout and Fleiss denomination ### # ICC(1,1) / ICC(1) @icc_1_1=(bms-wms).quo(bms+(k-1)*wms) # ICC(2,1) / ICC(A,1) @icc_2_1=(bms-ems).quo(bms+(k-1)*ems+k*(jms - ems).quo(n)) # ICC(3,1) / ICC(C,1) @icc_3_1=(bms-ems).quo(bms+(k-1)*ems) # ICC(1,K) / ICC(K) @icc_1_k=(bms-wms).quo(bms) # ICC(2,K) / ICC(A,k) @icc_2_k=(bms-ems).quo(bms+(jms-ems).quo(n)) # ICC(3,K) / ICC(C,k) = Cronbach's alpha @icc_3_k=(bms-ems).quo(bms) ### # McGraw and Wong ### end def icc_1_f(rho=0.0) num=msr*(1-rho) den=msw*(1+(k-1)*rho) Statsample::Test::F.new(num, den, @df_bt, @df_wt) end # One way random F, type k def icc_1_k_f(rho=0) num=msr*(1-rho) den=msw Statsample::Test::F.new(num, den, @df_bt, @df_wt) end def icc_c_1_f(rho=0) num=msr*(1-rho) den=mse*(1+(k-1)*rho) Statsample::Test::F.new(num, den, @df_bt, @df_residual) end def icc_c_k_f(rho=0) num=(1-rho) den=1-@icc_3_k Statsample::Test::F.new(num, den, @df_bt, @df_residual) end def v(a,b) ((a*msc+b*mse)**2).quo(((a*msc)**2.quo(k-1))+((b*mse)**2.quo( (n-1) * (k-1)))) end def a(rho) (k*rho).quo(n*(1-rho)) end def b(rho) 1+((k*rho*(n-1)).quo(n*(1-rho))) end def c(rho) rho.quo(n*(1-rho)) end def d(rho) 1+((rho*(n-1)).quo(n*(1-rho))) end private :v, :a, :b, :c, :d def icc_a_1_f(rho=0) fj=jms.quo(ems) num=msr den=a(rho)*msc+b(rho)*mse pp = @icc_2_1 vn=(k-1)*(n-1)*((k*pp*fj+n*(1+(k-1)*pp)-k*pp)**2) vd=(n-1)*(k**2)*(pp**2)*(fj**2)+((n*(1+(k-1)*pp)-k*pp)**2) v=vn.quo(vd) Statsample::Test::F.new(num, den, @df_bt, v) end def icc_a_k_f(rho=0) num=msr den=c(rho)*msc+d(rho)*mse fj=jms.quo(ems) pp = @icc_2_k vn=(k-1)*(n-1)*((k*pp*fj+n*(1+(k-1)*pp)-k*pp)**2) vd=(n-1)*(k**2)*(pp**2)*(fj**2)+((n*(1+(k-1)*pp)-k*pp)**2) v=vn.quo(vd) Statsample::Test::F.new(num, den, @df_bt,v) end # F test for ICC Case 1. Shrout and Fleiss def icc_1_f_shrout Statsample::Test::F.new(bms, wms, @df_bt, @df_wt) end # Intervale of confidence for ICC (1,1) def icc_1_1_ci(alpha=0.05) per=1-(0.5*alpha) fu=icc_1_f.f*Distribution::F.p_value(per, @df_wt, @df_bt) fl=icc_1_f.f.quo(Distribution::F.p_value(per, @df_bt, @df_wt)) [(fl-1).quo(fl+k-1), (fu-1).quo(fu+k-1)] end # Intervale of confidence for ICC (1,k) def icc_1_k_ci(alpha=0.05) per=1-(0.5*alpha) fu=icc_1_f.f*Distribution::F.p_value(per, @df_wt, @df_bt) fl=icc_1_f.f.quo(Distribution::F.p_value(per, @df_bt, @df_wt)) [1-1.quo(fl), 1-1.quo(fu)] end # F test for ICC Case 2 def icc_2_f Statsample::Test::F.new(bms, ems, @df_bt, @df_residual) end # # F* for ICC(2,1) and ICC(2,k) # def icc_2_1_fs(pp,alpha=0.05) fj=jms.quo(ems) per=1-(0.5*alpha) vn=(k-1)*(n-1)*((k*pp*fj+n*(1+(k-1)*pp)-k*pp)**2) vd=(n-1)*(k**2)*(pp**2)*(fj**2)+((n*(1+(k-1)*pp)-k*pp)**2) v=vn.quo(vd) f1=Distribution::F.p_value(per, n-1,v) f2=Distribution::F.p_value(per, v, n-1) [f1,f2] end def icc_2_1_ci(alpha=0.05) icc_2_1_ci_mcgraw end # Confidence interval ICC(A,1), McGawn def icc_2_1_ci_mcgraw(alpha=0.05) fd,fu=icc_2_1_fs(icc_2_1,alpha) cl=(n*(msr-fd*mse)).quo(fd*(k*msc+(k*n-k-n)*mse)+n*msr) cu=(n*(fu*msr-mse)).quo(k*msc+(k*n-k-n)*mse+n*fu*msr) [cl,cu] end def icc_2_k_ci(alpha=0.05) icc_2_k_ci_mcgraw(alpha) end def icc_2_k_ci_mcgraw(alpha=0.05) f1,f2=icc_2_1_fs(icc_2_k,alpha) [ (n*(msr-f1*mse)).quo(f1*(msc-mse)+n*msr), (n*(f2*msr-mse)).quo(msc-mse+n*f2*msr) ] end def icc_2_k_ci_shrout(alpha=0.05) ci=icc_2_1_ci(alpha) [(ci[0]*k).quo(1+(k-1)*ci[0]), (ci[1]*k).quo(1+(k-1)*ci[1])] end def icc_3_f Statsample::Test::F.new(bms, ems, @df_bt, @df_residual) end def icc_3_1_ci(alpha=0.05) per=1-(0.5*alpha) fl=icc_3_f.f.quo(Distribution::F.p_value(per, @df_bt, @df_residual)) fu=icc_3_f.f*Distribution::F.p_value(per, @df_residual, @df_bt) [(fl-1).quo(fl+k-1), (fu-1).quo(fu+k-1)] end def icc_3_k_ci(alpha=0.05) per=1-(0.5*alpha) fl=icc_3_f.f.quo(Distribution::F.p_value(per, @df_bt, @df_residual)) fu=icc_3_f.f*Distribution::F.p_value(per, @df_residual, @df_bt) [1-1.quo(fl),1-1.quo(fu)] end def icc_c_k_ci(alpha=0.05) per=1-(0.5*alpha) fl=icc_c_k_f.f.quo(Distribution::F.p_value(per, @df_bt, @df_residual)) fu=icc_c_k_f.f*Distribution::F.p_value(per, @df_residual, @df_bt) [1-1.quo(fl),1-1.quo(fu)] end def report_building(b) b.section(:name=>name) do |s| s.text @icc_name s.text _("ICC: %0.4f") % @r s.parse_element(@f) s.text _("CI (%0.2f): [%0.4f - %0.4f]") % [(1-@alpha)*100, @lbound, @ubound] end end end end end ================================================ FILE: lib/statsample/reliability/multiscaleanalysis.rb ================================================ module Statsample module Reliability # DSL for analysis of multiple scales analysis. # Retrieves reliability analysis for each scale and # provides fast accessors to correlations matrix, # PCA and Factor Analysis. # # == Usage # @x1=[1,1,1,1,2,2,2,2,3,3,3,30].to_vector(:scale) # @x2=[1,1,1,2,2,3,3,3,3,4,4,50].to_vector(:scale) # @x3=[2,2,1,1,1,2,2,2,3,4,5,40].to_vector(:scale) # @x4=[1,2,3,4,4,4,4,3,4,4,5,30].to_vector(:scale) # ds={'x1'=>@x1,'x2'=>@x2,'x3'=>@x3,'x4'=>@x4}.to_dataset # opts={:name=>"Scales", # Name of analysis # :summary_correlation_matrix=>true, # Add correlation matrix # :summary_pca } # Add PCA between scales # msa=Statsample::Reliability::MultiScaleAnalysis.new(opts) do |m| # m.scale :s1, ds.clone(%w{x1 x2}) # m.scale :s2, ds.clone(%w{x3 x4}), {:name=>"Scale 2"} # end # # Retrieve summary # puts msa.summary class MultiScaleAnalysis include Statsample::Summarizable # Hash with scales attr_reader :scales # Name of analysis attr_accessor :name # Add a correlation matrix on summary attr_accessor :summary_correlation_matrix # Add PCA to summary attr_accessor :summary_pca # Add Principal Axis to summary attr_accessor :summary_principal_axis # Options for Factor::PCA object attr_accessor :pca_options # Options for Factor::PrincipalAxis attr_accessor :principal_axis_options # Add Parallel Analysis to summary attr_accessor :summary_parallel_analysis # Options for Parallel Analysis attr_accessor :parallel_analysis_options # Add MPA to summary attr_accessor :summary_map # Options for MAP attr_accessor :map_options # Generates a new MultiScaleAnalysis # Opts could be any accessor of the class # * :name, # * :summary_correlation_matrix # * :summary_pca # * :summary_principal_axis # * :summary_map # * :pca_options # * :factor_analysis_options # * :map_options # If block given, all methods should be called # inside object environment. # def initialize(opts=Hash.new, &block) @scales=Hash.new @scales_keys=Array.new opts_default={ :name=>_("Multiple Scale analysis"), :summary_correlation_matrix=>false, :summary_pca=>false, :summary_principal_axis=>false, :summary_parallel_analysis=>false, :summary_map=>false, :pca_options=>Hash.new, :principal_axis_options=>Hash.new, :parallel_analysis_options=>Hash.new, :map_options=>Hash.new } @opts=opts_default.merge(opts) @opts.each{|k,v| self.send("#{k}=",v) if self.respond_to? k } if block block.arity<1 ? instance_eval(&block) : block.call(self) end end # Add or retrieve a scale to analysis. # If second parameters is a dataset, generates a ScaleAnalysis # for ds, named code with options opts. # # If second parameters is empty, returns the ScaleAnalysis # code. def scale(code, ds=nil, opts=nil) if ds.nil? @scales[code] else opts={:name=>_("Scale %s") % code} if opts.nil? @scales_keys.push(code) @scales[code]=ScaleAnalysis.new(ds, opts) end end # Delete ScaleAnalysis named code def delete_scale(code) @scales_keys.delete code @scales.delete code end # Retrieves a Principal Component Analysis (Factor::PCA) # using all scales, using opts a options. def pca(opts=nil) opts||=pca_options Statsample::Factor::PCA.new(correlation_matrix, opts) end # Retrieve Velicer's MAP # using all scales. def map(opts=nil) opts||=map_options Statsample::Factor::MAP.new(correlation_matrix, opts) end # Retrieves a PrincipalAxis Analysis (Factor::PrincipalAxis) # using all scales, using opts a options. def principal_axis_analysis(opts=nil) opts||=principal_axis_options Statsample::Factor::PrincipalAxis.new(correlation_matrix, opts) end def dataset_from_scales ds=Dataset.new(@scales_keys) @scales.each_pair do |code,scale| ds[code.to_s]=scale.ds.vector_sum ds[code.to_s].name=scale.name end ds.update_valid_data ds end def parallel_analysis(opts=nil) opts||=parallel_analysis_options Statsample::Factor::ParallelAnalysis.new(dataset_from_scales, opts) end # Retrieves a Correlation Matrix between scales. # def correlation_matrix Statsample::Bivariate.correlation_matrix(dataset_from_scales) end def report_building(b) # :nodoc: b.section(:name=>name) do |s| s.section(:name=>_("Reliability analysis of scales")) do |s2| @scales.each_pair do |k, scale| s2.parse_element(scale) end end if summary_correlation_matrix s.section(:name=>_("Correlation matrix for %s") % name) do |s2| s2.parse_element(correlation_matrix) end end if summary_pca s.section(:name=>_("PCA for %s") % name) do |s2| s2.parse_element(pca) end end if summary_principal_axis s.section(:name=>_("Principal Axis for %s") % name) do |s2| s2.parse_element(principal_axis_analysis) end end if summary_parallel_analysis s.section(:name=>_("Parallel Analysis for %s") % name) do |s2| s2.parse_element(parallel_analysis) end end if summary_map s.section(:name=>_("MAP for %s") % name) do |s2| s2.parse_element(map) end end end end end end end ================================================ FILE: lib/statsample/reliability/scaleanalysis.rb ================================================ module Statsample module Reliability # Analysis of a Scale. Analoge of Scale Reliability analysis on SPSS. # Returns several statistics for complete scale and each item # == Usage # @x1=[1,1,1,1,2,2,2,2,3,3,3,30].to_vector(:scale) # @x2=[1,1,1,2,2,3,3,3,3,4,4,50].to_vector(:scale) # @x3=[2,2,1,1,1,2,2,2,3,4,5,40].to_vector(:scale) # @x4=[1,2,3,4,4,4,4,3,4,4,5,30].to_vector(:scale) # ds={'x1'=>@x1,'x2'=>@x2,'x3'=>@x3,'x4'=>@x4}.to_dataset # ia=Statsample::Reliability::ScaleAnalysis.new(ds) # puts ia.summary class ScaleAnalysis include Summarizable attr_reader :ds,:mean, :sd,:valid_n, :alpha , :alpha_standarized, :variances_mean, :covariances_mean, :cov_m attr_accessor :name attr_accessor :summary_histogram def initialize(ds, opts=Hash.new) @dumped=ds.fields.find_all {|f| ds[f].variance==0 } @ods=ds @ds=ds.dup_only_valid(ds.fields - @dumped) @ds.name=ds.name @k=@ds.fields.size @total=@ds.vector_sum @o_total=@dumped.size > 0 ? @ods.vector_sum : nil @vector_mean=@ds.vector_mean @item_mean=@vector_mean.mean @item_sd=@vector_mean.sd @mean=@total.mean @median=@total.median @skew=@total.skew @kurtosis=@total.kurtosis @sd = @total.sd @variance=@total.variance @valid_n = @total.size opts_default={ :name=>_("Reliability Analysis"), :summary_histogram=>true } @opts=opts_default.merge(opts) @opts.each{|k,v| self.send("#{k}=",v) if self.respond_to? k } @cov_m=Statsample::Bivariate.covariance_matrix(@ds) # Mean for covariances and variances @variances=@k.times.map {|i| @cov_m[i,i]}.to_scale @variances_mean=@variances.mean @covariances_mean=(@variance-@variances.sum).quo(@k**2-@k) #begin @alpha = Statsample::Reliability.cronbach_alpha(@ds) @alpha_standarized = Statsample::Reliability.cronbach_alpha_standarized(@ds) #rescue => e # raise DatasetException.new(@ds,e), "Error calculating alpha" #end end # Returns a hash with structure def item_characteristic_curve i=0 out={} total={} @ds.each do |row| tot=@total[i] @ds.fields.each do |f| out[f]||= {} total[f]||={} out[f][tot]||= 0 total[f][tot]||=0 out[f][tot]+= row[f] total[f][tot]+=1 end i+=1 end total.each do |f,var| var.each do |tot,v| out[f][tot]=out[f][tot].quo(total[f][tot]) end end out end # =Adjusted R.P.B. for each item # Adjusted RPB(Point biserial-correlation) for each item # def item_total_correlation @itc||=@ds.fields.inject({}) do |a,v| vector=@ds[v].clone ds2=@ds.clone ds2.delete_vector(v) total=ds2.vector_sum a[v]=Statsample::Bivariate.pearson(vector,total) a end end def mean_rpb item_total_correlation.values.to_scale.mean end def item_statistics @is||=@ds.fields.inject({}) do |a,v| a[v]={:mean=>@ds[v].mean, :sds=>Math::sqrt(@cov_m.variance(v))} a end end # Returns a dataset with cases ordered by score # and variables ordered by difficulty def item_difficulty_analysis dif={} @ds.fields.each{|f| dif[f]=@ds[f].mean } dif_sort=dif.sort{|a,b| -(a[1]<=>b[1])} scores_sort={} scores=@ds.vector_mean scores.each_index{|i| scores_sort[i]=scores[i] } scores_sort=scores_sort.sort{|a,b| a[1]<=>b[1]} ds_new=Statsample::Dataset.new(['case','score'] + dif_sort.collect{|a,b| a}) scores_sort.each do |i,score| row=[i, score] case_row=@ds.case_as_hash(i) dif_sort.each{|variable,dif_value| row.push(case_row[variable]) } ds_new.add_case_array(row) end ds_new.update_valid_data ds_new end def stats_if_deleted @sif||=stats_if_deleted_intern end def stats_if_deleted_intern # :nodoc: return Hash.new if @ds.fields.size==1 @ds.fields.inject({}) do |a,v| cov_2=@cov_m.submatrix(@ds.fields-[v]) #ds2=@ds.clone #ds2.delete_vector(v) #total=ds2.vector_sum a[v]={} #a[v][:mean]=total.mean a[v][:mean]=@mean-item_statistics[v][:mean] a[v][:variance_sample]=cov_2.total_sum a[v][:sds]=Math::sqrt(a[v][:variance_sample]) n=cov_2.row_size a[v][:alpha] = (n>=2) ? Statsample::Reliability.cronbach_alpha_from_covariance_matrix(cov_2) : nil a end end def report_building(builder) #:nodoc: builder.section(:name=>@name) do |s| if @dumped.size>0 s.section(:name=>"Items with variance=0") do |s1| s.table(:name=>_("Summary for %s with all items") % @name) do |t| t.row [_("Items"), @ods.fields.size] t.row [_("Sum mean"), "%0.4f" % @o_total.mean] t.row [_("S.d. mean"), "%0.4f" % @o_total.sd] end s.table(:name=>_("Deleted items"), :header=>['item','mean']) do |t| @dumped.each do |f| t.row(["#{@ods[f].name}(#{f})", "%0.5f" % @ods[f].mean]) end end s.parse_element(Statsample::Graph::Histogram.new(@o_total, :name=>"Histogram (complete data) for %s" % @name)) if @summary_histogram end end s.table(:name=>_("Summary for %s") % @name) do |t| t.row [_("Valid Items"), @ds.fields.size] t.row [_("Valid cases"), @valid_n] t.row [_("Sum mean"), "%0.4f" % @mean] t.row [_("Sum sd"), "%0.4f" % @sd ] # t.row [_("Sum variance"), "%0.4f" % @variance] t.row [_("Sum median"), @median] t.hr t.row [_("Item mean"), "%0.4f" % @item_mean] t.row [_("Item sd"), "%0.4f" % @item_sd] t.hr t.row [_("Skewness"), "%0.4f" % @skew] t.row [_("Kurtosis"), "%0.4f" % @kurtosis] t.hr t.row [_("Cronbach's alpha"), @alpha ? ("%0.4f" % @alpha) : "--"] t.row [_("Standarized Cronbach's alpha"), @alpha_standarized ? ("%0.4f" % @alpha_standarized) : "--" ] t.row [_("Mean rpb"), "%0.4f" % mean_rpb] t.row [_("Variances mean"), "%g" % @variances_mean] t.row [_("Covariances mean") , "%g" % @covariances_mean] end if (@alpha) s.text _("Items for obtain alpha(0.8) : %d" % Statsample::Reliability::n_for_desired_reliability(@alpha, 0.8, @ds.fields.size)) s.text _("Items for obtain alpha(0.9) : %d" % Statsample::Reliability::n_for_desired_reliability(@alpha, 0.9, @ds.fields.size)) end sid=stats_if_deleted is=item_statistics itc=item_total_correlation s.table(:name=>_("Items report for %s") % @name, :header=>["item","mean","sd", "mean if deleted", "var if deleted", "sd if deleted"," item-total correl.", "alpha if deleted"]) do |t| @ds.fields.each do |f| row=["#{@ds[f].name}(#{f})"] if is[f] row+=[sprintf("%0.5f",is[f][:mean]), sprintf("%0.5f", is[f][:sds])] else row+=["-","-"] end if sid[f] row+= [sprintf("%0.5f",sid[f][:mean]), sprintf("%0.5f",sid[f][:variance_sample]), sprintf("%0.5f",sid[f][:sds])] else row+=%w{- - -} end if itc[f] row+= [sprintf("%0.5f",itc[f])] else row+=['-'] end if sid[f] and !sid[f][:alpha].nil? row+=[sprintf("%0.5f",sid[f][:alpha])] else row+=["-"] end t.row row end # end each end # table s.parse_element(Statsample::Graph::Histogram.new(@total, :name=>"Histogram (valid data) for %s" % @name)) if @summary_histogram end # section end # def end # class end # module end # module ================================================ FILE: lib/statsample/reliability/skillscaleanalysis.rb ================================================ module Statsample module Reliability # Analysis of a Skill Scale # Given a dataset with results and a correct answers hash, # generates a ScaleAnalysis # == Usage # x1=%{a b b c}.to_vector # x2=%{b a b c}.to_vector # x3=%{a c b a}.to_vector # ds={'x1'=>@x1,'x2'=>@x2,'x3'=>@x3}.to_dataset # key={'x1'=>'a','x2'=>'b','x3'=>'a'} # ssa=Statsample::Reliability::SkillScaleAnalysis.new(ds,key) # puts ssa.summary class SkillScaleAnalysis include Summarizable attr_accessor :name attr_accessor :summary_minimal_item_correlation attr_accessor :summary_show_problematic_items def initialize(ds,key,opts=Hash.new) opts_default={ :name=>_("Skill Scale Reliability Analysis (%s)") % ds.name, :summary_minimal_item_correlation=>0.10, :summary_show_problematic_items=>true } @ds=ds @key=key @opts=opts_default.merge(opts) @opts.each{|k,v| self.send("#{k}=",v) if self.respond_to? k } @cds=nil end # Dataset only corrected vectors def corrected_dataset_minimal cds=corrected_dataset dsm=@key.keys.inject({}) {|ac,v| ac[v]=cds[v];ac}.to_dataset @key.keys.each do |k| dsm[k].name=_("%s(corrected)") % @ds[k].name dsm[k].labels=@ds[k].labels end dsm.name=_("Corrected dataset from %s") % @ds.name dsm end def vector_sum corrected_dataset_minimal.vector_sum end def vector_mean corrected_dataset_minimal.vector_mean end def scale_analysis sa=ScaleAnalysis.new(corrected_dataset_minimal) sa.name=_("%s (Scale Analysis)") % @name sa end def corrected_dataset if @cds.nil? @cds=@ds.dup_empty @key.keys.each {|k| @cds[k].type=:scale; @cds[k].name=@ds[k].name} @ds.each do |row| out={} row.each do |k,v| if @key.keys.include? k if @ds[k].is_valid? v out[k]= @key[k]==v ? 1 : 0 else out[k]=nil end else out[k]=v end end @cds.add_case(out,false) end @cds.update_valid_data end @cds end def report_building(builder) builder.section(:name=>@name) do |s| sa=scale_analysis s.parse_element(sa) if summary_show_problematic_items s.section(:name=>_("Problematic Items")) do |spi| count=0 sa.item_total_correlation.each do |k,v| if v < summary_minimal_item_correlation count+=1 spi.section(:name=>_("Item: %s") % @ds[k].name) do |spii| spii.text _("Correct answer: %s") % @key[k] spii.text _("p: %0.3f") % corrected_dataset[k].mean props=@ds[k].proportions.inject({}) {|ac,v| ac[v[0]] = v[1].to_f;ac} spi.table(:name=>"Proportions",:header=>[_("Value"), _("%")]) do |table| props.each do |k1,v| table.row [ @ds[k].labeling(k1), "%0.3f" % v] end end end end end spi.text _("No problematic items") if count==0 end end end end end end end ================================================ FILE: lib/statsample/reliability.rb ================================================ module Statsample module Reliability class << self # Calculate Chonbach's alpha for a given dataset. # only uses tuples without missing data def cronbach_alpha(ods) ds=ods.dup_only_valid return nil if ds.vectors.any? {|k,v| v.variance==0} n_items=ds.fields.size return nil if n_items<=1 s2_items=ds.vectors.inject(0) {|ac,v| ac+v[1].variance } total=ds.vector_sum (n_items.quo(n_items-1)) * (1-(s2_items.quo(total.variance))) end # Calculate Chonbach's alpha for a given dataset # using standarized values for every vector. # Only uses tuples without missing data # Return nil if one or more vectors has 0 variance def cronbach_alpha_standarized(ods) ds=ods.dup_only_valid return nil if ds.vectors.any? {|k,v| v.variance==0} ds=ds.fields.inject({}){|a,f| a[f]=ods[f].standarized; a }.to_dataset cronbach_alpha(ds) end # Predicted reliability of a test by replicating # +n+ times the number of items def spearman_brown_prophecy(r,n) (n*r).quo(1+(n-1)*r) end alias :sbp :spearman_brown_prophecy # Returns the number of items # to obtain +r_d+ desired reliability # from +r+ current reliability, achieved with # +n+ items def n_for_desired_reliability(r,r_d,n=1) return nil if r.nil? (r_d*(1-r)).quo(r*(1-r_d))*n end # Get Cronbach alpha from n cases, # s2 mean variance and cov # mean covariance def cronbach_alpha_from_n_s2_cov(n,s2,cov) (n.quo(n-1)) * (1-(s2.quo(s2+(n-1)*cov))) end # Get Cronbach's alpha from a covariance matrix def cronbach_alpha_from_covariance_matrix(cov) n=cov.row_size raise "covariance matrix should have at least 2 variables" if n < 2 s2=n.times.inject(0) {|ac,i| ac+cov[i,i]} (n.quo(n-1))*(1-(s2.quo(cov.total_sum))) end # Returns n necessary to obtain specific alpha # given variance and covariance mean of items def n_for_desired_alpha(alpha,s2,cov) # Start with a regular test : 50 items min=2 max=1000 n=50 prev_n=0 epsilon=0.0001 dif=1000 c_a=cronbach_alpha_from_n_s2_cov(n,s2,cov) dif=c_a - alpha while(dif.abs>epsilon and n!=prev_n) prev_n=n if dif<0 min=n n=(n+(max-min).quo(2)).to_i else max=n n=(n-(max-min).quo(2)).to_i end c_a=cronbach_alpha_from_n_s2_cov(n,s2,cov) dif=c_a - alpha #puts "#{n} , #{c_a}" end n end # First derivative for alfa # Parameters # n: Number of items # sx: mean of variances # sxy: mean of covariances def alpha_first_derivative(n,sx,sxy) (sxy*(sx-sxy)).quo(((sxy*(n-1))+sx)**2) end # Second derivative for alfa # Parameters # n: Number of items # sx: mean of variances # sxy: mean of covariances def alfa_second_derivative(n,sx,sxy) (2*(sxy**2)*(sxy-sx)).quo(((sxy*(n-1))+sx)**3) end end class ItemCharacteristicCurve attr_reader :totals, :counts, :vector_total def initialize (ds, vector_total=nil) vector_total||=ds.vector_sum raise ArgumentError, "Total size != Dataset size" if vector_total.size!=ds.cases @vector_total=vector_total @ds=ds @totals={} @counts=@ds.fields.inject({}) {|a,v| a[v]={};a} process end def process i=0 @ds.each do |row| tot=@vector_total[i] @totals[tot]||=0 @totals[tot]+=1 @ds.fields.each do |f| item=row[f].to_s @counts[f][tot]||={} @counts[f][tot][item]||=0 @counts[f][tot][item] += 1 end i+=1 end end # Return a hash with p for each different value on a vector def curve_field(field, item) out={} item=item.to_s @totals.each do |value,n| count_value= @counts[field][value][item].nil? ? 0 : @counts[field][value][item] out[value]=count_value.quo(n) end out end # def end # self end # Reliability end # Statsample require 'statsample/reliability/icc.rb' require 'statsample/reliability/scaleanalysis.rb' require 'statsample/reliability/skillscaleanalysis.rb' require 'statsample/reliability/multiscaleanalysis.rb' ================================================ FILE: lib/statsample/resample.rb ================================================ module Statsample module Resample class << self def repeat_and_save(times,&action) (1..times).inject([]) {|a,x| a.push(action.call); a} end def generate (size,low,upper) range=upper-low+1 Vector.new((0...size).collect {|x| rand(range)+low },:scale) end end end end ================================================ FILE: lib/statsample/rserve_extension.rb ================================================ # Several additions to Statsample objects, to support # rserve-client module Statsample class Vector def to_REXP Rserve::REXP::Wrapper.wrap(data_with_nils) end end class Dataset def to_REXP names=@fields data=@fields.map {|f| Rserve::REXP::Wrapper.wrap(@vectors[f].data_with_nils) } l=Rserve::Rlist.new(data,names) Rserve::REXP.create_data_frame(l) end end end ================================================ FILE: lib/statsample/shorthand.rb ================================================ class Object # Shorthand for Statsample::Analysis.store(*args,&block) def ss_analysis(*args,&block) Statsample::Analysis.store(*args,&block) end end module Statsample # Module which provide shorthands for many methods. module Shorthand ### # :section: R like methods ### def read_with_cache(klass, filename,opts=Hash.new, cache=true) file_ds=filename+".ds" if cache and (File.exists? file_ds and File.mtime(file_ds)>File.mtime(filename)) ds=Statsample.load(file_ds) else ds=klass.read(filename) ds.save(file_ds) if cache end ds end # Import an Excel file. Cache result by default def read_excel(filename, opts=Hash.new, cache=true) read_with_cache(Statsample::Excel, filename, opts, cache) end # Import an CSV file. Cache result by default def read_csv read_with_cache(Statsample::CSV, filename, opts, cache) end # Retrieve names (fields) from dataset def names(ds) ds.fields end # Create a correlation matrix from a dataset def cor(ds) Statsample::Bivariate.correlation_matrix(ds) end # Create a variance/covariance matrix from a dataset def cov(ds) Statsample::Bivariate.covariate_matrix(ds) end # Create a Statsample::Vector # Analog to R's c def vector(*args) Statsample::Vector[*args] end # Random generation for the normal distribution def rnorm(n,mean=0,sd=1) rng=Distribution::Normal.rng(mean,sd) Statsample::Vector.new_scale(n) { rng.call} end # Creates a new Statsample::Dataset # Each key is transformed into string def dataset(vectors=Hash.new) vectors=vectors.inject({}) {|ac,v| ac[v[0].to_s]=v[1];ac} Statsample::Dataset.new(vectors) end alias :data_frame :dataset # Returns a Statsample::Graph::Boxplot def boxplot(*args) Statsample::Graph::Boxplot.new(*args) end # Returns a Statsample::Graph::Histogram def histogram(*args) Statsample::Graph::Histogram.new(*args) end # Returns a Statsample::Graph::Scatterplot def scatterplot(*args) Statsample::Graph::Scatterplot.new(*args) end # Returns a Statsample::Test::Levene def levene(*args) Statsample::Test::Levene.new(*args) end def principal_axis(*args) Statsample::Factor::PrincipalAxis.new(*args) end def polychoric(*args) Statsample::Bivariate::Polychoric.new(*args) end def tetrachoric(*args) Statsample::Bivariate::Tetrachoric.new(*args) end ### # Other Shortcuts ### def lr(*args) Statsample::Regression.multiple(*args) end def pca(ds,opts=Hash.new) Statsample::Factor::PCA.new(ds,opts) end def dominance_analysis(*args) Statsample::DominanceAnalysis.new(*args) end def dominance_analysis_bootstrap(*args) Statsample::DominanceAnalysis::Bootstrap.new(*args) end def scale_analysis(*args) Statsample::Reliability::ScaleAnalysis.new(*args) end def skill_scale_analysis(*args) Statsample::Reliability::SkillScaleAnalysis.new(*args) end def multiscale_analysis(*args,&block) Statsample::Reliability::MultiScaleAnalysis.new(*args,&block) end def test_u(*args) Statsample::Test::UMannWhitney.new(*args) end module_function :test_u, :rnorm end end ================================================ FILE: lib/statsample/srs.rb ================================================ module Statsample # Several methods to estimate parameters for simple random sampling # == Reference: # * Cochran, W.(1972). Sampling Techniques [spanish edition]. # * http://stattrek.com/Lesson6/SRS.aspx module SRS class << self ######################## # # :SECTION: Proportion estimation # # Function for estimation of proportions ######################## # # Finite population correction (over variance) # Source: Cochran(1972) def fpc_var(sam,pop) (pop - sam).quo(pop - 1) end # Finite population correction (over standard deviation) def fpc(sam,pop) Math::sqrt((pop-sam).quo(pop-1)) end # Non sample fraction. # # 1 - sample fraction def qf(sam , pop) 1-(sam.quo(pop)) end # Sample size estimation for proportions, infinite poblation def estimation_n0(d,prop,margin=0.95) t=Distribution::Normal.p_value(1-(1-margin).quo(2)) var=prop*(1-prop) t**2*var.quo(d**2) end # Sample size estimation for proportions, finite poblation. def estimation_n(d,prop,n_pobl,margin=0.95) n0=estimation_n0(d,prop,margin) n0.quo( 1 + ((n0 - 1).quo(n_pobl))) end # Proportion confidence interval with t values # Uses estimated proportion, sample without replacement. def proportion_confidence_interval_t(prop, n_sample, n_population, margin=0.95) t = Distribution::T.p_value(1-((1-margin).quo(2)) , n_sample-1) proportion_confidence_interval(prop,n_sample,n_population, t) end # Proportion confidence interval with z values # Uses estimated proportion, sample without replacement. def proportion_confidence_interval_z(p, n_sample, n_population, margin=0.95) z=Distribution::Normal.p_value(1-((1-margin).quo(2))) proportion_confidence_interval(p,n_sample,n_population, z) end # Proportion confidence interval with x value # Uses estimated proportion, sample without replacement def proportion_confidence_interval(p, sam,pop , x) #f=sam.quo(pop) one_range=x * Math::sqrt((qf(sam, pop) * p * (1-p)).quo(sam-1)) + (1.quo(sam * 2.0)) [p-one_range, p+one_range] end # Standard deviation for sample distribution of a proportion # Know proportion, sample with replacement. # Based on http://stattrek.com/Lesson6/SRS.aspx def proportion_sd_kp_wr(p, n_sample) Math::sqrt(p*(1-p).quo(n_sample)) end # Standard deviation for sample distribution of a proportion # Know proportion, sample without replacement. # # Sources: # * Cochran(1972) def proportion_sd_kp_wor(p, sam, pop) fpc(sam,pop)*Math::sqrt(p*(1-p).quo(sam)) end # Standard deviation for sample distribution of a proportion # Estimated proportion, sample with replacement # Based on http://stattrek.com/Lesson6/SRS.aspx. def proportion_sd_ep_wr(p, n_sample) Math::sqrt(p*(1-p).quo(n_sample-1)) end # Standard deviation for sample distribution of a proportion. # Estimated proportion, sample without replacement. # Reference: # * Cochran, 1972, Técnicas de muestreo def proportion_sd_ep_wor(p, sam,pop) fsc=(pop-sam).quo((sam-1)*pop) Math::sqrt(fsc*p*(1-p)) end # Total estimation sd based on sample. # Known proportion, sample without replacement # Reference: # * Cochran(1972) def proportion_total_sd_kp_wor(prop, sam, pop) pob * proportion_sd_kp_wor(p, sam, pop) end # Total estimation sd based on sample. # Estimated proportion, sample without replacement # Source: Cochran(1972) def proportion_total_sd_ep_wor(prop, sam, pop) fsc=((pop - sam).to_f / ( sam - 1)) Math::sqrt(fsc*pop*prop*(1-prop)) end ######################## # # :SECTION: Mean stimation # ######################## # Standard error. Known variance, sample with replacement. def standard_error_ksd_wr(s, sam, pop) s.quo(Math::sqrt(sam)) * Math::sqrt((pop-1).quo(pop)) end # Standard error of the mean. Known variance, sample w/o replacement def standard_error_ksd_wor(s,sam,pop) s.quo(Math::sqrt(sam)) * Math::sqrt(qf(sam,pop)) end alias_method :standard_error_esd_wr, :standard_error_ksd_wr # Standard error of the mean. # Estimated variance, without replacement # Cochran (1972) p.47 def standard_error_esd_wor(s,sam,pop) s.quo(Math::sqrt(sam)) * Math::sqrt(qf(sam,pop)) end alias_method :standard_error, :standard_error_esd_wor alias_method :se, :standard_error_esd_wor # Standard error of total estimation def standard_error_total(s,sam,pop) pop*se(s,sam,pop) end # Confidence Interval using T-Student # Use with n < 60 def mean_confidence_interval_t(mean,s,n_sample,n_population,margin=0.95) t=Distribution::T.p_value(1-((1-margin) / 2),n_sample-1) mean_confidence_interval(mean,s,n_sample,n_population,t) end # Confidente Interval using Z # Use with n > 60 def mean_confidence_interval_z(mean,s,n_sample,n_population,margin=0.95) z=Distribution::Normal.p_value(1-((1-margin) / 2)) mean_confidence_interval(mean,s,n_sample,n_population, z) end # Confidente interval using X. # # Better use mean_confidence_interval_z or mean_confidence_interval_t def mean_confidence_interval(mean,s,n_sample,n_population,x) range=x*se(s,n_sample,n_population) [mean-range,mean+range] end end end end ================================================ FILE: lib/statsample/test/bartlettsphericity.rb ================================================ module Statsample module Test # == Bartlett's test of Sphericity. # Test the hyphotesis that the sample correlation matrix # comes from a multivariate normal population where variables # are independent. In other words, the population correlation # matrix is the identity matrix. # == Reference # * Dziuban, C., & Shirkey E. (1974). When is a correlation matrix appropriate for factor analysis? Some decision rules. Psychological Bulletin, 81(6), 358-361. class BartlettSphericity include Statsample::Test include Summarizable attr_accessor :name attr_reader :ncases attr_reader :nvars attr_reader :value attr_reader :df # Args # * _matrix_: correlation matrix # * _ncases_: number of cases def initialize(matrix,ncases) @matrix=matrix @ncases=ncases @nvars=@matrix.row_size @name=_("Bartlett's test of sphericity") compute end # Uses SPSS formula. # On Dziuban & Shirkey, the minus between the first and second # statement is a *!!! # def compute @value=-((@ncases-1)-(2*@nvars+5).quo(6))*Math::log(@matrix.determinant) @df=(@nvars*(@nvars-1)).quo(2) end def probability 1-Distribution::ChiSquare.cdf(@value,@df) end def report_building(builder) # :nodoc: builder.text "%s : X(%d) = %0.4f , p = %0.4f" % [@name, @df, @value, probability] end end end end ================================================ FILE: lib/statsample/test/chisquare.rb ================================================ module Statsample module Test module ChiSquare class WithMatrix attr_reader :df attr_reader :value def initialize(observed, expected=nil) @observed=observed @expected=expected or calculate_expected raise "Observed size!=expected size" if @observed.row_size!=@expected.row_size or @observed.column_size!=@expected.column_size @df=(@observed.row_size-1)*(@observed.column_size-1) @value=compute_chi end def calculate_expected sum=@observed.total_sum @expected=Matrix.rows( @observed.row_size.times.map {|i| @observed.column_size.times.map {|j| (@observed.row_sum[i].quo(sum) * @observed.column_sum[j].quo(sum))*sum } }) end def to_f @value end def chi_square @value end def probability 1-Distribution::ChiSquare.cdf(@value.to_f,@df) end def compute_chi sum=0 (0...@observed.row_size).each {|i| (0...@observed.column_size).each {|j| sum+=((@observed[i, j] - @expected[i,j])**2).quo(@expected[i,j]) } } sum end end end end end ================================================ FILE: lib/statsample/test/f.rb ================================================ module Statsample module Test # From Wikipedia: # An F-test is any statistical test in which the test statistic has an F-distribution under the null hypothesis. It is most often used when comparing statistical models that have been fit to a data set, in order to identify the model that best fits the population from which the data were sampled. class F include Statsample::Test include Summarizable attr_reader :var_num, :var_den, :df_num, :df_den, :var_total, :df_total # Tails for probability (:both, :left or :right) attr_accessor :tails # Name of F analysis attr_accessor :name # Parameters: # * var_num: variance numerator # * var_den: variance denominator # * df_num: degrees of freedom numerator # * df_den: degrees of freedom denominator def initialize(var_num, var_den, df_num, df_den, opts=Hash.new) @var_num=var_num @var_den=var_den @df_num=df_num @df_den=df_den @var_total=var_num+var_den @df_total=df_num+df_den opts_default={:tails=>:right, :name=>_("F Test")} @opts=opts_default.merge(opts) raise "Tails should be right or left, not both" if @opts[:tails]==:both opts_default.keys.each {|k| send("#{k}=", @opts[k]) } end def f @var_num.quo(@var_den) end def to_f f end # probability def probability p_using_cdf(Distribution::F.cdf(f, @df_num, @df_den), tails) end def report_building(builder) #:nodoc: if @df_num.is_a? Integer and @df_den.is_a? Integer builder.text "%s : F(%d, %d) = %0.4f , p = %0.4f" % [@name, @df_num, @df_den, f, probability] else builder.text "%s : F(%0.2f, %0.2f) = %0.4f , p = %0.4f" % [@name, @df_num, @df_den, f, probability] end end end end end ================================================ FILE: lib/statsample/test/kolmogorovsmirnov.rb ================================================ module Statsample module Test # == Kolmogorov-Smirnov's test of equality of distributions. class KolmogorovSmirnov attr_reader :d include Statsample::Test include Summarizable # Creates a new Kolmogorov-Smirnov test # d1 should have each method # d2 could be a Distribution class, with a cdf method, # a vector or a lambda def initialize(d1,d2) raise "First argument should have each method" unless d1.respond_to? :each @d1=make_cdf(d1) if d2.respond_to? :cdf or d2.is_a? Proc @d2=d2 elsif d2.respond_to? :each @d2=make_cdf(d2) else raise "Second argument should respond to cdf or each" end calculate end def calculate d=0 @d1.each {|x| v1=@d1.cdf(x); v2=@d2.is_a?(Proc) ? @d2.call(x) : @d2.cdf(x) d=(v1-v2).to_f.abs if (v1-v2).abs>d } @d=d end # Make a wrapper EmpiricDistribution to any method which implements # each # On Statsample::Vector, only uses #valid_data def make_cdf(v) v.is_a?(Statsample::Vector) ? EmpiricDistribution.new(v.valid_data) : EmpiricDistribution.new(v) end class EmpiricDistribution def initialize(data) @min=data.min @max=data.max @data=data.sort @n=data.size end def each @data.each {|x| yield x } end def cdf(x) return 0 if x<@min return 1 if x>=@max v=@data.index{|v1| v1>=x} v.nil? ? 0 : (v+(x==@data[v]? 1 : 0)).quo(@n) end end # End EmpiricDistribution end end end ================================================ FILE: lib/statsample/test/levene.rb ================================================ module Statsample module Test # = Levene Test for Equality of Variances # From NIST/SEMATECH: #
Levene's test ( Levene, 1960) is used to test if k samples have equal variances. Equal variances across samples is called homogeneity of variance. Some statistical tests, for example the analysis of variance, assume that variances are equal across groups or samples. The Levene test can be used to verify that assumption.
# Use: # require 'statsample' # a=[1,2,3,4,5,6,7,8,100,10].to_scale # b=[30,40,50,60,70,80,90,100,110,120].to_scale # # levene=Statsample::Test::Levene.new([a,b]) # puts levene.summary # # Output: # Levene Test # F: 0.778121319848449 # p: 0.389344552595791 # # Reference: # * NIST/SEMATECH e-Handbook of Statistical Methods. Available on http://www.itl.nist.gov/div898/handbook/eda/section3/eda35a.htm class Levene include Statsample::Test include Summarizable # Degrees of freedom 1 (k-1) attr_reader :d1 # Degrees of freedom 2 (n-k) attr_reader :d2 # Name of test attr_accessor :name # Input could be an array of vectors or a dataset def initialize(input, opts=Hash.new()) if input.is_a? Statsample::Dataset @vectors=input.vectors.values else @vectors=input end @name=_("Levene Test") opts.each{|k,v| self.send("#{k}=",v) if self.respond_to? k } compute end # Value of the test def f @w end def report_building(builder) # :nodoc: builder.text "%s : F(%d, %d) = %0.4f , p = %0.4f" % [@name, @d1, @d2, f, probability] end def compute n=@vectors.inject(0) {|ac,v| ac+v.n_valid} zi=@vectors.collect {|vector| mean=vector.mean vector.collect {|v| (v-mean).abs }.to_scale } total_mean=zi.inject([]) {|ac,vector| ac+vector.valid_data }.to_scale.mean k=@vectors.size sum_num=zi.inject(0) {|ac,vector| ac+(vector.size*(vector.mean-total_mean)**2) } sum_den=zi.inject(0) {|ac,vector| z_mean=vector.mean ac+vector.valid_data.inject(0) {|acp,zij| acp+(zij-z_mean)**2 } } @w=((n-k)*sum_num).quo((k-1)*sum_den) @d1=k-1 @d2=n-k end private :compute # Probability. # With H_0 = Sum(s2)=0, probability of getting a value of the test upper or equal to the obtained on the sample def probability p_using_cdf(Distribution::F.cdf(f, @d1, @d2), :right) end end end end ================================================ FILE: lib/statsample/test/t.rb ================================================ module Statsample module Test # A t-test is any statistical hypothesis test in which the test statistic follows a Student's t distribution, if the null hypothesis is supported class T class << self include Math # Test the null hypothesis that the population mean is equal to a specified value u, one uses the statistic. # Is the same formula used on t-test for paired sample. # * x: sample/differences mean # * u: population mean # * s: sample/differences standard deviation # * n: sample size def one_sample(x,u,s,n) (x-u)*Math::sqrt(n).quo(s) end # Test if means of two samples are different. # * x1: sample 1 mean # * x2: sample 2 mean # * s1: sample 1 standard deviation # * s2: sample 2 standard deviation # * n1: sample 1 size # * n2: sample 2 size # * equal_variance: true if equal_variance assumed # def two_sample_independent(x1, x2, s1, s2, n1, n2, equal_variance = false) num=x1-x2 if equal_variance sx1x2 = sqrt(((n1-1)*s1**2 + (n2-1)*s2**2).quo(n1+n2-2)) den = sx1x2*sqrt(1.quo(n1)+1.quo(n2)) else den=sqrt((s1**2).quo(n1) + (s2**2).quo(n2)) end num.quo(den) end # Degrees of freedom for equal variance on t test def df_equal_variance(n1,n2) n1+n2-2 end # Degrees of freedom for unequal variance # * s1: sample 1 standard deviation # * s2: sample 2 standard deviation # * n1: sample 1 size # * n2: sample 2 size # == Reference # * http://en.wikipedia.org/wiki/Welch-Satterthwaite_equation def df_not_equal_variance(s1,s2,n1,n2) s2_1=s1**2 s2_2=s2**2 num=(s2_1.quo(n1)+s2_2.quo(n2))**2 den=(s2_1.quo(n1)**2).quo(n1-1) + (s2_2.quo(n2)**2).quo(n2-1) num.quo(den) end end include Statsample::Test include Summarizable attr_reader :standard_error, :estimate, :df # Tails for p-value (:both, :left or :right). Default :both attr_accessor :tails # Name of F analysis attr_accessor :name attr_accessor :confidence_level attr_reader :t attr_accessor :estimate_name, :standard_error_name # Creates a generic t test. Use OneSample or TwoSamplesIndependent # classes for better summaries. # Parameters: # * estimate: estimate # * standard_error: standard error of estimate # * df: degrees of freedom def initialize(estimate, standard_error, df, opts=Hash.new) @estimate=estimate @standard_error=standard_error @df=df @t = @estimate / @standard_error.to_f opts_default={ :tails=>:both, :name=>_("T Test"), :estimate_name=>_("Estimate"), :standard_error_name=>_("Std.Err.of Estimate"), :confidence_level=>0.95} @opts = opts_default.merge(opts) @opts.keys.each {|k| send("#{k}=", @opts[k]) if respond_to? k } end alias :se :standard_error def to_f t end # probability def probability p_using_cdf(Distribution::T.cdf(t, df), tails) end def confidence_interval(cl=nil) cl||=confidence_level t_crit = t_critical(cl, df) [estimate - se*t_crit, estimate + se*t_crit] end alias :ci :confidence_interval def report_building(builder) #:nodoc: builder.section(:name=>@name) do |section| section.text _("%s: %0.4f | %s: %0.4f") % [@estimate_name, @estimate, @standard_error_name, se] report_building_t(section) end end def report_building_t(s) df_f=@df.is_a?(Integer) ? "%d" : "%0.4f" s.text _("t(%d) = %0.4f, p=%0.4f (%s tails)") % [df, t,probability, tails] s.text _("CI(%d%%): %0.4f - %0.4f") % [confidence_level*100, ci[0],ci[1]] end # One Sample t-test # == Usage # a=1000.times.map {rand(100)}.to_scale # t_1=Statsample::Test::T::OneSample.new(a, {:u=>50}) # t_1.summary # # === Output # # = One Sample T Test # Sample mean: 48.954 # Population mean:50 # Tails: both # t = -1.1573, p=0.2474, d.f=999 class OneSample include Math include Statsample::Test include Summarizable # Options attr_accessor :opts # Name of test attr_accessor :name # Population mean to contrast attr_accessor :u # Degress of freedom attr_reader :df # Tails for probability (:both, :left or :right) attr_accessor :tails # Create a One Sample T Test # Options: # * :u = Mean to compare. Default= 0 # * :name = Name of the analysis # * :tails = Tail for probability. Could be :both, :left, :right def initialize(vector, opts=Hash.new) @vector=vector default={:u=>0, :name=>"One Sample T Test", :tails=>:both} @opts=default.merge(opts) @name=@opts[:name] @u=@opts[:u] @tails=@opts[:tails] @confidence_level=@opts[:confidence_level] || 0.95 @df= @vector.n_valid-1 @t=nil end def t_object T.new(@vector.mean-u, @vector.se, @vector.n_valid-1, opts) end def t t_object.t end def probability t_object.probability end def standard_error t_object.standard_error end alias :se :standard_error def confidence_interval(cl=nil) t_object.confidence_interval(cl) end alias :ci :confidence_interval def report_building(b) # :nodoc: b.section(:name=>@name) {|s| s.text _("Sample mean: %0.4f | Sample sd: %0.4f | se : %0.4f") % [@vector.mean, @vector.sd, se] s.text _("Population mean: %0.4f") % u if u!=0 t_object.report_building_t(s) } end end # Two Sample t-test. # # == Usage # a=1000.times.map {rand(100)}.to_scale # b=1000.times.map {rand(100)}.to_scale # t_2=Statsample::Test::T::TwoSamplesIndependent.new(a,b) # t_2.summary # === Output # = Two Sample T Test # Mean and standard deviation # +----------+---------+---------+------+ # | Variable | m | sd | n | # +----------+---------+---------+------+ # | 1 | 49.3310 | 29.3042 | 1000 | # | 2 | 47.8180 | 28.8640 | 1000 | # +----------+---------+---------+------+ # # == Levene Test # Levene Test # F: 0.3596 # p: 0.5488 # T statistics # +--------------------+--------+-----------+----------------+ # | Type | t | df | p (both tails) | # +--------------------+--------+-----------+----------------+ # | Equal variance | 1.1632 | 1998 | 0.2449 | # | Non equal variance | 1.1632 | 1997.5424 | 0.1362 | # +--------------------+--------+-----------+----------------+ class TwoSamplesIndependent include Math include Statsample::Test include DirtyMemoize include Summarizable # Options attr_accessor :opts # Name of test attr_accessor :name # Degress of freedom (equal variance) attr_reader :df_equal_variance # Degress of freedom (not equal variance) attr_reader :df_not_equal_variance # Value of t for equal_variance attr_reader :t_equal_variance # Value of t for non-equal_variance attr_reader :t_not_equal_variance # Probability(equal variance) attr_reader :probability_equal_variance # Probability(unequal variance) attr_reader :probability_not_equal_variance # Tails for probability (:both, :left or :right) attr_accessor :tails # Create the object dirty_writer :tails dirty_memoize :t_equal_variance, :t_not_equal_variance, :probability_equal_variance, :probability_not_equal_variance, :df_equal_variance, :df_not_equal_variance # Create a Two Independent T Test # Options: # * :name = Name of the analysis # * :tails = Tail for probability. Could be :both, :left, :right def initialize(v1, v2, opts=Hash.new) @v1=v1 @v2=v2 default={:u=>0, :name=>"Two Sample T Test", :tails=>:both} @opts=default.merge(opts) @name=@opts[:name] @tails=@opts[:tails] end # Set t and probability for given u def compute @t_equal_variance= T.two_sample_independent(@v1.mean, @v2.mean, @v1.sd, @v2.sd, @v1.n_valid, @v2.n_valid,true) @t_not_equal_variance= T.two_sample_independent(@v1.mean, @v2.mean, @v1.sd, @v2.sd, @v1.n_valid, @v2.n_valid, false) @df_equal_variance=T.df_equal_variance(@v1.n_valid, @v2.n_valid) @df_not_equal_variance=T.df_not_equal_variance(@v1.sd, @v2.sd, @v1.n_valid, @v2.n_valid) @probability_equal_variance = p_using_cdf(Distribution::T.cdf(@t_equal_variance, @df_equal_variance), tails) @probability_not_equal_variance = p_using_cdf(Distribution::T.cdf(@t_not_equal_variance, @df_not_equal_variance), tails) end # Cohen's d is a measure of effect size. Its defined as the difference between two means divided by a standard deviation for the data def d n1=@v1.n_valid n2=@v2.n_valid num=@v1.mean-@v2.mean den=Math::sqrt( ((n1-1)*@v1.sd+(n2-1)*@v2.sd).quo(n1+n2)) num.quo(den) end def report_building(b) # :nodoc: b.section(:name=>@name) {|g| g.table(:name=>_("Mean and standard deviation"), :header=>[_("Variable"), _("mean"), _("sd"),_("n")]) {|t| t.row([@v1.name,"%0.4f" % @v1.mean,"%0.4f" % @v1.sd,@v1.n_valid]) t.row([@v2.name,"%0.4f" % @v2.mean,"%0.4f" % @v2.sd, @v2.n_valid]) } g.parse_element(Statsample::Test.levene([@v1,@v2],:name=>_("Levene test for equality of variances"))) g.table(:name=>_("T statistics"),:header=>["Type","t","df", "p (#{tails} tails)"].map{|v| _(v)}) {|t| t.row([_("Equal variance"), "%0.4f" % t_equal_variance, df_equal_variance, "%0.4f" % probability_equal_variance]) t.row([_("Non equal variance"), "%0.4f" % t_not_equal_variance, "%0.4f" % df_not_equal_variance, "%0.4f" % probability_not_equal_variance]) } g.table(:name=>_("Effect size")) do |t| t.row ['x1-x2', "%0.4f" % (@v1.mean-@v2.mean)] t.row ['d', "%0.4f" % d] end } end end end end end ================================================ FILE: lib/statsample/test/umannwhitney.rb ================================================ module Statsample module Test # # = U Mann-Whitney test # # Non-parametric test for assessing whether two independent samples # of observations come from the same distribution. # # == Assumptions # # * The two samples under investigation in the test are independent of each other and the observations within each sample are independent. # * The observations are comparable (i.e., for any two observations, one can assess whether they are equal or, if not, which one is greater). # * The variances in the two groups are approximately equal. # # Higher differences of distributions correspond to # to lower values of U. # class UMannWhitney # Max for m*n allowed for exact calculation of probability MAX_MN_EXACT=10000 # U sampling distribution, based on Dinneen & Blakesley (1973) algorithm. # This is the algorithm used on SPSS. # # Parameters: # * n1: group 1 size # * n2: group 2 size # == Reference: # * Dinneen, L., & Blakesley, B. (1973). Algorithm AS 62: A Generator for the Sampling Distribution of the Mann- Whitney U Statistic. Journal of the Royal Statistical Society, 22(2), 269-273 # def self.u_sampling_distribution_as62(n1,n2) freq=[] work=[] mn1=n1*n2+1 max_u=n1*n2 minmn=n1n2 ? n1 : n2 n1=maxmn+1 (1..n1).each{|i| freq[i]=1} n1+=1 (n1..mn1).each{|i| freq[i]=0} work[1]=0 xin=maxmn (2..minmn).each do |i| work[i]=0 xin=xin+maxmn n1=xin+2 l=1+xin.quo(2) k=i (1..l).each do |j| k=k+1 n1=n1-1 sum=freq[j]+work[j] freq[j]=sum work[k]=sum-freq[n1] freq[n1]=sum end end # Generate percentages for normal U dist=(1+max_u/2).to_i freq.shift total=freq.inject(0) {|a,v| a+v } (0...dist).collect {|i| if i!=max_u-i ues=freq[i]*2 else ues=freq[i] end ues.quo(total) } end # Generate distribution for permutations. # Very expensive, but useful for demostrations def self.distribution_permutations(n1,n2) base=[0]*n1+[1]*n2 po=Statsample::Permutation.new(base) total=n1*n2 req={} po.each do |perm| r0,s0=0,0 perm.each_index {|c_i| if perm[c_i]==0 r0+=c_i+1 s0+=1 end } u1=r0-((s0*(s0+1)).quo(2)) u2=total-u1 temp_u= (u1 <= u2) ? u1 : u2 req[perm]=temp_u end req end # Sample 1 Rank sum attr_reader :r1 # Sample 2 Rank sum attr_reader :r2 # Sample 1 U (useful for demostration) attr_reader :u1 # Sample 2 U (useful for demostration) attr_reader :u2 # U Value attr_reader :u # Value of compensation for ties (useful for demostration) attr_reader :t # Name of test attr_accessor :name include Summarizable # # Create a new U Mann-Whitney test # Params: Two Statsample::Vectors # def initialize(v1,v2, opts=Hash.new) @v1=v1 @v2=v2 @n1=v1.valid_data.size @n2=v2.valid_data.size data=(v1.valid_data+v2.valid_data).to_scale groups=(([0]*@n1)+([1]*@n2)).to_vector ds={'g'=>groups, 'data'=>data}.to_dataset @t=nil @ties=data.data.size!=data.data.uniq.size if(@ties) adjust_for_ties(ds['data']) end ds['ranked']=ds['data'].ranked(:scale) @n=ds.cases @r1=ds.filter{|r| r['g']==0}['ranked'].sum @r2=((ds.cases*(ds.cases+1)).quo(2))-r1 @u1=r1-((@n1*(@n1+1)).quo(2)) @u2=r2-((@n2*(@n2+1)).quo(2)) @u=(u1_("Mann-Whitney's U")} @opts=opts_default.merge(opts) opts_default.keys.each {|k| send("#{k}=", @opts[k]) } end def report_building(generator) # :nodoc: generator.section(:name=>@name) do |s| s.table(:name=>_("%s results") % @name) do |t| t.row([_("Sum of ranks %s") % @v1.name, "%0.3f" % @r1]) t.row([_("Sum of ranks %s") % @v2.name, "%0.3f" % @r2]) t.row([_("U Value"), "%0.3f" % @u]) t.row([_("Z"), "%0.3f (p: %0.3f)" % [z, probability_z]]) if @n1*@n2100000. # Uses u_sampling_distribution_as62 def probability_exact dist=UMannWhitney.u_sampling_distribution_as62(@n1,@n2) sum=0 (0..@u.to_i).each {|i| sum+=dist[i] } sum end # Adjunt for ties. # # == Reference: # * http://europe.isixsigma.com/library/content/c080806a.asp def adjust_for_ties(data) @t=data.frequencies.find_all{|k,v| v>1}.inject(0) {|a,v| a+(v[1]**3-v[1]).quo(12) } end private :adjust_for_ties # Z value for U, with adjust for ties. # For large samples, U is approximately normally distributed. # In that case, you can use z to obtain probabily for U. # == Reference: # * SPSS Manual def z mu=(@n1*@n2).quo(2) if(!@ties) ou=Math::sqrt(((@n1*@n2)*(@n1+@n2+1)).quo(12)) else n=@n1+@n2 first=(@n1*@n2).quo(n*(n-1)) second=((n**3-n).quo(12))-@t ou=Math::sqrt(first*second) end (@u-mu).quo(ou) end # Assuming H_0, the proportion of cdf with values of U lower # than the sample, using normal approximation. # Use with more than 30 cases per group. def probability_z (1-Distribution::Normal.cdf(z.abs()))*2 end end end end ================================================ FILE: lib/statsample/test/wilcoxonsignedrank.rb ================================================ module Statsample module Test # From Wikipedia: # The Wilcoxon signed-rank test is a non-parametric statistical hypothesis test used when comparing two related samples, matched samples, or repeated measurements on a single sample to assess whether their population mean ranks differ (i.e. it is a paired difference test). It can be used as an alternative to the paired Student's t-test, t-test for matched pairs, or the t-test for dependent samples when the population cannot be assumed to be normally distributed. class WilcoxonSignedRank include Statsample::Test include Summarizable # Name of F analysis attr_accessor :name attr_reader :w attr_reader :nr attr_writer :tails # Parameters: def initialize(v1,v2, opts=Hash.new) @v1=v1 @v2=v2 opts_default={:name=>_("Wilcoxon Signed Rank Test"),:tails=>:both} @opts=opts_default.merge(opts) opts_default.keys.each {|k| send("#{k}=", @opts[k]) } calculate end def calculate df=Statsample::Dataset.new({'v1'=>@v1,'v2'=>@v2}) df["abs"]=df.collect {|row| r=(row["v2"]-row["v1"]).abs } df["sgn"]=df.collect {|row| r=row["v2"]-row["v1"] r==0 ? 0 : r/r.abs } df=df.filter {|row| row["sgn"]!=0} df["rank"]=df["abs"].ranked @nr=df.cases @w=df.collect {|row| row["sgn"]*row["rank"] #p row["sgn"]*row["rank"] }.sum end def report_building(generator) # :nodoc: generator.section(:name=>@name) do |s| s.table(:name=>_("%s results") % @name) do |t| t.row([_("W Value"), "%0.3f" % @w]) t.row([_("Z"), "%0.3f (p: %0.3f)" % [z, probability_z]]) if(nr<=10) t.row([_("Exact probability"), "p-exact: %0.3f" % [probability_exact]]) end end end end def z sigma=Math.sqrt((nr*(nr+1)*(2*nr+1))/6) (w-0.5)/sigma end # Assuming normal distribution of W, this calculate # the probability of samples with Z equal or higher than # obtained on sample def probability_z (1-Distribution::Normal.cdf(z))*(@tails==:both ? 2:1) end # Calculate exact probability. # Don't calculate for large Nr, please! def probability_exact str_format="%0#{nr}b" combinations=2**nr #p str_format total_w=combinations.times.map {|i| comb=sprintf(str_format,i) w_local=comb.length.times.inject(0) {|ac,j| sgn=comb[j]=="0" ? -1 : 1 ac+(j+1)*sgn } }.sort total_w.find_all {|v| if @tails==:both v<=-w.abs or v>=w.abs elsif @tails==:left v<=w elsif @tails==:right v>=w end }.count/(combinations.to_f) end end end end ================================================ FILE: lib/statsample/test.rb ================================================ module Statsample # Module for several statistical tests module Test autoload(:UMannWhitney, 'statsample/test/umannwhitney') autoload(:Levene, 'statsample/test/levene') autoload(:T, 'statsample/test/t') autoload(:F, 'statsample/test/f') autoload(:ChiSquare, 'statsample/test/chisquare') autoload(:BartlettSphericity, 'statsample/test/bartlettsphericity') autoload(:KolmogorovSmirnov, 'statsample/test/kolmogorovsmirnov') autoload(:WilcoxonSignedRank, 'statsample/test/wilcoxonsignedrank') # Returns probability of getting a value lower or higher # than sample, using cdf and number of tails. # # * :left : For one tail left, return the cdf # * :right : For one tail right, return 1-cdf # * :both : For both tails, returns 2*right_tail(cdf.abs) def p_using_cdf(cdf, tails=:both) tails=:both if tails==2 or tails==:two tails=:right if tails==1 or tails==:positive tails=:left if tails==:negative case tails when :left then cdf when :right then 1-cdf when :both if cdf>=0.5 cdf=1-cdf end 2*cdf end end # Get critical t to create confidence interval def t_critical(confidence_level, df) -Distribution::T.p_value((1-confidence_level) / 2.0, df) end # Get critical z to create confidence interval def z_critical(confidence_level) -Distribution::Z.p_value((1-confidence_level) / 2.0) end extend self # Calculate chi square for two Matrix class << self def chi_square(observed, expected=nil) case observed when Vector ChiSquare::WithVector.new(observed,expected) when Matrix ChiSquare::WithMatrix.new(observed,expected) else raise "Not implemented for #{observed.class}" end end # Shorthand for Statsample::Test::UMannWhitney.new # # * v1 and v2 should be Statsample::Vector. def u_mannwhitney(v1, v2) Statsample::Test::UMannWhitney.new(v1,v2) end # Shorthand for Statsample::Test::T::OneSample.new def t_one_sample(vector, opts=Hash.new) Statsample::Test::T::OneSample.new(vector,opts) end # Shorthand for Statsample::Test::T::TwoSamplesIndependent.new def t_two_samples_independent(v1,v2, opts=Hash.new) Statsample::Test::T::TwoSamplesIndependent.new(v1,v2,opts) end # Shorthand for Statsample::Test::WilcoxonSignedRank.new def wilcoxon_signed_rank(v1,v2,opts=Hash.new) Statsample::Test::WilcoxonSignedRank.new(v1,v2,opts) end # Shorthand for Statsample::Test::Levene.new def levene(input, opts=Hash.new) Statsample::Test::Levene.new(input,opts) end end end end ================================================ FILE: lib/statsample/vector/gsl.rb ================================================ module Statsample class Vector module GSL_ def clear_gsl @gsl=nil end def set_valid_data clear_gsl set_valid_data_ruby end def push(v) # If data is GSL::Vector, should be converted first to an Array if @data.is_a? GSL::Vector @data=@data.to_a end push_ruby(v) end def gsl @gsl||=GSL::Vector.alloc(@scale_data) if @scale_data.size>0 end alias :to_gsl :gsl def vector_standarized_compute(m,sd) if flawed? vector_standarized_compute_ruby(m,sd) else gsl.collect {|x| (x.to_f - m).quo(sd)}.to_scale end end def vector_centered_compute(m) if flawed? vector_centered_compute_ruby(m) else gsl.collect {|x| (x.to_f - m)}.to_scale end end def sample_with_replacement(sample=1) if(@type!=:scale) sample_with_replacement_ruby(sample) else r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000)) Statsample::Vector.new(r.sample(gsl, sample).to_a,:scale) end end def sample_without_replacement(sample=1) if(@type!=:scale) sample_without_replacement_ruby(sample) else r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000)) r.choose(gsl, sample).to_a end end def median if @type!=:scale median_ruby else sorted=GSL::Vector.alloc(@scale_data.sort) GSL::Stats::median_from_sorted_data(sorted) end end def sum check_type :scale gsl.nil? ? nil : gsl.sum end def mean check_type :scale gsl.nil? ? nil : gsl.mean end def variance_sample(m=nil) check_type :scale m||=mean gsl.nil? ? nil : gsl.variance_m end def standard_deviation_sample(m=nil) check_type :scale m||=mean gsl.nil? ? nil : gsl.sd(m) end def variance_population(m=nil) # :nodoc: check_type :scale m||=mean gsl.nil? ? nil : gsl.variance_with_fixed_mean(m) end def standard_deviation_population(m=nil) # :nodoc: check_type :scale m||=mean gsl.nil? ? nil : gsl.sd_with_fixed_mean(m) end def skew # :nodoc: check_type :scale gsl.nil? ? nil : gsl.skew end def kurtosis # :nodoc: check_type :scale gsl.nil? ? nil : gsl.kurtosis end end end end ================================================ FILE: lib/statsample/vector.rb ================================================ require 'date' require 'statsample/vector/gsl' module Statsample::VectorShorthands # Creates a new Statsample::Vector object # Argument should be equal to Vector.new def to_vector(*args) Statsample::Vector.new(self,*args) end # Creates a new Statsample::Vector object of type :scale def to_scale(*args) Statsample::Vector.new(self, :scale, *args) end end class Array include Statsample::VectorShorthands end if Statsample.has_gsl? module GSL class Vector include Statsample::VectorShorthands end end end module Statsample # Collection of values on one dimension. Works as a column on a Spreadsheet. # # == Usage # The fast way to create a vector uses Array.to_vector or Array.to_scale. # # v=[1,2,3,4].to_vector(:scale) # v=[1,2,3,4].to_scale # class Vector include Enumerable include Writable include Summarizable include Statsample::VectorShorthands # Level of measurement. Could be :nominal, :ordinal or :scale attr_reader :type # Original data. attr_reader :data # Valid data. Equal to data, minus values assigned as missing values attr_reader :valid_data # Array of values considered as missing. Nil is a missing value, by default attr_reader :missing_values # Array of values considered as "Today", with date type. "NOW", "TODAY", :NOW and :TODAY are 'today' values, by default attr_reader :today_values # Missing values array attr_reader :missing_data # Original data, with all missing values replaced by nils attr_reader :data_with_nils # Date date, with all missing values replaced by nils attr_reader :date_data_with_nils # Change label for specific values attr_accessor :labels # Name of vector. Should be used for output by many classes attr_accessor :name # Creates a new Vector object. # * data Any data which can be converted on Array # * type Level of meausurement. See Vector#type # * opts Hash of options # * :missing_values Array of missing values. See Vector#missing_values # * :today_values Array of 'today' values. See Vector#today_values # * :labels Labels for data values # * :name Name of vector def initialize(data=[], type=:nominal, opts=Hash.new) @data=data.is_a?(Array) ? data : data.to_a @type=type opts_default={ :missing_values=>[], :today_values=>['NOW','TODAY', :NOW, :TODAY], :labels=>{}, :name=>nil } @opts=opts_default.merge(opts) if @opts[:name].nil? @@n_table||=0 @@n_table+=1 @opts[:name]="Vector #{@@n_table}" end @missing_values=@opts[:missing_values] @labels=@opts[:labels] @today_values=@opts[:today_values] @name=@opts[:name] @valid_data=[] @data_with_nils=[] @date_data_with_nils=[] @missing_data=[] @has_missing_data=nil @scale_data=nil set_valid_data self.type=type end # Create a vector using (almost) any object # * Array: flattened # * Range: transformed using to_a # * Statsample::Vector # * Numeric and string values def self.[](*args) values=[] args.each do |a| case a when Array values.concat a.flatten when Statsample::Vector values.concat a.to_a when Range values.concat a.to_a else values << a end end vector=new(values) vector.type=:scale if vector.can_be_scale? vector end # Create a new scale type vector # Parameters # [n] Size # [val] Value of each value # [&block] If block provided, is used to set the values of vector def self.new_scale(n,val=nil, &block) if block vector=n.times.map {|i| block.call(i)}.to_scale else vector=n.times.map { val}.to_scale end vector.type=:scale vector end # Creates a duplicate of the Vector. # Note: data, missing_values and labels are duplicated, so # changes on original vector doesn't propages to copies. def dup Vector.new(@data.dup,@type, :missing_values => @missing_values.dup, :labels => @labels.dup, :name=>@name) end # Returns an empty duplicate of the vector. Maintains the type, # missing values and labels. def dup_empty Vector.new([],@type, :missing_values => @missing_values.dup, :labels => @labels.dup, :name=> @name) end if Statsample::STATSAMPLE__.respond_to?(:check_type) # Raises an exception if type of vector is inferior to t type def check_type(t) Statsample::STATSAMPLE__.check_type(self,t) end else def check_type(t) #:nodoc: _check_type(t) end end def _check_type(t) #:nodoc: raise NoMethodError if (t==:scale and @type!=:scale) or (t==:ordinal and @type==:nominal) or (t==:date) or (:date==@type) end def vector_standarized_compute(m,sd) # :nodoc: @data_with_nils.collect{|x| x.nil? ? nil : (x.to_f - m).quo(sd) }.to_vector(:scale) end # Return a vector usign the standarized values for data # with sd with denominator n-1. With variance=0 or mean nil, # returns a vector of equal size full of nils # def vector_standarized(use_population=false) check_type :scale m=mean sd=use_population ? sdp : sds return ([nil]*size).to_scale if mean.nil? or sd==0.0 vector=vector_standarized_compute(m,sd) vector.name=_("%s(standarized)") % @name vector end def vector_centered_compute(m) #:nodoc: @data_with_nils.collect {|x| x.nil? ? nil : x.to_f-m }.to_scale end # Return a centered vector def vector_centered check_type :scale m=mean return ([nil]*size).to_scale if mean.nil? vector=vector_centered_compute(m) vector.name=_("%s(centered)") % @name vector end alias_method :standarized, :vector_standarized alias_method :centered, :vector_centered # Return a vector with values replaced with the percentiles # of each values def vector_percentil check_type :ordinal c=@valid_data.size vector=ranked.map {|i| i.nil? ? nil : (i.quo(c)*100).to_f }.to_vector(@type) vector.name=_("%s(percentil)") % @name vector end def box_cox_transformation(lambda) # :nodoc: raise "Should be a scale" unless @type==:scale @data_with_nils.collect{|x| if !x.nil? if(lambda==0) Math.log(x) else (x**lambda-1).quo(lambda) end else nil end }.to_vector(:scale) end # Vector equality. # Two vector will be the same if their data, missing values, type, labels are equals def ==(v2) return false unless v2.instance_of? Statsample::Vector @data==v2.data and @missing_values==v2.missing_values and @type==v2.type and @labels==v2.labels end def _dump(i) # :nodoc: Marshal.dump({'data'=>@data,'missing_values'=>@missing_values, 'labels'=>@labels, 'type'=>@type,'name'=>@name}) end def self._load(data) # :nodoc: h=Marshal.load(data) Vector.new(h['data'], h['type'], :missing_values=> h['missing_values'], :labels=>h['labels'], :name=>h['name']) end # Returns a new vector, with data modified by block. # Equivalent to create a Vector after #collect on data def recode(type=nil) type||=@type @data.collect{|x| yield x }.to_vector(type) end # Modifies current vector, with data modified by block. # Equivalent to #collect! on @data def recode! @data.collect!{|x| yield x } set_valid_data end def push(v) @data.push(v) set_valid_data end # Dicotomize the vector with 0 and 1, based on lowest value # If parameter if defined, this value and lower # will be 0 and higher, 1 def dichotomize(low=nil) fs=factors low||=factors.min @data_with_nils.collect{|x| if x.nil? nil elsif x>low 1 else 0 end }.to_scale end # Iterate on each item. # Equivalent to # @data.each{|x| yield x} def each @data.each{|x| yield(x) } end # Iterate on each item, retrieving index def each_index (0...@data.size).each {|i| yield(i) } end # Add a value at the end of the vector. # If second argument set to false, you should update the Vector usign # Vector.set_valid_data at the end of your insertion cycle # def add(v,update_valid=true) @data.push(v) set_valid_data if update_valid end # Update valid_data, missing_data, data_with_nils and gsl # at the end of an insertion. # # Use after Vector.add(v,false) # Usage: # v=Statsample::Vector.new # v.add(2,false) # v.add(4,false) # v.data # => [2,3] # v.valid_data # => [] # v.set_valid_data # v.valid_data # => [2,3] def set_valid_data @valid_data.clear @missing_data.clear @data_with_nils.clear @date_data_with_nils.clear set_valid_data_intern set_scale_data if(@type==:scale) set_date_data if(@type==:date) end if Statsample::STATSAMPLE__.respond_to?(:set_valid_data_intern) def set_valid_data_intern #:nodoc: Statsample::STATSAMPLE__.set_valid_data_intern(self) end else def set_valid_data_intern #:nodoc: _set_valid_data_intern end end def _set_valid_data_intern #:nodoc: @data.each do |n| if is_valid? n @valid_data.push(n) @data_with_nils.push(n) else @data_with_nils.push(nil) @missing_data.push(n) end end @has_missing_data=@missing_data.size>0 end # Retrieves true if data has one o more missing values def has_missing_data? @has_missing_data end alias :flawed? :has_missing_data? # Retrieves label for value x. Retrieves x if # no label defined. def labeling(x) @labels.has_key?(x) ? @labels[x].to_s : x.to_s end alias :label :labeling # Returns a Vector with data with labels replaced by the label. def vector_labeled d=@data.collect{|x| if @labels.has_key? x @labels[x] else x end } Vector.new(d,@type) end # Size of total data def size @data.size end alias_method :n, :size # Retrieves i element of data def [](i) @data[i] end # Set i element of data. # Note: Use set_valid_data if you include missing values def []=(i,v) @data[i]=v end # Return true if a value is valid (not nil and not included on missing values) def is_valid?(x) !(x.nil? or @missing_values.include? x) end # Set missing_values. # set_valid_data is called after changes def missing_values=(vals) @missing_values = vals set_valid_data end # Set data considered as "today" on data vectors def today_values=(vals) @today_values = vals set_valid_data end # Set level of measurement. def type=(t) @type=t set_scale_data if(t==:scale) set_date_data if (t==:date) end def to_a if @data.is_a? Array @data.dup else @data.to_a end end alias_method :to_ary, :to_a # Vector sum. # - If v is a scalar, add this value to all elements # - If v is a Array or a Vector, should be of the same size of this vector # every item of this vector will be added to the value of the # item at the same position on the other vector def +(v) _vector_ari("+",v) end # Vector rest. # - If v is a scalar, rest this value to all elements # - If v is a Array or a Vector, should be of the same # size of this vector # every item of this vector will be rested to the value of the # item at the same position on the other vector def -(v) _vector_ari("-",v) end def *(v) _vector_ari("*",v) end # Reports all values that doesn't comply with a condition. # Returns a hash with the index of data and the invalid data. def verify h={} (0...@data.size).to_a.each{|i| if !(yield @data[i]) h[i]=@data[i] end } h end def _vector_ari(method,v) # :nodoc: if(v.is_a? Vector or v.is_a? Array) raise ArgumentError, "The array/vector parameter (#{v.size}) should be of the same size of the original vector (#{@data.size})" unless v.size==@data.size sum=[] v.size.times {|i| if((v.is_a? Vector and v.is_valid?(v[i]) and is_valid?(@data[i])) or (v.is_a? Array and !v[i].nil? and !data[i].nil?)) sum.push(@data[i].send(method,v[i])) else sum.push(nil) end } Statsample::Vector.new(sum, :scale) elsif(v.respond_to? method ) Statsample::Vector.new( @data.collect {|x| if(!x.nil?) x.send(method,v) else nil end } , :scale) else raise TypeError,"You should pass a scalar or a array/vector" end end # Return an array with the data splitted by a separator. # a=Vector.new(["a,b","c,d","a,b","d"]) # a.splitted # => # [["a","b"],["c","d"],["a","b"],["d"]] def splitted(sep=Statsample::SPLIT_TOKEN) @data.collect{|x| if x.nil? nil elsif (x.respond_to? :split) x.split(sep) else [x] end } end # Returns a hash of Vectors, defined by the different values # defined on the fields # Example: # # a=Vector.new(["a,b","c,d","a,b"]) # a.split_by_separator # => {"a"=>#, # "b"=>#, # "c"=>#} # def split_by_separator(sep=Statsample::SPLIT_TOKEN) split_data=splitted(sep) factors=split_data.flatten.uniq.compact out=factors.inject({}) {|a,x| a[x]=[] a } split_data.each do |r| if r.nil? factors.each do |f| out[f].push(nil) end else factors.each do |f| out[f].push(r.include?(f) ? 1:0) end end end out.inject({}){|s,v| s[v[0]]=Vector.new(v[1],:nominal) s } end def split_by_separator_freq(sep=Statsample::SPLIT_TOKEN) split_by_separator(sep).inject({}) {|a,v| a[v[0]]=v[1].inject {|s,x| s+x.to_i} a } end # == Bootstrap # Generate +nr+ resamples (with replacement) of size +s+ # from vector, computing each estimate from +estimators+ # over each resample. # +estimators+ could be # a) Hash with variable names as keys and lambdas as values # a.bootstrap(:log_s2=>lambda {|v| Math.log(v.variance)},1000) # b) Array with names of method to bootstrap # a.bootstrap([:mean, :sd],1000) # c) A single method to bootstrap # a.jacknife(:mean, 1000) # If s is nil, is set to vector size by default. # # Returns a dataset where each vector is an vector # of length +nr+ containing the computed resample estimates. def bootstrap(estimators, nr, s=nil) s||=n h_est, es, bss= prepare_bootstrap(estimators) nr.times do |i| bs=sample_with_replacement(s) es.each do |estimator| # Add bootstrap bss[estimator].push(h_est[estimator].call(bs)) end end es.each do |est| bss[est]=bss[est].to_scale bss[est].type=:scale end bss.to_dataset end # == Jacknife # Returns a dataset with jacknife delete-+k+ +estimators+ # +estimators+ could be: # a) Hash with variable names as keys and lambdas as values # a.jacknife(:log_s2=>lambda {|v| Math.log(v.variance)}) # b) Array with method names to jacknife # a.jacknife([:mean, :sd]) # c) A single method to jacknife # a.jacknife(:mean) # +k+ represent the block size for block jacknife. By default # is set to 1, for classic delete-one jacknife. # # Returns a dataset where each vector is an vector # of length +cases+/+k+ containing the computed jacknife estimates. # # == Reference: # * Sawyer, S. (2005). Resampling Data: Using a Statistical Jacknife. def jacknife(estimators, k=1) raise "n should be divisible by k:#{k}" unless n%k==0 nb=(n / k).to_i h_est, es, ps= prepare_bootstrap(estimators) est_n=es.inject({}) {|h,v| h[v]=h_est[v].call(self) h } nb.times do |i| other=@data_with_nils.dup other.slice!(i*k,k) other=other.to_scale es.each do |estimator| # Add pseudovalue ps[estimator].push( nb * est_n[estimator] - (nb-1) * h_est[estimator].call(other)) end end es.each do |est| ps[est]=ps[est].to_scale ps[est].type=:scale end ps.to_dataset end # For an array or hash of estimators methods, returns # an array with three elements # 1.- A hash with estimators names as keys and lambdas as values # 2.- An array with estimators names # 3.- A Hash with estimators names as keys and empty arrays as values def prepare_bootstrap(estimators) h_est=estimators h_est=[h_est] unless h_est.is_a? Array or h_est.is_a? Hash if h_est.is_a? Array h_est=h_est.inject({}) {|h,est| h[est]=lambda {|v| v.send(est)} h } end bss=h_est.keys.inject({}) {|h,v| h[v]=[];h} [h_est,h_est.keys, bss] end private :prepare_bootstrap # Returns an random sample of size n, with replacement, # only with valid data. # # In all the trails, every item have the same probability # of been selected. def sample_with_replacement(sample=1) vds=@valid_data.size (0...sample).collect{ @valid_data[rand(vds)] } end # Returns an random sample of size n, without replacement, # only with valid data. # # Every element could only be selected once. # # A sample of the same size of the vector is the vector itself. def sample_without_replacement(sample=1) raise ArgumentError, "Sample size couldn't be greater than n" if sample>@valid_data.size out=[] size=@valid_data.size while out.sizedir could be :horizontal or :vertical def to_matrix(dir=:horizontal) case dir when :horizontal Matrix[@data] when :vertical Matrix.columns([@data]) end end def inspect self.to_s end # Retrieves uniques values for data. def factors if @type==:scale @scale_data.uniq.sort elsif @type==:date @date_data_with_nils.uniq.sort else @valid_data.uniq.sort end end if Statsample::STATSAMPLE__.respond_to?(:frequencies) # Returns a hash with the distribution of frecuencies for # the sample def frequencies Statsample::STATSAMPLE__.frequencies(@valid_data) end else def frequencies #:nodoc: _frequencies end end def _frequencies #:nodoc: @valid_data.inject(Hash.new) {|a,x| a[x]||=0 a[x]=a[x]+1 a } end # Returns the most frequent item. def mode frequencies.max{|a,b| a[1]<=>b[1]}.first end # The numbers of item with valid data. def n_valid @valid_data.size end # Returns a hash with the distribution of proportions of # the sample. def proportions frequencies.inject({}){|a,v| a[v[0]] = v[1].quo(n_valid) a } end # Proportion of a given value. def proportion(v=1) frequencies[v].quo(@valid_data.size) end def report_building(b) b.section(:name=>name) do |s| s.text _("n :%d") % n s.text _("n valid:%d") % n_valid if @type==:nominal s.text _("factors:%s") % factors.join(",") s.text _("mode: %s") % mode s.table(:name=>_("Distribution")) do |t| frequencies.sort.each do |k,v| key=labels.has_key?(k) ? labels[k]:k t.row [key, v , ("%0.2f%%" % (v.quo(n_valid)*100))] end end end s.text _("median: %s") % median.to_s if(@type==:ordinal or @type==:scale) if(@type==:scale) s.text _("mean: %0.4f") % mean if sd s.text _("std.dev.: %0.4f") % sd s.text _("std.err.: %0.4f") % se s.text _("skew: %0.4f") % skew s.text _("kurtosis: %0.4f") % kurtosis end end end end # Variance of p, according to poblation size def variance_proportion(n_poblation, v=1) Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation) end # Variance of p, according to poblation size def variance_total(n_poblation, v=1) Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation) end def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1) Statsample::proportion_confidence_interval_t(proportion(v), @valid_data.size, n_poblation, margin) end def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1) Statsample::proportion_confidence_interval_z(proportion(v), @valid_data.size, n_poblation, margin) end self.instance_methods.find_all{|met| met=~/_slow$/}.each do |met| met_or=met.gsub("_slow","") if !self.method_defined?(met_or) alias_method met_or, met end end ###### ### Ordinal Methods ###### # == Percentil # Returns the value of the percentile q # # Accepts an optional second argument specifying the strategy to interpolate # when the requested percentile lies between two data points a and b # Valid strategies are: # * :midpoint (Default): (a + b) / 2 # * :linear : a + (b - a) * d where d is the decimal part of the index between a and b. # This is the NIST recommended method (http://en.wikipedia.org/wiki/Percentile#NIST_method) # def percentil(q, strategy = :midpoint) check_type :ordinal sorted=@valid_data.sort case strategy when :midpoint v = (n_valid * q).quo(100) if(v.to_i!=v) sorted[v.to_i] else (sorted[(v-0.5).to_i].to_f + sorted[(v+0.5).to_i]).quo(2) end when :linear index = (q / 100.0) * (n_valid + 1) k = index.truncate d = index % 1 if k == 0 sorted[0] elsif k >= sorted.size sorted[-1] else sorted[k - 1] + d * (sorted[k] - sorted[k - 1]) end else raise NotImplementedError.new "Unknown strategy #{strategy.to_s}" end end # Returns a ranked vector. def ranked(type=:ordinal) check_type :ordinal i=0 r=frequencies.sort.inject({}){|a,v| a[v[0]]=(i+1 + i+v[1]).quo(2) i+=v[1] a } @data.collect {|c| r[c] }.to_vector(type) end # Return the median (percentil 50) def median check_type :ordinal percentil(50) end # Minimun value def min check_type :ordinal @valid_data.min end # Maximum value def max check_type :ordinal @valid_data.max end def set_date_data @date_data_with_nils=@data.collect do|x| if x.is_a? Date x elsif x.is_a? Time Date.new(x.year, x.month, x.day) elsif x.is_a? String and x=~/(\d{4,4})[-\/](\d{1,2})[-\/](\d{1,2})/ Date.new($1.to_i,$2.to_i,$3.to_i) elsif @today_values.include? x Date.today() elsif @missing_values.include? x or x.nil? nil end end end def set_scale_data @scale_data=@valid_data.collect do|x| if x.is_a? Numeric x elsif x.is_a? String and x.to_i==x.to_f x.to_i else x.to_f end end end private :set_date_data, :set_scale_data # The range of the data (max - min) def range; check_type :scale @scale_data.max - @scale_data.min end # The sum of values for the data def sum check_type :scale @scale_data.inject(0){|a,x|x+a} ; end # The arithmetical mean of data def mean check_type :scale sum.to_f.quo(n_valid) end # Sum of squares for the data around a value. # By default, this value is the mean # ss= sum{(xi-m)^2} # def sum_of_squares(m=nil) check_type :scale m||=mean @scale_data.inject(0){|a,x| a+(x-m).square} end # Sum of squared deviation def sum_of_squared_deviation check_type :scale @scale_data.inject(0) {|a,x| x.square+a} - (sum.square.quo(n_valid)) end # Population variance (denominator N) def variance_population(m=nil) check_type :scale m||=mean squares=@scale_data.inject(0){|a,x| x.square+a} squares.quo(n_valid) - m.square end # Population Standard deviation (denominator N) def standard_deviation_population(m=nil) check_type :scale Math::sqrt( variance_population(m) ) end # Population average deviation (denominator N) # author: Al Chou def average_deviation_population( m = nil ) check_type :scale m ||= mean ( @scale_data.inject( 0 ) { |a, x| ( x - m ).abs + a } ).quo( n_valid ) end def median_absolute_deviation med=median recode {|x| (x-med).abs}.median end alias :mad :median_absolute_deviation # Sample Variance (denominator n-1) def variance_sample(m=nil) check_type :scale m||=mean sum_of_squares(m).quo(n_valid - 1) end # Sample Standard deviation (denominator n-1) def standard_deviation_sample(m=nil) check_type :scale m||=mean Math::sqrt(variance_sample(m)) end # Skewness of the sample def skew(m=nil) check_type :scale m||=mean th=@scale_data.inject(0){|a,x| a+((x-m)**3)} th.quo((@scale_data.size)*sd(m)**3) end # Kurtosis of the sample def kurtosis(m=nil) check_type :scale m||=mean fo=@scale_data.inject(0){|a,x| a+((x-m)**4)} fo.quo((@scale_data.size)*sd(m)**4)-3 end # Product of all values on the sample # def product check_type :scale @scale_data.inject(1){|a,x| a*x } end # With a fixnum, creates X bins within the range of data # With an Array, each value will be a cut point def histogram(bins=10) check_type :scale if bins.is_a? Array #h=Statsample::Histogram.new(self, bins) h=Statsample::Histogram.alloc(bins) else # ugly patch. The upper limit for a bin has the form # x < range #h=Statsample::Histogram.new(self, bins) min,max=Statsample::Util.nice(@valid_data.min,@valid_data.max) # fix last data if max==@valid_data.max max+=1e-10 end h=Statsample::Histogram.alloc(bins,[min,max]) # Fix last bin end h.increment(@valid_data) h end # Coefficient of variation # Calculed with the sample standard deviation def coefficient_of_variation check_type :scale standard_deviation_sample.quo(mean) end # Standard error of the distribution mean # Calculated using sd/sqrt(n) def standard_error standard_deviation_sample.quo(Math.sqrt(valid_data.size)) end alias :se :standard_error alias_method :sdp, :standard_deviation_population alias_method :sds, :standard_deviation_sample alias_method :adp, :average_deviation_population alias_method :cov, :coefficient_of_variation alias_method :variance, :variance_sample alias_method :sd, :standard_deviation_sample alias_method :ss, :sum_of_squares include_aliasing Statsample::Vector::GSL_ if Statsample.has_gsl? end end ================================================ FILE: lib/statsample/version.rb ================================================ module Statsample VERSION = '1.4.0' end ================================================ FILE: lib/statsample.rb ================================================ # = statsample.rb - # Statsample - Statistic package for Ruby # Copyright (C) 2008-2014 Claudio Bustos # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # #$:.unshift(File.dirname(__FILE__)) require 'matrix' require 'extendmatrix' require 'distribution' require 'dirty-memoize' require 'reportbuilder' class Numeric def square ; self * self ; end end class String def is_number? if self =~ /^-?\d+[,.]?\d*(e-?\d+)?$/ true else false end end end class Module def include_aliasing(m, suffix="ruby") m.instance_methods.each do |f| if instance_methods.include? f alias_method("#{f}_#{suffix}",f) remove_method f end end include m end end class Array # Recode repeated values on an array, adding the number of repetition # at the end # Example: # a=%w{a b c c d d d e} # a.recode_repeated # => ["a","b","c_1","c_2","d_1","d_2","d_3","e"] def recode_repeated if self.size!=self.uniq.size # Find repeated repeated=self.inject({}) {|a,v| (a[v].nil? ? a[v]=1 : a[v]+=1); a }.find_all{|k,v| v>1}.collect{|k,v| k} ns=repeated.inject({}) {|a,v| a[v]=0;a} self.collect do |f| if repeated.include? f ns[f]+=1 sprintf("%s_%d",f,ns[f]) else f end end else self end end end def create_test(*args,&proc) description=args.shift fields=args [description, fields, Proc.new] end #-- # Test extensions begin require 'gettext' rescue LoadError def bindtextdomain(d) #:nodoc: d end # Bored module module GetText #:nodoc: def _(t) t end end end # Library for statistical analysis on Ruby # # * Classes for manipulation and storage of data: # * Module Statsample::Bivariate provides covariance and pearson, spearman, point biserial, tau a, tau b, gamma, tetrachoric (see Bivariate::Tetrachoric) and polychoric (see Bivariate::Polychoric) correlations. Include methods to create correlation and covariance matrices # * Multiple types of regression on Statsample::Regression # * Factorial Analysis algorithms on Statsample::Factor module. # * Dominance Analysis. Based on Budescu and Azen papers.link[http://psycnet.apa.org/journals/met/8/2/129/]. # * Module Statsample::Codification, to help to codify open questions # * Converters to import and export data from databases, csv and excel files. # * Module Statsample::Crosstab provides function to create crosstab for categorical data # * Reliability analysis provides functions to analyze scales. # * Module Statsample::SRS (Simple Random Sampling) provides a lot of functions to estimate standard error for several type of samples # * Interfaces to gdchart, gnuplot and SVG::Graph # module Statsample def self.create_has_library(library) define_singleton_method("has_#{library}?") do cv="@@#{library}" if !class_variable_defined? cv begin require library.to_s class_variable_set(cv,true) rescue LoadError class_variable_set(cv,false) end end class_variable_get(cv) end end create_has_library :gsl SPLIT_TOKEN = "," autoload(:Analysis, 'statsample/analysis') autoload(:Database, 'statsample/converters') autoload(:Anova, 'statsample/anova') autoload(:CSV, 'statsample/converters') autoload(:PlainText, 'statsample/converters') autoload(:Excel, 'statsample/converters') autoload(:GGobi, 'statsample/converters') autoload(:SPSS, 'statsample/converter/spss') autoload(:Histogram, 'statsample/histogram') autoload(:DominanceAnalysis, 'statsample/dominanceanalysis') autoload(:HtmlReport, 'statsample/htmlreport') autoload(:Mx, 'statsample/converters') autoload(:Resample, 'statsample/resample') autoload(:SRS, 'statsample/srs') autoload(:Codification, 'statsample/codification') autoload(:Reliability, 'statsample/reliability') autoload(:Bivariate, 'statsample/bivariate') autoload(:Multivariate, 'statsample/multivariate') autoload(:Multiset, 'statsample/multiset') autoload(:StratifiedSample, 'statsample/multiset') autoload(:MLE, 'statsample/mle') autoload(:Regression, 'statsample/regression') autoload(:Test, 'statsample/test') autoload(:Factor, 'statsample/factor') autoload(:Graph, 'statsample/graph') class << self # Load a object saved on a file. def load(filename) if File.exist? filename o=false File.open(filename,"r") {|fp| o=Marshal.load(fp) } o else false end end # Create a matrix using vectors as columns. # Use: # # matrix=Statsample.vector_cols_matrix(v1,v2) def vector_cols_matrix(*vs) # test size=vs[0].size vs.each{|v| raise ArgumentError,"Arguments should be Vector" unless v.instance_of? Statsample::Vector raise ArgumentError,"Vectors size should be the same" if v.size!=size } Matrix.rows((0...size).to_a.collect() {|i| vs.collect{|v| v[i]} }) end # Returns a duplicate of the input vectors, without missing data # for any of the vectors. # # a=[1,2,3,6,7,nil,3,5].to_scale # b=[nil,nil,5,6,4,5,10,2].to_scale # c=[2,4,6,7,4,5,6,7].to_scale # a2,b2,c2=Statsample.only_valid(a,b,c) # => [#, # #, # #] # def only_valid(*vs) i=1 h=vs.inject({}) {|a,v| a["v#{i}"]=v;i+=1;a} ds=Statsample::Dataset.new(h).dup_only_valid ds.vectors.values end # Cheap version of #only_valid. # If any vectors have missing_values, return only valid. # If not, return the vectors itself def only_valid_clone(*vs) if vs.any? {|v| v.flawed?} only_valid(*vs) else vs end end end module Util # Reference: http://www.itl.nist.gov/div898/handbook/eda/section3/normprpl.htm def normal_order_statistic_medians(i,n) if i==1 u= 1.0 - normal_order_statistic_medians(n,n) elsif i==n u=0.5**(1 / n.to_f) else u= (i - 0.3175) / (n + 0.365) end u end def self.nice(s,e) # :nodoc: reverse = etrue).add(self).send(method) end end module STATSAMPLE__ #:nodoc: end end #-- begin require 'statsamplert' rescue LoadError module Statsample OPTIMIZED=false end end require 'statsample/vector' require 'statsample/dataset' require 'statsample/crosstab' require 'statsample/matrix' require 'statsample/shorthand' require 'statsample/version' ================================================ FILE: po/es/statsample.po ================================================ msgid "" msgstr "" "Project-Id-Version: statsample 1.0.1\n" "POT-Creation-Date: 2011-03-03 12:03-0300\n" "PO-Revision-Date: 2011-03-03 12:05-0300\n" "Last-Translator: Claudio Bustos \n" "Language-Team: Desarrollador\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" "X-Poedit-Language: Spanish\n" "X-Poedit-SourceCharset: utf-8\n" #: lib/statsample/test/f.rb:26 msgid "F Test" msgstr "Prueba F" #: lib/statsample/test/t.rb:82 msgid "T Test" msgstr "Prueba T" #: lib/statsample/test/t.rb:83 msgid "Estimate" msgstr "Estimado" #: lib/statsample/test/t.rb:84 msgid "Std.Err.of Estimate" msgstr "Err.Est. del Estimado" #: lib/statsample/test/t.rb:114 msgid "%s: %0.4f | %s: %0.4f" msgstr "%s: %0.4f | %s: %0.4f" #: lib/statsample/test/t.rb:120 msgid "t(%d) = %0.4f, p=%0.4f (%s tails)" msgstr "t(%d) = %0.4f, p=%0.4f (%s colas)" #: lib/statsample/test/t.rb:121 msgid "CI(%d%%): %0.4f - %0.4f" msgstr "IC(%d%%): %0.4f - %0.4f" #: lib/statsample/test/t.rb:190 msgid "Sample mean: %0.4f | Sample sd: %0.4f | se : %0.4f" msgstr "Media de la muestra: %0.4f | DE de la muestra: %0.4f | EE : %0.4f" #: lib/statsample/test/t.rb:191 msgid "Population mean: %0.4f" msgstr "Promedio población: %0.4f" #: lib/statsample/test/t.rb:292 msgid "Mean and standard deviation" msgstr "Promedio y desviación estándar" #: lib/statsample/test/t.rb:292 #: lib/statsample/regression/simple.rb:109 #: lib/statsample/factor/pca.rb:216 #: lib/statsample/factor/principalaxis.rb:202 msgid "Variable" msgstr "Variable" #: lib/statsample/test/t.rb:292 #: lib/statsample/dominanceanalysis/bootstrap.rb:208 msgid "mean" msgstr "promedio" #: lib/statsample/test/t.rb:292 msgid "sd" msgstr "de" #: lib/statsample/test/t.rb:292 #: lib/statsample/factor/parallelanalysis.rb:103 #: lib/statsample/factor/parallelanalysis.rb:111 msgid "n" msgstr "n" #: lib/statsample/test/t.rb:296 msgid "Levene test for equality of variances" msgstr "Test de Levene para igualdad de variancas" #: lib/statsample/test/t.rb:298 msgid "T statistics" msgstr "Estadístico T" #: lib/statsample/test/t.rb:299 msgid "Equal variance" msgstr "Varianza Igual" #: lib/statsample/test/t.rb:300 msgid "Non equal variance" msgstr "Varianza Desigual" #: lib/statsample/test/t.rb:302 msgid "Effect size" msgstr "Tamaño del efecto" #: lib/statsample/test/umannwhitney.rb:140 msgid "Mann-Whitney's U" msgstr "U de Mann-Whitney" #: lib/statsample/test/umannwhitney.rb:149 msgid "%s results" msgstr "resultados de %s" #: lib/statsample/test/umannwhitney.rb:150 #: lib/statsample/test/umannwhitney.rb:151 msgid "Sum of ranks %s" msgstr "Suma de rangos %s" #: lib/statsample/test/umannwhitney.rb:152 msgid "U Value" msgstr "Valor de U" #: lib/statsample/test/umannwhitney.rb:153 msgid "Z" msgstr "Z" #: lib/statsample/test/umannwhitney.rb:155 msgid "Exact p (Dinneen & Blakesley, 1973):" msgstr "p exacto (Dinneen & Blakesley, 1973):" #: lib/statsample/test/levene.rb:37 msgid "Levene Test" msgstr "Test de Levene" #: lib/statsample/test/bartlettsphericity.rb:25 msgid "Bartlett's test of sphericity" msgstr "Test de esfericidad de Bartlett" #: lib/statsample/regression/multiple/baseengine.rb:27 msgid "Multiple Regression: %s over %s" msgstr "Regresión Múltiple: %s sobre %s" #: lib/statsample/regression/multiple/baseengine.rb:40 msgid "Regression" msgstr "Regresión" #: lib/statsample/regression/multiple/baseengine.rb:40 msgid "Error" msgstr "Error" #: lib/statsample/regression/multiple/baseengine.rb:184 msgid "Engine: %s" msgstr "Motor: %s" #: lib/statsample/regression/multiple/baseengine.rb:185 msgid "Cases(listwise)=%d(%d)" msgstr "Casos (sólo válidos)=%d(%d)" #: lib/statsample/regression/multiple/baseengine.rb:186 msgid "R=" msgstr "R=" #: lib/statsample/regression/multiple/baseengine.rb:187 msgid "R^2=" msgstr "R^2=" #: lib/statsample/regression/multiple/baseengine.rb:188 msgid "R^2 Adj=" msgstr "R^2 Adj=" #: lib/statsample/regression/multiple/baseengine.rb:189 msgid "Std.Error R=" msgstr "Error estándar R=" #: lib/statsample/regression/multiple/baseengine.rb:191 msgid "Equation" msgstr "Ecuación" #: lib/statsample/regression/multiple/baseengine.rb:197 msgid "Beta coefficients" msgstr "Coeficientes beta" #: lib/statsample/regression/multiple/baseengine.rb:198 msgid "Constant" msgstr "Constante" #: lib/statsample/regression/multiple/matrixengine.rb:78 msgid "Multiple reggresion of %s on %s" msgstr "Regresión Múltiple de %s en %s" #: lib/statsample/regression/simple.rb:88 msgid "Regression of %s over %s" msgstr "Regresión de %s sobre %s" #: lib/statsample/regression/simple.rb:109 #: lib/statsample/factor/map.rb:105 #: lib/statsample/reliability/skillscaleanalysis.rb:92 msgid "Value" msgstr "Valor" #: lib/statsample/regression/simple.rb:110 msgid "r" msgstr "r" #: lib/statsample/regression/simple.rb:111 msgid "r^2" msgstr "r^2" #: lib/statsample/regression/simple.rb:112 msgid "a" msgstr "a" #: lib/statsample/regression/simple.rb:113 msgid "b" msgstr "b" #: lib/statsample/regression/simple.rb:114 msgid "s.e" msgstr "e.e." #: lib/statsample/dominanceanalysis/bootstrap.rb:115 msgid "Bootstrap dominance Analysis: %s over %s" msgstr "Resultados del Análisis de Dominancia Bootstrap: %s en %s" #: lib/statsample/dominanceanalysis/bootstrap.rb:138 msgid "Bootstrap %d of %d" msgstr "Bootstrap: %d de %d" #: lib/statsample/dominanceanalysis/bootstrap.rb:177 msgid "Sample size: %d\n" msgstr "Tamaño de muestra: %d\n" #: lib/statsample/dominanceanalysis/bootstrap.rb:179 msgid "Linear Regression Engine: %s" msgstr "Motor de Regresión Linear: %s" #: lib/statsample/dominanceanalysis/bootstrap.rb:181 msgid "pairs" msgstr "pares" #: lib/statsample/dominanceanalysis/bootstrap.rb:181 msgid "SE(Dij)" msgstr "EE(Dij)" #: lib/statsample/dominanceanalysis/bootstrap.rb:181 msgid "Reproducibility" msgstr "Reproducibilidad" #: lib/statsample/dominanceanalysis/bootstrap.rb:182 msgid "Complete dominance" msgstr "Dominancia Completa" #: lib/statsample/dominanceanalysis/bootstrap.rb:190 msgid "Conditional dominance" msgstr "Dominancia Condicional" #: lib/statsample/dominanceanalysis/bootstrap.rb:199 msgid "General Dominance" msgstr "Dominancia General" #: lib/statsample/dominanceanalysis/bootstrap.rb:208 msgid "General averages" msgstr "Promedios generales" #: lib/statsample/dominanceanalysis/bootstrap.rb:208 msgid "var" msgstr "var" #: lib/statsample/dominanceanalysis/bootstrap.rb:208 msgid "se" msgstr "de" #: lib/statsample/dominanceanalysis/bootstrap.rb:208 msgid "p.5" msgstr "p.5" #: lib/statsample/dominanceanalysis/bootstrap.rb:208 msgid "p.95" msgstr "p.95" #: lib/statsample/anova/twoway.rb:59 msgid "ANOVA Two-Way" msgstr "Anova de dos vías" #: lib/statsample/anova/twoway.rb:60 msgid "A" msgstr "A" #: lib/statsample/anova/twoway.rb:61 msgid "B" msgstr "B" #: lib/statsample/anova/twoway.rb:62 msgid "Within" msgstr "Dentro" #: lib/statsample/anova/twoway.rb:98 #: lib/statsample/anova/oneway.rb:57 msgid "%s Table" msgstr "Tabla %s" #: lib/statsample/anova/twoway.rb:103 #: lib/statsample/anova/oneway.rb:60 #: lib/statsample/crosstab.rb:101 #: lib/statsample/crosstab.rb:116 #: lib/statsample/crosstab.rb:151 #: lib/statsample/crosstab.rb:173 #: lib/statsample/dominanceanalysis.rb:354 msgid "Total" msgstr "Total" #: lib/statsample/anova/twoway.rb:172 msgid "Anova Two-Way on %s" msgstr "Anova de dos vías en %s" #: lib/statsample/anova/twoway.rb:184 #: lib/statsample/anova/oneway.rb:127 msgid "Test of Homogeneity of variances (Levene)" msgstr "Test de homogeneidad de varianza (Levene)" #: lib/statsample/anova/twoway.rb:189 #: lib/statsample/anova/twoway.rb:193 msgid "%s Mean" msgstr "Promedio %s" #: lib/statsample/anova/oneway.rb:35 msgid "Explained variance" msgstr "Varianza explicada" #: lib/statsample/anova/oneway.rb:36 msgid "Unexplained variance" msgstr "Varianza sin explicar" #: lib/statsample/anova/oneway.rb:97 msgid "Anova One-Way" msgstr "Anova de una vía" #: lib/statsample/anova/oneway.rb:98 msgid "Between Groups" msgstr "Entre grupos" #: lib/statsample/anova/oneway.rb:99 msgid "Within Groups" msgstr "Dentro de grupos" #: lib/statsample/anova/oneway.rb:119 msgid "Contrast for %s" msgstr "Contraste para %s" #: lib/statsample/anova/oneway.rb:163 msgid "Descriptives" msgstr "Descriptivos" #: lib/statsample/anova/contrast.rb:13 msgid "Psi estimate" msgstr "Psi Estimado" #: lib/statsample/anova/contrast.rb:14 msgid "Contrast" msgstr "Contraste" #: lib/statsample/anova/contrast.rb:73 msgid "Contrast:%s" msgstr "Contraste: %s" #: lib/statsample/graph/scatterplot.rb:72 msgid "Scatterplot (%s - %s)" msgstr "Diagrama de dispersión (%s - %s)" #: lib/statsample/graph/histogram.rb:50 msgid "Histograma (%s)" msgstr "Histograma (%s)" #: lib/statsample/graph/boxplot.rb:63 msgid "Boxplot" msgstr "Diagrama de caja" #: lib/statsample/bivariate/pearson.rb:32 msgid "Correlation (%s - %s)" msgstr "Correlación (%s - %s)" #: lib/statsample/bivariate/pearson.rb:50 msgid "%s : r=%0.3f (t:%0.3f, g.l.=%d, p:%0.3f / %s tails)" msgstr "%s : r=%0.3f (t:%0.3f, g.l.=%d, p:%0.3f / %s colas)" #: lib/statsample/factor/parallelanalysis.rb:68 msgid "Parallel Analysis" msgstr "Análisis Paralelo" #: lib/statsample/factor/parallelanalysis.rb:96 msgid "Bootstrap Method: %s" msgstr "Método de Remuestreo: %s" #: lib/statsample/factor/parallelanalysis.rb:97 msgid "Uses SMC: %s" msgstr "Usa SMC: %s" #: lib/statsample/factor/parallelanalysis.rb:97 msgid "Yes" msgstr "Sí" #: lib/statsample/factor/parallelanalysis.rb:97 msgid "No" msgstr "No" #: lib/statsample/factor/parallelanalysis.rb:98 msgid "Correlation Matrix type : %s" msgstr "Tipo de matriz de correlacion : %s" #: lib/statsample/factor/parallelanalysis.rb:99 msgid "Number of variables: %d" msgstr "Número de variables: %d" #: lib/statsample/factor/parallelanalysis.rb:100 msgid "Number of cases: %d" msgstr "Número de casos: %d" #: lib/statsample/factor/parallelanalysis.rb:101 msgid "Number of iterations: %d" msgstr "Número de iteraciones: %d" #: lib/statsample/factor/parallelanalysis.rb:103 #: lib/statsample/factor/parallelanalysis.rb:111 #: lib/statsample/factor/map.rb:105 msgid "Eigenvalues" msgstr "Eigenvalues" #: lib/statsample/factor/parallelanalysis.rb:103 #: lib/statsample/factor/parallelanalysis.rb:111 msgid "generated eigenvalue" msgstr "eigenvalue generado" #: lib/statsample/factor/parallelanalysis.rb:110 msgid "Number or factors to preserve: %d" msgstr "Número de factores a preservar: %d" #: lib/statsample/factor/parallelanalysis.rb:111 msgid "data eigenvalue" msgstr "eigenvalue de los datos" #: lib/statsample/factor/parallelanalysis.rb:111 msgid "preserve?" msgstr "¿preservar?" #: lib/statsample/factor/map.rb:60 msgid "Velicer's MAP" msgstr "PPM de Velicer" #: lib/statsample/factor/map.rb:110 msgid "Velicer's Average Squared Correlations" msgstr "Correlaciones Cuadradas Promedio de Velicer " #: lib/statsample/factor/map.rb:110 msgid "number of components" msgstr "número de componentes" #: lib/statsample/factor/map.rb:110 msgid "average square correlation" msgstr "correlación cuadrada promedio" #: lib/statsample/factor/map.rb:115 msgid "The smallest average squared correlation is : %0.6f" msgstr "La correlación cuadrada promedio más pequeña es: %0.6f" #: lib/statsample/factor/map.rb:116 msgid "The number of components is : %d" msgstr "El número de componentes es: %d" #: lib/statsample/factor/pca.rb:56 msgid "Principal Component Analysis" msgstr "Análisis de componentes principales" #: lib/statsample/factor/pca.rb:59 #: lib/statsample/matrix.rb:14 #: lib/statsample/matrix.rb:81 msgid "VAR_%d" msgstr "VAR_%d" #: lib/statsample/factor/pca.rb:160 msgid "Component matrix (from covariance)" msgstr "Matriz de componentes (desde covarianza)" #: lib/statsample/factor/pca.rb:181 msgid "Component matrix" msgstr "Matriz de componentes" #: lib/statsample/factor/pca.rb:215 #: lib/statsample/factor/principalaxis.rb:200 msgid "Number of factors: %d" msgstr "Número de factores: %d" #: lib/statsample/factor/pca.rb:216 #: lib/statsample/factor/principalaxis.rb:202 msgid "Communalities" msgstr "Comunalidades" #: lib/statsample/factor/pca.rb:216 #: lib/statsample/factor/principalaxis.rb:202 msgid "Initial" msgstr "Inicial" #: lib/statsample/factor/pca.rb:216 #: lib/statsample/factor/principalaxis.rb:202 msgid "Extraction" msgstr "Extracción" #: lib/statsample/factor/pca.rb:216 #: lib/statsample/factor/pca.rb:223 #: lib/statsample/reliability/skillscaleanalysis.rb:92 msgid "%" msgstr "%" #: lib/statsample/factor/pca.rb:223 msgid "Total Variance Explained" msgstr "Varianza Total Explicada" #: lib/statsample/factor/pca.rb:223 msgid "Component" msgstr "Componente" #: lib/statsample/factor/pca.rb:223 msgid "E.Total" msgstr "E. Total" #: lib/statsample/factor/pca.rb:223 msgid "Cum. %" msgstr "% Acum." #: lib/statsample/factor/pca.rb:227 msgid "Component %d" msgstr "Componente %d" #: lib/statsample/factor/principalaxis.rb:70 msgid "Variable %d" msgstr "Variable %d" #: lib/statsample/factor/principalaxis.rb:147 msgid "Factor Matrix" msgstr "Matriz de Factores" #: lib/statsample/factor/principalaxis.rb:201 msgid "Iterations: %d" msgstr "Iteraciones: %d" #: lib/statsample/factor/principalaxis.rb:207 msgid "Total Variance" msgstr "Varianza Total" #: lib/statsample/factor/principalaxis.rb:207 msgid "Factor" msgstr "Factor" #: lib/statsample/factor/principalaxis.rb:207 msgid "I.E.Total" msgstr "E.I. Total" #: lib/statsample/factor/principalaxis.rb:207 msgid "I.E. %" msgstr "E.I. %" #: lib/statsample/factor/principalaxis.rb:207 msgid "I.E.Cum. %" msgstr "E.I. Acum. %" #: lib/statsample/factor/principalaxis.rb:208 msgid "S.L.Total" msgstr "C.C. Total" #: lib/statsample/factor/principalaxis.rb:208 msgid "S.L. %" msgstr "C.C. %" #: lib/statsample/factor/principalaxis.rb:208 msgid "S.L.Cum. %" msgstr "C.C. Acum %" #: lib/statsample/factor/principalaxis.rb:215 msgid "Factor %d" msgstr "Factor %d" #: lib/statsample/factor/rotation.rb:35 msgid "%s rotation" msgstr "rotación %s" #: lib/statsample/factor/rotation.rb:132 msgid "Rotated Component matrix" msgstr "Matriz de componentes rotada" #: lib/statsample/factor/rotation.rb:149 msgid "Component transformation matrix" msgstr "Matriz de transformación de componentes" #: lib/statsample/reliability/multiscaleanalysis.rb:67 msgid "Multiple Scale analysis" msgstr "Análisis de múltiples escalas" #: lib/statsample/reliability/multiscaleanalysis.rb:97 msgid "Scale %s" msgstr "Escala %s" #: lib/statsample/reliability/multiscaleanalysis.rb:145 msgid "Reliability analysis of scales" msgstr "Análisis de confiabilidad de escalas" #: lib/statsample/reliability/multiscaleanalysis.rb:151 msgid "Correlation matrix for %s" msgstr "Matriz de correlaciones para %s" #: lib/statsample/reliability/multiscaleanalysis.rb:156 msgid "PCA for %s" msgstr "ACP para %s" #: lib/statsample/reliability/multiscaleanalysis.rb:161 msgid "Principal Axis for %s" msgstr "Ejes principales para %s" #: lib/statsample/reliability/multiscaleanalysis.rb:167 msgid "Parallel Analysis for %s" msgstr "Análisis Paralelo para %s" #: lib/statsample/reliability/multiscaleanalysis.rb:172 msgid "MAP for %s" msgstr "MAP para %s" #: lib/statsample/reliability/skillscaleanalysis.rb:21 msgid "Skill Scale Reliability Analysis (%s)" msgstr "Análisis de confiabilidad de escalas de habilidad" #: lib/statsample/reliability/skillscaleanalysis.rb:36 msgid "%s(corrected)" msgstr "%s(corregido)" #: lib/statsample/reliability/skillscaleanalysis.rb:40 msgid "Corrected dataset from %s" msgstr "Grupo de datos corregido desde %s" #: lib/statsample/reliability/skillscaleanalysis.rb:51 msgid "%s (Scale Analysis)" msgstr "%s (Análisis de Escala)" #: lib/statsample/reliability/skillscaleanalysis.rb:82 msgid "Problematic Items" msgstr "Ítems problemáticos" #: lib/statsample/reliability/skillscaleanalysis.rb:87 msgid "Item: %s" msgstr "Ítem: %s" #: lib/statsample/reliability/skillscaleanalysis.rb:88 msgid "Correct answer: %s" msgstr "Respuesta correcta: %s" #: lib/statsample/reliability/skillscaleanalysis.rb:89 msgid "p: %0.3f" msgstr "p: %0.3f" #: lib/statsample/reliability/skillscaleanalysis.rb:101 msgid "No problematic items" msgstr "Sin ítems problemáticos" #: lib/statsample/reliability/scaleanalysis.rb:44 msgid "Reliability Analisis" msgstr "Análisis de confiabilidad" #: lib/statsample/reliability/scaleanalysis.rb:157 msgid "Summary for %s with all items" msgstr "Sumario para %s con todos los ítems" #: lib/statsample/reliability/scaleanalysis.rb:158 msgid "Items" msgstr "Ítems" #: lib/statsample/reliability/scaleanalysis.rb:159 #: lib/statsample/reliability/scaleanalysis.rb:176 msgid "Sum mean" msgstr "Promedio de suma" #: lib/statsample/reliability/scaleanalysis.rb:160 msgid "S.d. mean" msgstr "Promedio de d.e." #: lib/statsample/reliability/scaleanalysis.rb:162 msgid "Deleted items" msgstr "Ítems eliminados" #: lib/statsample/reliability/scaleanalysis.rb:172 msgid "Summary for %s" msgstr "Sumario para %s" #: lib/statsample/reliability/scaleanalysis.rb:173 msgid "Valid Items" msgstr "Ítems Válidos" #: lib/statsample/reliability/scaleanalysis.rb:175 msgid "Valid cases" msgstr "casos válidos" #: lib/statsample/reliability/scaleanalysis.rb:177 msgid "Sum sd" msgstr "d.e. de suma" #: lib/statsample/reliability/scaleanalysis.rb:179 msgid "Sum median" msgstr "Mediana de suma" #: lib/statsample/reliability/scaleanalysis.rb:181 msgid "Item mean" msgstr "Promedio de los ítemes" #: lib/statsample/reliability/scaleanalysis.rb:182 msgid "Item sd" msgstr "DE de Items" #: lib/statsample/reliability/scaleanalysis.rb:184 msgid "Skewness" msgstr "Sesgo" #: lib/statsample/reliability/scaleanalysis.rb:185 msgid "Kurtosis" msgstr "Curtosis" #: lib/statsample/reliability/scaleanalysis.rb:187 msgid "Cronbach's alpha" msgstr "Alfa de Cronbach" #: lib/statsample/reliability/scaleanalysis.rb:188 msgid "Standarized Cronbach's alpha" msgstr "Alfa de Cronbach estandarizado" #: lib/statsample/reliability/scaleanalysis.rb:189 msgid "Mean rpb" msgstr "rbp medio" #: lib/statsample/reliability/scaleanalysis.rb:191 msgid "Variances mean" msgstr "Promedio de las varianzas" #: lib/statsample/reliability/scaleanalysis.rb:192 msgid "Covariances mean" msgstr "Promedio de las covarianzas" #: lib/statsample/reliability/scaleanalysis.rb:196 msgid "Items for obtain alpha(0.8) : %d" msgstr "Ítems para obtener alfa(0,8): %d" #: lib/statsample/reliability/scaleanalysis.rb:197 msgid "Items for obtain alpha(0.9) : %d" msgstr "Ítems para obtener alfa(0,9): %d" #: lib/statsample/reliability/scaleanalysis.rb:205 msgid "Items report for %s" msgstr "Reporte de ítems para %s" #: lib/statsample/reliability/icc.rb:114 msgid "Shrout & Fleiss ICC(1,1)" msgstr "Shrout & Fleiss ICC(1,1)" #: lib/statsample/reliability/icc.rb:119 msgid "Shrout & Fleiss ICC(2,1)" msgstr "Shrout & Fleiss ICC(2,1)" #: lib/statsample/reliability/icc.rb:125 msgid "Shrout & Fleiss ICC(3,1)" msgstr "Shrout & Fleiss ICC(3,1)" #: lib/statsample/reliability/icc.rb:132 msgid "Shrout & Fleiss ICC(1,k)" msgstr "Shrout & Fleiss ICC(1,k)" #: lib/statsample/reliability/icc.rb:138 msgid "Shrout & Fleiss ICC(2,k)" msgstr "Shrout & Fleiss ICC(2,k)" #: lib/statsample/reliability/icc.rb:145 msgid "Shrout & Fleiss ICC(3,k)" msgstr "Shrout & Fleiss ICC(3,k)" #: lib/statsample/reliability/icc.rb:153 msgid "McGraw & Wong ICC(1)" msgstr "McGraw & Wong ICC(1)" #: lib/statsample/reliability/icc.rb:159 msgid "McGraw & Wong ICC(K)" msgstr "McGraw & Wong ICC(K)" #: lib/statsample/reliability/icc.rb:165 msgid "McGraw & Wong ICC(C,1)" msgstr "McGraw & Wong ICC(C,1)" #: lib/statsample/reliability/icc.rb:172 msgid "McGraw & Wong ICC(C,K)" msgstr "McGraw & Wong ICC(C,K)" #: lib/statsample/reliability/icc.rb:179 msgid "McGraw & Wong ICC(A,1)" msgstr "McGraw & Wong ICC(A,1)" #: lib/statsample/reliability/icc.rb:186 msgid "McGraw & Wong ICC(A,K)" msgstr "McGraw & Wong ICC(A,K)" #: lib/statsample/reliability/icc.rb:408 msgid "ICC: %0.4f" msgstr "CIC: %0.3f" #: lib/statsample/reliability/icc.rb:410 msgid "CI (%0.2f): [%0.4f - %0.4f]" msgstr "IC (%0.2f): [%0.4f - %0.4f]" #: lib/statsample/crosstab.rb:22 msgid "Crosstab %s - %s" msgstr "Tabulación cruzada %s - %s" #: lib/statsample/crosstab.rb:98 msgid "Rows: %s" msgstr "Filas: %s" #: lib/statsample/crosstab.rb:99 msgid "Columns: %s" msgstr "Columnas: %s" #: lib/statsample/crosstab.rb:101 msgid "Raw" msgstr "En Bruto" #: lib/statsample/crosstab.rb:146 msgid "% Row" msgstr "% Fila" #: lib/statsample/crosstab.rb:147 msgid "% Column" msgstr "% Columna" #: lib/statsample/crosstab.rb:148 msgid "% Total" msgstr "% Total" #: lib/statsample/dominanceanalysis.rb:121 msgid "Dominance Analysis: %s over %s" msgstr "Análisis de dominancia: %s en %s" #: lib/statsample/dominanceanalysis.rb:315 msgid "sign" msgstr "signo" #: lib/statsample/dominanceanalysis.rb:317 msgid "Dominance Analysis result" msgstr "Resultados del análisis de dominancia" #: lib/statsample/dominanceanalysis.rb:318 msgid "Model 0" msgstr "Modelo 0" #: lib/statsample/dominanceanalysis.rb:333 msgid "k=%d Average" msgstr "k=%d Promedio" #: lib/statsample/dominanceanalysis.rb:345 msgid "Overall averages" msgstr "Promedios generales" #: lib/statsample/dominanceanalysis.rb:354 msgid "Pairwise dominance" msgstr "Dominancia en pares" #: lib/statsample/dominanceanalysis.rb:354 msgid "Pairs" msgstr "Pares" #: lib/statsample/dominanceanalysis.rb:354 msgid "Conditional" msgstr "Condicional" #: lib/statsample/dominanceanalysis.rb:354 msgid "General" msgstr "General" #: lib/statsample/matrix.rb:181 msgid "X%d" msgstr "X%d" #: lib/statsample/matrix.rb:184 msgid "Y%d" msgstr "Y%d" #: lib/statsample/matrix.rb:196 msgid "Matrix %d" msgstr "Matriz %d" #: lib/statsample/matrix.rb:255 msgid "Covariate matrix %d" msgstr "Matriz de Covarianza %d" #: lib/statsample/matrix.rb:303 msgid "Correlation" msgstr "Correlación" #: lib/statsample/matrix.rb:303 msgid "Covariance" msgstr "Covarianza" #: lib/statsample/matrix.rb:303 msgid " Matrix" msgstr "Matriz" #: lib/statsample/vector.rb:177 msgid "%s(standarized)" msgstr "%s(estandarizado)" #: lib/statsample/vector.rb:189 msgid "%s(centered)" msgstr "%s(centrado)" #: lib/statsample/vector.rb:201 msgid "%s(percentil)" msgstr "%s(percentil)" #: lib/statsample/vector.rb:778 msgid "n :%d" msgstr "n: %s" #: lib/statsample/vector.rb:779 msgid "n valid:%d" msgstr "n válido: %d" #: lib/statsample/vector.rb:780 msgid "factors:%s" msgstr "factores:%s" #: lib/statsample/vector.rb:781 msgid "mode: %s" msgstr "modo: %s" #: lib/statsample/vector.rb:782 msgid "Distribution" msgstr "Distribución" #: lib/statsample/vector.rb:788 msgid "median: %s" msgstr "Mediana: %s" #: lib/statsample/vector.rb:790 msgid "mean: %0.4f" msgstr "promedio: %0.3f" #: lib/statsample/vector.rb:791 msgid "sd: %0.4f" msgstr "d.e.: %0.3f" #: lib/statsample/dataset.rb:161 msgid "Dataset %d" msgstr "Dataset %d" #: lib/statsample/dataset.rb:457 msgid "Sum from %s" msgstr "Suma para %s" #: lib/statsample/dataset.rb:510 msgid "Means from %s" msgstr "Media desde %s" #: lib/statsample/dataset.rb:734 msgid "%s(filtered)" msgstr "%s(filtrado)" #: lib/statsample/dataset.rb:956 msgid "Cases: %d" msgstr "Casos: %s" ================================================ FILE: po/statsample.pot ================================================ # Statsample po template. # Copyright (C) 2009-2009 Claudio Bustos # This file is distributed under the same license as the Statsample package. # Claudio Bustos # #, fuzzy msgid "" msgstr "" "Project-Id-Version: statsample 1.0.1\n" "POT-Creation-Date: 2011-03-03 12:03-0300\n" "PO-Revision-Date: 2009-08-04 15:36-0400\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" "Plural-Forms: nplurals=INTEGER; plural=EXPRESSION;\n" #: lib/statsample/test/f.rb:26 msgid "F Test" msgstr "" #: lib/statsample/test/t.rb:82 msgid "T Test" msgstr "" #: lib/statsample/test/t.rb:83 msgid "Estimate" msgstr "" #: lib/statsample/test/t.rb:84 msgid "Std.Err.of Estimate" msgstr "" #: lib/statsample/test/t.rb:114 msgid "%s: %0.4f | %s: %0.4f" msgstr "" #: lib/statsample/test/t.rb:120 msgid "t(%d) = %0.4f, p=%0.4f (%s tails)" msgstr "" #: lib/statsample/test/t.rb:121 msgid "CI(%d%%): %0.4f - %0.4f" msgstr "" #: lib/statsample/test/t.rb:190 msgid "Sample mean: %0.4f | Sample sd: %0.4f | se : %0.4f" msgstr "" #: lib/statsample/test/t.rb:191 msgid "Population mean: %0.4f" msgstr "" #: lib/statsample/test/t.rb:292 msgid "Mean and standard deviation" msgstr "" #: lib/statsample/test/t.rb:292 lib/statsample/regression/simple.rb:109 #: lib/statsample/factor/pca.rb:216 lib/statsample/factor/principalaxis.rb:202 msgid "Variable" msgstr "" #: lib/statsample/test/t.rb:292 #: lib/statsample/dominanceanalysis/bootstrap.rb:208 msgid "mean" msgstr "" #: lib/statsample/test/t.rb:292 msgid "sd" msgstr "" #: lib/statsample/test/t.rb:292 lib/statsample/factor/parallelanalysis.rb:103 #: lib/statsample/factor/parallelanalysis.rb:111 msgid "n" msgstr "" #: lib/statsample/test/t.rb:296 msgid "Levene test for equality of variances" msgstr "" #: lib/statsample/test/t.rb:298 msgid "T statistics" msgstr "" #: lib/statsample/test/t.rb:299 msgid "Equal variance" msgstr "" #: lib/statsample/test/t.rb:300 msgid "Non equal variance" msgstr "" #: lib/statsample/test/t.rb:302 msgid "Effect size" msgstr "" #: lib/statsample/test/umannwhitney.rb:140 msgid "Mann-Whitney's U" msgstr "" #: lib/statsample/test/umannwhitney.rb:149 msgid "%s results" msgstr "" #: lib/statsample/test/umannwhitney.rb:150 #: lib/statsample/test/umannwhitney.rb:151 msgid "Sum of ranks %s" msgstr "" #: lib/statsample/test/umannwhitney.rb:152 msgid "U Value" msgstr "" #: lib/statsample/test/umannwhitney.rb:153 msgid "Z" msgstr "" #: lib/statsample/test/umannwhitney.rb:155 msgid "Exact p (Dinneen & Blakesley, 1973):" msgstr "" #: lib/statsample/test/levene.rb:37 msgid "Levene Test" msgstr "" #: lib/statsample/test/bartlettsphericity.rb:25 msgid "Bartlett's test of sphericity" msgstr "" #: lib/statsample/regression/multiple/baseengine.rb:27 msgid "Multiple Regression: %s over %s" msgstr "" #: lib/statsample/regression/multiple/baseengine.rb:40 msgid "Regression" msgstr "" #: lib/statsample/regression/multiple/baseengine.rb:40 msgid "Error" msgstr "" #: lib/statsample/regression/multiple/baseengine.rb:184 msgid "Engine: %s" msgstr "" #: lib/statsample/regression/multiple/baseengine.rb:185 msgid "Cases(listwise)=%d(%d)" msgstr "" #: lib/statsample/regression/multiple/baseengine.rb:186 msgid "R=" msgstr "" #: lib/statsample/regression/multiple/baseengine.rb:187 msgid "R^2=" msgstr "" #: lib/statsample/regression/multiple/baseengine.rb:188 msgid "R^2 Adj=" msgstr "" #: lib/statsample/regression/multiple/baseengine.rb:189 msgid "Std.Error R=" msgstr "" #: lib/statsample/regression/multiple/baseengine.rb:191 msgid "Equation" msgstr "" #: lib/statsample/regression/multiple/baseengine.rb:197 msgid "Beta coefficients" msgstr "" #: lib/statsample/regression/multiple/baseengine.rb:198 msgid "Constant" msgstr "" #: lib/statsample/regression/multiple/matrixengine.rb:78 msgid "Multiple reggresion of %s on %s" msgstr "" #: lib/statsample/regression/simple.rb:88 msgid "Regression of %s over %s" msgstr "" #: lib/statsample/regression/simple.rb:109 lib/statsample/factor/map.rb:105 #: lib/statsample/reliability/skillscaleanalysis.rb:92 msgid "Value" msgstr "" #: lib/statsample/regression/simple.rb:110 msgid "r" msgstr "" #: lib/statsample/regression/simple.rb:111 msgid "r^2" msgstr "" #: lib/statsample/regression/simple.rb:112 msgid "a" msgstr "" #: lib/statsample/regression/simple.rb:113 msgid "b" msgstr "" #: lib/statsample/regression/simple.rb:114 msgid "s.e" msgstr "" #: lib/statsample/dominanceanalysis/bootstrap.rb:115 msgid "Bootstrap dominance Analysis: %s over %s" msgstr "" #: lib/statsample/dominanceanalysis/bootstrap.rb:138 msgid "Bootstrap %d of %d" msgstr "" #: lib/statsample/dominanceanalysis/bootstrap.rb:177 msgid "Sample size: %d\n" msgstr "" #: lib/statsample/dominanceanalysis/bootstrap.rb:179 msgid "Linear Regression Engine: %s" msgstr "" #: lib/statsample/dominanceanalysis/bootstrap.rb:181 msgid "pairs" msgstr "" #: lib/statsample/dominanceanalysis/bootstrap.rb:181 msgid "SE(Dij)" msgstr "" #: lib/statsample/dominanceanalysis/bootstrap.rb:181 msgid "Reproducibility" msgstr "" #: lib/statsample/dominanceanalysis/bootstrap.rb:182 msgid "Complete dominance" msgstr "" #: lib/statsample/dominanceanalysis/bootstrap.rb:190 msgid "Conditional dominance" msgstr "" #: lib/statsample/dominanceanalysis/bootstrap.rb:199 msgid "General Dominance" msgstr "" #: lib/statsample/dominanceanalysis/bootstrap.rb:208 msgid "General averages" msgstr "" #: lib/statsample/dominanceanalysis/bootstrap.rb:208 msgid "var" msgstr "" #: lib/statsample/dominanceanalysis/bootstrap.rb:208 msgid "se" msgstr "" #: lib/statsample/dominanceanalysis/bootstrap.rb:208 msgid "p.5" msgstr "" #: lib/statsample/dominanceanalysis/bootstrap.rb:208 msgid "p.95" msgstr "" #: lib/statsample/anova/twoway.rb:59 msgid "ANOVA Two-Way" msgstr "" #: lib/statsample/anova/twoway.rb:60 msgid "A" msgstr "" #: lib/statsample/anova/twoway.rb:61 msgid "B" msgstr "" #: lib/statsample/anova/twoway.rb:62 msgid "Within" msgstr "" #: lib/statsample/anova/twoway.rb:98 lib/statsample/anova/oneway.rb:57 msgid "%s Table" msgstr "" #: lib/statsample/anova/twoway.rb:103 lib/statsample/anova/oneway.rb:60 #: lib/statsample/crosstab.rb:101 lib/statsample/crosstab.rb:116 #: lib/statsample/crosstab.rb:151 lib/statsample/crosstab.rb:173 #: lib/statsample/dominanceanalysis.rb:354 msgid "Total" msgstr "" #: lib/statsample/anova/twoway.rb:172 msgid "Anova Two-Way on %s" msgstr "" #: lib/statsample/anova/twoway.rb:184 lib/statsample/anova/oneway.rb:127 msgid "Test of Homogeneity of variances (Levene)" msgstr "" #: lib/statsample/anova/twoway.rb:189 lib/statsample/anova/twoway.rb:193 msgid "%s Mean" msgstr "" #: lib/statsample/anova/oneway.rb:35 msgid "Explained variance" msgstr "" #: lib/statsample/anova/oneway.rb:36 msgid "Unexplained variance" msgstr "" #: lib/statsample/anova/oneway.rb:97 msgid "Anova One-Way" msgstr "" #: lib/statsample/anova/oneway.rb:98 msgid "Between Groups" msgstr "" #: lib/statsample/anova/oneway.rb:99 msgid "Within Groups" msgstr "" #: lib/statsample/anova/oneway.rb:119 msgid "Contrast for %s" msgstr "" #: lib/statsample/anova/oneway.rb:163 msgid "Descriptives" msgstr "" #: lib/statsample/anova/contrast.rb:13 msgid "Psi estimate" msgstr "" #: lib/statsample/anova/contrast.rb:14 msgid "Contrast" msgstr "" #: lib/statsample/anova/contrast.rb:73 msgid "Contrast:%s" msgstr "" #: lib/statsample/graph/scatterplot.rb:72 msgid "Scatterplot (%s - %s)" msgstr "" #: lib/statsample/graph/histogram.rb:50 msgid "Histograma (%s)" msgstr "" #: lib/statsample/graph/boxplot.rb:63 msgid "Boxplot" msgstr "" #: lib/statsample/bivariate/pearson.rb:32 msgid "Correlation (%s - %s)" msgstr "" #: lib/statsample/bivariate/pearson.rb:50 msgid "%s : r=%0.3f (t:%0.3f, g.l.=%d, p:%0.3f / %s tails)" msgstr "" #: lib/statsample/factor/parallelanalysis.rb:68 msgid "Parallel Analysis" msgstr "" #: lib/statsample/factor/parallelanalysis.rb:96 msgid "Bootstrap Method: %s" msgstr "" #: lib/statsample/factor/parallelanalysis.rb:97 msgid "Uses SMC: %s" msgstr "" #: lib/statsample/factor/parallelanalysis.rb:97 msgid "Yes" msgstr "" #: lib/statsample/factor/parallelanalysis.rb:97 msgid "No" msgstr "" #: lib/statsample/factor/parallelanalysis.rb:98 msgid "Correlation Matrix type : %s" msgstr "" #: lib/statsample/factor/parallelanalysis.rb:99 msgid "Number of variables: %d" msgstr "" #: lib/statsample/factor/parallelanalysis.rb:100 msgid "Number of cases: %d" msgstr "" #: lib/statsample/factor/parallelanalysis.rb:101 msgid "Number of iterations: %d" msgstr "" #: lib/statsample/factor/parallelanalysis.rb:103 #: lib/statsample/factor/parallelanalysis.rb:111 #: lib/statsample/factor/map.rb:105 msgid "Eigenvalues" msgstr "" #: lib/statsample/factor/parallelanalysis.rb:103 #: lib/statsample/factor/parallelanalysis.rb:111 msgid "generated eigenvalue" msgstr "" #: lib/statsample/factor/parallelanalysis.rb:110 msgid "Number or factors to preserve: %d" msgstr "" #: lib/statsample/factor/parallelanalysis.rb:111 msgid "data eigenvalue" msgstr "" #: lib/statsample/factor/parallelanalysis.rb:111 msgid "preserve?" msgstr "" #: lib/statsample/factor/map.rb:60 msgid "Velicer's MAP" msgstr "" #: lib/statsample/factor/map.rb:110 msgid "Velicer's Average Squared Correlations" msgstr "" #: lib/statsample/factor/map.rb:110 msgid "number of components" msgstr "" #: lib/statsample/factor/map.rb:110 msgid "average square correlation" msgstr "" #: lib/statsample/factor/map.rb:115 msgid "The smallest average squared correlation is : %0.6f" msgstr "" #: lib/statsample/factor/map.rb:116 msgid "The number of components is : %d" msgstr "" #: lib/statsample/factor/pca.rb:56 msgid "Principal Component Analysis" msgstr "" #: lib/statsample/factor/pca.rb:59 lib/statsample/matrix.rb:14 #: lib/statsample/matrix.rb:81 msgid "VAR_%d" msgstr "" #: lib/statsample/factor/pca.rb:160 msgid "Component matrix (from covariance)" msgstr "" #: lib/statsample/factor/pca.rb:181 msgid "Component matrix" msgstr "" #: lib/statsample/factor/pca.rb:215 lib/statsample/factor/principalaxis.rb:200 msgid "Number of factors: %d" msgstr "" #: lib/statsample/factor/pca.rb:216 lib/statsample/factor/principalaxis.rb:202 msgid "Communalities" msgstr "" #: lib/statsample/factor/pca.rb:216 lib/statsample/factor/principalaxis.rb:202 msgid "Initial" msgstr "" #: lib/statsample/factor/pca.rb:216 lib/statsample/factor/principalaxis.rb:202 msgid "Extraction" msgstr "" #: lib/statsample/factor/pca.rb:216 lib/statsample/factor/pca.rb:223 #: lib/statsample/reliability/skillscaleanalysis.rb:92 msgid "%" msgstr "" #: lib/statsample/factor/pca.rb:223 msgid "Total Variance Explained" msgstr "" #: lib/statsample/factor/pca.rb:223 msgid "Component" msgstr "" #: lib/statsample/factor/pca.rb:223 msgid "E.Total" msgstr "" #: lib/statsample/factor/pca.rb:223 msgid "Cum. %" msgstr "" #: lib/statsample/factor/pca.rb:227 msgid "Component %d" msgstr "" #: lib/statsample/factor/principalaxis.rb:70 msgid "Variable %d" msgstr "" #: lib/statsample/factor/principalaxis.rb:147 msgid "Factor Matrix" msgstr "" #: lib/statsample/factor/principalaxis.rb:201 msgid "Iterations: %d" msgstr "" #: lib/statsample/factor/principalaxis.rb:207 msgid "Total Variance" msgstr "" #: lib/statsample/factor/principalaxis.rb:207 msgid "Factor" msgstr "" #: lib/statsample/factor/principalaxis.rb:207 msgid "I.E.Total" msgstr "" #: lib/statsample/factor/principalaxis.rb:207 msgid "I.E. %" msgstr "" #: lib/statsample/factor/principalaxis.rb:207 msgid "I.E.Cum. %" msgstr "" #: lib/statsample/factor/principalaxis.rb:208 msgid "S.L.Total" msgstr "" #: lib/statsample/factor/principalaxis.rb:208 msgid "S.L. %" msgstr "" #: lib/statsample/factor/principalaxis.rb:208 msgid "S.L.Cum. %" msgstr "" #: lib/statsample/factor/principalaxis.rb:215 msgid "Factor %d" msgstr "" #: lib/statsample/factor/rotation.rb:35 msgid "%s rotation" msgstr "" #: lib/statsample/factor/rotation.rb:132 msgid "Rotated Component matrix" msgstr "" #: lib/statsample/factor/rotation.rb:149 msgid "Component transformation matrix" msgstr "" #: lib/statsample/reliability/multiscaleanalysis.rb:67 msgid "Multiple Scale analysis" msgstr "" #: lib/statsample/reliability/multiscaleanalysis.rb:97 msgid "Scale %s" msgstr "" #: lib/statsample/reliability/multiscaleanalysis.rb:145 msgid "Reliability analysis of scales" msgstr "" #: lib/statsample/reliability/multiscaleanalysis.rb:151 msgid "Correlation matrix for %s" msgstr "" #: lib/statsample/reliability/multiscaleanalysis.rb:156 msgid "PCA for %s" msgstr "" #: lib/statsample/reliability/multiscaleanalysis.rb:161 msgid "Principal Axis for %s" msgstr "" #: lib/statsample/reliability/multiscaleanalysis.rb:167 msgid "Parallel Analysis for %s" msgstr "" #: lib/statsample/reliability/multiscaleanalysis.rb:172 msgid "MAP for %s" msgstr "" #: lib/statsample/reliability/skillscaleanalysis.rb:21 msgid "Skill Scale Reliability Analysis (%s)" msgstr "" #: lib/statsample/reliability/skillscaleanalysis.rb:36 msgid "%s(corrected)" msgstr "" #: lib/statsample/reliability/skillscaleanalysis.rb:40 msgid "Corrected dataset from %s" msgstr "" #: lib/statsample/reliability/skillscaleanalysis.rb:51 msgid "%s (Scale Analysis)" msgstr "" #: lib/statsample/reliability/skillscaleanalysis.rb:82 msgid "Problematic Items" msgstr "" #: lib/statsample/reliability/skillscaleanalysis.rb:87 msgid "Item: %s" msgstr "" #: lib/statsample/reliability/skillscaleanalysis.rb:88 msgid "Correct answer: %s" msgstr "" #: lib/statsample/reliability/skillscaleanalysis.rb:89 msgid "p: %0.3f" msgstr "" #: lib/statsample/reliability/skillscaleanalysis.rb:101 msgid "No problematic items" msgstr "" #: lib/statsample/reliability/scaleanalysis.rb:44 msgid "Reliability Analisis" msgstr "" #: lib/statsample/reliability/scaleanalysis.rb:157 msgid "Summary for %s with all items" msgstr "" #: lib/statsample/reliability/scaleanalysis.rb:158 msgid "Items" msgstr "" #: lib/statsample/reliability/scaleanalysis.rb:159 #: lib/statsample/reliability/scaleanalysis.rb:176 msgid "Sum mean" msgstr "" #: lib/statsample/reliability/scaleanalysis.rb:160 msgid "S.d. mean" msgstr "" #: lib/statsample/reliability/scaleanalysis.rb:162 msgid "Deleted items" msgstr "" #: lib/statsample/reliability/scaleanalysis.rb:172 msgid "Summary for %s" msgstr "" #: lib/statsample/reliability/scaleanalysis.rb:173 msgid "Valid Items" msgstr "" #: lib/statsample/reliability/scaleanalysis.rb:175 msgid "Valid cases" msgstr "" #: lib/statsample/reliability/scaleanalysis.rb:177 msgid "Sum sd" msgstr "" #: lib/statsample/reliability/scaleanalysis.rb:179 msgid "Sum median" msgstr "" #: lib/statsample/reliability/scaleanalysis.rb:181 msgid "Item mean" msgstr "" #: lib/statsample/reliability/scaleanalysis.rb:182 msgid "Item sd" msgstr "" #: lib/statsample/reliability/scaleanalysis.rb:184 msgid "Skewness" msgstr "" #: lib/statsample/reliability/scaleanalysis.rb:185 msgid "Kurtosis" msgstr "" #: lib/statsample/reliability/scaleanalysis.rb:187 msgid "Cronbach's alpha" msgstr "" #: lib/statsample/reliability/scaleanalysis.rb:188 msgid "Standarized Cronbach's alpha" msgstr "" #: lib/statsample/reliability/scaleanalysis.rb:189 msgid "Mean rpb" msgstr "" #: lib/statsample/reliability/scaleanalysis.rb:191 msgid "Variances mean" msgstr "" #: lib/statsample/reliability/scaleanalysis.rb:192 msgid "Covariances mean" msgstr "" #: lib/statsample/reliability/scaleanalysis.rb:196 msgid "Items for obtain alpha(0.8) : %d" msgstr "" #: lib/statsample/reliability/scaleanalysis.rb:197 msgid "Items for obtain alpha(0.9) : %d" msgstr "" #: lib/statsample/reliability/scaleanalysis.rb:205 msgid "Items report for %s" msgstr "" #: lib/statsample/reliability/icc.rb:114 msgid "Shrout & Fleiss ICC(1,1)" msgstr "" #: lib/statsample/reliability/icc.rb:119 msgid "Shrout & Fleiss ICC(2,1)" msgstr "" #: lib/statsample/reliability/icc.rb:125 msgid "Shrout & Fleiss ICC(3,1)" msgstr "" #: lib/statsample/reliability/icc.rb:132 msgid "Shrout & Fleiss ICC(1,k)" msgstr "" #: lib/statsample/reliability/icc.rb:138 msgid "Shrout & Fleiss ICC(2,k)" msgstr "" #: lib/statsample/reliability/icc.rb:145 msgid "Shrout & Fleiss ICC(3,k)" msgstr "" #: lib/statsample/reliability/icc.rb:153 msgid "McGraw & Wong ICC(1)" msgstr "" #: lib/statsample/reliability/icc.rb:159 msgid "McGraw & Wong ICC(K)" msgstr "" #: lib/statsample/reliability/icc.rb:165 msgid "McGraw & Wong ICC(C,1)" msgstr "" #: lib/statsample/reliability/icc.rb:172 msgid "McGraw & Wong ICC(C,K)" msgstr "" #: lib/statsample/reliability/icc.rb:179 msgid "McGraw & Wong ICC(A,1)" msgstr "" #: lib/statsample/reliability/icc.rb:186 msgid "McGraw & Wong ICC(A,K)" msgstr "" #: lib/statsample/reliability/icc.rb:408 msgid "ICC: %0.4f" msgstr "" #: lib/statsample/reliability/icc.rb:410 msgid "CI (%0.2f): [%0.4f - %0.4f]" msgstr "" #: lib/statsample/crosstab.rb:22 msgid "Crosstab %s - %s" msgstr "" #: lib/statsample/crosstab.rb:98 msgid "Rows: %s" msgstr "" #: lib/statsample/crosstab.rb:99 msgid "Columns: %s" msgstr "" #: lib/statsample/crosstab.rb:101 msgid "Raw" msgstr "" #: lib/statsample/crosstab.rb:146 msgid "% Row" msgstr "" #: lib/statsample/crosstab.rb:147 msgid "% Column" msgstr "" #: lib/statsample/crosstab.rb:148 msgid "% Total" msgstr "" #: lib/statsample/dominanceanalysis.rb:121 msgid "Dominance Analysis: %s over %s" msgstr "" #: lib/statsample/dominanceanalysis.rb:315 msgid "sign" msgstr "" #: lib/statsample/dominanceanalysis.rb:317 msgid "Dominance Analysis result" msgstr "" #: lib/statsample/dominanceanalysis.rb:318 msgid "Model 0" msgstr "" #: lib/statsample/dominanceanalysis.rb:333 msgid "k=%d Average" msgstr "" #: lib/statsample/dominanceanalysis.rb:345 msgid "Overall averages" msgstr "" #: lib/statsample/dominanceanalysis.rb:354 msgid "Pairwise dominance" msgstr "" #: lib/statsample/dominanceanalysis.rb:354 msgid "Pairs" msgstr "" #: lib/statsample/dominanceanalysis.rb:354 msgid "Conditional" msgstr "" #: lib/statsample/dominanceanalysis.rb:354 msgid "General" msgstr "" #: lib/statsample/matrix.rb:181 msgid "X%d" msgstr "" #: lib/statsample/matrix.rb:184 msgid "Y%d" msgstr "" #: lib/statsample/matrix.rb:196 msgid "Matrix %d" msgstr "" #: lib/statsample/matrix.rb:255 msgid "Covariate matrix %d" msgstr "" #: lib/statsample/matrix.rb:303 msgid "Correlation" msgstr "" #: lib/statsample/matrix.rb:303 msgid "Covariance" msgstr "" #: lib/statsample/matrix.rb:303 msgid " Matrix" msgstr "" #: lib/statsample/vector.rb:177 msgid "%s(standarized)" msgstr "" #: lib/statsample/vector.rb:189 msgid "%s(centered)" msgstr "" #: lib/statsample/vector.rb:201 msgid "%s(percentil)" msgstr "" #: lib/statsample/vector.rb:778 msgid "n :%d" msgstr "" #: lib/statsample/vector.rb:779 msgid "n valid:%d" msgstr "" #: lib/statsample/vector.rb:780 msgid "factors:%s" msgstr "" #: lib/statsample/vector.rb:781 msgid "mode: %s" msgstr "" #: lib/statsample/vector.rb:782 msgid "Distribution" msgstr "" #: lib/statsample/vector.rb:788 msgid "median: %s" msgstr "" #: lib/statsample/vector.rb:790 msgid "mean: %0.4f" msgstr "" #: lib/statsample/vector.rb:791 msgid "sd: %0.4f" msgstr "" #: lib/statsample/dataset.rb:161 msgid "Dataset %d" msgstr "" #: lib/statsample/dataset.rb:457 msgid "Sum from %s" msgstr "" #: lib/statsample/dataset.rb:510 msgid "Means from %s" msgstr "" #: lib/statsample/dataset.rb:734 msgid "%s(filtered)" msgstr "" #: lib/statsample/dataset.rb:956 msgid "Cases: %d" msgstr "" ================================================ FILE: references.txt ================================================ References * Azen, R. & Budescu, D.V. (2003). The dominance analysis approach for comparing predictors in multiple regression. Psychological Methods, 8(2), 129-148. * Azen, R. & Budescu, D.V. (2006). Comparing predictors in Multivariate Regression Models: An extension of Dominance Analysis. Journal of Educational and Behavioral Statistics, 31(2), 157-180. * Budescu, D. V. (1993). Dominance analysis: a new approach to the problem of relative importance of predictors in multiple regression. Psychological Bulletin, 114, 542-551. * Cochran, W.(1972). Sampling Techniques [spanish edition]. * Cohen et al. (2003). Applied Multiple Reggression / Correlation Analysis for the Behavioral Sciences * Dinneen, L., & Blakesley, B. (1973). Algorithm AS 62: A Generator for the Sampling Distribution of the Mann- Whitney U Statistic. Journal of the Royal Statistical Society, 22(2), 269-273 * Dziuban, C., & Shirkey E. (1974). When is a correlation matrix appropriate for factor analysis? Some decision rules. Psychological Bulletin, 81(6), 358-361. * Hayton, J., Allen, D. & Scarpello, V.(2004). Factor Retention Decisions in Exploratory Factor Analysis: a Tutorial on Parallel Analysis. Organizational Research Methods, 7 (2), 191-205. * Härdle, W. & Simar, L. (2003). Applied Multivariate Statistical Analysis. Springer * Leach, L. & Henson, R. (2007). The Use and Impact of Adjusted R2 Effects in Published Regression Research. Multiple Linear Regression Viewpoints, 33(1), 1-11. * Lin, J. (2007). VARIMAX_K58 [Source code]. [http://www.johnny-lin.com/idl_code/varimax_k58.pro] * Liu, O., & Rijmen, F. (2008). A modified procedure for parallel analysis of ordered categorical data. Behavior Research Methods, 40(2), 556-562. * McGraw, K. & Wong, S.P. (1996). Forming Inferences About Some Intraclass Correlation Coefficients. Psychological methods, 1(1), 30-46. * O'Connor, B. (2000). SPSS and SAS programs for determining the number of components using parallel analysis and Velicer's MAP test. Behavior Research Methods, Instruments, & Computers, 32(3), 396-402. * SPSS Manual * Sawyer, S. (2005). Resampling Data: Using a Statistical Jacknife. * Shrout,P. & Fleiss, J. (1979). Intraclass Correlation: Uses in assessing rater reliability. Psychological Bulletin, 86(2), 420-428 * Smith, L. (2002). A tutorial on Principal Component Analysis. Available on http://courses.eas.ualberta.ca/eas570/pca_tutorial.pdf * http://en.wikipedia.org/wiki/Welch-Satterthwaite_equation * http://europe.isixsigma.com/library/content/c080806a.asp * http://stattrek.com/Lesson6/SRS.aspx * http://talkstats.com/showthread.php?t=5056 * http://www.gnu.org/software/gsl/manual/html_node/The-histogram-struct.html ================================================ FILE: setup.rb ================================================ # # setup.rb # # Copyright (c) 2000-2005 Minero Aoki # # This program is free software. # You can distribute/modify this program under the terms of # the GNU LGPL, Lesser General Public License version 2.1. # unless Enumerable.method_defined?(:map) # Ruby 1.4.6 module Enumerable alias map collect end end unless File.respond_to?(:read) # Ruby 1.6 def File.read(fname) open(fname) {|f| return f.read } end end unless Errno.const_defined?(:ENOTEMPTY) # Windows? module Errno class ENOTEMPTY # We do not raise this exception, implementation is not needed. end end end def File.binread(fname) open(fname, 'rb') {|f| return f.read } end # for corrupted Windows' stat(2) def File.dir?(path) File.directory?((path[-1,1] == '/') ? path : path + '/') end class ConfigTable include Enumerable def initialize(rbconfig) @rbconfig = rbconfig @items = [] @table = {} # options @install_prefix = nil @config_opt = nil @verbose = true @no_harm = false end attr_accessor :install_prefix attr_accessor :config_opt attr_writer :verbose def verbose? @verbose end attr_writer :no_harm def no_harm? @no_harm end def [](key) lookup(key).resolve(self) end def []=(key, val) lookup(key).set val end def names @items.map {|i| i.name } end def each(&block) @items.each(&block) end def key?(name) @table.key?(name) end def lookup(name) @table[name] or setup_rb_error "no such config item: #{name}" end def add(item) @items.push item @table[item.name] = item end def remove(name) item = lookup(name) @items.delete_if {|i| i.name == name } @table.delete_if {|name, i| i.name == name } item end def load_script(path, inst = nil) if File.file?(path) MetaConfigEnvironment.new(self, inst).instance_eval File.read(path), path end end def savefile '.config' end def load_savefile begin File.foreach(savefile()) do |line| k, v = *line.split(/=/, 2) self[k] = v.strip end rescue Errno::ENOENT setup_rb_error $!.message + "\n#{File.basename($0)} config first" end end def save @items.each {|i| i.value } File.open(savefile(), 'w') {|f| @items.each do |i| f.printf "%s=%s\n", i.name, i.value if i.value? and i.value end } end def load_standard_entries standard_entries(@rbconfig).each do |ent| add ent end end def standard_entries(rbconfig) c = rbconfig rubypath = File.join(c['bindir'], c['ruby_install_name'] + c['EXEEXT']) major = c['MAJOR'].to_i minor = c['MINOR'].to_i teeny = c['TEENY'].to_i version = "#{major}.#{minor}" # ruby ver. >= 1.4.4? newpath_p = ((major >= 2) or ((major == 1) and ((minor >= 5) or ((minor == 4) and (teeny >= 4))))) if c['rubylibdir'] # V > 1.6.3 libruby = "#{c['prefix']}/lib/ruby" librubyver = c['rubylibdir'] librubyverarch = c['archdir'] siteruby = c['sitedir'] siterubyver = c['sitelibdir'] siterubyverarch = c['sitearchdir'] elsif newpath_p # 1.4.4 <= V <= 1.6.3 libruby = "#{c['prefix']}/lib/ruby" librubyver = "#{c['prefix']}/lib/ruby/#{version}" librubyverarch = "#{c['prefix']}/lib/ruby/#{version}/#{c['arch']}" siteruby = c['sitedir'] siterubyver = "$siteruby/#{version}" siterubyverarch = "$siterubyver/#{c['arch']}" else # V < 1.4.4 libruby = "#{c['prefix']}/lib/ruby" librubyver = "#{c['prefix']}/lib/ruby/#{version}" librubyverarch = "#{c['prefix']}/lib/ruby/#{version}/#{c['arch']}" siteruby = "#{c['prefix']}/lib/ruby/#{version}/site_ruby" siterubyver = siteruby siterubyverarch = "$siterubyver/#{c['arch']}" end parameterize = lambda {|path| path.sub(/\A#{Regexp.quote(c['prefix'])}/, '$prefix') } if arg = c['configure_args'].split.detect {|arg| /--with-make-prog=/ =~ arg } makeprog = arg.sub(/'/, '').split(/=/, 2)[1] else makeprog = 'make' end [ ExecItem.new('installdirs', 'std/site/home', 'std: install under libruby; site: install under site_ruby; home: install under $HOME')\ {|val, table| case val when 'std' table['rbdir'] = '$librubyver' table['sodir'] = '$librubyverarch' when 'site' table['rbdir'] = '$siterubyver' table['sodir'] = '$siterubyverarch' when 'home' setup_rb_error '$HOME was not set' unless ENV['HOME'] table['prefix'] = ENV['HOME'] table['rbdir'] = '$libdir/ruby' table['sodir'] = '$libdir/ruby' end }, PathItem.new('prefix', 'path', c['prefix'], 'path prefix of target environment'), PathItem.new('bindir', 'path', parameterize.call(c['bindir']), 'the directory for commands'), PathItem.new('libdir', 'path', parameterize.call(c['libdir']), 'the directory for libraries'), PathItem.new('datadir', 'path', parameterize.call(c['datadir']), 'the directory for shared data'), PathItem.new('mandir', 'path', parameterize.call(c['mandir']), 'the directory for man pages'), PathItem.new('sysconfdir', 'path', parameterize.call(c['sysconfdir']), 'the directory for system configuration files'), PathItem.new('localstatedir', 'path', parameterize.call(c['localstatedir']), 'the directory for local state data'), PathItem.new('libruby', 'path', libruby, 'the directory for ruby libraries'), PathItem.new('librubyver', 'path', librubyver, 'the directory for standard ruby libraries'), PathItem.new('librubyverarch', 'path', librubyverarch, 'the directory for standard ruby extensions'), PathItem.new('siteruby', 'path', siteruby, 'the directory for version-independent aux ruby libraries'), PathItem.new('siterubyver', 'path', siterubyver, 'the directory for aux ruby libraries'), PathItem.new('siterubyverarch', 'path', siterubyverarch, 'the directory for aux ruby binaries'), PathItem.new('rbdir', 'path', '$siterubyver', 'the directory for ruby scripts'), PathItem.new('sodir', 'path', '$siterubyverarch', 'the directory for ruby extentions'), PathItem.new('rubypath', 'path', rubypath, 'the path to set to #! line'), ProgramItem.new('rubyprog', 'name', rubypath, 'the ruby program using for installation'), ProgramItem.new('makeprog', 'name', makeprog, 'the make program to compile ruby extentions'), SelectItem.new('shebang', 'all/ruby/never', 'ruby', 'shebang line (#!) editing mode'), BoolItem.new('without-ext', 'yes/no', 'no', 'does not compile/install ruby extentions') ] end private :standard_entries def load_multipackage_entries multipackage_entries().each do |ent| add ent end end def multipackage_entries [ PackageSelectionItem.new('with', 'name,name...', '', 'ALL', 'package names that you want to install'), PackageSelectionItem.new('without', 'name,name...', '', 'NONE', 'package names that you do not want to install') ] end private :multipackage_entries ALIASES = { 'std-ruby' => 'librubyver', 'stdruby' => 'librubyver', 'rubylibdir' => 'librubyver', 'archdir' => 'librubyverarch', 'site-ruby-common' => 'siteruby', # For backward compatibility 'site-ruby' => 'siterubyver', # For backward compatibility 'bin-dir' => 'bindir', 'bin-dir' => 'bindir', 'rb-dir' => 'rbdir', 'so-dir' => 'sodir', 'data-dir' => 'datadir', 'ruby-path' => 'rubypath', 'ruby-prog' => 'rubyprog', 'ruby' => 'rubyprog', 'make-prog' => 'makeprog', 'make' => 'makeprog' } def fixup ALIASES.each do |ali, name| @table[ali] = @table[name] end @items.freeze @table.freeze @options_re = /\A--(#{@table.keys.join('|')})(?:=(.*))?\z/ end def parse_opt(opt) m = @options_re.match(opt) or setup_rb_error "config: unknown option #{opt}" m.to_a[1,2] end def dllext @rbconfig['DLEXT'] end def value_config?(name) lookup(name).value? end class Item def initialize(name, template, default, desc) @name = name.freeze @template = template @value = default @default = default @description = desc end attr_reader :name attr_reader :description attr_accessor :default alias help_default default def help_opt "--#{@name}=#{@template}" end def value? true end def value @value end def resolve(table) @value.gsub(%r<\$([^/]+)>) { table[$1] } end def set(val) @value = check(val) end private def check(val) setup_rb_error "config: --#{name} requires argument" unless val val end end class BoolItem < Item def config_type 'bool' end def help_opt "--#{@name}" end private def check(val) return 'yes' unless val case val when /\Ay(es)?\z/i, /\At(rue)?\z/i then 'yes' when /\An(o)?\z/i, /\Af(alse)\z/i then 'no' else setup_rb_error "config: --#{@name} accepts only yes/no for argument" end end end class PathItem < Item def config_type 'path' end private def check(path) setup_rb_error "config: --#{@name} requires argument" unless path path[0,1] == '$' ? path : File.expand_path(path) end end class ProgramItem < Item def config_type 'program' end end class SelectItem < Item def initialize(name, selection, default, desc) super @ok = selection.split('/') end def config_type 'select' end private def check(val) unless @ok.include?(val.strip) setup_rb_error "config: use --#{@name}=#{@template} (#{val})" end val.strip end end class ExecItem < Item def initialize(name, selection, desc, &block) super name, selection, nil, desc @ok = selection.split('/') @action = block end def config_type 'exec' end def value? false end def resolve(table) setup_rb_error "$#{name()} wrongly used as option value" end undef set def evaluate(val, table) v = val.strip.downcase unless @ok.include?(v) setup_rb_error "invalid option --#{@name}=#{val} (use #{@template})" end @action.call v, table end end class PackageSelectionItem < Item def initialize(name, template, default, help_default, desc) super name, template, default, desc @help_default = help_default end attr_reader :help_default def config_type 'package' end private def check(val) unless File.dir?("packages/#{val}") setup_rb_error "config: no such package: #{val}" end val end end class MetaConfigEnvironment def initialize(config, installer) @config = config @installer = installer end def config_names @config.names end def config?(name) @config.key?(name) end def bool_config?(name) @config.lookup(name).config_type == 'bool' end def path_config?(name) @config.lookup(name).config_type == 'path' end def value_config?(name) @config.lookup(name).config_type != 'exec' end def add_config(item) @config.add item end def add_bool_config(name, default, desc) @config.add BoolItem.new(name, 'yes/no', default ? 'yes' : 'no', desc) end def add_path_config(name, default, desc) @config.add PathItem.new(name, 'path', default, desc) end def set_config_default(name, default) @config.lookup(name).default = default end def remove_config(name) @config.remove(name) end # For only multipackage def packages raise '[setup.rb fatal] multi-package metaconfig API packages() called for single-package; contact application package vendor' unless @installer @installer.packages end # For only multipackage def declare_packages(list) raise '[setup.rb fatal] multi-package metaconfig API declare_packages() called for single-package; contact application package vendor' unless @installer @installer.packages = list end end end # class ConfigTable # This module requires: #verbose?, #no_harm? module FileOperations def mkdir_p(dirname, prefix = nil) dirname = prefix + File.expand_path(dirname) if prefix $stderr.puts "mkdir -p #{dirname}" if verbose? return if no_harm? # Does not check '/', it's too abnormal. dirs = File.expand_path(dirname).split(%r<(?=/)>) if /\A[a-z]:\z/i =~ dirs[0] disk = dirs.shift dirs[0] = disk + dirs[0] end dirs.each_index do |idx| path = dirs[0..idx].join('') Dir.mkdir path unless File.dir?(path) end end def rm_f(path) $stderr.puts "rm -f #{path}" if verbose? return if no_harm? force_remove_file path end def rm_rf(path) $stderr.puts "rm -rf #{path}" if verbose? return if no_harm? remove_tree path end def remove_tree(path) if File.symlink?(path) remove_file path elsif File.dir?(path) remove_tree0 path else force_remove_file path end end def remove_tree0(path) Dir.foreach(path) do |ent| next if ent == '.' next if ent == '..' entpath = "#{path}/#{ent}" if File.symlink?(entpath) remove_file entpath elsif File.dir?(entpath) remove_tree0 entpath else force_remove_file entpath end end begin Dir.rmdir path rescue Errno::ENOTEMPTY # directory may not be empty end end def move_file(src, dest) force_remove_file dest begin File.rename src, dest rescue File.open(dest, 'wb') {|f| f.write File.binread(src) } File.chmod File.stat(src).mode, dest File.unlink src end end def force_remove_file(path) begin remove_file path rescue end end def remove_file(path) File.chmod 0777, path File.unlink path end def install(from, dest, mode, prefix = nil) $stderr.puts "install #{from} #{dest}" if verbose? return if no_harm? realdest = prefix ? prefix + File.expand_path(dest) : dest realdest = File.join(realdest, File.basename(from)) if File.dir?(realdest) str = File.binread(from) if diff?(str, realdest) verbose_off { rm_f realdest if File.exist?(realdest) } File.open(realdest, 'wb') {|f| f.write str } File.chmod mode, realdest File.open("#{objdir_root()}/InstalledFiles", 'a') {|f| if prefix f.puts realdest.sub(prefix, '') else f.puts realdest end } end end def diff?(new_content, path) return true unless File.exist?(path) new_content != File.binread(path) end def command(*args) $stderr.puts args.join(' ') if verbose? system(*args) or raise RuntimeError, "system(#{args.map{|a| a.inspect }.join(' ')}) failed" end def ruby(*args) command config('rubyprog'), *args end def make(task = nil) command(*[config('makeprog'), task].compact) end def extdir?(dir) File.exist?("#{dir}/MANIFEST") or File.exist?("#{dir}/extconf.rb") end def files_of(dir) Dir.open(dir) {|d| return d.select {|ent| File.file?("#{dir}/#{ent}") } } end DIR_REJECT = %w( . .. CVS SCCS RCS CVS.adm .svn ) def directories_of(dir) Dir.open(dir) {|d| return d.select {|ent| File.dir?("#{dir}/#{ent}") } - DIR_REJECT } end end # This module requires: #srcdir_root, #objdir_root, #relpath module HookScriptAPI def get_config(key) @config[key] end alias config get_config # obsolete: use metaconfig to change configuration def set_config(key, val) @config[key] = val end # # srcdir/objdir (works only in the package directory) # def curr_srcdir "#{srcdir_root()}/#{relpath()}" end def curr_objdir "#{objdir_root()}/#{relpath()}" end def srcfile(path) "#{curr_srcdir()}/#{path}" end def srcexist?(path) File.exist?(srcfile(path)) end def srcdirectory?(path) File.dir?(srcfile(path)) end def srcfile?(path) File.file?(srcfile(path)) end def srcentries(path = '.') Dir.open("#{curr_srcdir()}/#{path}") {|d| return d.to_a - %w(. ..) } end def srcfiles(path = '.') srcentries(path).select {|fname| File.file?(File.join(curr_srcdir(), path, fname)) } end def srcdirectories(path = '.') srcentries(path).select {|fname| File.dir?(File.join(curr_srcdir(), path, fname)) } end end class ToplevelInstaller Version = '3.4.1' Copyright = 'Copyright (c) 2000-2005 Minero Aoki' TASKS = [ [ 'all', 'do config, setup, then install' ], [ 'config', 'saves your configurations' ], [ 'show', 'shows current configuration' ], [ 'setup', 'compiles ruby extentions and others' ], [ 'install', 'installs files' ], [ 'test', 'run all tests in test/' ], [ 'clean', "does `make clean' for each extention" ], [ 'distclean',"does `make distclean' for each extention" ] ] def ToplevelInstaller.invoke config = ConfigTable.new(load_rbconfig()) config.load_standard_entries config.load_multipackage_entries if multipackage? config.fixup klass = (multipackage?() ? ToplevelInstallerMulti : ToplevelInstaller) klass.new(File.dirname($0), config).invoke end def ToplevelInstaller.multipackage? File.dir?(File.dirname($0) + '/packages') end def ToplevelInstaller.load_rbconfig if arg = ARGV.detect {|arg| /\A--rbconfig=/ =~ arg } ARGV.delete(arg) load File.expand_path(arg.split(/=/, 2)[1]) $".push 'rbconfig.rb' else require 'rbconfig' end ::Config::CONFIG end def initialize(ardir_root, config) @ardir = File.expand_path(ardir_root) @config = config # cache @valid_task_re = nil end def config(key) @config[key] end def inspect "#<#{self.class} #{__id__()}>" end def invoke run_metaconfigs case task = parsearg_global() when nil, 'all' parsearg_config init_installers exec_config exec_setup exec_install else case task when 'config', 'test' ; when 'clean', 'distclean' @config.load_savefile if File.exist?(@config.savefile) else @config.load_savefile end __send__ "parsearg_#{task}" init_installers __send__ "exec_#{task}" end end def run_metaconfigs @config.load_script "#{@ardir}/metaconfig" end def init_installers @installer = Installer.new(@config, @ardir, File.expand_path('.')) end # # Hook Script API bases # def srcdir_root @ardir end def objdir_root '.' end def relpath '.' end # # Option Parsing # def parsearg_global while arg = ARGV.shift case arg when /\A\w+\z/ setup_rb_error "invalid task: #{arg}" unless valid_task?(arg) return arg when '-q', '--quiet' @config.verbose = false when '--verbose' @config.verbose = true when '--help' print_usage $stdout exit 0 when '--version' puts "#{File.basename($0)} version #{Version}" exit 0 when '--copyright' puts Copyright exit 0 else setup_rb_error "unknown global option '#{arg}'" end end nil end def valid_task?(t) valid_task_re() =~ t end def valid_task_re @valid_task_re ||= /\A(?:#{TASKS.map {|task,desc| task }.join('|')})\z/ end def parsearg_no_options unless ARGV.empty? task = caller(0).first.slice(%r<`parsearg_(\w+)'>, 1) setup_rb_error "#{task}: unknown options: #{ARGV.join(' ')}" end end alias parsearg_show parsearg_no_options alias parsearg_setup parsearg_no_options alias parsearg_test parsearg_no_options alias parsearg_clean parsearg_no_options alias parsearg_distclean parsearg_no_options def parsearg_config evalopt = [] set = [] @config.config_opt = [] while i = ARGV.shift if /\A--?\z/ =~ i @config.config_opt = ARGV.dup break end name, value = *@config.parse_opt(i) if @config.value_config?(name) @config[name] = value else evalopt.push [name, value] end set.push name end evalopt.each do |name, value| @config.lookup(name).evaluate value, @config end # Check if configuration is valid set.each do |n| @config[n] if @config.value_config?(n) end end def parsearg_install @config.no_harm = false @config.install_prefix = '' while a = ARGV.shift case a when '--no-harm' @config.no_harm = true when /\A--prefix=/ path = a.split(/=/, 2)[1] path = File.expand_path(path) unless path[0,1] == '/' @config.install_prefix = path else setup_rb_error "install: unknown option #{a}" end end end def print_usage(out) out.puts 'Typical Installation Procedure:' out.puts " $ ruby #{File.basename $0} config" out.puts " $ ruby #{File.basename $0} setup" out.puts " # ruby #{File.basename $0} install (may require root privilege)" out.puts out.puts 'Detailed Usage:' out.puts " ruby #{File.basename $0} " out.puts " ruby #{File.basename $0} [] []" fmt = " %-24s %s\n" out.puts out.puts 'Global options:' out.printf fmt, '-q,--quiet', 'suppress message outputs' out.printf fmt, ' --verbose', 'output messages verbosely' out.printf fmt, ' --help', 'print this message' out.printf fmt, ' --version', 'print version and quit' out.printf fmt, ' --copyright', 'print copyright and quit' out.puts out.puts 'Tasks:' TASKS.each do |name, desc| out.printf fmt, name, desc end fmt = " %-24s %s [%s]\n" out.puts out.puts 'Options for CONFIG or ALL:' @config.each do |item| out.printf fmt, item.help_opt, item.description, item.help_default end out.printf fmt, '--rbconfig=path', 'rbconfig.rb to load',"running ruby's" out.puts out.puts 'Options for INSTALL:' out.printf fmt, '--no-harm', 'only display what to do if given', 'off' out.printf fmt, '--prefix=path', 'install path prefix', '' out.puts end # # Task Handlers # def exec_config @installer.exec_config @config.save # must be final end def exec_setup @installer.exec_setup end def exec_install @installer.exec_install end def exec_test @installer.exec_test end def exec_show @config.each do |i| printf "%-20s %s\n", i.name, i.value if i.value? end end def exec_clean @installer.exec_clean end def exec_distclean @installer.exec_distclean end end # class ToplevelInstaller class ToplevelInstallerMulti < ToplevelInstaller include FileOperations def initialize(ardir_root, config) super @packages = directories_of("#{@ardir}/packages") raise 'no package exists' if @packages.empty? @root_installer = Installer.new(@config, @ardir, File.expand_path('.')) end def run_metaconfigs @config.load_script "#{@ardir}/metaconfig", self @packages.each do |name| @config.load_script "#{@ardir}/packages/#{name}/metaconfig" end end attr_reader :packages def packages=(list) raise 'package list is empty' if list.empty? list.each do |name| raise "directory packages/#{name} does not exist"\ unless File.dir?("#{@ardir}/packages/#{name}") end @packages = list end def init_installers @installers = {} @packages.each do |pack| @installers[pack] = Installer.new(@config, "#{@ardir}/packages/#{pack}", "packages/#{pack}") end with = extract_selection(config('with')) without = extract_selection(config('without')) @selected = @installers.keys.select {|name| (with.empty? or with.include?(name)) \ and not without.include?(name) } end def extract_selection(list) a = list.split(/,/) a.each do |name| setup_rb_error "no such package: #{name}" unless @installers.key?(name) end a end def print_usage(f) super f.puts 'Inluded packages:' f.puts ' ' + @packages.sort.join(' ') f.puts end # # Task Handlers # def exec_config run_hook 'pre-config' each_selected_installers {|inst| inst.exec_config } run_hook 'post-config' @config.save # must be final end def exec_setup run_hook 'pre-setup' each_selected_installers {|inst| inst.exec_setup } run_hook 'post-setup' end def exec_install run_hook 'pre-install' each_selected_installers {|inst| inst.exec_install } run_hook 'post-install' end def exec_test run_hook 'pre-test' each_selected_installers {|inst| inst.exec_test } run_hook 'post-test' end def exec_clean rm_f @config.savefile run_hook 'pre-clean' each_selected_installers {|inst| inst.exec_clean } run_hook 'post-clean' end def exec_distclean rm_f @config.savefile run_hook 'pre-distclean' each_selected_installers {|inst| inst.exec_distclean } run_hook 'post-distclean' end # # lib # def each_selected_installers Dir.mkdir 'packages' unless File.dir?('packages') @selected.each do |pack| $stderr.puts "Processing the package `#{pack}' ..." if verbose? Dir.mkdir "packages/#{pack}" unless File.dir?("packages/#{pack}") Dir.chdir "packages/#{pack}" yield @installers[pack] Dir.chdir '../..' end end def run_hook(id) @root_installer.run_hook id end # module FileOperations requires this def verbose? @config.verbose? end # module FileOperations requires this def no_harm? @config.no_harm? end end # class ToplevelInstallerMulti class Installer FILETYPES = %w( bin lib ext data conf man ) include FileOperations include HookScriptAPI def initialize(config, srcroot, objroot) @config = config @srcdir = File.expand_path(srcroot) @objdir = File.expand_path(objroot) @currdir = '.' end def inspect "#<#{self.class} #{File.basename(@srcdir)}>" end def noop(rel) end # # Hook Script API base methods # def srcdir_root @srcdir end def objdir_root @objdir end def relpath @currdir end # # Config Access # # module FileOperations requires this def verbose? @config.verbose? end # module FileOperations requires this def no_harm? @config.no_harm? end def verbose_off begin save, @config.verbose = @config.verbose?, false yield ensure @config.verbose = save end end # # TASK config # def exec_config exec_task_traverse 'config' end alias config_dir_bin noop alias config_dir_lib noop def config_dir_ext(rel) extconf if extdir?(curr_srcdir()) end alias config_dir_data noop alias config_dir_conf noop alias config_dir_man noop def extconf ruby "#{curr_srcdir()}/extconf.rb", *@config.config_opt end # # TASK setup # def exec_setup exec_task_traverse 'setup' end def setup_dir_bin(rel) files_of(curr_srcdir()).each do |fname| update_shebang_line "#{curr_srcdir()}/#{fname}" end end alias setup_dir_lib noop def setup_dir_ext(rel) make if extdir?(curr_srcdir()) end alias setup_dir_data noop alias setup_dir_conf noop alias setup_dir_man noop def update_shebang_line(path) return if no_harm? return if config('shebang') == 'never' old = Shebang.load(path) if old $stderr.puts "warning: #{path}: Shebang line includes too many args. It is not portable and your program may not work." if old.args.size > 1 new = new_shebang(old) return if new.to_s == old.to_s else return unless config('shebang') == 'all' new = Shebang.new(config('rubypath')) end $stderr.puts "updating shebang: #{File.basename(path)}" if verbose? open_atomic_writer(path) {|output| File.open(path, 'rb') {|f| f.gets if old # discard output.puts new.to_s output.print f.read } } end def new_shebang(old) if /\Aruby/ =~ File.basename(old.cmd) Shebang.new(config('rubypath'), old.args) elsif File.basename(old.cmd) == 'env' and old.args.first == 'ruby' Shebang.new(config('rubypath'), old.args[1..-1]) else return old unless config('shebang') == 'all' Shebang.new(config('rubypath')) end end def open_atomic_writer(path, &block) tmpfile = File.basename(path) + '.tmp' begin File.open(tmpfile, 'wb', &block) File.rename tmpfile, File.basename(path) ensure File.unlink tmpfile if File.exist?(tmpfile) end end class Shebang def Shebang.load(path) line = nil File.open(path) {|f| line = f.gets } return nil unless /\A#!/ =~ line parse(line) end def Shebang.parse(line) cmd, *args = *line.strip.sub(/\A\#!/, '').split(' ') new(cmd, args) end def initialize(cmd, args = []) @cmd = cmd @args = args end attr_reader :cmd attr_reader :args def to_s "#! #{@cmd}" + (@args.empty? ? '' : " #{@args.join(' ')}") end end # # TASK install # def exec_install rm_f 'InstalledFiles' exec_task_traverse 'install' end def install_dir_bin(rel) install_files targetfiles(), "#{config('bindir')}/#{rel}", 0755 end def install_dir_lib(rel) install_files libfiles(), "#{config('rbdir')}/#{rel}", 0644 end def install_dir_ext(rel) return unless extdir?(curr_srcdir()) install_files rubyextentions('.'), "#{config('sodir')}/#{File.dirname(rel)}", 0555 end def install_dir_data(rel) install_files targetfiles(), "#{config('datadir')}/#{rel}", 0644 end def install_dir_conf(rel) # FIXME: should not remove current config files # (rename previous file to .old/.org) install_files targetfiles(), "#{config('sysconfdir')}/#{rel}", 0644 end def install_dir_man(rel) install_files targetfiles(), "#{config('mandir')}/#{rel}", 0644 end def install_files(list, dest, mode) mkdir_p dest, @config.install_prefix list.each do |fname| install fname, dest, mode, @config.install_prefix end end def libfiles glob_reject(%w(*.y *.output), targetfiles()) end def rubyextentions(dir) ents = glob_select("*.#{@config.dllext}", targetfiles()) if ents.empty? setup_rb_error "no ruby extention exists: 'ruby #{$0} setup' first" end ents end def targetfiles mapdir(existfiles() - hookfiles()) end def mapdir(ents) ents.map {|ent| if File.exist?(ent) then ent # objdir else "#{curr_srcdir()}/#{ent}" # srcdir end } end # picked up many entries from cvs-1.11.1/src/ignore.c JUNK_FILES = %w( core RCSLOG tags TAGS .make.state .nse_depinfo #* .#* cvslog.* ,* .del-* *.olb *~ *.old *.bak *.BAK *.orig *.rej _$* *$ *.org *.in .* ) def existfiles glob_reject(JUNK_FILES, (files_of(curr_srcdir()) | files_of('.'))) end def hookfiles %w( pre-%s post-%s pre-%s.rb post-%s.rb ).map {|fmt| %w( config setup install clean ).map {|t| sprintf(fmt, t) } }.flatten end def glob_select(pat, ents) re = globs2re([pat]) ents.select {|ent| re =~ ent } end def glob_reject(pats, ents) re = globs2re(pats) ents.reject {|ent| re =~ ent } end GLOB2REGEX = { '.' => '\.', '$' => '\$', '#' => '\#', '*' => '.*' } def globs2re(pats) /\A(?:#{ pats.map {|pat| pat.gsub(/[\.\$\#\*]/) {|ch| GLOB2REGEX[ch] } }.join('|') })\z/ end # # TASK test # TESTDIR = 'test' def exec_test unless File.directory?('test') $stderr.puts 'no test in this package' if verbose? return end $stderr.puts 'Running tests...' if verbose? begin require 'test/unit' rescue LoadError setup_rb_error 'test/unit cannot loaded. You need Ruby 1.8 or later to invoke this task.' end runner = Test::Unit::AutoRunner.new(true) runner.to_run << TESTDIR runner.run end # # TASK clean # def exec_clean exec_task_traverse 'clean' rm_f @config.savefile rm_f 'InstalledFiles' end alias clean_dir_bin noop alias clean_dir_lib noop alias clean_dir_data noop alias clean_dir_conf noop alias clean_dir_man noop def clean_dir_ext(rel) return unless extdir?(curr_srcdir()) make 'clean' if File.file?('Makefile') end # # TASK distclean # def exec_distclean exec_task_traverse 'distclean' rm_f @config.savefile rm_f 'InstalledFiles' end alias distclean_dir_bin noop alias distclean_dir_lib noop def distclean_dir_ext(rel) return unless extdir?(curr_srcdir()) make 'distclean' if File.file?('Makefile') end alias distclean_dir_data noop alias distclean_dir_conf noop alias distclean_dir_man noop # # Traversing # def exec_task_traverse(task) run_hook "pre-#{task}" FILETYPES.each do |type| if type == 'ext' and config('without-ext') == 'yes' $stderr.puts 'skipping ext/* by user option' if verbose? next end traverse task, type, "#{task}_dir_#{type}" end run_hook "post-#{task}" end def traverse(task, rel, mid) dive_into(rel) { run_hook "pre-#{task}" __send__ mid, rel.sub(%r[\A.*?(?:/|\z)], '') directories_of(curr_srcdir()).each do |d| traverse task, "#{rel}/#{d}", mid end run_hook "post-#{task}" } end def dive_into(rel) return unless File.dir?("#{@srcdir}/#{rel}") dir = File.basename(rel) Dir.mkdir dir unless File.dir?(dir) prevdir = Dir.pwd Dir.chdir dir $stderr.puts '---> ' + rel if verbose? @currdir = rel yield Dir.chdir prevdir $stderr.puts '<--- ' + rel if verbose? @currdir = File.dirname(rel) end def run_hook(id) path = [ "#{curr_srcdir()}/#{id}", "#{curr_srcdir()}/#{id}.rb" ].detect {|cand| File.file?(cand) } return unless path begin instance_eval File.read(path), path, 1 rescue raise if $DEBUG setup_rb_error "hook #{path} failed:\n" + $!.message end end end # class Installer class SetupError < StandardError; end def setup_rb_error(msg) raise SetupError, msg end if $0 == __FILE__ begin ToplevelInstaller.invoke rescue SetupError raise if $DEBUG $stderr.puts $!.message $stderr.puts "Try 'ruby #{$0} --help' for detailed usage." exit 1 end end ================================================ FILE: test/fixtures/correlation_matrix.rb ================================================ # Retrieve Correlation matrix for eigth variables module Statsample module Fixtures def harman_817 Matrix[ [1.0, 0.84, 0.62, -0.53, 0.03, 0.57, -0.33, -0.63], [0.84, 1.00, 0.84, -0.68, -0.05, 0.76, -0.35, -0.73], [0.62, 0.84, 1.00, -0.76, 0.08, 0.81, -0.51, -0.81], [-0.53, -0.68, -0.76, 1.00, -0.25, -0.80, 0.62, 0.88], [0.03, -0.05, 0.08, -0.25, 1.00, 0.25, -0.72, -0.36], [0.57, 0.76, 0.81, -0.80, 0.25, 1.00, -0.58, -0.84], [-0.33, -0.35, -0.51, 0.62, -0.72, -0.58, 1.00, 0.68], [-0.63, -0.73, -0.81, 0.88, -0.36, -0.84, 0.68, 1.00] ].extend(Statsample::CovariateMatrix) end end end ================================================ FILE: test/fixtures/hartman_23.matrix ================================================ "height" "arm.span" "forearm" "lower.leg" "weight" "bitro.diameter" "chest.girth" "chest.width" "height" 1 0.846 0.805 0.859 0.473 0.398 0.301 0.382 "arm.span" 0.846 1 0.881 0.826 0.376 0.326 0.277 0.415 "forearm" 0.805 0.881 1 0.801 0.38 0.319 0.237 0.345 "lower.leg" 0.859 0.826 0.801 1 0.436 0.329 0.327 0.365 "weight" 0.473 0.376 0.38 0.436 1 0.762 0.73 0.629 "bitro.diameter" 0.398 0.326 0.319 0.329 0.762 1 0.583 0.577 "chest.girth" 0.301 0.277 0.237 0.327 0.73 0.583 1 0.539 "chest.width" 0.382 0.415 0.345 0.365 0.629 0.577 0.539 1 ================================================ FILE: test/fixtures/repeated_fields.csv ================================================ "id","name","age","city","a1","name","age" 1,"Alex",20,"New York","a,b","a",3 2,"Claude",23,"London","b,c","b",4 3,"Peter",25,"London","a","c",5 4,"Franz",27,"Paris",,"d",6 5,"George","5,5","Tome","a,b,c","f", 6,"Fernand",20,"London","c,b","f",8 ================================================ FILE: test/fixtures/stock_data.csv ================================================ 17.66 17.65 17.68 17.66 17.68 17.67 17.68 17.68 17.67 17.67 17.68 17.71 17.74 17.72 17.73 17.76 17.74 17.69 17.69 17.67 17.66 17.67 17.69 17.69 17.68 17.65 17.65 17.64 17.63 17.64 17.67 17.68 17.7 17.68 17.69 17.69 17.72 17.71 17.71 17.71 17.69 17.69 17.71 17.72 17.71 17.68 17.68 17.68 17.69 17.68 17.68 17.69 17.67 17.69 17.71 17.7 17.7 17.71 17.73 17.74 17.74 17.74 17.76 17.77 17.55 17.55 17.5 17.46 17.49 17.54 17.51 17.54 17.57 17.54 17.52 17.53 17.56 17.55 17.55 17.54 17.55 17.55 17.55 17.54 17.52 17.53 17.51 17.52 17.5 17.5 17.5 17.49 17.46 17.47 17.48 17.45 17.41 17.39 17.38 17.43 17.44 17.43 17.43 17.46 17.46 17.47 17.47 17.45 17.48 17.49 17.5 17.49 17.48 17.49 17.47 17.47 17.44 17.44 17.43 17.45 17.42 17.43 17.43 17.44 17.44 17.43 17.41 17.41 17.38 17.38 17.37 17.37 17.37 17.3 17.28 17.27 17.19 16.41 16.44 16.48 16.53 16.51 16.57 16.54 16.59 16.64 16.6 16.65 16.69 16.69 16.68 16.64 16.65 16.66 16.64 16.61 16.65 16.67 16.66 16.65 16.61 16.59 16.57 16.55 16.55 16.57 16.54 16.6 16.62 16.6 16.59 16.61 16.66 16.69 16.67 16.65 16.66 16.65 16.65 16.68 16.68 16.67 16.64 16.73 16.76 16.75 16.79 16.8 16.77 16.74 16.76 16.83 16.84 16.82 16.89 16.93 16.94 16.9 16.92 16.88 16.85 16.87 16.8 16.79 16.85 16.85 16.8 16.82 16.85 16.9 16.86 16.79 16.75 16.78 17.06 17.05 17.04 17.02 17.01 17.02 17.05 17.07 17.08 17.09 17.1 17.11 17.09 17.1 17.1 17.12 17.17 17.16 17.17 17.18 17.18 17.18 17.17 17.15 17.14 17.13 17.14 17.13 17.12 17.12 17.09 17.09 17.11 17.06 17.07 17.06 17.07 17.06 17.09 17.05 17.04 17.04 16.99 17 17.03 17 16.97 16.96 16.98 16.98 16.98 17.03 17 17 17 17.02 17 17.02 17.01 17.02 17.03 17.03 17.01 17.03 17.03 17.03 17.01 17.03 17.05 17.05 17.08 17.04 17.01 17.03 17.02 17.03 17.04 17.05 17.37 17.35 17.34 17.32 17.29 17.29 17.22 17.26 17.3 17.34 17.33 17.39 17.4 17.39 17.48 17.5 17.47 17.43 17.4 17.42 17.46 17.48 17.48 17.46 17.46 17.45 17.43 17.44 17.48 17.43 17.45 17.47 17.46 17.46 17.48 17.48 17.48 17.46 17.5 17.55 17.58 17.57 17.56 17.59 17.61 17.62 17.63 17.62 17.61 17.61 17.62 17.64 17.65 17.61 17.62 17.66 17.65 17.64 17.63 17.64 17.64 17.64 17.63 17.61 17.61 17.62 17.63 17.64 17.65 17.66 17.68 17.69 17.69 17.69 17.66 17.69 17.69 17.62 17.68 17.64 17.65 17.61 17.52 17.56 17.55 17.55 17.48 17.45 17.46 17.46 17.44 17.47 17.5 17.49 17.5 17.53 17.53 17.54 17.51 17.51 17.53 17.53 17.53 17.55 17.55 17.54 17.56 17.59 17.57 17.58 17.58 17.57 17.59 17.57 17.55 17.51 17.51 17.52 17.52 17.53 17.55 17.59 17.61 17.61 17.6 17.6 17.62 17.65 17.62 17.6 17.6 17.62 17.61 17.62 17.63 17.64 17.65 17.61 17.62 17.64 17.63 17.62 17.6 17.57 17.57 17.6 17.59 17.6 17.61 17.61 17.63 17.63 17.59 17.58 17.76 17.79 17.76 17.73 17.74 17.73 17.67 17.66 17.66 17.64 17.63 17.62 17.61 17.6 17.61 17.61 17.6 17.6 17.64 17.65 17.65 17.63 17.61 17.6 17.63 17.63 17.62 17.63 17.64 17.62 17.63 17.65 17.64 17.6 17.59 17.59 17.58 17.58 17.6 17.6 17.6 17.6 17.6 17.58 17.59 17.6 17.6 17.6 17.59 17.59 17.58 17.58 17.65 17.65 ================================================ FILE: test/fixtures/test_csv.csv ================================================ "id","name","age","city","a1" 1,"Alex",20,"New York","a,b" 2,"Claude",23,"London","b,c" 3,"Peter",25,"London","a" 4,"Franz",27,"Paris", 5,"George","5,5","Tome","a,b,c" 6,"Fernand",,, ================================================ FILE: test/fixtures/tetmat_matrix.txt ================================================ 1.0000000 0.1703164 0.2275128 0.1071861 0.0665047 0.1703164 1.0000000 0.1890911 0.1111471 0.1724219 0.2275128 0.1890911 1.0000000 0.1866805 0.1055028 0.1071861 0.1111471 0.1866805 1.0000000 0.2009241 0.0665047 0.1724219 0.1055028 0.2009241 1.0000000 ================================================ FILE: test/fixtures/tetmat_test.txt ================================================ 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 2 1 1 1 1 2 1 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 2 1 1 1 1 2 1 2 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 2 1 1 1 1 2 1 1 2 1 2 1 1 2 1 2 1 1 2 1 2 1 1 2 1 2 1 1 2 1 2 1 1 2 1 2 1 1 2 1 2 1 1 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 2 1 1 2 2 2 1 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 2 1 2 1 1 2 1 2 1 1 2 1 2 1 1 2 1 2 1 1 2 1 2 1 1 2 1 2 1 1 2 1 2 1 1 2 1 2 1 1 2 1 2 1 1 2 1 2 1 1 2 1 2 1 1 2 1 2 1 1 2 1 2 1 1 2 1 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 2 1 1 2 1 2 1 1 2 1 2 1 1 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2  ================================================ FILE: test/helpers_tests.rb ================================================ $:.unshift(File.expand_path(File.dirname(__FILE__)+'/../lib/')) $:.unshift(File.expand_path(File.dirname(__FILE__)+'/')) require 'minitest' require 'minitest/unit' require 'mocha/setup' require 'tempfile' require 'tmpdir' require 'shoulda' require 'shoulda-context' require 'fixtures/correlation_matrix' require 'statsample' module MiniTest class Test include Shoulda::Context::Assertions include Shoulda::Context::InstanceMethods extend Shoulda::Context::ClassMethods def self.should_with_gsl(name,&block) should(name) do if Statsample.has_gsl? instance_eval(&block) else skip("Requires GSL") end end end end module Assertions def assert_similar_vector(exp, obs, delta=1e-10,msg=nil) msg||="Different vectors #{exp} - #{obs}" assert_equal(exp.size, obs.size) exp.data_with_nils.each_with_index {|v,i| assert_in_delta(v,obs[i],delta) } end def assert_equal_vector(exp,obs,delta=1e-10,msg=nil) assert_equal(exp.size, obs.size, "Different size.#{msg}") exp.size.times {|i| assert_in_delta(exp[i],obs[i],delta, "Different element #{i}. \nExpected:\n#{exp}\nObserved:\n#{obs}.#{msg}") } end def assert_equal_matrix(exp,obs,delta=1e-10,msg=nil) assert_equal(exp.row_size, obs.row_size, "Different row size.#{msg}") assert_equal(exp.column_size, obs.column_size, "Different column size.#{msg}") exp.row_size.times {|i| exp.column_size.times {|j| assert_in_delta(exp[i,j],obs[i,j], delta, "Different element #{i},#{j}\nExpected:\n#{exp}\nObserved:\n#{obs}.#{msg}") } } end alias :assert_raise :assert_raises unless method_defined? :assert_raise alias :assert_not_equal :refute_equal unless method_defined? :assert_not_equal alias :assert_not_same :refute_same unless method_defined? :assert_not_same unless method_defined? :assert_nothing_raised def assert_nothing_raised(msg=nil) msg||="Nothing should be raised, but raised %s" begin yield not_raised=true rescue Exception => e not_raised=false msg=sprintf(msg,e) end assert(not_raised,msg) end end end end MiniTest.autorun ================================================ FILE: test/test_analysis.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) class StatsampleAnalysisTestCase < MiniTest::Unit::TestCase context(Statsample::Analysis) do setup do Statsample::Analysis.clear_analysis end should "store() should create and store Statsample::Analysis::Suite" do Statsample::Analysis.store(:first) do a=1 end assert(Statsample::Analysis.stored_analysis[:first]) assert(Statsample::Analysis.stored_analysis[:first].is_a? Statsample::Analysis::Suite) end should "ss_analysis should create an Statsample::Analysis" do ss_analysis(:first) {a=1} end should "store last created analysis" do an=Statsample::Analysis.store(:first) do a=1 end assert_equal(an,Statsample::Analysis.last) end should "add_to_reportbuilder() add sections to reportbuilder object" do rb=mock() rb.expects(:add).with {|value| value.is_a? ReportBuilder::Section and value.name==:first} rb.expects(:add).with {|value| value.is_a? ReportBuilder::Section and value.name==:second} Statsample::Analysis.store(:first) do echo "first","second" end Statsample::Analysis.store(:second) do echo "third" end Statsample::Analysis.add_to_reportbuilder(rb,:first,:second) end should "to_text returns the same as a normal ReportBuilder object" do rb=ReportBuilder.new(:name=>:test) section=ReportBuilder::Section.new(:name=>"first") a=[1,2,3].to_scale section.add("first") section.add(a) rb.add(section) exp=rb.to_text an=ss_analysis(:first) { echo 'first' summary(a) } obs=Statsample::Analysis.to_text(:first) assert_equal(exp.split("\n")[1,exp.size], obs.split("\n")[1,obs.size]) end should "run() execute all analysis by default" do m1=mock() m1.expects(:run).once m1.expects(:hide).once Statsample::Analysis.store(:first) do m1.run end Statsample::Analysis.store(:second) do m1.hide end # Should run all test Statsample::Analysis.run end should "run() execute blocks specificed on parameters" do m1=mock() m1.expects(:run).once m1.expects(:hide).never Statsample::Analysis.store(:first) do m1.run end Statsample::Analysis.store(:second) do m1.hide end # Should run all test Statsample::Analysis.run(:first) end context(Statsample::Analysis::Suite) do should "echo() uses output#puts with same arguments" do an=Statsample::Analysis::Suite.new(:output) obj=mock() obj.expects(:puts).with(:first,:second).once an.output=obj an.echo(:first,:second) end should "summary() should call object.summary" do an=Statsample::Analysis::Suite.new(:summary) obj=stub('summarizable',:summary=>'summary') assert_equal(obj.summary,an.summary(obj)) end should "attach() allows to call objects on objects which respond to fields" do an=Statsample::Analysis::Suite.new(:summary) ds={'x'=>stub(:mean=>10),'y'=>stub(:mean=>12)} ds.expects(:fields).returns(%w{x y}).at_least_once an.attach(ds) assert_equal(10,an.x.mean) assert_equal(12,an.y.mean) assert_raise(RuntimeError) { an.z } end should "attached objects should be called LIFO" do an=Statsample::Analysis::Suite.new(:summary) ds1={'x'=>stub(:mean=>100),'y'=>stub(:mean=>120),'z'=>stub(:mean=>13)} ds1.expects(:fields).returns(%w{x y z}).at_least_once ds2={'x'=>stub(:mean=>10),'y'=>stub(:mean=>12)} ds2.expects(:fields).returns(%w{x y}).at_least_once an.attach(ds1) an.attach(ds2) assert_equal(10,an.x.mean) assert_equal(12,an.y.mean) assert_equal(13,an.z.mean) end should "detach() without arguments drop latest object" do an=Statsample::Analysis::Suite.new(:summary) ds1={'x'=>stub(:mean=>100),'y'=>stub(:mean=>120),'z'=>stub(:mean=>13)} ds1.expects(:fields).returns(%w{x y z}).at_least_once ds2={'x'=>stub(:mean=>10),'y'=>stub(:mean=>12)} ds2.expects(:fields).returns(%w{x y}).at_least_once an.attach(ds1) an.attach(ds2) assert_equal(10,an.x.mean) an.detach assert_equal(100, an.x.mean) end should "detach() with argument drop select object" do an=Statsample::Analysis::Suite.new(:summary) ds1={'x'=>1} ds1.expects(:fields).returns(%w{x}).at_least_once ds2={'x'=>2,'y'=>3} ds2.expects(:fields).returns(%w{x y}).at_least_once ds3={'y'=>4} ds3.expects(:fields).returns(%w{y}).at_least_once an.attach(ds3) an.attach(ds2) an.attach(ds1) assert_equal(1,an.x) assert_equal(3,an.y) an.detach(ds2) assert_equal(4,an.y) end should "perform a simple analysis" do output=mock() output.expects(:puts).with(5.5) an=Statsample::Analysis.store(:simple, :output=>output) do ds=data_frame(:x=>vector(1..10),:y=>vector(1..10)) attach(ds) echo x.mean end an.run end end context(Statsample::Analysis::SuiteReportBuilder) do should "echo() use add on rb object" do an=Statsample::Analysis::SuiteReportBuilder.new(:puts_to_add) an.rb.expects(:add).with(:first).twice an.echo(:first, :first) end should "summary() uses add on rb object" do an=Statsample::Analysis::SuiteReportBuilder.new(:summary_to_add) an.rb.expects(:add).with(:first).once an.summary(:first) end end end end ================================================ FILE: test/test_anova_contrast.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) class StatsampleAnovaContrastTestCase < MiniTest::Unit::TestCase context(Statsample::Anova::Contrast) do setup do constant=[12,13,11,12,12].to_scale frequent=[9,10,9,13,14].to_scale infrequent=[15,16,17,16,16].to_scale never=[17,18,12,18,20].to_scale @vectors=[constant, frequent, infrequent, never] @c=Statsample::Anova::Contrast.new(:vectors=>@vectors) end should "return correct value using c" do @c.c([1,-1.quo(3),-1.quo(3),-1.quo(3)]) #@c.c([1,-0.333,-0.333,-0.333]) assert_in_delta(-2.6667, @c.psi, 0.0001) assert_in_delta(1.0165, @c.se, 0.0001) assert_in_delta(-2.623, @c.t, 0.001) assert_in_delta(-4.82, @c.confidence_interval[0],0.01) assert_in_delta(-0.51, @c.confidence_interval[1],0.01) assert(@c.summary.size>0) end should "return correct values using c_by_index" do @c.c_by_index([0],[1,2,3]) assert_in_delta(-2.6667, @c.psi, 0.0001) assert_in_delta(1.0165, @c.se, 0.0001) assert_in_delta(-2.623, @c.t, 0.001) end should "return correct values using incomplete c_by_index" do c1=Statsample::Anova::Contrast.new(:vectors=>@vectors, :c=>[0.5,0.5,-1,0]) c2=Statsample::Anova::Contrast.new(:vectors=>@vectors, :c1=>[0,1],:c2=>[2]) assert_equal(c1.psi,c2.psi) assert_equal(c1.se,c2.se) assert_equal(c1.t,c2.t) end end end ================================================ FILE: test/test_anovaoneway.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) class StatsampleAnovaOneWayTestCase < MiniTest::Unit::TestCase context(Statsample::Anova::OneWay) do setup do @ss_num=30.08 @ss_den=87.88 @df_num=2 @df_den=21 @anova=Statsample::Anova::OneWay.new(:ss_num=>@ss_num, :ss_den=>@ss_den, :df_num=>@df_num, :df_den=>@df_den) end should "Statsample::Anova.oneway respond to #oneway" do assert(Statsample::Anova.respond_to? :oneway) end should "return correct value for ms_num and ms_den" do assert_in_delta(15.04, @anova.ms_num, 0.01) assert_in_delta(4.18, @anova.ms_den, 0.01) end should "return correct value for f" do assert_in_delta(3.59, @anova.f, 0.01) end should "respond to summary" do assert(@anova.respond_to? :summary) assert(@anova.summary.size>0) end end end ================================================ FILE: test/test_anovatwoway.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) class StatsampleAnovaTwoWayTestCase < MiniTest::Unit::TestCase context(Statsample::Anova::TwoWay) do setup do @ss_a=192.2 @ss_b=57.8 @ss_axb=168.2 @ss_within=75.6 @df_a=@df_b=1 @df_within=16 @anova=Statsample::Anova::TwoWay.new(:ss_a=>@ss_a, :ss_b=>@ss_b, :ss_axb=>@ss_axb, :ss_within=>@ss_within , :df_a=>@df_a, :df_b=>@df_b, :df_within=>@df_within) end should "Statsample::Anova.twoway respond to #twoway" do assert(Statsample::Anova.respond_to? :twoway) end should "return correct value for ms_a, ms_b and ms_axb" do assert_in_delta(192.2, @anova.ms_a, 0.01) assert_in_delta(57.8, @anova.ms_b, 0.01) assert_in_delta(168.2, @anova.ms_axb, 0.01) end should "return correct value for f " do assert_in_delta(40.68, @anova.f_a, 0.01) assert_in_delta(12.23, @anova.f_b, 0.01) assert_in_delta(35.60, @anova.f_axb, 0.01) end should "return correct value for probability for f " do assert(@anova.f_a_probability < 0.05) assert(@anova.f_b_probability < 0.05) assert(@anova.f_axb_probability < 0.05) end should "respond to summary" do assert(@anova.respond_to? :summary) assert(@anova.summary.size>0) end end end ================================================ FILE: test/test_anovatwowaywithdataset.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) # Reference: # * http://www.uwsp.edu/psych/Stat/13/anova-2w.htm#III class StatsampleAnovaTwoWayWithVectorsTestCase < MiniTest::Unit::TestCase context(Statsample::Anova::TwoWayWithVectors) do setup do @pa=[5,4,3,4,2,18,19,14,12,15,6,7,5,8,4,6,9,5,9,3].to_scale @pa.name="Passive Avoidance" @a=[0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,1,1,1,1,1].to_vector @a.labels={0=>'0%',1=>'35%'} @a.name='Diet' @b=[0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1].to_vector @b.labels={0=>'Young',1=>'Older'} @b.name="Age" @anova=Statsample::Anova::TwoWayWithVectors.new(:a=>@a,:b=>@b, :dependent=>@pa) end should "Statsample::Anova respond to #twoway_with_vectors" do assert(Statsample::Anova.respond_to? :twoway_with_vectors) end should "#new returns the same as Statsample::Anova.twoway_with_vectors" do @anova2=Statsample::Anova.twoway_with_vectors(:a=>@a,:b=>@b, :dependent=>@pa) assert_equal(@anova.summary, @anova2.summary) end should "return correct value for ms_a, ms_b and ms_axb" do assert_in_delta(192.2, @anova.ms_a, 0.01) assert_in_delta(57.8, @anova.ms_b, 0.01) assert_in_delta(168.2, @anova.ms_axb, 0.01) end should "return correct value for f " do assert_in_delta(40.68, @anova.f_a, 0.01) assert_in_delta(12.23, @anova.f_b, 0.01) assert_in_delta(35.60, @anova.f_axb, 0.01) end should "return correct value for probability for f " do assert(@anova.f_a_probability < 0.05) assert(@anova.f_b_probability < 0.05) assert(@anova.f_axb_probability < 0.05) end should "respond to summary" do @anova.summary_descriptives=true @anova.summary_levene=true assert(@anova.respond_to? :summary) assert(@anova.summary.size>0) end end end ================================================ FILE: test/test_anovawithvectors.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) class StatsampleAnovaOneWayWithVectorsTestCase < MiniTest::Unit::TestCase context(Statsample::Anova::OneWayWithVectors) do context("when initializing") do setup do @v1=10.times.map {rand(100)}.to_scale @v2=10.times.map {rand(100)}.to_scale @v3=10.times.map {rand(100)}.to_scale end should "be the same using [] or args*" do a1=Statsample::Anova::OneWayWithVectors.new(@v1,@v2,@v3) a2=Statsample::Anova::OneWayWithVectors.new([@v1,@v2,@v3]) assert_equal(a1.f,a2.f) end should "be the same using module method or object instantiation" do a1=Statsample::Anova::OneWayWithVectors.new(@v1,@v2,@v3) a2=Statsample::Anova.oneway_with_vectors(@v1,@v2,@v3) assert_equal(a1.f,a2.f) end should "detect optional hash" do a1=Statsample::Anova::OneWayWithVectors.new(@v1,@v2,@v3, {:name=>'aaa'}) assert_equal('aaa', a1.name) end should "omit incorrect arguments" do a1=Statsample::Anova::OneWayWithVectors.new(@v1,@v2,@v3, {:name=>'aaa'}) a2=Statsample::Anova::OneWayWithVectors.new(@v1,nil,nil,@v2,@v3, {:name=>'aaa'}) assert_equal(a1.f,a2.f) end end setup do @v1=[3,3,2,3,6].to_vector(:scale) @v2=[7,6,5,6,7].to_vector(:scale) @v3=[9,8,9,7,8].to_vector(:scale) @name="Anova testing" @anova=Statsample::Anova::OneWayWithVectors.new(@v1,@v2,@v3, :name=>@name) end should "store correctly contrasts" do c1=Statsample::Anova::Contrast.new(:vectors=>[@v1,@v2,@v3], :c=>[1,-0.5, -0.5]) c2=@anova.contrast(:c=>[1,-0.5,-0.5]) assert_equal(c1.t,c2.t) end should "respond to #summary" do assert(@anova.respond_to? :summary) end should "have correct name of analysis on #summary" do assert_match(/#{@name}/, @anova.summary) end should "returns same levene values as direct Levene creation" do assert_equal(@anova.levene.f, Statsample::Test.levene([@v1,@v2,@v3]).f) end should "have correct value for levene" do assert_in_delta(0.604,@anova.levene.f, 0.001) assert_in_delta(0.562,@anova.levene.probability, 0.001) end should "have correct value for sst" do assert_in_delta(72.933, @anova.sst,0.001) end should "have correct value for sswg" do assert_in_delta(14.8,@anova.sswg,0.001) end should "have correct value for ssb" do assert_in_delta(58.133,@anova.ssbg,0.001) end should "sst=sswg+ssbg" do assert_in_delta(@anova.sst,@anova.sswg+@anova.ssbg,0.00001) end should "df total equal to number of n-1" do assert_equal(@v1.n+@v2.n+@v3.n-1,@anova.df_total) end should "df wg equal to number of n-k" do assert_equal(@v1.n+@v2.n+@v3.n-3,@anova.df_wg) end should "df bg equal to number of k-1" do assert_equal(2,@anova.df_bg) end should "f=(ssbg/df_bg)/(sswt/df_wt)" do assert_in_delta((@anova.ssbg.quo(@anova.df_bg)).quo( @anova.sswg.quo(@anova.df_wg)), @anova.f, 0.001) end should "p be correct" do assert(@anova.probability<0.01) end should "be correct using different test values" do anova2=Statsample::Anova::OneWayWithVectors.new([@v1,@v1,@v1,@v1,@v2]) assert_in_delta(3.960, anova2.f,0.001) assert_in_delta(0.016, anova2.probability,0.001) end context "with extra information on summary" do setup do @anova.summary_descriptives=true @anova.summary_levene=true @summary=@anova.summary end should "have section with levene statistics" do assert_match(/Levene/, @summary) end should "have section with descriptives" do assert_match(/Min/, @summary) end end end end ================================================ FILE: test/test_awesome_print_bug.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) class StatsampleAwesomePrintBug < MiniTest::Test context("Awesome Print integration") do setup do require "awesome_print" end should "should be flawless" do a=[1,2,3].to_scale assert(a!=[1,2,3]) assert_nothing_raised do ap a end end end end ================================================ FILE: test/test_bartlettsphericity.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) class StatsampleBartlettSphericityTestCase < MiniTest::Test include Statsample::Test context Statsample::Test::BartlettSphericity do setup do @v1=[1 ,2 ,3 ,4 ,7 ,8 ,9 ,10,14,15,20,50,60,70].to_scale @v2=[5 ,6 ,11,12,13,16,17,18,19,20,30,0,0,0].to_scale @v3=[10,3 ,20,30,40,50,80,10,20,30,40,2,3,4].to_scale # KMO: 0.490 ds={'v1'=>@v1,'v2'=>@v2,'v3'=>@v3}.to_dataset cor=Statsample::Bivariate.correlation_matrix(ds) @bs=Statsample::Test::BartlettSphericity.new(cor, 14) end should "have correct value for chi" do assert_in_delta(9.477, @bs.value,0.001) end should "have correct value for df" do assert_equal(3, @bs.df) end should "have correct value for probability" do assert_in_delta(0.024,@bs.probability,0.001) end end end ================================================ FILE: test/test_bivariate.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) class StatsampleBivariateTestCase < MiniTest::Test should "method sum of squares should be correct" do v1=[1,2,3,4,5,6].to_vector(:scale) v2=[6,2,4,10,12,8].to_vector(:scale) assert_equal(23.0, Statsample::Bivariate.sum_of_squares(v1,v2)) end should_with_gsl "return same covariance with ruby and gls implementation" do v1=20.times.collect {|a| rand()}.to_scale v2=20.times.collect {|a| rand()}.to_scale assert_in_delta(Statsample::Bivariate.covariance(v1,v2), Statsample::Bivariate.covariance_slow(v1,v2), 0.001) end should_with_gsl "return same correlation with ruby and gls implementation" do v1=20.times.collect {|a| rand()}.to_scale v2=20.times.collect {|a| rand()}.to_scale assert_in_delta(GSL::Stats::correlation(v1.gsl, v2.gsl), Statsample::Bivariate.pearson_slow(v1,v2), 1e-10) end should "return correct pearson correlation" do v1=[6,5,4,7,8,4,3,2].to_vector(:scale) v2=[2,3,7,8,6,4,3,2].to_vector(:scale) assert_in_delta(0.525,Statsample::Bivariate.pearson(v1,v2), 0.001) assert_in_delta(0.525,Statsample::Bivariate.pearson_slow(v1,v2), 0.001) v3=[6,2, 1000,1000,5,4,7,8,4,3,2,nil].to_vector(:scale) v4=[2,nil,nil,nil, 3,7,8,6,4,3,2,500].to_vector(:scale) assert_in_delta(0.525,Statsample::Bivariate.pearson(v3,v4),0.001) # Test ruby method v3a,v4a=Statsample.only_valid v3, v4 assert_in_delta(0.525, Statsample::Bivariate.pearson_slow(v3a,v4a),0.001) end should "return correct values for t_pearson and prop_pearson" do v1=[6,5,4,7,8,4,3,2].to_vector(:scale) v2=[2,3,7,8,6,4,3,2].to_vector(:scale) r=Statsample::Bivariate::Pearson.new(v1,v2) assert_in_delta(0.525,r.r, 0.001) assert_in_delta(Statsample::Bivariate.t_pearson(v1,v2), r.t, 0.001) assert_in_delta(Statsample::Bivariate.prop_pearson(r.t,8,:both), r.probability, 0.001) assert(r.summary.size>0) end should "return correct correlation_matrix with nils values" do v1=[6,5,4,7,8,4,3,2].to_vector(:scale) v2=[2,3,7,8,6,4,3,2].to_vector(:scale) v3=[6,2, 1000,1000,5,4,7,8].to_vector(:scale) v4=[2,nil,nil,nil, 3,7,8,6].to_vector(:scale) ds={'v1'=>v1,'v2'=>v2,'v3'=>v3,'v4'=>v4}.to_dataset c=Proc.new {|n1,n2|Statsample::Bivariate.pearson(n1,n2)} expected=Matrix[ [c.call(v1,v1),c.call(v1,v2),c.call(v1,v3),c.call(v1,v4)], [c.call(v2,v1),c.call(v2,v2),c.call(v2,v3),c.call(v2,v4)], [c.call(v3,v1),c.call(v3,v2),c.call(v3,v3),c.call(v3,v4)], [c.call(v4,v1),c.call(v4,v2),c.call(v4,v3),c.call(v4,v4)] ] obt=Statsample::Bivariate.correlation_matrix(ds) for i in 0...expected.row_size for j in 0...expected.column_size #puts expected[i,j].inspect #puts obt[i,j].inspect assert_in_delta(expected[i,j], obt[i,j],0.0001, "#{expected[i,j].class}!=#{obt[i,j].class} ") end end #assert_equal(expected,obt) end should_with_gsl "return same values for optimized and pairwise covariance matrix" do cases=100 v1=Statsample::Vector.new_scale(cases) {rand()} v2=Statsample::Vector.new_scale(cases) {rand()} v3=Statsample::Vector.new_scale(cases) {rand()} v4=Statsample::Vector.new_scale(cases) {rand()} v5=Statsample::Vector.new_scale(cases) {rand()} ds={'v1'=>v1,'v2'=>v2,'v3'=>v3,'v4'=>v4,'v5'=>v5}.to_dataset cor_opt=Statsample::Bivariate.covariance_matrix_optimized(ds) cor_pw =Statsample::Bivariate.covariance_matrix_pairwise(ds) assert_equal_matrix(cor_opt,cor_pw,1e-15) end should_with_gsl "return same values for optimized and pairwise correlation matrix" do cases=100 v1=Statsample::Vector.new_scale(cases) {rand()} v2=Statsample::Vector.new_scale(cases) {rand()} v3=Statsample::Vector.new_scale(cases) {rand()} v4=Statsample::Vector.new_scale(cases) {rand()} v5=Statsample::Vector.new_scale(cases) {rand()} ds={'v1'=>v1,'v2'=>v2,'v3'=>v3,'v4'=>v4,'v5'=>v5}.to_dataset cor_opt=Statsample::Bivariate.correlation_matrix_optimized(ds) cor_pw =Statsample::Bivariate.correlation_matrix_pairwise(ds) assert_equal_matrix(cor_opt,cor_pw,1e-15) end should "return correct correlation_matrix without nils values" do v1=[6,5,4,7,8,4,3,2].to_vector(:scale) v2=[2,3,7,8,6,4,3,2].to_vector(:scale) v3=[6,2, 1000,1000,5,4,7,8].to_vector(:scale) v4=[2,4,6,7, 3,7,8,6].to_vector(:scale) ds={'v1'=>v1,'v2'=>v2,'v3'=>v3,'v4'=>v4}.to_dataset c=Proc.new {|n1,n2|Statsample::Bivariate.pearson(n1,n2)} expected=Matrix[ [c.call(v1,v1),c.call(v1,v2),c.call(v1,v3),c.call(v1,v4)], [c.call(v2,v1),c.call(v2,v2),c.call(v2,v3),c.call(v2,v4)], [c.call(v3,v1),c.call(v3,v2),c.call(v3,v3),c.call(v3,v4)], [c.call(v4,v1),c.call(v4,v2),c.call(v4,v3),c.call(v4,v4)] ] obt=Statsample::Bivariate.correlation_matrix(ds) for i in 0...expected.row_size for j in 0...expected.column_size #puts expected[i,j].inspect #puts obt[i,j].inspect assert_in_delta(expected[i,j], obt[i,j],0.0001, "#{expected[i,j].class}!=#{obt[i,j].class} ") end end #assert_equal(expected,obt) end should "return correct value for prop pearson" do assert_in_delta(0.42, Statsample::Bivariate.prop_pearson(Statsample::Bivariate.t_r(0.084,94), 94),0.01) assert_in_delta(0.65, Statsample::Bivariate.prop_pearson(Statsample::Bivariate.t_r(0.046,95), 95),0.01) r=0.9 n=100 t=Statsample::Bivariate.t_r(r,n) assert(Statsample::Bivariate.prop_pearson(t,n,:both)<0.05) assert(Statsample::Bivariate.prop_pearson(t,n,:right)<0.05) assert(Statsample::Bivariate.prop_pearson(t,n,:left)>0.05) r=-0.9 n=100 t=Statsample::Bivariate.t_r(r,n) assert(Statsample::Bivariate.prop_pearson(t,n,:both)<0.05) assert(Statsample::Bivariate.prop_pearson(t,n,:right)>0.05) assert(Statsample::Bivariate.prop_pearson(t,n,:left)<0.05) end should "return correct value for Spearman's rho" do v1=[86,97,99,100,101,103,106,110,112,113].to_vector(:scale) v2=[0,20,28,27,50,29,7,17,6,12].to_vector(:scale) assert_in_delta(-0.175758,Statsample::Bivariate.spearman(v1,v2),0.0001) end should "return correct value for point_biserial correlation" do c=[1,3,5,6,7,100,200,300,400,300].to_vector(:scale) d=[1,1,1,1,1,0,0,0,0,0].to_vector(:scale) assert_raises TypeError do Statsample::Bivariate.point_biserial(c,d) end assert_in_delta(Statsample::Bivariate.point_biserial(d,c), Statsample::Bivariate.pearson(d,c), 0.0001) end should "return correct value for tau_a and tau_b" do v1=[1,2,3,4,5,6,7,8,9,10,11].to_vector(:ordinal) v2=[1,3,4,5,7,8,2,9,10,6,11].to_vector(:ordinal) assert_in_delta(0.6727,Statsample::Bivariate.tau_a(v1,v2),0.001) assert_in_delta(0.6727,Statsample::Bivariate.tau_b((Statsample::Crosstab.new(v1,v2).to_matrix)),0.001) v1=[12,14,14,17,19,19,19,19,19,20,21,21,21,21,21,22,23,24,24,24,26,26,27].to_vector(:ordinal) v2=[11,4,4,2,0,0,0,0,0,0,4,0,4,0,0,0,0,4,0,0,0,0,0].to_vector(:ordinal) assert_in_delta(-0.376201540231705, Statsample::Bivariate.tau_b(Statsample::Crosstab.new(v1,v2).to_matrix),0.001) end should "return correct value for gamma correlation" do m=Matrix[[10,5,2],[10,15,20]] assert_in_delta(0.636,Statsample::Bivariate.gamma(m),0.001) m2=Matrix[[15,12,6,5],[12,8,10,8],[4,6,9,10]] assert_in_delta(0.349,Statsample::Bivariate.gamma(m2),0.001) end end ================================================ FILE: test/test_codification.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) class StatsampleCodificationTestCase < MiniTest::Unit::TestCase def initialize(*args) v1=%w{run walk,run walking running sleep sleeping,dreaming sleep,dream}.to_vector @dict={'run'=>'r','walk'=>'w','walking'=>'w','running'=>'r','sleep'=>'s', 'sleeping'=>'s', 'dream'=>'d', 'dreaming'=>'d'} @ds={"v1"=>v1}.to_dataset super end def test_create_hash expected_keys_v1=%w{run walk walking running sleep sleeping dream dreaming}.sort hash=Statsample::Codification.create_hash(@ds,['v1']) assert_equal(['v1'],hash.keys) assert_equal(expected_keys_v1,hash['v1'].keys.sort) assert_equal(expected_keys_v1,hash['v1'].values.sort) end def test_create_excel filename=Dir::tmpdir+"/test_excel"+Time.now().to_s+".xls" #filename = Tempfile.new("test_codification_"+Time.now().to_s) Statsample::Codification.create_excel(@ds, ['v1'], filename) field=(["v1"]*8).to_vector keys=%w{dream dreaming run running sleep sleeping walk walking}.to_vector ds=Statsample::Excel.read(filename) assert_equal(field, ds['field']) assert_equal(keys, ds['original']) assert_equal(keys, ds['recoded']) hash=Statsample::Codification.excel_to_recoded_hash(filename) assert_equal(keys.data, hash['v1'].keys.sort) assert_equal(keys.data, hash['v1'].values.sort) end def test_create_yaml assert_raise ArgumentError do Statsample::Codification.create_yaml(@ds,[]) end expected_keys_v1=%w{run walk walking running sleep sleeping dream dreaming}.sort yaml_hash=Statsample::Codification.create_yaml(@ds,['v1']) h=YAML::load(yaml_hash) assert_equal(['v1'],h.keys) assert_equal(expected_keys_v1,h['v1'].keys.sort) tf = Tempfile.new("test_codification") yaml_hash=Statsample::Codification.create_yaml(@ds,['v1'],tf, Statsample::SPLIT_TOKEN) tf.close tf.open h=YAML::load(tf) assert_equal(['v1'],h.keys) assert_equal(expected_keys_v1,h['v1'].keys.sort) tf.close(true) end def test_recodification expected=[['r'],['w','r'],['w'],['r'],['s'],['s','d'], ['s','d']] assert_equal(expected,Statsample::Codification.recode_vector(@ds['v1'],@dict)) v2=['run','walk,dreaming',nil,'walk,dream,dreaming,walking'].to_vector expected=[['r'],['w','d'],nil,['w','d']] assert_equal(expected,Statsample::Codification.recode_vector(v2,@dict)) end def test_recode_dataset_simple Statsample::Codification.recode_dataset_simple!(@ds,{'v1'=>@dict}) expected_vector=['r','w,r','w','r','s','s,d', 's,d'].to_vector assert_not_equal(expected_vector,@ds['v1']) assert_equal(expected_vector,@ds['v1_recoded']) end def test_recode_dataset_split Statsample::Codification.recode_dataset_split!(@ds,{'v1'=>@dict}) e={} e['r']=[1,1,0,1,0,0,0].to_vector e['w']=[0,1,1,0,0,0,0].to_vector e['s']=[0,0,0,0,1,1,1].to_vector e['d']=[0,0,0,0,0,1,1].to_vector e.each{|k,expected| assert_equal(expected,@ds['v1_'+k],"Error on key #{k}") } end end ================================================ FILE: test/test_crosstab.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) class StatsampleCrosstabTestCase < MiniTest::Unit::TestCase def initialize(*args) @v1=%w{black blonde black black red black brown black blonde black red black blonde}.to_vector @v2=%w{woman man man woman man man man woman man woman woman man man}.to_vector @ct=Statsample::Crosstab.new(@v1,@v2) super end def test_crosstab_errors e1=%w{black blonde black black red black brown black blonde black} assert_raise ArgumentError do Statsample::Crosstab.new(e1,@v2) end e2=%w{black blonde black black red black brown black blonde black black}.to_vector assert_raise ArgumentError do Statsample::Crosstab.new(e2,@v2) end assert_nothing_raised do Statsample::Crosstab.new(@v1,@v2) end end def test_crosstab_basic assert_equal(%w{black blonde brown red}, @ct.rows_names) assert_equal(%w{man woman}, @ct.cols_names) assert_equal({'black'=>7,'blonde'=>3,'red'=>2,'brown'=>1}, @ct.rows_total) assert_equal({'man'=>8,'woman'=>5}, @ct.cols_total) end def test_crosstab_frequencies fq=@ct.frequencies assert_equal(8,fq.size) sum=fq.inject(0) {|s,x| s+x[1]} assert_equal(13,sum) fr=@ct.frequencies_by_row assert_equal(4,fr.size) assert_equal(%w{black blonde brown red},fr.keys.sort) fc=@ct.frequencies_by_col assert_equal(2,fc.size) assert_equal(%w{man woman},fc.keys.sort) assert_equal(Matrix.rows([[3,4],[3,0],[1,0],[1,1]]),@ct.to_matrix) end def test_summary @ct.percentage_row=true @ct.percentage_column=true @ct.percentage_total=true assert(@ct.summary.size>0) end def test_expected v1=%w{1 1 1 1 1 0 0 0 0 0}.to_vector v2=%w{0 0 0 0 0 1 1 1 1 1}.to_vector ct=Statsample::Crosstab.new(v1,v2) assert_equal(Matrix[[2.5,2.5],[2.5,2.5]],ct.matrix_expected) end def test_crosstab_with_scale v1=%w{1 1 1 1 1 0 0 0 0 0}.to_scale v2=%w{0 0 0 0 0 1 1 1 1 1}.to_scale ct=Statsample::Crosstab.new(v1,v2) assert_equal(Matrix[[0,5],[5,0]],ct.to_matrix) assert_nothing_raised { ct.summary } end end ================================================ FILE: test/test_csv.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) class StatsampleCSVTestCase < MiniTest::Unit::TestCase def setup @ds=Statsample::CSV.read(File.dirname(__FILE__)+"/fixtures/test_csv.csv") end def test_read assert_equal(6,@ds.cases) assert_equal(%w{id name age city a1}, @ds.fields) id=[1,2,3,4,5,6].to_vector(:scale) name=["Alex","Claude","Peter","Franz","George","Fernand"].to_vector(:nominal) age=[20,23,25,27,5.5,nil].to_vector(:scale) city=["New York","London","London","Paris","Tome",nil].to_vector(:nominal) a1=["a,b","b,c","a",nil,"a,b,c",nil].to_vector(:nominal) ds_exp=Statsample::Dataset.new({'id'=>id,'name'=>name,'age'=>age,'city'=>city,'a1'=>a1}, %w{id name age city a1}) ds_exp.fields.each{|f| assert_equal(ds_exp[f],@ds[f]) } assert_equal(ds_exp,@ds) end def test_nil assert_equal(nil,@ds['age'][5]) end def test_repeated ds=Statsample::CSV.read(File.dirname(__FILE__)+"/fixtures/repeated_fields.csv") assert_equal(%w{id name_1 age_1 city a1 name_2 age_2},ds.fields) age=[3,4,5,6,nil,8].to_vector(:scale) assert_equal(age,ds['age_2']) end def test_write filename=Tempfile.new("afile") # filename=Dir::tmpdir+"/test_write.csv" Statsample::CSV.write(@ds, filename.path) ds2=Statsample::CSV.read(filename.path) i=0 ds2.each_array{|row| assert_equal(@ds.case_as_array(i),row) i+=1 } end end =begin class StatsampleCSVTestCase2 < MiniTest::Unit::TestCase def setup @ds=Statsample::CSV.read19(File.dirname(__FILE__)+"/fixtures/test_csv.csv") end def test_read assert_equal(6,@ds.cases) assert_equal(%w{id name age city a1}, @ds.fields) id=[1,2,3,4,5,6].to_vector(:scale) name=["Alex","Claude","Peter","Franz","George","Fernand"].to_vector(:nominal) age=[20,23,25,27,5.5,nil].to_vector(:scale) city=["New York","London","London","Paris","Tome",nil].to_vector(:nominal) a1=["a,b","b,c","a",nil,"a,b,c",nil].to_vector(:nominal) ds_exp=Statsample::Dataset.new({'id'=>id,'name'=>name,'age'=>age,'city'=>city,'a1'=>a1}, %w{id name age city a1}) ds_exp.fields.each{|f| assert_equal(ds_exp[f],@ds[f]) } assert_equal(ds_exp,@ds) end def test_nil assert_equal(nil,@ds['age'][5]) end def test_repeated ds=Statsample::CSV.read19(File.dirname(__FILE__)+"/fixtures/repeated_fields.csv") assert_equal(%w{id name_1 age_1 city a1 name_2 age_2},ds.fields) age=[3,4,5,6,nil,8].to_vector(:scale) assert_equal(age,ds['age_2']) end def test_write filename=Tempfile.new("afile") # filename=Dir::tmpdir+"/test_write.csv" Statsample::CSV.write(@ds, filename.path) ds2=Statsample::CSV.read19(filename.path) i=0 ds2.each_array{|row| assert_equal(@ds.case_as_array(i),row) i+=1 } end end =end ================================================ FILE: test/test_dataset.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) class StatsampleDatasetTestCase < MiniTest::Unit::TestCase def setup @ds=Statsample::Dataset.new({'id' => Statsample::Vector.new([1,2,3,4,5]), 'name'=>Statsample::Vector.new(%w{Alex Claude Peter Franz George}), 'age'=>Statsample::Vector.new([20,23,25,27,5]), 'city'=>Statsample::Vector.new(['New York','London','London','Paris','Tome']), 'a1'=>Statsample::Vector.new(['a,b','b,c','a',nil,'a,b,c'])}, ['id','name','age','city','a1']) end def test_nest ds={ 'a'=>%w{a a a b b b}.to_vector, 'b'=>%w{c c d d e e}.to_vector, 'c'=>%w{f g h i j k}.to_vector }.to_dataset nest=ds.nest('a','b') assert_equal([{'c'=>'f'},{'c'=>'g'}], nest['a']['c']) assert_equal([{'c'=>'h'}], nest['a']['d']) assert_equal([{'c'=>'j'},{'c'=>'k'}], nest['b']['e']) end def test_should_have_summary assert(@ds.summary.size>0) end def test_basic assert_equal(5,@ds.cases) assert_equal(%w{id name age city a1}, @ds.fields) end def test_saveload outfile=Tempfile.new("dataset.ds") @ds.save(outfile.path) a=Statsample.load(outfile.path) assert_equal(@ds,a) end def test_gsl if Statsample.has_gsl? matrix=GSL::Matrix[[1,2],[3,4],[5,6]] ds=Statsample::Dataset.new('v1'=>[1,3,5].to_vector,'v2'=>[2,4,6].to_vector) assert_equal(matrix,ds.to_gsl) else skip("Gsl needed") end end def test_matrix matrix=Matrix[[1,2],[3,4],[5,6]] ds=Statsample::Dataset.new('v1'=>[1,3,5].to_vector,'v2'=>[2,4,6].to_vector) assert_equal(matrix,ds.to_matrix) end def test_fields @ds.fields=%w{name a1 id age city} assert_equal(%w{name a1 id age city}, @ds.fields) @ds.fields=%w{id name age} assert_equal(%w{id name age a1 city}, @ds.fields) end def test_merge a=[1,2,3].to_scale b=[3,4,5].to_vector c=[4,5,6].to_scale d=[7,8,9].to_vector e=[10,20,30].to_vector ds1={'a'=>a,'b'=>b}.to_dataset ds2={'c'=>c,'d'=>d}.to_dataset exp={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset assert_equal(exp,ds1.merge(ds2)) exp.fields=%w{c d a b} assert_equal(exp,ds2.merge(ds1)) ds3={'a'=>e}.to_dataset exp={'a_1'=>a,'b'=>b,'a_2'=>e}.to_dataset exp.fields=%w{a_1 b a_2} assert_equal(exp,ds1.merge(ds3)) end def test_each_vector a=[1,2,3].to_vector b=[3,4,5].to_vector fields=["a","b"] ds=Statsample::Dataset.new({'a'=>a,'b'=>b},fields) res=[] ds.each_vector{|k,v| res.push([k,v]) } assert_equal([["a",a],["b",b]],res) ds.fields=["b","a"] res=[] ds.each_vector{|k,v| res.push([k,v]) } assert_equal([["b",b],["a",a]],res) end def test_equality v1=[1,2,3,4].to_vector v2=[5,6,7,8].to_vector ds1=Statsample::Dataset.new({'v1'=>v1,'v2'=>v2}, %w{v2 v1}) v3=[1,2,3,4].to_vector v4=[5,6,7,8].to_vector ds2=Statsample::Dataset.new({'v1'=>v3,'v2'=>v4}, %w{v2 v1}) assert_equal(ds1,ds2) ds2.fields=%w{v1 v2} assert_not_equal(ds1,ds2) end def test_add_vector v=Statsample::Vector.new(%w{a b c d e}) @ds.add_vector('new',v) assert_equal(%w{id name age city a1 new},@ds.fields) x=Statsample::Vector.new(%w{a b c d e f g}) assert_raise ArgumentError do @ds.add_vector('new2',x) end end def test_vector_by_calculation a1=[1,2,3,4,5,6,7].to_vector(:scale) a2=[10,20,30,40,50,60,70].to_vector(:scale) a3=[100,200,300,400,500,600,700].to_vector(:scale) ds={'a1'=>a1,'a2'=>a2,'a3'=>a3}.to_dataset total=ds.vector_by_calculation() {|row| row['a1']+row['a2']+row['a3'] } expected=[111,222,333,444,555,666,777].to_vector(:scale) assert_equal(expected,total) end def test_vector_sum a1=[1 ,2 ,3 ,4 , 5,nil].to_vector(:scale) a2=[10 ,10,20,20 ,20,30].to_vector(:scale) b1=[nil,1 ,1 ,1 ,1 ,2].to_vector(:scale) b2=[2 ,2 ,2 ,nil,2 ,3].to_vector(:scale) ds={'a1'=>a1,'a2'=>a2,'b1'=>b1,'b2'=>b2}.to_dataset total=ds.vector_sum a=ds.vector_sum(['a1','a2']) b=ds.vector_sum(['b1','b2']) expected_a=[11,12,23,24,25,nil].to_vector(:scale) expected_b=[nil,3,3,nil,3,5].to_vector(:scale) expected_total=[nil,15,26,nil,28,nil].to_vector(:scale) assert_equal(expected_a, a) assert_equal(expected_b, b) assert_equal(expected_total, total) end def test_vector_missing_values a1=[1 ,nil ,3 ,4 , 5,nil].to_vector(:scale) a2=[10 ,nil ,20,20 ,20,30].to_vector(:scale) b1=[nil,nil ,1 ,1 ,1 ,2].to_vector(:scale) b2=[2 ,2 ,2 ,nil,2 ,3].to_vector(:scale) c= [nil,2 , 4,2 ,2 ,2].to_vector(:scale) ds={'a1'=>a1,'a2'=>a2,'b1'=>b1,'b2'=>b2,'c'=>c}.to_dataset mva=[2,3,0,1,0,1].to_vector(:scale) assert_equal(mva,ds.vector_missing_values) end def test_has_missing_values a1=[1 ,nil ,3 ,4 , 5,nil].to_vector(:scale) a2=[10 ,nil ,20,20 ,20,30].to_vector(:scale) b1=[nil,nil ,1 ,1 ,1 ,2].to_vector(:scale) b2=[2 ,2 ,2 ,nil,2 ,3].to_vector(:scale) c= [nil,2 , 4,2 ,2 ,2].to_vector(:scale) ds={'a1'=>a1,'a2'=>a2,'b1'=>b1,'b2'=>b2,'c'=>c}.to_dataset assert(ds.has_missing_data?) clean=ds.dup_only_valid assert(!clean.has_missing_data?) end def test_vector_count_characters a1=[1 ,"abcde" ,3 ,4 , 5,nil].to_vector(:scale) a2=[10 ,20.3 ,20 ,20 ,20,30].to_vector(:scale) b1=[nil,"343434" ,1 ,1 ,1 ,2].to_vector(:scale) b2=[2 ,2 ,2 ,nil,2 ,3].to_vector(:scale) c= [nil,2 ,"This is a nice example",2 ,2 ,2].to_vector(:scale) ds={'a1'=>a1,'a2'=>a2,'b1'=>b1,'b2'=>b2,'c'=>c}.to_dataset exp=[4,17,27,5,6,5].to_vector(:scale) assert_equal(exp,ds.vector_count_characters) end def test_vector_mean a1=[1 ,2 ,3 ,4 , 5,nil].to_vector(:scale) a2=[10 ,10,20,20 ,20,30].to_vector(:scale) b1=[nil,1 ,1 ,1 ,1 ,2].to_vector(:scale) b2=[2 ,2 ,2 ,nil,2 ,3].to_vector(:scale) c= [nil,2, 4,2 ,2 ,2].to_vector(:scale) ds={'a1'=>a1,'a2'=>a2,'b1'=>b1,'b2'=>b2,'c'=>c}.to_dataset total=ds.vector_mean a=ds.vector_mean(['a1','a2'],1) b=ds.vector_mean(['b1','b2'],1) c=ds.vector_mean(['b1','b2','c'],1) expected_a=[5.5,6,11.5,12,12.5,30].to_vector(:scale) expected_b=[2,1.5,1.5,1,1.5,2.5].to_vector(:scale) expected_c=[nil, 5.0/3,7.0/3,1.5,5.0/3,7.0/3].to_vector(:scale) expected_total=[nil,3.4,6,nil,6.0,nil].to_vector(:scale) assert_equal(expected_a, a) assert_equal(expected_b, b) assert_equal(expected_c, c) assert_equal(expected_total, total) end def test_each_array expected=[[1,'Alex',20,'New York','a,b'], [2,'Claude',23,'London','b,c'], [3,'Peter',25,'London','a'],[4,'Franz', 27,'Paris',nil],[5,'George',5,'Tome','a,b,c']] out=[] @ds.each_array{ |a| out.push(a) } assert_equal(expected,out) end def test_recode @ds['age'].type=:scale @ds.recode!("age") {|c| c['id']*2} expected=[2,4,6,8,10].to_vector(:scale) assert_equal(expected,@ds['age']) end def test_case_as assert_equal({'id'=>1,'name'=>'Alex','city'=>'New York','age'=>20,'a1'=>'a,b'},@ds.case_as_hash(0)) assert_equal([5,'George',5,'Tome','a,b,c'],@ds.case_as_array(4)) # Native methods assert_equal({'id'=>1,'name'=>'Alex','city'=>'New York','age'=>20,'a1'=>'a,b'},@ds._case_as_hash(0)) assert_equal([5,'George',5,'Tome','a,b,c'],@ds._case_as_array(4)) end def test_delete_vector @ds.delete_vector('name') assert_equal(%w{id age city a1},@ds.fields) assert_equal(%w{a1 age city id},@ds.vectors.keys.sort) end def test_change_type @ds.col('age').type=:scale assert_equal(:scale,@ds.col('age').type) end def test_split_by_separator_recode @ds.add_vectors_by_split_recode("a1","_") assert_equal(%w{id name age city a1 a1_1 a1_2 a1_3},@ds.fields) assert_equal([1,0,1,nil,1],@ds.col('a1_1').to_a) assert_equal([1,1,0,nil,1],@ds.col('a1_2').to_a) assert_equal([0,1,0,nil,1],@ds.col('a1_3').to_a) {'a1_1'=>'a1:a', 'a1_2'=>'a1:b', 'a1_3'=>'a1:c'}.each do |k,v| assert_equal(v, @ds[k].name) end end def test_split_by_separator @ds.add_vectors_by_split("a1","_") assert_equal(%w{id name age city a1 a1_a a1_b a1_c},@ds.fields) assert_equal([1,0,1,nil,1],@ds.col('a1_a').to_a) assert_equal([1,1,0,nil,1],@ds.col('a1_b').to_a) assert_equal([0,1,0,nil,1],@ds.col('a1_c').to_a) end def test_percentiles v1=(1..100).to_a.to_scale assert_equal(50.5,v1.median) assert_equal(25.5, v1.percentil(25)) v2=(1..99).to_a.to_scale assert_equal(50,v2.median) assert_equal(25,v2.percentil(25)) v3=(1..50).to_a.to_scale assert_equal(25.5, v3.median) assert_equal(13, v3.percentil(25)) end def test_add_case ds=Statsample::Dataset.new({'a'=>[].to_vector, 'b'=>[].to_vector, 'c'=>[].to_vector}) ds.add_case([1,2,3]) ds.add_case({'a'=>4,'b'=>5,'c'=>6}) ds.add_case([[7,8,9],%w{a b c}]) assert_equal({'a'=>1,'b'=>2,'c'=>3},ds.case_as_hash(0)) assert_equal([4,5,6],ds.case_as_array(1)) assert_equal([7,8,9],ds.case_as_array(2)) assert_equal(['a','b','c'],ds.case_as_array(3)) ds.add_case_array([6,7,1]) ds.update_valid_data assert_equal([6,7,1],ds.case_as_array(4)) end def test_marshaling ds_marshal=Marshal.load(Marshal.dump(@ds)) assert_equal(ds_marshal,@ds) end def test_range v1=[1,2,3,4].to_vector v2=[5,6,7,8].to_vector v3=[9,10,11,12].to_vector ds1=Statsample::Dataset.new({'v1'=>v1,'v2'=>v2,'v3'=>v3}, %w{v3 v2 v1}) assert_same(v1,ds1['v1']) ds2=ds1["v2".."v1"] assert_equal(%w{v2 v1},ds2.fields) assert_same(ds1['v1'],ds2['v1']) assert_same(ds1['v2'],ds2['v2']) end def test_clone v1=[1,2,3,4].to_vector v2=[5,6,7,8].to_vector ds1=Statsample::Dataset.new({'v1'=>v1,'v2'=>v2}, %w{v2 v1}) ds2=ds1.clone assert_equal(ds1,ds2) assert_not_same(ds1,ds2) assert_equal(ds1['v1'],ds2['v1']) assert_same(ds1['v1'], ds2['v1']) assert_equal(ds1.fields,ds2.fields) assert_not_same(ds1.fields,ds2.fields) assert_equal(ds1.cases,ds2.cases) # partial clone ds3=ds1.clone('v1') ds_exp=Statsample::Dataset.new({'v1'=>v1},%w{v1}) assert_equal(ds_exp,ds3) assert_not_same(ds_exp,ds3) assert_equal(ds3['v1'],ds_exp['v1']) assert_same(ds3['v1'],ds_exp['v1']) assert_equal(ds3.fields,ds_exp.fields) assert_equal(ds3.cases,ds_exp.cases) assert_not_same(ds3.fields,ds_exp.fields) end def test_dup v1=[1,2,3,4].to_vector v2=[5,6,7,8].to_vector ds1=Statsample::Dataset.new({'v1'=>v1,'v2'=>v2}, %w{v2 v1}) ds2=ds1.dup assert_equal(ds1,ds2) assert_not_same(ds1,ds2) assert_equal(ds1['v1'],ds2['v1']) assert_not_same(ds1['v1'],ds2['v1']) assert_equal(ds1.cases,ds2.cases) assert_equal(ds1.fields,ds2.fields) assert_not_same(ds1.fields,ds2.fields) ds1['v1'].type=:scale # dup partial ds3=ds1.dup('v1') ds_exp=Statsample::Dataset.new({'v1'=>v1},%w{v1}) assert_equal(ds_exp,ds3) assert_not_same(ds_exp,ds3) assert_equal(ds3['v1'],ds_exp['v1']) assert_not_same(ds3['v1'],ds_exp['v1']) assert_equal(ds3.fields,ds_exp.fields) assert_equal(ds3.cases,ds_exp.cases) assert_not_same(ds3.fields,ds_exp.fields) # empty ds3=ds1.dup_empty assert_not_equal(ds1,ds3) assert_not_equal(ds1['v1'],ds3['v1']) assert_equal([],ds3['v1'].data) assert_equal([],ds3['v2'].data) assert_equal(:scale,ds3['v1'].type) assert_equal(ds1.fields,ds2.fields) assert_not_same(ds1.fields,ds2.fields) end def test_from_to assert_equal(%w{name age city}, @ds.from_to("name","city")) assert_raise ArgumentError do @ds.from_to("name","a2") end end def test_each_array_with_nils v1=[1,-99,3,4,"na"].to_vector(:scale,:missing_values=>[-99,"na"]) v2=[5,6,-99,8,20].to_vector(:scale,:missing_values=>[-99]) v3=[9,10,11,12,20].to_vector(:scale,:missing_values=>[-99]) ds1=Statsample::Dataset.new({'v1'=>v1,'v2'=>v2,'v3'=>v3}) ds2=ds1.dup_empty ds1.each_array_with_nils {|row| ds2.add_case_array(row) } ds2.update_valid_data assert_equal([1,nil,3,4,nil],ds2['v1'].data) assert_equal([5,6,nil,8,20],ds2['v2'].data) end def test_dup_only_valid v1=[1,nil,3,4].to_vector(:scale) v2=[5,6,nil,8].to_vector(:scale) v3=[9,10,11,12].to_vector(:scale) ds1=Statsample::Dataset.new({'v1'=>v1,'v2'=>v2,'v3'=>v3}) ds2=ds1.dup_only_valid expected=Statsample::Dataset.new({'v1'=>[1,4].to_vector(:scale), 'v2'=> [5,8].to_vector(:scale), 'v3'=>[9, 12].to_vector(:scale)}) assert_equal(expected,ds2) assert_equal(expected.vectors.values,Statsample::only_valid(v1,v2,v3)) expected_partial=Statsample::Dataset.new({'v1'=>[1,3,4].to_vector(:scale), 'v3'=>[9, 11,12].to_vector(:scale)}) assert_equal(expected_partial, ds1.dup_only_valid(%w{v1 v3})) end def test_filter @ds['age'].type=:scale filtered=@ds.filter{|c| c['id']==2 or c['id']==4} expected=Statsample::Dataset.new({'id' => Statsample::Vector.new([2,4]), 'name'=>Statsample::Vector.new(%w{Claude Franz}), 'age'=>Statsample::Vector.new([23,27],:scale), 'city'=>Statsample::Vector.new(['London','Paris']), 'a1'=>Statsample::Vector.new(['b,c',nil,])}, ['id','name','age','city','a1']) assert_equal(expected,filtered) end def test_filter_field @ds['age'].type=:scale filtered=@ds.filter_field('id') {|c| c['id']==2 or c['id']==4} expected=[2,4].to_vector assert_equal(expected,filtered) end def test_verify name=%w{r1 r2 r3 r4}.to_vector(:nominal) v1=[1,2,3,4].to_vector(:scale) v2=[4,3,2,1].to_vector(:scale) v3=[10,20,30,40].to_vector(:scale) v4=%w{a b a b}.to_vector(:nominal) ds={'v1'=>v1,'v2'=>v2,'v3'=>v3,'v4'=>v4,'id'=>name}.to_dataset ds.fields=%w{v1 v2 v3 v4 id} #Correct t1=create_test("If v4=a, v1 odd") {|r| r['v4']=='b' or (r['v4']=='a' and r['v1']%2==1)} t2=create_test("v3=v1*10") {|r| r['v3']==r['v1']*10} # Fail! t3=create_test("v4='b'") {|r| r['v4']=='b'} exp1=["1 [1]: v4='b'", "3 [3]: v4='b'"] exp2=["1 [r1]: v4='b'", "3 [r3]: v4='b'"] res=ds.verify(t3,t1,t2) assert_equal(exp1,res) res=ds.verify('id',t1,t2,t3) assert_equal(exp2,res) end def test_compute_operation v1=[1,2,3,4].to_vector(:scale) v2=[4,3,2,1].to_vector(:scale) v3=[10,20,30,40].to_vector(:scale) vscale=[1.quo(2),1,3.quo(2),2].to_vector(:scale) vsum=[1+4+10.0,2+3+20.0,3+2+30.0,4+1+40.0].to_vector(:scale) vmult=[1*4,2*3,3*2,4*1].to_vector(:scale) ds={'v1'=>v1,'v2'=>v2,'v3'=>v3}.to_dataset assert_equal(vscale,ds.compute("v1/2")) assert_equal(vsum,ds.compute("v1+v2+v3")) assert_equal(vmult,ds.compute("v1*v2")) end def test_crosstab_with_asignation v1=%w{a a a b b b c c c}.to_vector v2=%w{a b c a b c a b c}.to_vector v3=%w{0 1 0 0 1 1 0 0 1}.to_scale ds=Statsample::Dataset.crosstab_by_asignation(v1,v2,v3) assert_equal(:nominal, ds['_id'].type) assert_equal(:scale, ds['a'].type) assert_equal(:scale, ds['b'].type) ev_id=%w{a b c}.to_vector ev_a =%w{0 0 0}.to_scale ev_b =%w{1 1 0}.to_scale ev_c =%w{0 1 1}.to_scale ds2={'_id'=>ev_id, 'a'=>ev_a, 'b'=>ev_b, 'c'=>ev_c}.to_dataset assert_equal(ds, ds2) end def test_one_to_many cases=[ ['1','george','red',10,'blue',20,nil,nil], ['2','fred','green',15,'orange',30,'white',20], ['3','alfred',nil,nil,nil,nil,nil,nil] ] ds=Statsample::Dataset.new(%w{id name car_color1 car_value1 car_color2 car_value2 car_color3 car_value3}) cases.each {|c| ds.add_case_array c } ds.update_valid_data ids=%w{1 1 2 2 2}.to_vector colors=%w{red blue green orange white}.to_vector values=[10,20,15,30,20].to_vector col_ids=[1,2,1,2,3].to_scale ds_expected={'id'=>ids, '_col_id'=>col_ids, 'color'=>colors, 'value'=>values}.to_dataset(['id','_col_id', 'color','value']) assert_equal(ds_expected, ds.one_to_many(%w{id}, "car_%v%n")) end end ================================================ FILE: test/test_dominance_analysis.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) class StatsampleDominanceAnalysisTestCase < MiniTest::Unit::TestCase def test_dominance_univariate # Example from Budescu (1993) m=Matrix[[1, 0.683, 0.154, 0.460, 0.618],[0.683, 1, -0.050, 0.297, 0.461], [0.154, -0.050, 1, 0.006, 0.262],[0.460, 0.297, 0.006, 1, 0.507],[0.618, 0.461, 0.262, 0.507, 1]] m.extend Statsample::CovariateMatrix m.fields=%w{x1 x2 x3 x4 y} da=Statsample::DominanceAnalysis.new(m,'y') contr_x1={'x2'=>0.003, 'x3'=>0.028, 'x4'=>0.063} contr_x1.each do |k,v| assert_in_delta(v, da.models_data[['x1']].contributions[k], 0.001) end assert_in_delta(0.052, da.models_data[['x2','x3','x4']].contributions['x1'], 0.001) expected_dominances=[1, 1, 0.5, 0.5, 0,0] expected_g_dominances=[1, 1, 1, 1, 0,0] da.pairs.each_with_index do |a,i| assert_equal(expected_dominances[i], da.total_dominance_pairwise(a[0],a[1])) assert_equal(expected_dominances[i], da.conditional_dominance_pairwise(a[0],a[1])) assert_equal(expected_g_dominances[i], da.general_dominance_pairwise(a[0],a[1])) end assert(da.summary.size>0) end def test_dominance_multivariate m=Matrix[[1.0, -0.19, -0.358, -0.343, 0.359, 0.257], [-0.19, 1.0, 0.26, 0.29, -0.11, -0.11], [-0.358, 0.26, 1.0, 0.54, -0.49, -0.23], [-0.343, 0.29, 0.54, 1.0, -0.22, -0.41], [0.359, -0.11, -0.49, -0.22, 1.0, 0.62], [0.257, -0.11, -0.23, -0.41, 0.62, 1]] m.extend Statsample::CovariateMatrix m.fields=%w{y1 y2 x1 x2 x3 x4} m2=m.submatrix(%w{y1 x1 x2 x3 x4}) da=Statsample::DominanceAnalysis.new(m, ['y1','y2'], :cases=>683, :method_association=>:p2yx) contr_x1={'x2'=>0.027, 'x3'=>0.024, 'x4'=>0.017} contr_x1.each do |k,v| assert_in_delta(v, da.models_data[['x1']].contributions[k], 0.003) end end end ================================================ FILE: test/test_factor.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) #require 'rserve' #require 'statsample/rserve_extension' class StatsampleFactorTestCase < MiniTest::Unit::TestCase include Statsample::Fixtures # Based on Hardle and Simar def setup @fixtures_dir=File.expand_path(File.dirname(__FILE__)+"/fixtures") end # Based on Hurdle example def test_covariance_matrix ds=Statsample::PlainText.read(@fixtures_dir+"/bank2.dat", %w{v1 v2 v3 v4 v5 v6}) ds.fields.each {|f| ds[f]=ds[f].centered } cm=ds.covariance_matrix pca =Statsample::Factor::PCA.new( cm, :m=>6) #puts pca.summary #puts pca.feature_matrix exp_eig=[2.985, 0.931,0.242, 0.194, 0.085, 0.035].to_scale assert_similar_vector(exp_eig, pca.eigenvalues.to_scale, 0.1) pcs=pca.principal_components(ds) k=6 comp_matrix=pca.component_matrix() k.times {|i| pc_id="PC_#{i+1}" k.times {|j| # variable ds_id="v#{j+1}" r= Statsample::Bivariate.correlation(ds[ds_id], pcs[pc_id]) assert_in_delta( r, comp_matrix[j,i]) } } end def test_principalcomponents_ruby_gsl ran=Distribution::Normal.rng # @r=::Rserve::Connection.new samples=20 [3,5,7].each {|k| v={} v["x0"]=samples.times.map { ran.call()}.to_scale.centered (1...k).each {|i| v["x#{i}"]=samples.times.map {|ii| ran.call()*0.5+v["x#{i-1}"][ii]*0.5}.to_scale.centered } ds=v.to_dataset cm=ds.covariance_matrix # @r.assign('ds',ds) # @r.eval('cm<-cor(ds);sm<-eigen(cm, sym=TRUE);v<-sm$vectors') # puts "eigenvalues" # puts @r.eval('v').to_ruby.to_s pca_ruby=Statsample::Factor::PCA.new( cm, :m=>k, :use_gsl=>false ) pca_gsl =Statsample::Factor::PCA.new( cm, :m=>k, :use_gsl=>true ) pc_ruby = pca_ruby.principal_components(ds) pc_gsl = pca_gsl.principal_components(ds) # Test component matrix correlation! cm_ruby=pca_ruby.component_matrix #puts cm_ruby.summary k.times {|i| pc_id="PC_#{i+1}" assert_in_delta(pca_ruby.eigenvalues[i], pca_gsl.eigenvalues[i],1e-10) # Revert gsl component values pc_gsl_data= (pc_gsl[pc_id][0]-pc_ruby[pc_id][0]).abs>1e-6 ? pc_gsl[pc_id].recode {|v| -v} : pc_gsl[pc_id] assert_similar_vector(pc_gsl_data, pc_ruby[pc_id], 1e-6,"PC for #{k} variables") if false k.times {|j| # variable ds_id="x#{j}" r= Statsample::Bivariate.correlation(ds[ds_id],pc_ruby[pc_id]) puts "#{pc_id}-#{ds_id}:#{r}" } end } } #@r.close end def test_principalcomponents() principalcomponents(true) principalcomponents(false) end def principalcomponents(gsl) ran=Distribution::Normal.rng samples=50 x1=samples.times.map { ran.call()}.to_scale x2=samples.times.map {|i| ran.call()*0.5+x1[i]*0.5}.to_scale ds={'x1'=>x1,'x2'=>x2}.to_dataset cm=ds.correlation_matrix r=cm[0,1] pca=Statsample::Factor::PCA.new(cm,:m=>2,:use_gsl=>gsl) assert_in_delta(1+r,pca.eigenvalues[0],1e-10) assert_in_delta(1-r,pca.eigenvalues[1],1e-10) hs=1.0 / Math.sqrt(2) assert_equal_vector(Vector[1, 1]*hs, pca.eigenvectors[0]) m_1=gsl ? Vector[-1,1] : Vector[1,-1] assert_equal_vector(hs*m_1, pca.eigenvectors[1]) pcs=pca.principal_components(ds) exp_pc_1=ds.collect_with_index {|row,i| hs*(row['x1']+row['x2']) } exp_pc_2=ds.collect_with_index {|row,i| gsl ? hs*(row['x2']-row['x1']) : hs*(row['x1']-row['x2']) } assert_similar_vector(exp_pc_1, pcs["PC_1"]) assert_similar_vector(exp_pc_2, pcs["PC_2"]) end def test_antiimage cor=Matrix[[1,0.964, 0.312],[0.964,1,0.411],[0.312,0.411,1]] expected=Matrix[[0.062,-0.057, 0.074],[-0.057, 0.057, -0.089], [0.074, -0.089, 0.729]] ai=Statsample::Factor.anti_image_covariance_matrix(cor) assert(Matrix.equal_in_delta?(expected, ai, 0.01), "#{expected.to_s} not equal to #{ai.to_s}") end def test_kmo @v1=[1 ,2 ,3 ,4 ,7 ,8 ,9 ,10,14,15,20,50,60,70].to_scale @v2=[5 ,6 ,11,12,13,16,17,18,19,20,30,0,0,0].to_scale @v3=[10,3 ,20,30,40,50,80,10,20,30,40,2,3,4].to_scale # KMO: 0.490 ds={'v1'=>@v1,'v2'=>@v2,'v3'=>@v3}.to_dataset cor=Statsample::Bivariate.correlation_matrix(ds) kmo=Statsample::Factor.kmo(cor) assert_in_delta(0.667, kmo,0.001) assert_in_delta(0.81, Statsample::Factor.kmo(harman_817),0.01) end def test_kmo_univariate m=harman_817 expected=[0.73,0.76,0.84,0.87,0.53,0.93,0.78,0.86] m.row_size.times.map {|i| assert_in_delta(expected[i], Statsample::Factor.kmo_univariate(m,i),0.01) } end # Tested with SPSS and R def test_pca a=[2.5, 0.5, 2.2, 1.9, 3.1, 2.3, 2.0, 1.0, 1.5, 1.1].to_scale b=[2.4, 0.7, 2.9, 2.2, 3.0, 2.7, 1.6, 1.1, 1.6, 0.9].to_scale a.recode! {|c| c-a.mean} b.recode! {|c| c-b.mean} ds={'a'=>a,'b'=>b}.to_dataset cov_matrix=Statsample::Bivariate.covariance_matrix(ds) if Statsample.has_gsl? pca=Statsample::Factor::PCA.new(cov_matrix,:use_gsl=>true) pca_set(pca,"gsl") else skip("Eigenvalues could be calculated with GSL (requires gsl)") end pca=Statsample::Factor::PCA.new(cov_matrix,:use_gsl=>false) pca_set(pca,"ruby") end def pca_set(pca,type) expected_eigenvalues=[1.284, 0.0490] expected_eigenvalues.each_with_index{|ev,i| assert_in_delta(ev,pca.eigenvalues[i],0.001) } expected_communality=[0.590, 0.694] expected_communality.each_with_index{|ev,i| assert_in_delta(ev,pca.communalities[i],0.001) } expected_cm=[0.768, 0.833] obs=pca.component_matrix_correlation(1).column(0).to_a expected_cm.each_with_index{|ev,i| assert_in_delta(ev,obs[i],0.001) } assert(pca.summary) end # Tested with R def test_principalaxis matrix=::Matrix[ [1.0, 0.709501601093587, 0.877596585880047, 0.272219316266807], [0.709501601093587, 1.0, 0.291633797330304, 0.871141831433844], [0.877596585880047, 0.291633797330304, 1.0, -0.213373722977167], [0.272219316266807, 0.871141831433844, -0.213373722977167, 1.0]] fa=Statsample::Factor::PrincipalAxis.new(matrix,:m=>1, :max_iterations=>50) cm=::Matrix[[0.923],[0.912],[0.507],[0.483]] assert_equal_matrix(cm,fa.component_matrix,0.001) h2=[0.852,0.832,0.257,0.233] h2.each_with_index{|ev,i| assert_in_delta(ev,fa.communalities[i],0.001) } eigen1=2.175 assert_in_delta(eigen1, fa.eigenvalues[0],0.001) assert(fa.summary.size>0) fa=Statsample::Factor::PrincipalAxis.new(matrix,:smc=>false) assert_raise RuntimeError do fa.iterate end end def test_rotation_varimax a = Matrix[ [ 0.4320, 0.8129, 0.3872] , [0.7950, -0.5416, 0.2565] , [0.5944, 0.7234, -0.3441], [0.8945, -0.3921, -0.1863] ] expected= Matrix[[-0.0204423, 0.938674, -0.340334], [0.983662, 0.0730206, 0.134997], [0.0826106, 0.435975, -0.893379], [0.939901, -0.0965213, -0.309596]] varimax=Statsample::Factor::Varimax.new(a) assert(!varimax.rotated.nil?, "Rotated shouldn't be empty") assert(!varimax.component_transformation_matrix.nil?, "Component matrix shouldn't be empty") assert(!varimax.h2.nil?, "H2 shouldn't be empty") assert_equal_matrix(expected,varimax.rotated,1e-6) assert(varimax.summary.size>0) end end ================================================ FILE: test/test_factor_map.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) #require 'rserve' #require 'statsample/rserve_extension' class StatsampleFactorMpaTestCase < MiniTest::Unit::TestCase context Statsample::Factor::MAP do setup do m=Matrix[ [ 1, 0.846, 0.805, 0.859, 0.473, 0.398, 0.301, 0.382], [ 0.846, 1, 0.881, 0.826, 0.376, 0.326, 0.277, 0.415], [ 0.805, 0.881, 1, 0.801, 0.38, 0.319, 0.237, 0.345], [ 0.859, 0.826, 0.801, 1, 0.436, 0.329, 0.327, 0.365], [ 0.473, 0.376, 0.38, 0.436, 1, 0.762, 0.73, 0.629], [ 0.398, 0.326, 0.319, 0.329, 0.762, 1, 0.583, 0.577], [ 0.301, 0.277, 0.237, 0.327, 0.73, 0.583, 1, 0.539], [ 0.382, 0.415, 0.345, 0.365, 0.629, 0.577, 0.539, 1] ] @map=Statsample::Factor::MAP.new(m) end should "return correct values with pure ruby" do @map.use_gsl=false map_assertions(@map) end should_with_gsl "return correct values with gsl" do #require 'ruby-prof' @map.use_gsl=true map_assertions(@map) end end def map_assertions(map) assert_in_delta(map.minfm, 0.066445,0.00001) assert_equal(map.number_of_factors, 2) assert_in_delta(map.fm[0], 0.312475,0.00001) assert_in_delta(map.fm[1], 0.245121,0.00001) end end ================================================ FILE: test/test_factor_pa.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) #require 'rserve' #require 'statsample/rserve_extension' class StatsampleFactorTestCase < MiniTest::Unit::TestCase include Statsample::Fixtures # Based on Hardle and Simar def setup @fixtures_dir=File.expand_path(File.dirname(__FILE__)+"/fixtures") end def test_parallelanalysis_with_data if Statsample.has_gsl? samples=100 variables=10 iterations=50 rng = Distribution::Normal.rng f1=samples.times.collect {rng.call}.to_scale f2=samples.times.collect {rng.call}.to_scale vectors={} variables.times do |i| if i<5 vectors["v#{i}"]=samples.times.collect {|nv| f1[nv]*5+f2[nv]*2+rng.call }.to_scale else vectors["v#{i}"]=samples.times.collect {|nv| f2[nv]*5+f1[nv]*2+rng.call }.to_scale end end ds=vectors.to_dataset pa1=Statsample::Factor::ParallelAnalysis.new(ds, :bootstrap_method=>:data, :iterations=>iterations) pa2=Statsample::Factor::ParallelAnalysis.with_random_data(samples,variables,:iterations=>iterations,:percentil=>95) 3.times do |n| var="ev_0000#{n+1}" assert_in_delta(pa1.ds_eigenvalues[var].mean, pa2.ds_eigenvalues[var].mean,0.05) end else skip("Too slow without GSL") end end def test_parallelanalysis pa=Statsample::Factor::ParallelAnalysis.with_random_data(305,8,:iterations=>100,:percentil=>95) assert_in_delta(1.2454, pa.ds_eigenvalues['ev_00001'].mean, 0.01) assert_in_delta(1.1542, pa.ds_eigenvalues['ev_00002'].mean, 0.01) assert_in_delta(1.0836, pa.ds_eigenvalues['ev_00003'].mean, 0.01) assert(pa.summary.size>0) end end ================================================ FILE: test/test_ggobi.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) require 'ostruct' class StatsampleGGobiTestCase < MiniTest::Unit::TestCase def setup v1=([10.2,20.3,10,20,30,40,30,20,30,40]*10).to_vector(:scale) @v2=(%w{a b c a a a b b c d}*10).to_vector(:nominal) @v2.labels={"a"=>"letter a","d"=>"letter d"} v3=([1,2,3,4,5,4,3,2,1,2]*10).to_vector(:ordinal) @ds={'v1'=>v1,'v2'=>@v2,'v3'=>v3}.to_dataset end def test_values_definition a=[1.0,2,"a",nil] assert_equal("1.0 2 a NA", Statsample::GGobi.values_definition(a,"NA")) end def test_variable_definition carrier=OpenStruct.new carrier.categorials=[] carrier.conversions={} real_var_definition=Statsample::GGobi.variable_definition(carrier,@v2,'variable 2',"v2") expected=<<-EOS letter a b c letter d EOS assert_equal(expected.gsub(/\s/," "),real_var_definition.gsub(/\s/," ")) assert_equal({'variable 2'=>{'a'=>1,'b'=>2,'c'=>3,'d'=>4}},carrier.conversions) assert_equal(['variable 2'],carrier.categorials) end end ================================================ FILE: test/test_gsl.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) class StatsampleGSLTestCase < MiniTest::Unit::TestCase should_with_gsl "matrix with gsl" do a=[1,2,3,4,20].to_vector(:scale) b=[3,2,3,4,50].to_vector(:scale) c=[6,2,3,4,3].to_vector(:scale) ds={'a'=>a,'b'=>b,'c'=>c}.to_dataset gsl=ds.to_matrix.to_gsl assert_equal(5,gsl.size1) assert_equal(3,gsl.size2) matrix=gsl.to_matrix assert_equal(5,matrix.row_size) assert_equal(3,matrix.column_size) end end ================================================ FILE: test/test_histogram.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) class StatsampleHistogramTestCase < MiniTest::Unit::TestCase context Statsample::Histogram do should "alloc correctly with integer" do h = Statsample::Histogram.alloc(4) assert_equal([0.0]*4, h.bin) assert_equal([0.0]*5, h.range) end should "alloc correctly with array" do h = Statsample::Histogram.alloc([1, 3, 7, 9, 20]) assert_equal([0.0]*4, h.bin) assert_equal([1,3,7,9,20], h.range) end should "alloc correctly with integer and min, max array" do h = Statsample::Histogram.alloc(5, [0, 5]) assert_equal([0.0,1.0,2.0,3.0,4.0,5.0], h.range) assert_equal([0.0]*5,h.bin) end should "bin() method return correct number of bins" do h = Statsample::Histogram.alloc(4) assert_equal(4,h.bins) end should "increment correctly" do h = Statsample::Histogram.alloc(5, [0, 5]) h.increment 2.5 assert_equal([0.0,0.0,1.0,0.0,0.0], h.bin) h.increment [0.5,0.5,3.5,3.5] assert_equal([2.0,0.0,1.0,2.0,0.0], h.bin) h.increment 0 assert_equal([3.0,0.0,1.0,2.0,0.0], h.bin) h.increment 5 assert_equal([3.0,0.0,1.0,2.0,0.0], h.bin) end should "alloc_uniform correctly with n, min,max" do h = Statsample::Histogram.alloc_uniform(5,0,10) assert_equal(5,h.bins) assert_equal([0.0]*5,h.bin) assert_equal([0.0,2.0,4.0,6.0,8.0,10.0], h.range) end should "alloc_uniform correctly with n, [min,max]" do h = Statsample::Histogram.alloc_uniform(5, [0, 10]) assert_equal(5,h.bins) assert_equal([0.0]*5,h.bin) assert_equal([0.0,2.0,4.0,6.0,8.0,10.0], h.range) end should "get_range()" do h = Statsample::Histogram.alloc_uniform(5,2,12) 5.times {|i| assert_equal([2+i*2, 4+i*2], h.get_range(i)) } end should "min() and max()" do h=Statsample::Histogram.alloc_uniform(5,2,12) assert_equal(2,h.min) assert_equal(12,h.max) end should "max_val()" do h = Statsample::Histogram.alloc(5, [0, 5]) 100.times {h.increment(rand*5)} max=h.bin[0] (1..4).each {|i| max = h.bin[i] if h.bin[i] > max } assert_equal(max,h.max_val) end should "min_val()" do h = Statsample::Histogram.alloc(5, [0, 5]) 100.times {h.increment(rand*5)} min=h.bin[0] (1..4).each {|i| min = h.bin[i] if h.bin[i]x1,'x2'=>x2}.to_dataset ds.name="test" obs=m.to_dataset assert_equal(ds['x1'],obs['x1']) assert_equal(ds['x2'],obs['x2']) assert_equal(ds['x1'].mean,obs['x1'].mean) end def test_covariate a=Matrix[[1.0, 0.3, 0.2], [0.3, 1.0, 0.5], [0.2, 0.5, 1.0]] a.extend Statsample::CovariateMatrix a.fields=%w{a b c} assert_equal(:correlation, a._type) assert_equal(Matrix[[0.5],[0.3]], a.submatrix(%w{c a}, %w{b})) assert_equal(Matrix[[1.0, 0.2] , [0.2, 1.0]], a.submatrix(%w{c a})) assert_equal(:correlation, a.submatrix(%w{c a})._type) a=Matrix[[20,30,10], [30,60,50], [10,50,50]] a.extend Statsample::CovariateMatrix assert_equal(:covariance, a._type) a=50.times.collect {rand()}.to_scale b=50.times.collect {rand()}.to_scale c=50.times.collect {rand()}.to_scale ds={'a'=>a,'b'=>b,'c'=>c}.to_dataset corr=Statsample::Bivariate.correlation_matrix(ds) real=Statsample::Bivariate.covariance_matrix(ds).correlation corr.row_size.times do |i| corr.column_size.times do |j| assert_in_delta(corr[i,j], real[i,j],1e-15) end end end end ================================================ FILE: test/test_multiset.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) class StatsampleMultisetTestCase < MiniTest::Unit::TestCase def setup @x=%w{a a a a b b b b}.to_vector @y=[1,2,3,4,5,6,7,8].to_scale @z=[10,11,12,13,14,15,16,17].to_scale @ds={'x'=>@x,'y'=>@y,'z'=>@z}.to_dataset @ms=@ds.to_multiset_by_split('x') end def test_creation v1a=[1,2,3,4,5].to_vector v2b=[11,21,31,41,51].to_vector v3c=[21,23,34,45,56].to_vector ds1={'v1'=>v1a,'v2'=>v2b,'v3'=>v3c}.to_dataset v1b=[15,25,35,45,55].to_vector v2b=[11,21,31,41,51].to_vector v3b=[21,23,34,45,56].to_vector ds2={'v1'=>v1b,'v2'=>v2b,'v3'=>v3b}.to_dataset ms=Statsample::Multiset.new(['v1','v2','v3']) ms.add_dataset('ds1',ds1) ms.add_dataset('ds2',ds2) assert_equal(ds1,ms['ds1']) assert_equal(ds2,ms['ds2']) assert_equal(v1a,ms['ds1']['v1']) assert_not_equal(v1b,ms['ds1']['v1']) ds3={'v1'=>v1b,'v2'=>v2b}.to_dataset assert_raise ArgumentError do ms.add_dataset(ds3) end end def test_creation_empty ms=Statsample::Multiset.new_empty_vectors(%w{id age name},%w{male female}) ds_male={'id'=>[].to_vector,'age'=>[].to_vector, 'name'=>[].to_vector}.to_dataset(%w{id age name}) ds_female={'id'=>[].to_vector,'age'=>[].to_vector, 'name'=>[].to_vector}.to_dataset(%w{id age name}) ms2=Statsample::Multiset.new(%w{id age name}) ms2.add_dataset('male',ds_male) ms2.add_dataset('female',ds_female) assert_equal(ms2.fields,ms.fields) assert_equal(ms2['male'],ms['male']) assert_equal(ms2['female'],ms['female']) end def test_to_multiset_by_split_one sex=%w{m m m m m f f f f m}.to_vector(:nominal) city=%w{London Paris NY London Paris NY London Paris NY Tome}.to_vector(:nominal) age=[10,10,20,30,34,34,33,35,36,40].to_vector(:scale) ds={'sex'=>sex,'city'=>city,'age'=>age}.to_dataset ms=ds.to_multiset_by_split('sex') assert_equal(2,ms.n_datasets) assert_equal(%w{f m},ms.datasets.keys.sort) assert_equal(6,ms['m'].cases) assert_equal(4,ms['f'].cases) assert_equal(%w{London Paris NY London Paris Tome},ms['m']['city'].to_a) assert_equal([34,33,35,36],ms['f']['age'].to_a) end def test_to_multiset_by_split_multiple sex=%w{m m m m m m m m m m f f f f f f f f f f}.to_vector(:nominal) city=%w{London London London Paris Paris London London London Paris Paris London London London Paris Paris London London London Paris Paris}.to_vector(:nominal) hair=%w{blonde blonde black black blonde blonde black black blonde blonde black black blonde blonde black black blonde blonde black black}.to_vector(:nominal) age=[10,10,20,30,34,34,33,35,36,40, 10,10,20,30,34,34,33,35,36,40].to_vector(:scale) ds={'sex'=>sex,'city'=>city,'hair'=>hair,'age'=>age}.to_dataset(%w{sex city hair age}) ms=ds.to_multiset_by_split('sex','city','hair') assert_equal(8,ms.n_datasets) assert_equal(3,ms[%w{m London blonde}].cases) assert_equal(3,ms[%w{m London blonde}].cases) assert_equal(1,ms[%w{m Paris black}].cases) end def test_stratum_proportion ds1={'q1'=>[1,1,1,1,1,0,0,0,0,0,0,0].to_vector}.to_dataset ds2={'q1'=>[1,1,1,1,1,1,1,0,0].to_vector}.to_dataset assert_equal(5.0/12, ds1['q1'].proportion ) assert_equal(7.0/9, ds2['q1'].proportion ) ms=Statsample::Multiset.new(['q1']) ms.add_dataset('d1',ds1) ms.add_dataset('d2',ds2) ss=Statsample::StratifiedSample.new(ms,{'d1'=>50,'d2'=>100}) assert_in_delta(0.655, ss.proportion('q1'),0.01) assert_in_delta(0.345, ss.proportion('q1',0),0.01) end def test_stratum_scale boys={'test'=>[50, 55, 60, 62, 62, 65, 67, 67, 70, 70, 73, 73, 75, 78, 78, 80, 85, 90].to_vector(:scale)}.to_dataset girls={'test'=>[70, 70, 72, 72, 75, 75, 78, 78, 80, 80, 82, 82, 85, 85, 88, 88, 90, 90].to_vector(:scale)}.to_dataset ms=Statsample::Multiset.new(['test']) ms.add_dataset('boys',boys) ms.add_dataset('girls',girls) ss=Statsample::StratifiedSample.new(ms,{'boys'=>10000,'girls'=>10000}) assert_equal(2,ss.strata_number) assert_equal(20000,ss.population_size) assert_equal(10000,ss.stratum_size('boys')) assert_equal(10000,ss.stratum_size('girls')) assert_equal(36,ss.sample_size) assert_equal(75,ss.mean('test')) assert_in_delta(1.45,ss.standard_error_wor('test'),0.01) assert_in_delta(ss.standard_error_wor('test'), ss.standard_error_wor_2('test'),0.00001) end def test_each xpe={ 'a'=>%w{a a a a}.to_vector, 'b'=>%w{b b b b}.to_vector } ype={ 'a'=>[1,2,3,4].to_scale, 'b'=>[5,6,7,8].to_scale, } zpe={ 'a'=>[10,11,12,13].to_scale, 'b'=>[14,15,16,17].to_scale, } xp,yp,zp=Hash.new(),Hash.new(),Hash.new() @ms.each {|k,ds| xp[k]=ds['x'] yp[k]=ds['y'] zp[k]=ds['z'] } assert_equal(xpe,xp) assert_equal(ype,yp) assert_equal(zpe,zp) end def test_multiset_union_with_block r1=rand() r2=rand() ye=[1*r1,2*r1,3*r1,4*r1,5*r2,6*r2,7*r2,8*r2].to_scale ze=[10*r1,11*r1,12*r1,13*r1, 14*r2,15*r2,16*r2,17*r2].to_scale ds2=@ms.union {|k,ds| ds['y'].recode!{|v| k=='a' ? v*r1 : v*r2} ds['z'].recode!{|v| k=='a' ? v*r1 : v*r2} } assert_equal(ye,ds2['y']) assert_equal(ze,ds2['z']) end def test_multiset_union r1=rand() r2=rand() ye=[1*r1,2*r1,3*r1,4*r1,5*r2,6*r2,7*r2,8*r2].to_scale ze=[10*r1,11*r1,12*r1,13*r1, 14*r2,15*r2,16*r2,17*r2].to_scale @ms.each {|k,ds| ds['y'].recode!{|v| k=='a' ? v*r1 : v*r2} ds['z'].recode!{|v| k=='a' ? v*r1 : v*r2} } ds2=@ms.union assert_equal(ye,ds2['y']) assert_equal(ze,ds2['z']) end end ================================================ FILE: test/test_regression.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) class StatsampleRegressionTestCase < MiniTest::Unit::TestCase context "Example with missing data" do setup do @x=[0.285714285714286, 0.114285714285714, 0.314285714285714, 0.2, 0.2, 0.228571428571429, 0.2, 0.4, 0.714285714285714, 0.285714285714286, 0.285714285714286, 0.228571428571429, 0.485714285714286, 0.457142857142857, 0.257142857142857, 0.228571428571429, 0.285714285714286, 0.285714285714286, 0.285714285714286, 0.142857142857143, 0.285714285714286, 0.514285714285714, 0.485714285714286, 0.228571428571429, 0.285714285714286, 0.342857142857143, 0.285714285714286, 0.0857142857142857].to_scale @y=[nil, 0.233333333333333, nil, 0.266666666666667, 0.366666666666667, nil, 0.333333333333333, 0.3, 0.666666666666667, 0.0333333333333333, 0.333333333333333, nil, nil, 0.533333333333333, 0.433333333333333, 0.4, 0.4, 0.5, 0.4, 0.266666666666667, 0.166666666666667, 0.666666666666667, 0.433333333333333, 0.166666666666667, nil, 0.4, 0.366666666666667, nil].to_scale @ds={'x'=>@x,'y'=>@y}.to_dataset @lr=Statsample::Regression::Multiple::RubyEngine.new(@ds,'y') end should "have correct values" do assert_in_delta(0.455,@lr.r2,0.001) assert_in_delta(0.427,@lr.r2_adjusted, 0.001) assert_in_delta(0.1165,@lr.se_estimate,0.001) assert_in_delta(15.925,@lr.f,0.0001) assert_in_delta(0.675, @lr.standarized_coeffs['x'],0.001) assert_in_delta(0.778, @lr.coeffs['x'],0.001, "coeff x") assert_in_delta(0.132, @lr.constant,0.001,"constant") assert_in_delta(0.195, @lr.coeffs_se['x'],0.001,"coeff x se") assert_in_delta(0.064, @lr.constant_se,0.001,"constant se") end end should "return an error if data is linearly dependent" do samples=100 a,b=rand,rand x1=samples.times.map { rand}.to_scale x2=samples.times.map {rand}.to_scale x3=samples.times.map {|i| x1[i]*(1+a)+x2[i]*(1+b)}.to_scale y=samples.times.map {|i| x1[i]+x2[i]+x3[i]+rand}.to_scale ds={'x1'=>x1,'x2'=>x2,'x3'=>x3,'y'=>y}.to_dataset assert_raise(Statsample::Regression::LinearDependency) { Statsample::Regression::Multiple::RubyEngine.new(ds,'y') } end def test_parameters @x=[13,20,10,33,15].to_vector(:scale) @y=[23,18,35,10,27 ].to_vector(:scale) reg=Statsample::Regression::Simple.new_from_vectors(@x,@y) _test_simple_regression(reg) ds={'x'=>@x,'y'=>@y}.to_dataset reg=Statsample::Regression::Simple.new_from_dataset(ds,'x','y') _test_simple_regression(reg) reg=Statsample::Regression.simple(@x,@y) _test_simple_regression(reg) end def _test_simple_regression(reg) assert_in_delta(40.009, reg.a,0.001) assert_in_delta(-0.957, reg.b,0.001) assert_in_delta(4.248,reg.standard_error,0.002) assert(reg.summary) end def test_summaries a=10.times.map{rand(100)}.to_scale b=10.times.map{rand(100)}.to_scale y=10.times.map{rand(100)}.to_scale ds={'a'=>a,'b'=>b,'y'=>y}.to_dataset lr=Statsample::Regression::Multiple::RubyEngine.new(ds,'y') assert(lr.summary.size>0) end def test_multiple_dependent complete=Matrix[ [1,0.53,0.62,0.19,-0.09,0.08,0.02,-0.12,0.08], [0.53,1,0.61,0.23,0.1,0.18,0.02,-0.1,0.15], [0.62,0.61,1,0.03,0.1,0.12,0.03,-0.06,0.12], [0.19,0.23,0.03,1,-0.02,0.02,0,-0.02,-0.02], [-0.09,0.1,0.1,-0.02,1,0.05,0.06,0.18,0.02], [0.08,0.18,0.12,0.02,0.05,1,0.22,-0.07,0.36], [0.02,0.02,0.03,0,0.06,0.22,1,-0.01,-0.05], [-0.12,-0.1,-0.06,-0.02,0.18,-0.07,-0.01,1,-0.03], [0.08,0.15,0.12,-0.02,0.02,0.36,-0.05,-0.03,1]] complete.extend Statsample::CovariateMatrix complete.fields=%w{adhd cd odd sex age monly mwork mage poverty} lr=Statsample::Regression::Multiple::MultipleDependent.new(complete, %w{adhd cd odd}) assert_in_delta(0.197, lr.r2yx,0.001) assert_in_delta(0.197, lr.r2yx_covariance,0.001) assert_in_delta(0.07, lr.p2yx,0.001) end def test_multiple_regression_pairwise_2 @a=[1,3,2,4,3,5,4,6,5,7,3,nil,3,nil,3].to_vector(:scale) @b=[3,3,4,4,5,5,6,6,4,4,2,2,nil,6,2].to_vector(:scale) @c=[11,22,30,40,50,65,78,79,99,100,nil,3,7,nil,7].to_vector(:scale) @y=[3,4,5,6,7,8,9,10,20,30,30,40,nil,50,nil].to_vector(:scale) ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset lr=Statsample::Regression::Multiple::RubyEngine.new(ds,'y') assert_in_delta(2407.436,lr.sst,0.001) assert_in_delta(0.752,lr.r,0.001, "pairwise r") assert_in_delta(0.565,lr.r2,0.001) assert_in_delta(1361.130,lr.ssr,0.001) assert_in_delta(1046.306,lr.sse,0.001) assert_in_delta(3.035,lr.f,0.001) end def test_multiple_regression_gsl if Statsample.has_gsl? @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale) @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale) @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale) @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale) ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset lr=Statsample::Regression::Multiple::GslEngine.new(ds,'y') assert(lr.summary.size>0) model_test(lr,'gsl') predicted=[1.7857, 6.0989, 3.2433, 7.2908, 4.9667, 10.3428, 8.8158, 10.4717, 23.6639, 25.3198] c_predicted=lr.predicted predicted.each_index{|i| assert_in_delta(predicted[i],c_predicted[i],0.001) } residuals=[1.2142, -2.0989, 1.7566, -1.29085, 2.033, -2.3428, 0.18414, -0.47177, -3.66395, 4.6801] c_residuals=lr.residuals residuals.each_index{|i| assert_in_delta(residuals[i],c_residuals[i],0.001) } else skip "Regression::Multiple::GslEngine not tested (no Gsl)" end end def model_test_matrix(lr,name='undefined') stan_coeffs={'a'=>0.151,'b'=>-0.547,'c'=>0.997} unstan_coeffs={'a'=>0.695, 'b'=>-4.286, 'c'=>0.266} unstan_coeffs.each_key{|k| assert_in_delta(unstan_coeffs[k], lr.coeffs[k],0.001,"b coeffs - #{name}") } stan_coeffs.each_key{|k| assert_in_delta(stan_coeffs[k], lr.standarized_coeffs[k],0.001, "beta coeffs - #{name}") } assert_in_delta(11.027,lr.constant,0.001) assert_in_delta(0.955,lr.r,0.001) assert_in_delta(0.913,lr.r2,0.001) assert_in_delta(20.908, lr.f,0.001) assert_in_delta(0.001, lr.probability, 0.001) assert_in_delta(0.226,lr.tolerance("a"),0.001) coeffs_se={"a"=>1.171,"b"=>1.129,"c"=>0.072} ccoeffs_se=lr.coeffs_se coeffs_se.each_key{|k| assert_in_delta(coeffs_se[k],ccoeffs_se[k],0.001) } coeffs_t={"a"=>0.594,"b"=>-3.796,"c"=>3.703} ccoeffs_t=lr.coeffs_t coeffs_t.each_key{|k| assert_in_delta(coeffs_t[k], ccoeffs_t[k],0.001) } assert_in_delta(639.6,lr.sst,0.001) assert_in_delta(583.76,lr.ssr,0.001) assert_in_delta(55.840,lr.sse,0.001) assert(lr.summary.size>0, "#{name} without summary") end def model_test(lr,name='undefined') model_test_matrix(lr,name) assert_in_delta(4.559, lr.constant_se,0.001) assert_in_delta(2.419, lr.constant_t,0.001) assert_in_delta(1.785,lr.process([1,3,11]),0.001) end def test_regression_matrix @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale) @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale) @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale) @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale) ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset cor=Statsample::Bivariate.correlation_matrix(ds) lr=Statsample::Regression::Multiple::MatrixEngine.new(cor,'y', :y_mean=>@y.mean, :x_mean=>{'a'=>ds['a'].mean, 'b'=>ds['b'].mean, 'c'=>ds['c'].mean}, :cases=>@a.size, :y_sd=>@y.sd , :x_sd=>{'a' => @a.sd, 'b' => @b.sd, 'c' => @c.sd}) assert_nil(lr.constant_se) assert_nil(lr.constant_t) model_test_matrix(lr, "correlation matrix") covariance=Statsample::Bivariate.covariance_matrix(ds) lr=Statsample::Regression::Multiple::MatrixEngine.new(covariance,'y', :y_mean=>@y.mean, :x_mean=>{'a'=>ds['a'].mean, 'b'=>ds['b'].mean, 'c'=>ds['c'].mean}, :cases=>@a.size) assert(lr.summary.size>0) model_test(lr , "covariance matrix") end def test_regression_rubyengine @a=[nil,1,3,2,4,3,5,4,6,5,7].to_vector(:scale) @b=[nil,3,3,4,4,5,5,6,6,4,4].to_vector(:scale) @c=[nil,11,22,30,40,50,65,78,79,99,100].to_vector(:scale) @y=[nil,3,4,5,6,7,8,9,10,20,30].to_vector(:scale) ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset lr=Statsample::Regression::Multiple::RubyEngine.new(ds,'y') assert_equal(11, lr.total_cases) assert_equal(10, lr.valid_cases) model_test(lr, 'rubyengine with missing data') predicted=[nil,1.7857, 6.0989, 3.2433, 7.2908, 4.9667, 10.3428, 8.8158, 10.4717, 23.6639, 25.3198] c_predicted = lr.predicted predicted.each_index do |i| if c_predicted[i].nil? assert(predicted[i].nil?, "Actual #{i} is nil, but expected #{predicted[i]}") else assert_in_delta(predicted[i], c_predicted[i], 0.001) end end residuals=[nil,1.2142, -2.0989, 1.7566, -1.29085, 2.033, -2.3428, 0.18414, -0.47177, -3.66395, 4.6801] c_residuals=lr.residuals residuals.each_index do |i| if c_residuals[i].nil? assert(residuals[i].nil?) else assert_in_delta(residuals[i],c_residuals[i],0.001) end end end end ================================================ FILE: test/test_reliability.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) class StatsampleReliabilityTestCase < MiniTest::Unit::TestCase context Statsample::Reliability do should "return correct r according to Spearman-Brown prophecy" do r=0.6849 n=62.quo(15) assert_in_delta(0.9, Statsample::Reliability.sbp(r,n), 0.001) end should "return correct n for desired realiability" do r=0.6849 r_d=0.9 assert_in_delta(62, Statsample::Reliability.n_for_desired_reliability(r, r_d, 15),0.5) end context "Cronbach's alpha" do setup do @samples=40 @n_variables=rand(10)+2 @ds=Statsample::Dataset.new() base=@samples.times.collect {|a| rand()}.to_scale @n_variables.times do |i| @ds[i]=base.collect {|v| v+rand()}.to_scale end @ds.update_valid_data @k=@ds.fields.size @cm=Statsample::Bivariate.covariance_matrix(@ds) @dse=@ds.dup @dse.fields.each do |f| @dse[f]=@dse[f].standarized end @cme=Statsample::Bivariate.covariance_matrix(@dse) @a=Statsample::Reliability.cronbach_alpha(@ds) @as=Statsample::Reliability.cronbach_alpha_standarized(@ds) end should "alpha will be equal to sum of matrix covariance less the individual variances" do total_sum=@cm.total_sum ind_var=@ds.fields.inject(0) {|ac,v| ac+@ds[v].variance} expected = @k.quo(@k-1) * (1-(ind_var.quo(total_sum))) assert_in_delta(expected, @a,1e-10) end should "method cronbach_alpha_from_n_s2_cov return correct values" do sa=Statsample::Reliability::ScaleAnalysis.new(@ds) vm, cm = sa.variances_mean, sa.covariances_mean assert_in_delta(sa.alpha, Statsample::Reliability.cronbach_alpha_from_n_s2_cov(@n_variables, vm,cm), 1e-10) end should "method cronbach_alpha_from_covariance_matrix returns correct value" do cov=Statsample::Bivariate.covariance_matrix(@ds) assert_in_delta(@a, Statsample::Reliability.cronbach_alpha_from_covariance_matrix(cov),0.0000001) end should "return correct n for desired alpha, covariance and variance" do sa=Statsample::Reliability::ScaleAnalysis.new(@ds) vm, cm = sa.variances_mean, sa.covariances_mean n_obtained=Statsample::Reliability.n_for_desired_alpha(@a, vm,cm) #p n_obtained assert_in_delta(Statsample::Reliability.cronbach_alpha_from_n_s2_cov(n_obtained, vm,cm) ,@a,0.001) end should "standarized alpha will be equal to sum of matrix covariance less the individual variances on standarized values" do total_sum=@cme.total_sum ind_var=@dse.fields.inject(0) {|ac,v| ac+@dse[v].variance} expected = @k.quo(@k-1) * (1-(ind_var.quo(total_sum))) assert_in_delta(expected, @as, 1e-10) end end context Statsample::Reliability::ItemCharacteristicCurve do setup do @samples=100 @points=rand(10)+3 @max_point=(@points-1)*3 @x1=@samples.times.map{rand(@points)}.to_scale @x2=@samples.times.map{rand(@points)}.to_scale @x3=@samples.times.map{rand(@points)}.to_scale @ds={'a'=>@x1,'b'=>@x2,'c'=>@x3}.to_dataset @icc=Statsample::Reliability::ItemCharacteristicCurve.new(@ds) end should "have a correct automatic vector_total" do assert_equal(@ds.vector_sum, @icc.vector_total) end should "have a correct different vector_total" do x2=@samples.times.map{rand(10)}.to_scale @icc=Statsample::Reliability::ItemCharacteristicCurve.new(@ds,x2) assert_equal(x2, @icc.vector_total) assert_raises(ArgumentError) do inc=(@samples+10).times.map{rand(10)}.to_scale @icc=Statsample::Reliability::ItemCharacteristicCurve.new(@ds,inc) end end should "have 0% for 0 points on maximum value values" do max=@icc.curve_field('a',0)[@max_point.to_f] max||=0 assert_in_delta(0, max) end should "have 0 for max value on minimum value" do max=@icc.curve_field('a',@max_point)[0.0] max||=0 assert_in_delta(0, max) end should "have correct values of % for any value" do sum=@icc.vector_total total={} total_g=sum.frequencies index=rand(@points) @x1.each_with_index do |v,i| total[sum[i]]||=0 total[sum[i]]+=1 if v==index end expected=total.each {|k,v| total[k]=v.quo(total_g[k]) } assert_equal(expected, @icc.curve_field('a',index)) end end context Statsample::Reliability::MultiScaleAnalysis do setup do size=100 @scales=3 @items_per_scale=10 h={} @scales.times {|s| @items_per_scale.times {|i| h["#{s}_#{i}"] = (size.times.map {(s*2)+rand}).to_scale } } @ds=h.to_dataset @msa=Statsample::Reliability::MultiScaleAnalysis.new(:name=>'Multiple Analysis') do |m| m.scale "complete", @ds @scales.times {|s| m.scale "scale_#{s}", @ds.clone(@items_per_scale.times.map {|i| "#{s}_#{i}"}), {:name=>"Scale #{s}"} } end end should "Retrieve correct ScaleAnalysis for whole scale" do sa=Statsample::Reliability::ScaleAnalysis.new(@ds, :name=>"Scale complete") assert_equal(sa.variances_mean, @msa.scale("complete").variances_mean) end should "Retrieve correct ScaleAnalysis for each scale" do @scales.times {|s| sa=Statsample::Reliability::ScaleAnalysis.new(@ds.dup(@items_per_scale.times.map {|i| "#{s}_#{i}"}), :name=>"Scale #{s}") assert_equal(sa.variances_mean,@msa.scale("scale_#{s}").variances_mean) } end should "retrieve correct correlation matrix for each scale" do vectors={'complete' => @ds.vector_sum} @scales.times {|s| vectors["scale_#{s}"]=@ds.dup(@items_per_scale.times.map {|i| "#{s}_#{i}"}).vector_sum } ds2=vectors.to_dataset assert_equal(Statsample::Bivariate.correlation_matrix(ds2), @msa.correlation_matrix) end should "delete scale using delete_scale" do @msa.delete_scale("complete") assert_equal(@msa.scales.keys.sort, @scales.times.map {|s| "scale_#{s}"}) end should "retrieve pca for scales" do @msa.delete_scale("complete") vectors=Hash.new @scales.times {|s| vectors["scale_#{s}"]=@ds.dup(@items_per_scale.times.map {|i| "#{s}_#{i}"}).vector_sum } ds2=vectors.to_dataset cor_matrix=Statsample::Bivariate.correlation_matrix(ds2) m=3 pca=Statsample::Factor::PCA.new(cor_matrix, :m=>m) assert_equal(pca.component_matrix, @msa.pca(:m=>m).component_matrix) end should "retrieve acceptable summary" do @msa.delete_scale("scale_0") @msa.delete_scale("scale_1") @msa.delete_scale("scale_2") #@msa.summary_correlation_matrix=true #@msa.summary_pca=true assert(@msa.summary.size>0) end end context Statsample::Reliability::ScaleAnalysis do setup do @x1=[1,1,1,1,2,2,2,2,3,3,3,30].to_scale @x2=[1,1,1,2,2,3,3,3,3,4,4,50].to_scale @x3=[2,2,1,1,1,2,2,2,3,4,5,40].to_scale @x4=[1,2,3,4,4,4,4,3,4,4,5,30].to_scale @ds={'x1'=>@x1,'x2'=>@x2,'x3'=>@x3,'x4'=>@x4}.to_dataset @ia=Statsample::Reliability::ScaleAnalysis.new(@ds) @cov_matrix=@ia.cov_m end should "return correct values for item analysis" do assert_in_delta(0.980,@ia.alpha,0.001) assert_in_delta(0.999,@ia.alpha_standarized,0.001) var_mean=4.times.map{|m| @cov_matrix[m,m]}.to_scale.mean assert_in_delta(var_mean, @ia.variances_mean) assert_equal(@x1.mean, @ia.item_statistics['x1'][:mean]) assert_equal(@x4.mean, @ia.item_statistics['x4'][:mean]) assert_in_delta(@x1.sds, @ia.item_statistics['x1'][:sds],1e-14) assert_in_delta(@x4.sds, @ia.item_statistics['x4'][:sds],1e-14) ds2=@ds.clone ds2.delete_vector('x1') vector_sum=ds2.vector_sum assert_equal(vector_sum.mean, @ia.stats_if_deleted['x1'][:mean]) assert_equal(vector_sum.sds, @ia.stats_if_deleted['x1'][:sds]) assert_in_delta(vector_sum.variance, @ia.stats_if_deleted['x1'][:variance_sample],1e-10) assert_equal(Statsample::Reliability.cronbach_alpha(ds2), @ia.stats_if_deleted['x1'][:alpha]) covariances=[] 4.times.each {|i| 4.times.each {|j| if i!=j covariances.push(@cov_matrix[i,j]) end } } assert_in_delta(covariances.to_scale.mean, @ia.covariances_mean) assert_in_delta(0.999,@ia.item_total_correlation()['x1'],0.001) assert_in_delta(1050.455,@ia.stats_if_deleted()['x1'][:variance_sample],0.001) end should "return a summary" do assert(@ia.summary.size>0) end end end end ================================================ FILE: test/test_reliability_icc.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) $reliability_icc=nil class StatsampleReliabilityIccTestCase < MiniTest::Test context Statsample::Reliability::ICC do setup do a=[9,6,8,7,10,6].to_scale b=[2,1,4,1,5,2].to_scale c=[5,3,6,2,6,4].to_scale d=[8,2,8,6,9,7].to_scale @ds={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset @icc=Statsample::Reliability::ICC.new(@ds) end should "basic method be correct" do assert_equal(6,@icc.n) assert_equal(4,@icc.k) end should "total mean be correct" do assert_in_delta(5.291, @icc.total_mean, 0.001) end should "df methods be correct" do assert_equal(5, @icc.df_bt) assert_equal(18, @icc.df_wt) assert_equal(3, @icc.df_bj) assert_equal(15, @icc.df_residual) end should "ms between targets be correct" do assert_in_delta(11.24, @icc.ms_bt, 0.01) end should "ms within targets be correct" do assert_in_delta(6.26, @icc.ms_wt, 0.01) end should "ms between judges be correct" do assert_in_delta(32.49, @icc.ms_bj, 0.01) end should "ms residual be correct" do assert_in_delta(1.02, @icc.ms_residual, 0.01) end context "with McGraw and Wong denominations," do end context "with Shrout & Fleiss denominations, " do should "icc(1,1) method be correct" do assert_in_delta(0.17, @icc.icc_1_1, 0.01) end # Verified on SPSS and R should "icc(2,1) method be correct" do assert_in_delta(0.29, @icc.icc_2_1, 0.01) end should "icc(3,1) method be correct" do assert_in_delta(0.71, @icc.icc_3_1, 0.01) end should "icc(1,k) method be correct" do assert_in_delta(0.44, @icc.icc_1_k, 0.01) end # Verified on SPSS and R should "icc(2,k) method be correct" do assert_in_delta(0.62, @icc.icc_2_k, 0.01) end should "icc(3,k) method be correct" do assert_in_delta(0.91, @icc.icc_3_k, 0.01) end should "icc(1,1) F be correct" do assert_in_delta(1.795, @icc.icc_1_f.f) end should "icc(1,1) confidence interval should be correct" do assert_in_delta(-0.133, @icc.icc_1_1_ci[0], 0.001) assert_in_delta(0.723, @icc.icc_1_1_ci[1], 0.001) end should "icc(1,k) confidence interval should be correct" do assert_in_delta(-0.884, @icc.icc_1_k_ci[0], 0.001) assert_in_delta(0.912, @icc.icc_1_k_ci[1], 0.001) end should "icc(2,1) F be correct" do assert_in_delta(11.027, @icc.icc_2_f.f) end should "icc(2,1) confidence interval should be correct" do #skip("Not yet operational") assert_in_delta(0.019, @icc.icc_2_1_ci[0], 0.001) assert_in_delta(0.761, @icc.icc_2_1_ci[1], 0.001) end # Verified on SPSS and R should "icc(2,k) confidence interval should be correct" do #skip("Not yet operational") #p @icc.icc_2_k_ci assert_in_delta(0.039, @icc.icc_2_k_ci[0], 0.001) assert_in_delta(0.929, @icc.icc_2_k_ci[1], 0.001) end #should "Shrout icc(2,k) and McGraw icc(a,k) ci be equal" do # assert_in_delta(@icc.icc_2_k_ci_shrout[0], @icc.icc_2_k_ci_mcgraw[0], 10e-5) #end should "icc(3,1) F be correct" do assert_in_delta(11.027, @icc.icc_3_f.f) end should "icc(3,1) confidence interval should be correct" do assert_in_delta(0.342, @icc.icc_3_1_ci[0], 0.001) assert_in_delta(0.946, @icc.icc_3_1_ci[1], 0.001) end should "icc(3,k) confidence interval should be correct" do assert_in_delta(0.676, @icc.icc_3_k_ci[0], 0.001) assert_in_delta(0.986, @icc.icc_3_k_ci[1], 0.001) end should "incorrect type raises an error" do assert_raise(::RuntimeError) do @icc.type=:nonexistant_type end end end begin require 'rserve' require 'statsample/rserve_extension' context "McGraw and Wong" do teardown do @r=$reliability_icc[:r].close unless $reliability_icc[:r].nil? end setup do if($reliability_icc.nil?) size=100 a=size.times.map {rand(10)}.to_scale b=a.recode{|i|i+rand(4)-2} c=a.recode{|i|i+rand(4)-2} d=a.recode{|i|i+rand(4)-2} @ds={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset @icc=Statsample::Reliability::ICC.new(@ds) @r=Rserve::Connection.new @r.assign('ds',@ds) @r.void_eval("library(irr); iccs=list( icc_1=icc(ds,'o','c','s'), icc_k=icc(ds,'o','c','a'), icc_c_1=icc(ds,'t','c','s'), icc_c_k=icc(ds,'t','c','a'), icc_a_1=icc(ds,'t','a','s'), icc_a_k=icc(ds,'t','a','a')) ") @iccs=@r.eval('iccs').to_ruby $reliability_icc={ :icc=>@icc, :iccs=>@iccs, :r=>@r } end @icc=$reliability_icc[:icc] @iccs=$reliability_icc[:iccs] @r=$reliability_icc[:r] end [:icc_1, :icc_k, :icc_c_1, :icc_c_k, :icc_a_1, :icc_a_k].each do |t| context "ICC Type #{t} " do should "value be correct" do @icc.type=t @r_icc=@iccs[t.to_s] assert_in_delta(@r_icc['value'],@icc.r) end should "fvalue be correct" do @icc.type=t @r_icc=@iccs[t.to_s] assert_in_delta(@r_icc['Fvalue'],@icc.f.f) end should "num df be correct" do @icc.type=t @r_icc=@iccs[t.to_s] assert_in_delta(@r_icc['df1'],@icc.f.df_num) end should "den df be correct" do @icc.type=t @r_icc=@iccs[t.to_s] assert_in_delta(@r_icc['df2'],@icc.f.df_den) end should "f probability be correct" do @icc.type=t @r_icc=@iccs[t.to_s] assert_in_delta(@r_icc['p.value'],@icc.f.probability) end should "bounds be equal" do @icc.type=t @r_icc=@iccs[t.to_s] assert_in_delta(@r_icc['lbound'],@icc.lbound) assert_in_delta(@r_icc['ubound'],@icc.ubound) end should "summary generated" do assert(@icc.summary.size>0) end end end end rescue puts "requires rserve" end end end ================================================ FILE: test/test_reliability_skillscale.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) class StatsampleReliabilitySkillScaleTestCase < MiniTest::Unit::TestCase context Statsample::Reliability::SkillScaleAnalysis do setup do options=%w{a b c d e} cases=20 @id=cases.times.map {|v| v}.to_scale @a=cases.times.map {options[rand(5)]}.to_vector @b=cases.times.map {options[rand(5)]}.to_vector @c=cases.times.map {options[rand(5)]}.to_vector @d=cases.times.map {options[rand(5)]}.to_vector @e=cases.times.map {|i| i==0 ? options[rand(0)] : rand()>0.8 ? nil : options[rand(5)] }.to_vector @ds={'id'=>@id,'a'=>@a,'b'=>@b,'c'=>@c,'d'=>@d,'e'=>@e}.to_dataset @key={'a'=>"a", 'b'=>options[rand(5)], 'c'=>options[rand(5)], 'd'=>options[rand(5)],'e'=>options[rand(5)]} @ssa=Statsample::Reliability::SkillScaleAnalysis.new(@ds, @key) @ac=@a.map {|v| v==@key['a'] ? 1 : 0}.to_scale @bc=@b.map {|v| v==@key['b'] ? 1 : 0}.to_scale @cc=@c.map {|v| v==@key['c'] ? 1 : 0}.to_scale @dc=@d.map {|v| v==@key['d'] ? 1 : 0}.to_scale @ec=@e.map {|v| v.nil? ? nil : (v==@key['e'] ? 1 : 0)}.to_scale end should "return proper corrected dataset" do cds={'id'=>@id, 'a'=>@ac,'b'=>@bc,'c'=>@cc,'d'=>@dc, 'e'=>@ec}.to_dataset assert_equal(cds, @ssa.corrected_dataset) end should "return proper corrected minimal dataset" do cdsm={'a'=>@ac,'b'=>@bc,'c'=>@cc,'d'=>@dc, 'e'=>@ec}.to_dataset assert_equal(cdsm, @ssa.corrected_dataset_minimal) end should "return correct vector_sum and vector_sum" do cdsm=@ssa.corrected_dataset_minimal assert_equal(cdsm.vector_sum, @ssa.vector_sum) assert_equal(cdsm.vector_mean, @ssa.vector_mean) end should "not crash on rare case" do a=Statsample::Vector["c","c","a","a","c","a","b","c","c","b","a","d","a","d","a","a","d","e","c","d"] b=Statsample::Vector["e","b","e","b","c","d","a","e","e","c","b","e","e","b","d","c","e","b","b","d"] c=Statsample::Vector["e","b","e","c","e","c","b","d","e","c","a","a","b","d","e","c","b","a","a","e"] d=Statsample::Vector["a","b","d","d","e","b","e","b","d","c","e","a","c","d","c","c","e","d","d","b"] e=Statsample::Vector["a","b",nil,"d","c","c","d",nil,"d","d","e","e",nil,nil,nil,"d","c",nil,"e","d"] key={"a"=>"a", "b"=>"e", "c"=>"d", "d"=>"c", "e"=>"d"} ds=Statsample::Dataset.new("a"=>a,"b"=>b,"c"=>c,"d"=>d,"e"=>e) ssa=Statsample::Reliability::SkillScaleAnalysis.new(ds, key) assert(ssa.summary) end should "return valid summary" do assert(@ssa.summary.size>0) end end end ================================================ FILE: test/test_resample.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) class StatsampleResampleTestCase < MiniTest::Unit::TestCase def initialize(*args) super end def test_basic r=Statsample::Resample.generate(20,1,10) assert_equal(20,r.size) assert(r.min>=1) assert(r.max<=10) end def test_repeat_and_save r=Statsample::Resample.repeat_and_save(400) { Statsample::Resample.generate(20,1,10).count(1) } assert_equal(400,r.size) v=Statsample::Vector.new(r,:scale) a=v.count {|x| x > 3} assert(a>=30 && a<=70) end end ================================================ FILE: test/test_rserve_extension.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) begin require 'rserve' require 'statsample/rserve_extension' class StatsampleRserveExtensionTestCase < MiniTest::Unit::TestCase context "Statsample Rserve extensions" do setup do @r=Rserve::Connection.new end teardown do @r.close end should "return a valid rexp for numeric vector" do a=100.times.map {|i| rand()>0.9 ? nil : i+rand() }.to_scale rexp=a.to_REXP assert(rexp.is_a? Rserve::REXP::Double) assert_equal(rexp.to_ruby,a.data_with_nils) @r.assign 'a',rexp assert_equal(a.data_with_nils, @r.eval('a').to_ruby) end should "return a valid rserve dataframe for statsample datasets" do a=100.times.map {|i| rand()>0.9 ? nil : i+rand() }.to_scale b=100.times.map {|i| rand()>0.9 ? nil : i+rand() }.to_scale c=100.times.map {|i| rand()>0.9 ? nil : i+rand() }.to_scale ds={'a'=>a,'b'=>b,'c'=>c}.to_dataset rexp=ds.to_REXP assert(rexp.is_a? Rserve::REXP::GenericVector) ret=rexp.to_ruby assert_equal(a.data_with_nils, ret['a']) @r.assign 'df', rexp out_df=@r.eval('df').to_ruby assert_equal('data.frame', out_df.attributes['class']) assert_equal(['a','b','c'], out_df.attributes['names']) assert_equal(a.data_with_nils, out_df['a']) end end end rescue LoadError puts "Require rserve extension" end ================================================ FILE: test/test_srs.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) class StatsampleSrsTestCase < MiniTest::Unit::TestCase def test_std_error assert_equal(384,Statsample::SRS.estimation_n0(0.05,0.5,0.95).to_i) assert_equal(108,Statsample::SRS.estimation_n(0.05,0.5,150,0.95).to_i) assert_in_delta(0.0289,Statsample::SRS.proportion_sd_kp_wor(0.5,100,150),0.001) end end ================================================ FILE: test/test_statistics.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) class StatsampleStatisicsTestCase < MiniTest::Unit::TestCase def initialize(*args) super end def test_p_using_cdf assert_equal(0.25, Statsample::Test.p_using_cdf(0.25, tails=:left)) assert_equal(0.75, Statsample::Test.p_using_cdf(0.25, tails=:right)) assert_equal(0.50, Statsample::Test.p_using_cdf(0.25, tails=:both)) assert_equal(1, Statsample::Test.p_using_cdf(0.50, tails=:both)) assert_equal(0.05, Statsample::Test.p_using_cdf(0.025, tails=:both)) assert_in_delta(0.05, Statsample::Test.p_using_cdf(0.975, tails=:both),0.0001) end def test_recode_repeated a=%w{a b c c d d d e} exp=["a","b","c_1","c_2","d_1","d_2","d_3","e"] assert_equal(exp,a.recode_repeated) end def test_is_number assert("10".is_number?) assert("-10".is_number?) assert("0.1".is_number?) assert("-0.1".is_number?) assert("10e3".is_number?) assert("10e-3".is_number?) assert(!"1212-1212-1".is_number?) assert(!"a10".is_number?) assert(!"".is_number?) end def test_estimation_mean v=([42]*23+[41]*4+[36]*1+[32]*1+[29]*1+[27]*2+[23]*1+[19]*1+[16]*2+[15]*2+[14,11,10,9,7]+ [6]*3+[5]*2+[4,3]).to_vector(:scale) assert_equal(50,v.size) assert_equal(1471,v.sum()) #limits=Statsample::SRS.mean_confidence_interval_z(v.mean(), v.sds(), v.size,676,0.80) end def test_estimation_proportion # total pop=3042 sam=200 prop=0.19 assert_in_delta(81.8, Statsample::SRS.proportion_total_sd_ep_wor(prop, sam, pop), 0.1) # confidence limits pop=500 sam=100 prop=0.37 a=0.95 l= Statsample::SRS.proportion_confidence_interval_z(prop, sam, pop, a) assert_in_delta(0.28,l[0],0.01) assert_in_delta(0.46,l[1],0.01) end def test_ml if(true) #real=[1,1,1,1].to_vector(:scale) #pred=[0.0001,0.0001,0.0001,0.0001].to_vector(:scale) # puts Statsample::Bivariate.maximum_likehood_dichotomic(pred,real) end end def test_simple_linear_regression a=[1,2,3,4,5,6].to_vector(:scale) b=[6,2,4,10,12,8].to_vector(:scale) reg = Statsample::Regression::Simple.new_from_vectors(a,b) assert_in_delta((reg.ssr+reg.sse).to_f,reg.sst,0.001) assert_in_delta(Statsample::Bivariate.pearson(a,b),reg.r,0.001) assert_in_delta(2.4,reg.a,0.01) assert_in_delta(1.314,reg.b,0.001) assert_in_delta(0.657,reg.r,0.001) assert_in_delta(0.432,reg.r2,0.001) end end ================================================ FILE: test/test_stest.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) class StatsampleTestTestCase < MiniTest::Unit::TestCase def test_chi_square_matrix_with_expected real=Matrix[[95,95],[45,155]] expected=Matrix[[68,122],[72,128]] assert_nothing_raised do Statsample::Test.chi_square(real,expected) end chi=Statsample::Test.chi_square(real,expected).chi_square assert_in_delta(32.53,chi,0.1) end def test_chi_square_matrix_only_observed observed=Matrix[[20,30,40],[30,40,50],[60,70,80],[10,20,40]] assert_nothing_raised do Statsample::Test.chi_square(observed) end chi=Statsample::Test.chi_square(observed) assert_in_delta(9.5602, chi.chi_square, 0.0001) assert_in_delta(0.1444, chi.probability, 0.0001) assert_equal(6, chi.df) end def test_u_mannwhitney a=[1,2,3,4,5,6].to_scale b=[0,5,7,9,10,11].to_scale assert_equal(7.5, Statsample::Test.u_mannwhitney(a,b).u) assert_equal(7.5, Statsample::Test.u_mannwhitney(b,a).u) a=[1, 7,8,9,10,11].to_scale b=[2,3,4,5,6,12].to_scale assert_equal(11, Statsample::Test.u_mannwhitney(a,b).u) end def test_levene a=[1,2,3,4,5,6,7,8,100,10].to_scale b=[30,40,50,60,70,80,90,100,110,120].to_scale levene=Statsample::Test::Levene.new([a,b]) assert_levene(levene) end def test_levene_dataset a=[1,2,3,4,5,6,7,8,100,10].to_scale b=[30,40,50,60,70,80,90,100,110,120].to_scale ds={'a'=>a,'b'=>b}.to_dataset levene=Statsample::Test::Levene.new(ds) assert_levene(levene) end def assert_levene(levene) assert_in_delta(0.778, levene.f, 0.001) assert_in_delta(0.389, levene.probability, 0.001) end end ================================================ FILE: test/test_stratified.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) class StatsampleStratifiedTestCase < MiniTest::Unit::TestCase def initialize(*args) super end def test_mean a=[10,20,30,40,50] b=[110,120,130,140] pop=a+b av=a.to_vector(:scale) bv=b.to_vector(:scale) popv=pop.to_vector(:scale) assert_equal(popv.mean,Statsample::StratifiedSample.mean(av,bv)) end end ================================================ FILE: test/test_test_f.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) class StatsampleTestFTestCase < MiniTest::Unit::TestCase context(Statsample::Test::F) do setup do @ssb=84 @ssw=68 @df_num=2 @df_den=15 @f=Statsample::Test::F.new(@ssb.quo(@df_num),@ssw.quo(@df_den), @df_num, @df_den) end should "have #f equal to msb/msw" do assert_equal((@ssb.quo(@df_num)).quo(@ssw.quo(@df_den)), @f.f) end should "have df total equal to df_num+df_den" do assert_equal(@df_num + @df_den, @f.df_total) end should "have probability near 0.002" do assert_in_delta(0.002, @f.probability, 0.0005) end should "be coerced into float" do assert_equal(@f.to_f, @f.f) end context("method summary") do setup do @summary=@f.summary end should "have size > 0" do assert(@summary.size>0) end end end end ================================================ FILE: test/test_test_kolmogorovsmirnov.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) class StatsampleTestKolmogorovSmirnovTestCase < MiniTest::Unit::TestCase context(Statsample::Test::KolmogorovSmirnov) do should "calculate correctly D for two given samples" do a=[1.1,2.5,5.6,9] b=[1,2.3,5.8,10] ks=Statsample::Test::KolmogorovSmirnov.new(a,b) assert_equal(0.25,ks.d) end should "calculate correctly D for a normal sample and Normal Distribution" do a=[0.30022510,-0.36664035,0.08593404,1.29881130,-0.49878633,-0.63056010, 0.28397638, -0.04913700,0.03566644,-1.33414346] ks=Statsample::Test::KolmogorovSmirnov.new(a,Distribution::Normal) assert_in_delta(0.282, ks.d,0.001) end should "calculate correctly D for a variable normal and Normal Distribution" do rng=Distribution::Normal.rng a=100.times.map {rng.call} ks=Statsample::Test::KolmogorovSmirnov.new(a,Distribution::Normal) assert(ks.d<0.15) end context(Statsample::Test::KolmogorovSmirnov::EmpiricDistribution) do should "Create a correct empirical distribution for an array" do a=[10,9,8,7,6,5,4,3,2,1] ed=Statsample::Test::KolmogorovSmirnov::EmpiricDistribution.new(a) assert_equal(0, ed.cdf(-2)) assert_equal(0.5, ed.cdf(5)) assert_equal(0.5, ed.cdf(5.5)) assert_equal(0.9, ed.cdf(9)) assert_equal(1, ed.cdf(11)) end end end end ================================================ FILE: test/test_test_t.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) class StatsampleTestTTestCase < MiniTest::Unit::TestCase include Statsample::Test include Math context T do setup do @a=[30.02, 29.99, 30.11, 29.97, 30.01, 29.99].to_scale @b=[29.89, 29.93, 29.72, 29.98, 30.02, 29.98].to_scale @x1=@a.mean @x2=@b.mean @s1=@a.sd @s2=@b.sd @n1=@a.n @n2=@b.n end should "calculate correctly standard t" do t=Statsample::Test::T.new(@x1, @s1.quo(Math.sqrt(@a.n)), @a.n-1) assert_equal((@x1).quo(@s1.quo(Math.sqrt(@a.n))), t.t) assert_equal(@a.n-1, t.df) assert(t.summary.size>0) end should "calculate correctly t for one sample" do t1=[6, 4, 6, 7, 4,5,5,12,6,1].to_scale t2=[9, 6, 5,10,10,8,7,10,6,5].to_scale d=t1-t2 t=Statsample::Test::T::OneSample.new(d) assert_in_delta(-2.631, t.t, 0.001) assert_in_delta( 0.027, t.probability, 0.001) assert_in_delta( 0.76012, t.se, 0.0001) assert(t.summary.size>0) end should "calculate correctly t for two samples" do assert_in_delta(1.959, T.two_sample_independent(@x1, @x2, @s1, @s2, @n1, @n2),0.001) assert_in_delta(1.959, T.two_sample_independent(@x1, @x2, @s1, @s2, @n1, @n2,true),0.001) end should "calculate correctly df for equal and unequal variance" do assert_equal(10, T.df_equal_variance(@n1,@n2)) assert_in_delta(7.03, T.df_not_equal_variance(@s1,@s2,@n1,@n2),0.001) end should "calculate all values for T object" do t=Statsample::Test.t_two_samples_independent(@a,@b) assert(t.summary.size>0) assert_in_delta(1.959, t.t_equal_variance,0.001) assert_in_delta(1.959, t.t_not_equal_variance,0.001) assert_in_delta(10, t.df_equal_variance,0.001) assert_in_delta(7.03, t.df_not_equal_variance,0.001) assert_in_delta(0.07856, t.probability_equal_variance,0.001) assert_in_delta(0.09095, t.probability_not_equal_variance,0.001) end should "be the same using shorthand" do v=100.times.map {rand(100)}.to_scale assert_equal(Statsample::Test.t_one_sample(v).t, T::OneSample.new(v).t) end should "calculate all values for one sample T test" do u=@a.mean+(1-rand*2) tos=T::OneSample.new(@a,{:u=>u}) assert_equal((@a.mean-u).quo(@a.sd.quo(sqrt(@a.n))), tos.t) assert_equal(@a.n-1, tos.df) assert(tos.summary.size>0) end end end ================================================ FILE: test/test_umannwhitney.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) class StatsampleUMannWhitneyTestCase < MiniTest::Unit::TestCase include Statsample::Test context Statsample::Test::UMannWhitney do setup do @v1=[1,2,3,4,7,8,9,10,14,15].to_scale @v2=[5,6,11,12,13,16,17,18,19].to_scale @u=Statsample::Test::UMannWhitney.new(@v1,@v2) end should "have same result using class or Test#u_mannwhitney" do assert_equal(Statsample::Test.u_mannwhitney(@v1,@v2).u, @u.u) end should "have correct U values" do assert_equal(73,@u.r1) assert_equal(117,@u.r2) assert_equal(18,@u.u) end should "have correct value for z" do assert_in_delta(-2.205,@u.z,0.001) end should "have correct value for z and exact probability" do assert_in_delta(0.027,@u.probability_z,0.001) assert_in_delta(0.028,@u.probability_exact,0.001) end end end ================================================ FILE: test/test_vector.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) class StatsampleTestVector < MiniTest::Unit::TestCase include Statsample::Shorthand def setup @c = Statsample::Vector.new([5,5,5,5,5,6,6,7,8,9,10,1,2,3,4,nil,-99,-99], :nominal) @c.name="Test Vector" @c.missing_values=[-99] end def assert_counting_tokens(b) assert_equal([1,1,0,1,0,nil],b['a'].to_a) assert_equal([0,1,0,0,0,nil],b['b'].to_a) assert_equal([0,0,1,0,0,nil],b['c'].to_a) assert_equal([0,0,1,1,0,nil],b['d'].to_a) assert_equal([0,0,0,0,1,nil],b[10].to_a) end context Statsample do setup do @sample=100 @a=@sample.times.map{|i| (i+rand(10)) %10 ==0 ? nil : rand(100)}.to_scale @b=@sample.times.map{|i| (i+rand(10)) %10 ==0 ? nil : rand(100)}.to_scale @correct_a=Array.new @correct_b=Array.new @a.each_with_index do |v,i| if !@a[i].nil? and !@b[i].nil? @correct_a.push(@a[i]) @correct_b.push(@b[i]) end end @correct_a=@correct_a.to_scale @correct_b=@correct_b.to_scale @common=lambda do |av,bv| assert_equal(@correct_a, av, "A no es esperado") assert_equal(@correct_b, bv, "B no es esperado") assert(!av.has_missing_data?, "A tiene datos faltantes") assert(!bv.has_missing_data?, "b tiene datos faltantes") end end should "return correct only_valid" do av,bv=Statsample.only_valid @a,@b av2,bv2=Statsample.only_valid av,bv @common.call(av,bv) assert_equal(av,av2) assert_not_same(av,av2) assert_not_same(bv,bv2) end should "return correct only_valid_clone" do av,bv=Statsample.only_valid_clone @a,@b @common.call(av,bv) av2,bv2=Statsample.only_valid_clone av,bv assert_equal(av,av2) assert_same(av,av2) assert_same(bv,bv2) end end context Statsample::Vector do setup do @c = Statsample::Vector.new([5,5,5,5,5,6,6,7,8,9,10,1,2,3,4,nil,-99,-99], :nominal) @c.name="Test Vector" @c.missing_values=[-99] end should_with_gsl "be created with GSL::Vector" do gsl=GSL::Vector[1,2,3,4,5] v=Statsample::Vector.new(gsl) assert_equal([1,2,3,4,5], v.to_a) refute(v.flawed?) end context "using matrix operations" do setup do @a=[1,2,3,4,5].to_scale end should "to_matrix returns a matrix with 1 row" do mh=Matrix[[1,2,3,4,5]] assert_equal(mh,@a.to_matrix) end should "to_matrix(:vertical) returns a matrix with 1 column" do mv=Matrix.columns([[1,2,3,4,5]]) assert_equal(mv,@a.to_matrix(:vertical)) end should "returns valid submatrixes" do # 3*4 + 2*5 = 22 a=[3,2].to_vector(:scale) b=[4,5].to_vector(:scale) assert_equal(22,(a.to_matrix*b.to_matrix(:vertical))[0,0]) end end context "when initializing" do setup do @data=(10.times.map{rand(100)})+[nil] @original=Statsample::Vector.new(@data, :scale) end should "be the sample using []" do second=Statsample::Vector[*@data] assert_equal(@original, second) end should "[] returns same results as R-c()" do reference=[0,4,5,6,10].to_scale assert_equal(reference, Statsample::Vector[0,4,5,6,10]) assert_equal(reference, Statsample::Vector[0,4..6,10]) assert_equal(reference, Statsample::Vector[[0],[4,5,6],[10]]) assert_equal(reference, Statsample::Vector[[0],[4,[5,[6]]],[10]]) assert_equal(reference, Statsample::Vector[[0],[4,5,6].to_vector,[10]]) end should "be the same usign #to_vector" do lazy1=@data.to_vector(:scale) assert_equal(@original,lazy1) end should "be the same using #to_scale" do lazy2=@data.to_scale assert_equal(@original,lazy2) assert_equal(:scale,lazy2.type) assert_equal(@data.find_all{|v| !v.nil?},lazy2.valid_data) end should "could use new_scale with size only" do v1=10.times.map {nil}.to_scale v2=Statsample::Vector.new_scale(10) assert_equal(v1,v2) end should "could use new_scale with size and value" do a=rand v1=10.times.map {a}.to_scale v2=Statsample::Vector.new_scale(10,a) assert_equal(v1,v2) end should "could use new_scale with func" do v1=10.times.map {|i| i*2}.to_scale v2=Statsample::Vector.new_scale(10) {|i| i*2} assert_equal(v1,v2) end end context "#split_by_separator" do setup do @a = Statsample::Vector.new(["a","a,b","c,d","a,d",10,nil],:nominal) @b=@a.split_by_separator(",") end should "returns a Hash" do assert_kind_of(Hash, @b) end should "return a Hash with keys with different values of @a" do expected=['a','b','c','d',10] assert_equal(expected, @b.keys) end should "returns a Hash, which values are Statsample::Vector" do @b.each_key {|k| assert_instance_of(Statsample::Vector, @b[k])} end should "hash values are n times the tokens appears" do assert_counting_tokens(@b) end should "#split_by_separator_freq returns the number of ocurrences of tokens" do assert_equal({'a'=>3,'b'=>1,'c'=>1,'d'=>2,10=>1}, @a.split_by_separator_freq()) end should "using a different separator give the same values" do a = Statsample::Vector.new(["a","a*b","c*d","a*d",10,nil],:nominal) b=a.split_by_separator("*") assert_counting_tokens(b) end end should "return correct median_absolute_deviation" do a=[1, 1, 2, 2, 4, 6, 9].to_scale assert_equal(1, a.median_absolute_deviation) end should "return correct histogram" do a=10.times.map {|v| v}.to_scale hist=a.histogram(2) assert_equal([5,5], hist.bin) 3.times do |i| assert_in_delta(i*4.5, hist.get_range(i)[0], 1e-9) end end should "have a name" do @c.name=="Test Vector" end should "without explicit name, returns vector with succesive numbers" do a=10.times.map{rand(100)}.to_scale b=10.times.map{rand(100)}.to_scale assert_match(/Vector \d+/, a.name) a.name=~/Vector (\d+)/ next_number=$1.to_i+1 assert_equal("Vector #{next_number}",b.name) end should "save to a file and load the same Vector" do outfile=Tempfile.new("vector.vec") @c.save(outfile.path) a=Statsample.load(outfile.path) assert_equal(@c,a) end should "#collect returns an array" do val=@c.collect {|v| v} assert_equal(val,[5,5,5,5,5,6,6,7,8,9,10,1,2,3,4,nil,-99,-99]) end should "#recode returns a recoded array" do a=@c.recode{|v| @c.is_valid?(v) ? 0 : 1 } exp=[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1].to_vector assert_equal(exp,a) exp.recode!{|v| v==0 ? 1:0} exp2=(([1]*15)+([0]*3)).to_vector assert_equal(exp2,exp) end should "#product returns the * of all values" do a=[1,2,3,4,5].to_vector(:scale) assert_equal(120,a.product) end should "missing values" do @c.missing_values=[10] assert_equal([-99,-99,1,2,3,4,5,5,5,5,5,6,6,7,8,9], @c.valid_data.sort) assert_equal([5,5,5,5,5,6,6,7,8,9,nil,1,2,3,4,nil,-99,-99], @c.data_with_nils) @c.missing_values=[-99] assert_equal(@c.valid_data.sort,[1,2,3,4,5,5,5,5,5,6,6,7,8,9,10]) assert_equal(@c.data_with_nils,[5,5,5,5,5,6,6,7,8,9,10,1,2,3,4,nil,nil,nil]) @c.missing_values=[] assert_equal(@c.valid_data.sort,[-99,-99,1,2,3,4,5,5,5,5,5,6,6,7,8,9,10]) assert_equal(@c.data_with_nils,[5,5,5,5,5,6,6,7,8,9,10,1,2,3,4,nil,-99,-99]) end should "correct has_missing_data? with missing data" do a=[1,2,3,nil].to_vector assert(a.has_missing_data?) end should "correct has_missing_data? without missing data" do a=[1,2,3,4,10].to_vector assert(!a.has_missing_data?) end should "with explicit missing_values, should respond has_missing_data?" do a=[1,2,3,4,10].to_vector a.missing_values=[10] assert(a.has_missing_data?) end should "label correctly fields" do @c.labels={5=>'FIVE'} assert_equal(["FIVE","FIVE","FIVE","FIVE","FIVE",6,6,7,8,9,10,1,2,3,4,nil,-99, -99],@c.vector_labeled.to_a) end should "verify" do h=@c.verify{|d| !d.nil? and d>0} e={15=>nil,16=>-99,17=>-99} assert_equal(e,h) end should "have a summary with name on it" do assert_match(/#{@c.name}/, @c.summary) end should "GSL::Vector based should push correcty" do if Statsample.has_gsl? v=GSL::Vector[1,2,3,4,5].to_scale v.push(nil) assert_equal([1,2,3,4,5,nil], v.to_a) assert(v.flawed?) else skip("Requires GSL") end end should "split correctly" do a = Statsample::Vector.new(["a","a,b","c,d","a,d","d",10,nil],:nominal) assert_equal([%w{a},%w{a b},%w{c d},%w{a d},%w{d},[10],nil], a.splitted) end should "multiply correct for scalar" do a = [1,2,3].to_scale assert_equal([5,10,15].to_scale, a*5) end should "multiply correct with other vector" do a = [1,2,3].to_scale b = [2,4,6].to_scale assert_equal([2,8,18].to_scale, a*b) end should "sum correct for scalar" do a = [1,2,3].to_scale assert_equal([11,12,13].to_scale, a+10) end should "raise NoMethodError when method requires ordinal and vector is nominal" do @c.type=:nominal assert_raise(::NoMethodError) { @c.median } end should "raise NoMethodError when method requires scalar and vector is ordinal" do @c.type=:ordinal assert_raise(::NoMethodError) { @c.mean } end should "jacknife correctly with named method" do # First example a=[1,2,3,4].to_scale ds=a.jacknife(:mean) assert_equal(a.mean, ds[:mean].mean) ds=a.jacknife([:mean,:sd]) assert_equal(a.mean, ds[:mean].mean) assert_equal(a.sd, ds[:mean].sd) end should "jacknife correctly with custom method" do # Second example a=[17.23, 18.71,13.93,18.81,15.78,11.29,14.91,13.39, 18.21, 11.57, 14.28, 10.94, 18.83, 15.52,13.45,15.25].to_scale ds=a.jacknife(:log_s2=>lambda {|v| Math.log(v.variance) }) exp=[1.605, 2.972, 1.151, 3.097, 0.998, 3.308, 0.942, 1.393, 2.416, 2.951, 1.043, 3.806, 3.122, 0.958, 1.362, 0.937].to_scale assert_similar_vector(exp, ds[:log_s2], 0.001) assert_in_delta(2.00389, ds[:log_s2].mean, 0.00001) assert_in_delta(1.091, ds[:log_s2].variance, 0.001) end should "jacknife correctly with k>1" do a=rnorm(6) ds=a.jacknife(:mean,2) mean=a.mean exp=[3*mean-2*(a[2]+a[3]+a[4]+a[5]) / 4, 3*mean-2*(a[0]+a[1]+a[4]+a[5]) / 4, 3*mean-2*(a[0]+a[1]+a[2]+a[3]) / 4].to_scale assert_similar_vector(exp, ds[:mean], 1e-13) end should "bootstrap should return a vector with mean=mu and sd=se" do a=rnorm(100) ds=a.bootstrap([:mean,:sd],200) se=1/Math.sqrt(a.size) assert_in_delta(0, ds[:mean].mean, 0.3) assert_in_delta(se, ds[:mean].sd, 0.02) end end def test_nominal assert_equal(@c[1],5) assert_equal({ 1=>1,2=>1,3=>1,4=>1,5=>5,6=>2,7=>1,8=>1, 9=>1,10=>1},@c.frequencies) assert_equal({ 1=>1,2=>1,3=>1,4=>1,5=>5,6=>2,7=>1,8=>1, 9=>1,10=>1},@c._frequencies) assert_equal({ 1 => 1.quo(15) ,2=>1.quo(15), 3=>1.quo(15),4=>1.quo(15),5=>5.quo(15),6=>2.quo(15),7=>1.quo(15), 8=>1.quo(15), 9=>1.quo(15),10=>1.quo(15)}, @c.proportions) assert_equal(@c.proportion, 1.quo(15)) assert_equal(@c.proportion(2), 1.quo(15)) assert_equal([1,2,3,4,5,6,7,8,9,10], @c.factors.sort) assert_equal(@c.mode,5) assert_equal(@c.n_valid,15) end def test_equality v1=[1,2,3].to_vector v2=[1,2,3].to_vector assert_equal(v1,v2) v1=[1,2,3].to_vector(:nominal) v2=[1,2,3].to_vector(:ordinal) assert_not_equal(v1,v2) v2=[1,2,3] assert_not_equal(v1,v2) v1=[1,2,3].to_vector() v2=[1,2,3].to_vector() assert_equal(v1,v2) assert_equal(false, v1 == Object.new) end def test_vector_percentil a=[1,2,2,3,4,5,5,5,6,10].to_scale expected=[10,25,25,40,50,70,70,70,90,100].to_scale assert_equal(expected, a.vector_percentil) a=[1,nil,nil,2,2,3,4,nil,nil,5,5,5,6,10].to_scale expected=[10,nil,nil,25,25,40,50,nil,nil,70,70,70,90,100].to_scale assert_equal(expected, a.vector_percentil) end def test_ordinal @c.type=:ordinal assert_equal(5,@c.median) assert_equal(4,@c.percentil(25)) assert_equal(7,@c.percentil(75)) v=[200000, 200000, 210000, 220000, 230000, 250000, 250000, 250000, 270000, 300000, 450000, 130000, 140000, 140000, 140000, 145000, 148000, 165000, 170000, 180000, 180000, 180000, 180000, 180000, 180000 ].to_scale assert_equal(180000,v.median) a=[7.0, 7.0, 7.0, 7.0, 7.0, 8.0, 8.0, 8.0, 9.0, 9.0, 10.0, 10.0, 10.0, 10.0, 10.0, 12.0, 12.0, 13.0, 14.0, 14.0, 2.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0].to_scale assert_equal(4.5, a.percentil(25)) assert_equal(6.5, a.percentil(50)) assert_equal(9.5, a.percentil(75)) assert_equal(3.0, a.percentil(10)) end def test_linear_percentil_strategy values = [102, 104, 105, 107, 108, 109, 110, 112, 115, 116].shuffle.to_scale assert_equal 102, values.percentil(0, :linear) assert_equal 104.75, values.percentil(25, :linear) assert_equal 108.5, values.percentil(50, :linear) assert_equal 112.75, values.percentil(75, :linear) assert_equal 116, values.percentil(100, :linear) values = [102, 104, 105, 107, 108, 109, 110, 112, 115, 116, 118].shuffle.to_scale assert_equal 102, values.percentil(0, :linear) assert_equal 105, values.percentil(25, :linear) assert_equal 109, values.percentil(50, :linear) assert_equal 115, values.percentil(75, :linear) assert_equal 118, values.percentil(100, :linear) end def test_ranked v1=[0.8,1.2,1.2,2.3,18].to_vector(:ordinal) expected=[1,2.5,2.5,4,5].to_vector(:ordinal) assert_equal(expected,v1.ranked) v1=[nil,0.8,1.2,1.2,2.3,18,nil].to_vector(:ordinal) expected=[nil,1,2.5,2.5,4,5,nil].to_vector(:ordinal) assert_equal(expected,v1.ranked) end def test_scale a=Statsample::Vector.new([1,2,3,4,"STRING"], :scale) assert_equal(10, a.sum) i=0 factors=a.factors.sort [0,1,2,3,4].each{|v| assert(v==factors[i]) assert(v.class==factors[i].class,"#{v} - #{v.class} != #{factors[i]} - #{factors[i].class}") i+=1 } end def test_vector_centered mean=rand() samples=11 centered=samples.times.map {|i| i-((samples/2).floor).to_i}.to_scale not_centered=centered.recode {|v| v+mean} obs=not_centered.centered centered.each_with_index do |v,i| assert_in_delta(v,obs[i],0.0001) end end def test_vector_standarized v1=[1,2,3,4,nil].to_vector(:scale) sds=v1.sds expected=[((1-2.5).quo(sds)),((2-2.5).quo(sds)),((3-2.5).quo(sds)),((4-2.5).quo(sds)), nil].to_vector(:scale) vs=v1.vector_standarized assert_equal(expected, vs) assert_equal(0,vs.mean) assert_equal(1,vs.sds) end def test_vector_standarized_with_zero_variance v1=100.times.map {|i| 1}.to_scale exp=100.times.map {nil}.to_scale assert_equal(exp,v1.standarized) end def test_check_type v=Statsample::Vector.new v.type=:nominal assert_raise(NoMethodError) { v.check_type(:scale)} assert_raise(NoMethodError) { v.check_type(:ordinal)} assert(v.check_type(:nominal).nil?) v.type=:ordinal assert_raise(NoMethodError) { v.check_type(:scale)} assert(v.check_type(:ordinal).nil?) assert(v.check_type(:nominal).nil?) v.type=:scale assert(v.check_type(:scale).nil?) assert(v.check_type(:ordinal).nil?) assert(v.check_type(:nominal).nil?) v.type=:date assert_raise(NoMethodError) { v.check_type(:scale)} assert_raise(NoMethodError) { v.check_type(:ordinal)} assert_raise(NoMethodError) { v.check_type(:nominal)} end def test_add a=Statsample::Vector.new([1,2,3,4,5], :scale) b=Statsample::Vector.new([11,12,13,14,15], :scale) assert_equal([3,4,5,6,7], (a+2).to_a) assert_equal([12,14,16,18,20], (a+b).to_a) assert_raise ArgumentError do a + @c end assert_raise TypeError do a+"string" end a=Statsample::Vector.new([nil,1, 2 ,3 ,4 ,5], :scale) b=Statsample::Vector.new([11, 12,nil,13,14,15], :scale) assert_equal([nil,13,nil,16,18,20], (a+b).to_a) assert_equal([nil,13,nil,16,18,20], (a+b.to_a).to_a) end def test_minus a=Statsample::Vector.new([1,2,3,4,5], :scale) b=Statsample::Vector.new([11,12,13,14,15], :scale) assert_equal([-1,0,1,2,3], (a-2).to_a) assert_equal([10,10,10,10,10], (b-a).to_a) assert_raise ArgumentError do a-@c end assert_raise TypeError do a-"string" end a=Statsample::Vector.new([nil,1, 2 ,3 ,4 ,5], :scale) b=Statsample::Vector.new([11, 12,nil,13,14,15], :scale) assert_equal([nil,11,nil,10,10,10], (b-a).to_a) assert_equal([nil,11,nil,10,10,10], (b-a.to_a).to_a) end def test_sum_of_squares a=[1,2,3,4,5,6].to_vector(:scale) assert_equal(17.5, a.sum_of_squared_deviation) end def test_average_deviation a=[1,2,3,4,5,6,7,8,9].to_scale assert_equal(20.quo(9), a.average_deviation_population) end def test_samples srand(1) assert_equal(100,@c.sample_with_replacement(100).size) assert_equal(@c.valid_data.to_a.sort, @c.sample_without_replacement(15).sort) assert_raise ArgumentError do @c.sample_without_replacement(20) end @c.type=:scale srand(1) assert_equal(100, @c.sample_with_replacement(100).size) assert_equal(@c.valid_data.to_a.sort, @c.sample_without_replacement(15).sort) end def test_valid_data a=Statsample::Vector.new([1,2,3,4,"STRING"]) a.missing_values=[-99] a.add(1,false) a.add(2,false) a.add(-99,false) a.set_valid_data exp_valid_data=[1,2,3,4,"STRING",1,2] assert_equal(exp_valid_data,a.valid_data) a.add(20,false) a.add(30,false) assert_equal(exp_valid_data,a.valid_data) a.set_valid_data exp_valid_data_2=[1,2,3,4,"STRING",1,2,20,30] assert_equal(exp_valid_data_2,a.valid_data) end def test_set_value @c[2]=10 expected=[5,5,10,5,5,6,6,7,8,9,10,1,2,3,4,nil,-99,-99].to_vector assert_equal(expected.data,@c.data) end def test_gsl if Statsample.has_gsl? a=Statsample::Vector.new([1,2,3,4,"STRING"], :scale) assert_equal(2,a.mean) assert_equal(a.variance_sample_ruby,a.variance_sample) assert_equal(a.standard_deviation_sample_ruby,a.sds) assert_equal(a.variance_population_ruby,a.variance_population) assert_equal(a.standard_deviation_population_ruby,a.standard_deviation_population) assert_nothing_raised do a=[].to_vector(:scale) end a.add(1,false) a.add(2,false) a.set_valid_data assert_equal(3,a.sum) b=[1,2,nil,3,4,5,nil,6].to_vector(:scale) assert_equal(21, b.sum) assert_equal(3.5, b.mean) assert_equal(6,b.gsl.size) c=[10,20,30,40,50,100,1000,2000,5000].to_scale assert_in_delta(c.skew, c.skew_ruby ,0.0001) assert_in_delta(c.kurtosis, c.kurtosis_ruby ,0.0001) end end def test_vector_matrix v1=%w{a a a b b b c c}.to_vector v2=%w{1 3 4 5 6 4 3 2}.to_vector v3=%w{1 0 0 0 1 1 1 0}.to_vector ex=Matrix.rows([["a", "1", "1"], ["a", "3", "0"], ["a", "4", "0"], ["b", "5", "0"], ["b", "6", "1"], ["b", "4", "1"], ["c", "3", "1"], ["c", "2", "0"]]) assert_equal(ex,Statsample.vector_cols_matrix(v1,v2,v3)) end def test_marshalling v1=(0..100).to_a.collect{|n| rand(100)}.to_vector(:scale) v2=Marshal.load(Marshal.dump(v1)) assert_equal(v1,v2) end def test_dup v1=%w{a a a b b b c c}.to_vector v2=v1.dup assert_equal(v1.data,v2.data) assert_not_same(v1.data,v2.data) assert_equal(v1.type,v2.type) v1.type=:ordinal assert_not_equal(v1.type,v2.type) assert_equal(v1.missing_values,v2.missing_values) assert_not_same(v1.missing_values,v2.missing_values) assert_equal(v1.labels,v2.labels) assert_not_same(v1.labels,v2.labels) v3=v1.dup_empty assert_equal([],v3.data) assert_not_equal(v1.data,v3.data) assert_not_same(v1.data,v3.data) assert_equal(v1.type,v3.type) v1.type=:ordinal v3.type=:nominal assert_not_equal(v1.type,v3.type) assert_equal(v1.missing_values,v3.missing_values) assert_not_same(v1.missing_values,v3.missing_values) assert_equal(v1.labels,v3.labels) assert_not_same(v1.labels,v3.labels) end def test_paired_ties a=[0,0,0,1,1,2,3,3,4,4,4].to_vector(:ordinal) expected=[2,2,2,4.5,4.5,6,7.5,7.5,10,10,10].to_vector(:ordinal) assert_equal(expected,a.ranked) end def test_dichotomize a= [0,0,0,1,2,3,nil].to_vector exp=[0,0,0,1,1,1,nil].to_scale assert_equal(exp,a.dichotomize) a= [1,1,1,2,2,2,3].to_vector exp=[0,0,0,1,1,1,1].to_scale assert_equal(exp,a.dichotomize) a= [0,0,0,1,2,3,nil].to_vector exp=[0,0,0,0,1,1,nil].to_scale assert_equal(exp,a.dichotomize(1)) a= %w{a a a b c d}.to_vector exp=[0,0,0,1,1,1].to_scale assert_equal(exp, a.dichotomize) end def test_can_be_methods a= [0,0,0,1,2,3,nil].to_vector assert(a.can_be_scale?) a=[0,"s",0,1,2,3,nil].to_vector assert(!a.can_be_scale?) a.missing_values=["s"] assert(a.can_be_scale?) a=[Date.new(2009,10,10), Date.today(), "2009-10-10", "2009-1-1", nil, "NOW"].to_vector assert(a.can_be_date?) a=[Date.new(2009,10,10), Date.today(),nil,"sss"].to_vector assert(!a.can_be_date?) end def test_date_vector a=[Date.new(2009,10,10), :NOW, "2009-10-10", "2009-1-1", nil, "NOW","MISSING"].to_vector(:date, :missing_values=>["MISSING"]) assert(a.type==:date) expected=[Date.new(2009,10,10), Date.today(), Date.new(2009,10,10), Date.new(2009,1,1), nil, Date.today(), nil ] assert_equal(expected, a.date_data_with_nils) end end ================================================ FILE: test/test_wilcoxonsignedrank.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) class StatsampleUMannWhitneyTestCase < MiniTest::Unit::TestCase include Statsample::Test context Statsample::Test::WilcoxonSignedRank do context "Example 1" do setup do @v1=[110,122,125,120,140,124,123,137,135,145].to_scale @v2=[125,115,130,140,140,115,140,125,140,135].to_scale @u=Statsample::Test::WilcoxonSignedRank.new(@v1,@v2) end should "have same result using class or Test#u_mannwhitney" do assert_equal(Statsample::Test.wilcoxon_signed_rank(@v1,@v2).w, @u.w) end should "have correct W values" do assert_equal(9,@u.w) end should "have correct nr values" do assert_equal(9,@u.nr) end should "have correct value for z" do assert_in_delta(0.503,@u.z,0.001) end should "have correct value for probability_z" do assert_in_delta(0.614,@u.probability_z,0.001) end should "have correct value for probability_exact" do assert_in_delta(0.652,@u.probability_exact,0.001) end should "have summary" do assert(@u.summary!="") end end context "Example 2" do setup do @v2=[78,24,64,45,64,52,30,50,64,50,78,22,84,40,90,72].to_scale @v1=[78,24,62,48,68,56,25,44,56,40,68,36,68,20,58,32].to_scale @u=Statsample::Test::WilcoxonSignedRank.new(@v1,@v2) end should "have same result using class or Test#u_mannwhitney" do assert_equal(Statsample::Test.wilcoxon_signed_rank(@v1,@v2).w, @u.w) end should "have correct W values" do assert_equal(67,@u.w) end should "have correct nr values" do assert_equal(14,@u.nr) end should "have correct value for z" do assert_in_delta(2.087,@u.z,0.001) end should "have correct value for probability_z" do assert_in_delta(0.036,@u.probability_z,0.001) end should "have correct value for probability_exact" do assert_in_delta(0.036,@u.probability_exact,0.001) end should "have summary" do assert(@u.summary!="") end end end end ================================================ FILE: test/test_xls.rb ================================================ require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) class StatsampleExcelTestCase < MiniTest::Unit::TestCase context "Excel reader" do setup do @ds=Statsample::Excel.read(File.dirname(__FILE__)+"/fixtures/test_xls.xls") end should "set the number of cases" do assert_equal(6,@ds.cases) end should "set correct field names" do assert_equal(%w{id name age city a1},@ds.fields) end should "set a dataset equal to expected" do id=[1,2,3,4,5,6].to_vector(:scale) name=["Alex","Claude","Peter","Franz","George","Fernand"].to_vector(:nominal) age=[20,23,25,nil,5.5,nil].to_vector(:scale) city=["New York","London","London","Paris","Tome",nil].to_vector(:nominal) a1=["a,b","b,c","a",nil,"a,b,c",nil].to_vector(:nominal) ds_exp=Statsample::Dataset.new({'id'=>id,'name'=>name,'age'=>age,'city'=>city,'a1'=>a1}, %w{id name age city a1}) ds_exp.fields.each{|f| assert_equal(ds_exp[f],@ds[f]) } assert_equal(ds_exp,@ds) end should "set to nil empty cells" do assert_equal(nil,@ds['age'][5]) end end context "Excel writer" do setup do a=100.times.map{rand(100)}.to_scale b=(["b"]*100).to_vector @ds={'b'=>b, 'a'=>a}.to_dataset(%w{b a}) tempfile=Tempfile.new("test_write.xls") Statsample::Excel.write(@ds,tempfile.path) @ds2=Statsample::Excel.read(tempfile.path) end should "return same fields as original" do assert_equal(@ds.fields ,@ds2.fields) end should "return same number of cases as original" do assert_equal(@ds.cases, @ds2.cases) end should "return same cases as original" do i=0 @ds2.each_array do |row| assert_equal(@ds.case_as_array(i),row) i+=1 end end end end ================================================ FILE: web/Rakefile ================================================ # -*- ruby -*- require 'rake' require 'fileutils' directory "examples" def get_base(f) f.sub(File.dirname(__FILE__)+"/../examples/","").gsub("/","_").gsub(".rb","") end EXAMPLES=Dir.glob(File.dirname(__FILE__)+"/../examples/**/*.rb").map {|v| [v, get_base(v)] }.find_all{|v| !v[0].include?"_data"} EXAMPLES_BASE=EXAMPLES.map {|v| v[1]} desc "Build all html, rtf and pdf files" task :build_site do ruby "build_site.rb" end task :clean do Dir.glob(File.dirname(__FILE__)+"/examples/*.pdf").each do |t| FileUtils.rm t end Dir.glob(File.dirname(__FILE__)+"/examples/*.html").each do |t| FileUtils.rm t end Dir.glob(File.dirname(__FILE__)+"/examples/*.rtf").each do |t| FileUtils.rm t end Dir.glob(File.dirname(__FILE__)+"/examples/images/*.*").each do |t| FileUtils.rm t end end load 'upload_task.rb' if File.exists? "upload_task.rb"