Repository: ondra-m/ruby-spark Branch: master Commit: d1b9787642fe Files: 191 Total size: 440.0 KB Directory structure: gitextract_h83fh3m2/ ├── .gitignore ├── .travis.yml ├── CHANGELOG.md ├── Gemfile ├── Guardfile ├── LICENSE.txt ├── README.md ├── Rakefile ├── TODO.md ├── benchmark/ │ ├── aggregate.rb │ ├── bisect.rb │ ├── comparison/ │ │ ├── prepare.sh │ │ ├── python.py │ │ ├── r.r │ │ ├── ruby.rb │ │ ├── run-all.sh │ │ └── scala.scala │ ├── custom_marshal.rb │ ├── digest.rb │ ├── enumerator.rb │ ├── serializer.rb │ ├── sort.rb │ ├── sort2.rb │ └── take.rb ├── bin/ │ └── ruby-spark ├── example/ │ ├── pi.rb │ └── website_search.rb ├── ext/ │ ├── ruby_c/ │ │ ├── extconf.rb │ │ ├── murmur.c │ │ ├── murmur.h │ │ └── ruby-spark.c │ ├── ruby_java/ │ │ ├── Digest.java │ │ ├── Murmur2.java │ │ ├── RubySparkExtService.java │ │ └── extconf.rb │ └── spark/ │ ├── build.sbt │ ├── project/ │ │ └── plugins.sbt │ ├── sbt/ │ │ └── sbt │ └── src/ │ ├── main/ │ │ └── scala/ │ │ ├── Exec.scala │ │ ├── MLLibAPI.scala │ │ ├── Marshal.scala │ │ ├── MarshalDump.scala │ │ ├── MarshalLoad.scala │ │ ├── RubyAccumulatorParam.scala │ │ ├── RubyBroadcast.scala │ │ ├── RubyConstant.scala │ │ ├── RubyMLLibAPI.scala │ │ ├── RubyMLLibUtilAPI.scala │ │ ├── RubyPage.scala │ │ ├── RubyRDD.scala │ │ ├── RubySerializer.scala │ │ ├── RubyTab.scala │ │ ├── RubyUtils.scala │ │ └── RubyWorker.scala │ └── test/ │ └── scala/ │ └── MarshalSpec.scala ├── lib/ │ ├── ruby-spark.rb │ ├── spark/ │ │ ├── accumulator.rb │ │ ├── broadcast.rb │ │ ├── build.rb │ │ ├── cli.rb │ │ ├── command/ │ │ │ ├── base.rb │ │ │ ├── basic.rb │ │ │ ├── pair.rb │ │ │ ├── sort.rb │ │ │ └── statistic.rb │ │ ├── command.rb │ │ ├── command_builder.rb │ │ ├── command_validator.rb │ │ ├── config.rb │ │ ├── constant.rb │ │ ├── context.rb │ │ ├── error.rb │ │ ├── ext/ │ │ │ ├── hash.rb │ │ │ ├── integer.rb │ │ │ ├── io.rb │ │ │ ├── ip_socket.rb │ │ │ ├── module.rb │ │ │ ├── object.rb │ │ │ └── string.rb │ │ ├── helper/ │ │ │ ├── logger.rb │ │ │ ├── parser.rb │ │ │ ├── serialize.rb │ │ │ ├── statistic.rb │ │ │ └── system.rb │ │ ├── helper.rb │ │ ├── java_bridge/ │ │ │ ├── base.rb │ │ │ ├── jruby.rb │ │ │ └── rjb.rb │ │ ├── java_bridge.rb │ │ ├── library.rb │ │ ├── logger.rb │ │ ├── mllib/ │ │ │ ├── classification/ │ │ │ │ ├── common.rb │ │ │ │ ├── logistic_regression.rb │ │ │ │ ├── naive_bayes.rb │ │ │ │ └── svm.rb │ │ │ ├── clustering/ │ │ │ │ ├── gaussian_mixture.rb │ │ │ │ └── kmeans.rb │ │ │ ├── matrix.rb │ │ │ ├── regression/ │ │ │ │ ├── common.rb │ │ │ │ ├── labeled_point.rb │ │ │ │ ├── lasso.rb │ │ │ │ ├── linear.rb │ │ │ │ └── ridge.rb │ │ │ ├── ruby_matrix/ │ │ │ │ ├── matrix_adapter.rb │ │ │ │ └── vector_adapter.rb │ │ │ ├── stat/ │ │ │ │ └── distribution.rb │ │ │ └── vector.rb │ │ ├── mllib.rb │ │ ├── rdd.rb │ │ ├── sampler.rb │ │ ├── serializer/ │ │ │ ├── auto_batched.rb │ │ │ ├── base.rb │ │ │ ├── batched.rb │ │ │ ├── cartesian.rb │ │ │ ├── compressed.rb │ │ │ ├── marshal.rb │ │ │ ├── message_pack.rb │ │ │ ├── oj.rb │ │ │ ├── pair.rb │ │ │ └── text.rb │ │ ├── serializer.rb │ │ ├── sort.rb │ │ ├── sql/ │ │ │ ├── column.rb │ │ │ ├── context.rb │ │ │ ├── data_frame.rb │ │ │ ├── data_frame_reader.rb │ │ │ ├── data_type.rb │ │ │ └── row.rb │ │ ├── sql.rb │ │ ├── stat_counter.rb │ │ ├── storage_level.rb │ │ ├── version.rb │ │ └── worker/ │ │ ├── master.rb │ │ ├── spark_files.rb │ │ └── worker.rb │ └── spark.rb ├── ruby-spark.gemspec └── spec/ ├── generator.rb ├── inputs/ │ ├── lorem_300.txt │ ├── numbers/ │ │ ├── 1.txt │ │ ├── 10.txt │ │ ├── 11.txt │ │ ├── 12.txt │ │ ├── 13.txt │ │ ├── 14.txt │ │ ├── 15.txt │ │ ├── 16.txt │ │ ├── 17.txt │ │ ├── 18.txt │ │ ├── 19.txt │ │ ├── 2.txt │ │ ├── 20.txt │ │ ├── 3.txt │ │ ├── 4.txt │ │ ├── 5.txt │ │ ├── 6.txt │ │ ├── 7.txt │ │ ├── 8.txt │ │ └── 9.txt │ ├── numbers_0_100.txt │ ├── numbers_1_100.txt │ └── people.json ├── lib/ │ ├── collect_spec.rb │ ├── command_spec.rb │ ├── config_spec.rb │ ├── context_spec.rb │ ├── ext_spec.rb │ ├── external_apps_spec.rb │ ├── filter_spec.rb │ ├── flat_map_spec.rb │ ├── group_spec.rb │ ├── helper_spec.rb │ ├── key_spec.rb │ ├── manipulation_spec.rb │ ├── map_partitions_spec.rb │ ├── map_spec.rb │ ├── mllib/ │ │ ├── classification_spec.rb │ │ ├── clustering_spec.rb │ │ ├── matrix_spec.rb │ │ ├── regression_spec.rb │ │ └── vector_spec.rb │ ├── reduce_by_key_spec.rb │ ├── reduce_spec.rb │ ├── sample_spec.rb │ ├── serializer_spec.rb │ ├── sort_spec.rb │ ├── sql/ │ │ ├── column_spec.rb │ │ └── data_frame_spec.rb │ ├── statistic_spec.rb │ └── whole_text_files_spec.rb └── spec_helper.rb ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ /.gemtags /.tags /java/spark.jar .jbundler target/* *.class *.jar pom.xml vendor/* *.gem *.rbc .bundle .config .yardoc Gemfile.lock InstalledFiles _yardoc coverage doc/ lib/bundler/man pkg rdoc spec/reports test/tmp test/version_tmp tmp *.bundle *.so *.o *.a mkmf.log ext/spark/target/* ext/spark/project/target/* ext/spark/project/project/target/* wiki /benchmark/performance/spark/* /benchmark/performance/rspark/* /_* ================================================ FILE: .travis.yml ================================================ language: ruby rvm: - 2.2.0 before_script: - bundle exec rake compile - bundle exec ruby bin/ruby-spark build cache: bundler: true directories: - $HOME/.m2 - $HOME/.ivy2 - $HOME/.sbt ================================================ FILE: CHANGELOG.md ================================================ ## Unreleased ## 1.3.0 - new method on RDD (lookup) - fix sbt url - Spark 1.5.0 ## 1.2.0 (15.06.2015) - target folder is now located at HOME - better serializators - error when java class does not exist - default setting at ~/.ruby-spark.conf - compatible with Spark 1.4.0 - added calling site to RDD ================================================ FILE: Gemfile ================================================ source 'https://rubygems.org' gemspec gem 'sourcify', '0.6.0.rc4' gem 'method_source' gem 'commander' gem 'pry' gem 'nio4r' gem 'distribution' platform :mri do gem 'rjb' gem 'msgpack' gem 'oj' gem 'narray' end platform :jruby do gem 'msgpack-jruby', require: 'msgpack' # NameError: no constructorfor arguments (org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.joda.time.chrono.GJChronology) on Java::OrgJodaTime::DateTime # gem 'mdarray' end group :stats do # gem 'nmatrix' # gem 'statsample' # gem 'statsample-glm' # gem 'statsample-timeseries' # gem 'statistics2' # gem 'statsample-optimization' # libgsl0-dev # gem 'narray' # gem 'gsl-nmatrix' end group :development do gem 'benchmark-ips' gem 'rspec' gem 'rake-compiler' gem 'guard' gem 'guard-rspec' gem 'listen' end group :test do gem 'simplecov', require: false end ================================================ FILE: Guardfile ================================================ guard :rspec, cmd: 'rspec' do watch(%r{^spec/.+_spec\.rb$}) watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" } watch('spec/spec_helper.rb') { "spec" } end ================================================ FILE: LICENSE.txt ================================================ Copyright (c) 2014 Ondřej Moravčík MIT License Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # Ruby-Spark [![Build Status](https://travis-ci.org/ondra-m/ruby-spark.svg?branch=master)](https://travis-ci.org/ondra-m/ruby-spark) Apache Spark™ is a fast and general engine for large-scale data processing. This Gem allows the use Spark functionality on Ruby. > Word count in Spark's Ruby API ```ruby file = spark.text_file("hdfs://...") file.flat_map(:split) .map(lambda{|word| [word, 1]}) .reduce_by_key(lambda{|a, b| a+b}) ``` - [Apache Spark](http://spark.apache.org) - [Wiki](https://github.com/ondra-m/ruby-spark/wiki) - [Rubydoc](http://www.rubydoc.info/gems/ruby-spark) ## Installation ### Requirments - Java 7+ - Ruby 2+ - wget or curl - MRI or JRuby Add this line to your application's Gemfile: ```ruby gem 'ruby-spark' ``` And then execute: ``` $ bundle ``` Or install it yourself as: ``` $ gem install ruby-spark ``` Run `rake compile` if you are using gem from local filesystem. ### Build Apache Spark This command will download Spark and build extensions for this gem ([SBT](ext/spark/build.sbt) is used for compiling). For more informations check [wiki](https://github.com/ondra-m/ruby-spark/wiki/Installation). Jars will be stored at you HOME directory. ``` $ ruby-spark build ``` ## Usage You can use Ruby Spark via interactive shell (Pry is used) ``` $ ruby-spark shell ``` Or on existing project. If you want configure Spark first. See [configurations](https://github.com/ondra-m/ruby-spark/wiki/Configuration) for more details. ```ruby require 'ruby-spark' # Configuration Spark.config do set_app_name "RubySpark" set 'spark.ruby.serializer', 'oj' set 'spark.ruby.serializer.batch_size', 100 end # Start Apache Spark Spark.start # Context reference Spark.sc ``` Finally, to stop the cluster. On the shell is Spark stopped automatically when environment exit. ```ruby Spark.stop ``` After first use, global configuration is created at **~/.ruby-spark.conf**. There can be specified properties for Spark and RubySpark. ## Creating RDD (a new collection) Single text file: ```ruby rdd = sc.text_file(FILE, workers_num, serializer=nil) ``` All files on directory: ```ruby rdd = sc.whole_text_files(DIRECTORY, workers_num, serializer=nil) ``` Direct uploading structures from ruby: ```ruby rdd = sc.parallelize([1,2,3,4,5], workers_num, serializer=nil) rdd = sc.parallelize(1..5, workers_num, serializer=nil) ``` There is 2 conditions: 1. choosen serializer must be able to serialize it 2. data must be iterable If you do not specified serializer -> default is used (defined from spark.ruby.serializer.* options). [Check this](https://github.com/ondra-m/ruby-spark/wiki/Loading-data#custom-serializer) if you want create custom serializer. ## Operations All operations can be divided into 2 groups: - **Transformations:** append new operation to current RDD and return new - **Actions:** add operation and start calculations More informations: - [Wiki page](https://github.com/ondra-m/ruby-spark/wiki/RDD) - [Rubydoc](http://www.rubydoc.info/github/ondra-m/ruby-spark/master/Spark/RDD) - [rdd.rb](https://github.com/ondra-m/ruby-spark/blob/master/lib/spark/rdd.rb) You can also check official Spark documentation. First make sure that method is implemented here. - [Transformations](http://spark.apache.org/docs/latest/programming-guide.html#transformations) - [Actions](http://spark.apache.org/docs/latest/programming-guide.html#actions) #### Transformations
rdd.map(function)
Return a new RDD by applying a function to all elements of this RDD.
rdd.flat_map(function)
Return a new RDD by first applying a function to all elements of this RDD, and then flattening the results.
rdd.map_partitions(function)
Return a new RDD by applying a function to each partition of this RDD.
rdd.filter(function)
Return a new RDD containing only the elements that satisfy a predicate.
rdd.cartesian(other)
Return the Cartesian product of this RDD and another one, that is, the RDD of all pairs of elements `(a, b)` where `a` is in `self` and `b` is in `other`.
rdd.intersection(other)
Return the intersection of this RDD and another one. The output will not contain any duplicate elements, even if the input RDDs did.
rdd.sample(with_replacement, fraction, seed)
Return a sampled subset of this RDD. Operations are base on Poisson and Uniform distributions.
rdd.group_by_key(num_partitions)
Group the values for each key in the RDD into a single sequence.
...many more...
#### Actions
rdd.take(count)
Take the first num elements of the RDD.
rdd.reduce(function)
Reduces the elements of this RDD using the specified lambda or method.
rdd.aggregate(zero_value, seq_op, comb_op)
Aggregate the elements of each partition, and then the results for all the partitions, using given combine functions and a neutral “zero value”.
rdd.histogram(buckets)
Compute a histogram using the provided buckets.
rdd.collect
Return an array that contains all of the elements in this RDD.
...many more...
## Examples ##### Basic methods ```ruby # Every batch will be serialized by Marshal and will have size 10 ser = Spark::Serializer.build('batched(marshal, 10)') # Range 0..100, 2 workers, custom serializer rdd = Spark.sc.parallelize(0..100, 2, ser) # Take first 5 items rdd.take(5) # => [0, 1, 2, 3, 4] # Numbers reducing rdd.reduce(lambda{|sum, x| sum+x}) rdd.reduce(:+) rdd.sum # => 5050 # Reducing with zero items seq = lambda{|x,y| x+y} com = lambda{|x,y| x*y} rdd.aggregate(1, seq, com) # 1. Every workers adds numbers # => [1226, 3826] # 2. Results are multiplied # => 4690676 # Statistic method rdd.stats # => StatCounter: (count, mean, max, min, variance, # sample_variance, stdev, sample_stdev) # Compute a histogram using the provided buckets. rdd.histogram(2) # => [[0.0, 50.0, 100], [50, 51]] # Mapping rdd.map(lambda {|x| x*2}).collect # => [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, ...] rdd.map(:to_f).collect # => [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ...] # Mapping the whole collection rdd.map_partitions(lambda{|part| part.reduce(:+)}).collect # => [1225, 3825] # Selecting rdd.filter(lambda{|x| x.even?}).collect # => [0, 2, 4, 6, 8, 10, 12, 14, 16, ...] # Sampling rdd.sample(true, 10).collect # => [3, 36, 40, 54, 58, 82, 86, 95, 98] # Sampling X items rdd.take_sample(true, 10) # => [53, 87, 71, 74, 18, 75, 55, 94, 46, 32] # Using external process rdd.pipe('cat', "awk '{print $1*10}'") # => ["0", "10", "20", "30", "40", "50", ...] ``` ##### Words count using methods ```ruby # Content: # "first line" # "second line" rdd = sc.text_file(PATH) # ["first", "line", "second", "line"] rdd = rdd.flat_map(lambda{|line| line.split}) # [["first", 1], ["line", 1], ["second", 1], ["line", 1]] rdd = rdd.map(lambda{|word| [word, 1]}) # [["first", 1], ["line", 2], ["second", 1]] rdd = rdd.reduce_by_key(lambda{|a, b| a+b}) # {"first"=>1, "line"=>2, "second"=>1} rdd.collect_as_hash ``` ##### Estimating PI with a custom serializer ```ruby slices = 3 n = 100000 * slices def map(_) x = rand * 2 - 1 y = rand * 2 - 1 if x**2 + y**2 < 1 return 1 else return 0 end end rdd = Spark.context.parallelize(1..n, slices, serializer: 'oj') rdd = rdd.map(method(:map)) puts 'Pi is roughly %f' % (4.0 * rdd.sum / n) ``` ##### Estimating PI ```ruby rdd = sc.parallelize([10_000], 1) rdd = rdd.add_library('bigdecimal/math') rdd = rdd.map(lambda{|x| BigMath.PI(x)}) rdd.collect # => # ``` ### Mllib (Machine Learning Library) Mllib functions are using Spark's Machine Learning Library. Ruby objects are serialized and deserialized in Java so you cannot use custom classes. Supported are primitive types such as string or integers. All supported methods/models: - [Rubydoc / Mllib](http://www.rubydoc.info/github/ondra-m/ruby-spark/Spark/Mllib) - [Github / Mllib](https://github.com/ondra-m/ruby-spark/tree/master/lib/spark/mllib) ##### Linear regression ```ruby # Import Mllib classes into Object # Otherwise are accessible via Spark::Mllib::LinearRegressionWithSGD Spark::Mllib.import(Object) # Training data data = [ LabeledPoint.new(0.0, [0.0]), LabeledPoint.new(1.0, [1.0]), LabeledPoint.new(3.0, [2.0]), LabeledPoint.new(2.0, [3.0]) ] # Train a model lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initial_weights: [1.0]) lrm.predict([0.0]) ``` ##### K-Mean ```ruby Spark::Mllib.import # Dense vectors data = [ DenseVector.new([0.0,0.0]), DenseVector.new([1.0,1.0]), DenseVector.new([9.0,8.0]), DenseVector.new([8.0,9.0]) ] model = KMeans.train(sc.parallelize(data), 2) model.predict([0.0, 0.0]) == model.predict([1.0, 1.0]) # => true model.predict([8.0, 9.0]) == model.predict([9.0, 8.0]) # => true ``` ## Benchmarks ================================================ FILE: Rakefile ================================================ #-*- mode: ruby -*- require "bundler/gem_tasks" require "rspec/core/rake_task" RSpec::Core::RakeTask.new task default: :spec task test: :spec def java? RUBY_PLATFORM =~ /java/ end if java? require "rake/javaextensiontask" Rake::JavaExtensionTask.new("ruby_java") do |ext| ext.name = "ruby_spark_ext" end else require "rake/extensiontask" Rake::ExtensionTask.new("ruby_c") do |ext| ext.name = "ruby_spark_ext" end end task :clean do Dir['lib/*.{jar,o,so}'].each do |path| puts "Deleting #{path} ..." File.delete(path) end FileUtils.rm_rf('./pkg') FileUtils.rm_rf('./tmp') end ================================================ FILE: TODO.md ================================================ - refactor JavaBridge - to_java, from_java - every type should have class - automatic registration - add Streaming - worker informations (time, memory, ...) - killing zombie workers - add_rb, add_inline_rb to Spark::{Context, RDD} - fix broadcast for cluster - dump to disk if there is memory limit - Add Partitioner to RDD - add NonExist serializer ================================================ FILE: benchmark/aggregate.rb ================================================ require 'benchmark' require 'benchmark/ips' data = 0..1_000_000 zero_value = rand(100_000) function = Proc.new{|sum, n| sum+n} Benchmark.ips do |r| r.report('each') do sum = zero_value data.each do |n| sum += n end end r.report('reduce') do data.reduce(zero_value){|sum, n| sum+n} end r.report('each with function') do sum = zero_value data.each do |n| sum = function.call(sum, n) end end r.report('reduce with function') do data.reduce(zero_value, &function) end r.compare! end ================================================ FILE: benchmark/bisect.rb ================================================ require "benchmark" def bisect_left1(a, x, opts={}) return nil if a.nil? return 0 if a.empty? lo = (opts[:lo] || opts[:low]).to_i hi = opts[:hi] || opts[:high] || a.length while lo < hi mid = (lo + hi) / 2 v = a[mid] if v < x lo = mid + 1 else hi = mid end end return lo end def bisect_left2(list, item) count = 0 list.each{|i| return count if i >= item count += 1 } nil end def bisect_left3(list, item, lo = 0, hi = list.size) while lo < hi i = (lo + hi - 1) >> 1 if 0 <= (list[i] <=> item) hi = i else lo = i + 1 end end return hi end array = Array.new(1000000) { rand(0..1000000) }; to_find = Array.new(500) { rand(0..10000) }; Benchmark.bm(20) do |x| x.report("bisect_left1") do to_find.each do |item| bisect_left1(array, item) end end x.report("bisect_left2") do to_find.each do |item| bisect_left2(array, item) end end x.report("bisect_left3") do to_find.each do |item| bisect_left3(array, item) end end end array = Array.new(100000) { Array.new(rand(1..10)){(97+rand(26)).chr}.join }; to_find = Array.new(500) { (97+rand(26)).chr }; Benchmark.bm(20) do |x| x.report("bisect_left1") do to_find.each do |item| bisect_left1(array, item) end end x.report("bisect_left2") do to_find.each do |item| bisect_left2(array, item) end end x.report("bisect_left3") do to_find.each do |item| bisect_left3(array, item) end end end ================================================ FILE: benchmark/comparison/prepare.sh ================================================ #!/usr/bin/env bash # Current dir cd "$(dirname "$0")" # Exit immediately if a pipeline returns a non-zero status. set -e # Spark wget "http://d3kbcqa49mib13.cloudfront.net/spark-1.3.0-bin-hadoop2.4.tgz" -O spark.tgz tar xvzf spark.tgz mv spark-1.3.0-bin-hadoop2.4 spark rm spark.tgz # RSpark (only for 1.3.0) git clone git@github.com:amplab-extras/SparkR-pkg.git rspark cd rspark SPARK_VERSION=1.3.0 ./install-dev.sh ================================================ FILE: benchmark/comparison/python.py ================================================ import os import math from time import time from random import random from operator import add from pyspark import SparkContext sc = SparkContext(appName="Python", master="local[*]") log_file = open(os.environ.get('PYTHON_LOG'), 'w') def log(*values): values = map(lambda x: str(x), values) log_file.write(';'.join(values)) log_file.write('\n') workers = int(os.environ.get('WORKERS')) numbers_count = int(os.environ.get('NUMBERS_COUNT')) text_file = os.environ.get('TEXT_FILE') numbers = range(numbers_count) floats = [float(i) for i in numbers] with open(text_file) as t: strings = t.read().split("\n") # ============================================================================= # Serialization # ============================================================================= t = time() rdd_numbers = sc.parallelize(numbers, workers) t = time() - t log('NumbersSerialization', t) t = time() rdd_floats = sc.parallelize(floats, workers) t = time() - t log('FloatsSerialization', t) t = time() rdd_strings = sc.parallelize(strings, workers) t = time() - t log('StringsSerialization', t) # ============================================================================= # Computing # ============================================================================= # --- Is prime? --------------------------------------------------------------- def is_prime(x): if x < 2: return [x, False] elif x == 2: return [x, True] elif x % 2 == 0: return [x, False] else: upper = int(math.sqrt(float(x))) result = True i = 3 while i <= upper: if x % i == 0: result = False break i += 2 return [x, result] t = time() rdd_numbers.map(is_prime).collect() t = time() - t log('IsPrime', t) # --- Matrix multiplication --------------------------------------------------- matrix_size = int(os.environ.get('MATRIX_SIZE')) matrix = [] for row in range(matrix_size): matrix.append([]) for col in range(matrix_size): matrix[row].append(row+col) def multiplication_func(matrix): matrix = list(matrix) size = len(matrix) new_matrix = [] for row in range(size): new_matrix.append([]) for col in range(size): result = 0 for i in range(size): result += matrix[row][i] * matrix[col][i] new_matrix[row].append(result) return new_matrix t = time() rdd = sc.parallelize(matrix, 1) rdd.mapPartitions(multiplication_func).collect() t = time() - t log('MatrixMultiplication', t) # --- Pi digits --------------------------------------------------------------- # http://rosettacode.org/wiki/Pi#Python pi_digit = int(os.environ.get('PI_DIGIT')) def pi_func(size): size = size.next() result = '' q, r, t, k, n, l = 1, 0, 1, 1, 3, 3 while size > 0: if 4*q+r-t < n*t: result += str(n) size -= 1 nr = 10*(r-n*t) n = ((10*(3*q+r))//t)-10*n q *= 10 r = nr else: nr = (2*q+r)*l nn = (q*(7*k)+2+(r*l))//(t*l) q *= k t *= l l += 2 k += 1 n = nn r = nr return [result] t = time() rdd = sc.parallelize([pi_digit], 1) rdd.mapPartitions(pi_func).collect() t = time() - t log('PiDigit', t) log_file.close() ================================================ FILE: benchmark/comparison/r.r ================================================ library(SparkR) sc <- sparkR.init(master="local[*]") logFile <- file(Sys.getenv("R_LOG"), "w") logInfo <- function(...){ args <- list(...) line <- paste(args, collapse = ";") writeLines(line, logFile) } workers <- as.integer(Sys.getenv('WORKERS')) numbersCount <- as.integer(Sys.getenv('NUMBERS_COUNT')) textFile <- Sys.getenv('TEXT_FILE') # ============================================================================= # Serialization # ============================================================================= time <- proc.time() rddNumbers <- parallelize(sc, as.numeric(seq(0, numbersCount)), workers) time <- as.double(proc.time()-time)[3] logInfo('NumbersSerialization', time) # ============================================================================= # Computing # ============================================================================= isPrime = function(x) { if(x < 2){ c(x, FALSE) } else if(x == 2){ c(x, TRUE) } else if(x %% 2 == 0){ c(x, FALSE) } else{ upper <- as.numeric(sqrt(as.double(x))) result <- TRUE i <- 3 while(i <= upper){ if(x %% i == 0){ result = FALSE break } i <- i+2 } c(x, result) } } time <- proc.time() rdd <- map(rddNumbers, isPrime) capture.output(collect(rdd), file='/dev/null') time <- as.double(proc.time()-time)[3] logInfo('IsPrime', time) close(logFile) sparkR.stop() ================================================ FILE: benchmark/comparison/ruby.rb ================================================ #!/usr/bin/env ruby lib = File.expand_path(File.dirname(__FILE__) + '/../../lib') $LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib) require 'ruby-spark' require 'benchmark' Spark.start sc = Spark.context $log_file = File.open(ENV['RUBY_LOG'], 'w') def log(*values) $log_file.puts(values.join(';')) end workers = ENV['WORKERS'].to_i numbers_count = ENV['NUMBERS_COUNT'].to_i text_file = ENV['TEXT_FILE'] numbers = (0...numbers_count).to_a floats = numbers.map(&:to_f) strings = File.read(text_file).split("\n") # ============================================================================= # Serialization # ============================================================================= time = Benchmark.realtime do @rdd_numbers = sc.parallelize(numbers, workers) end log('NumbersSerialization', time) time = Benchmark.realtime do @rdd_floats = sc.parallelize(floats, workers) end log('FloatsSerialization', time) time = Benchmark.realtime do @rdd_strings = sc.parallelize(strings, workers) end log('StringsSerialization', time) # ============================================================================= # Computing # ============================================================================= # --- Is prime? --------------------------------------------------------------- is_prime = Proc.new do |x| case when x < 2 [x, false] when x == 2 [x, true] when x % 2 == 0 [x, false] else upper = Math.sqrt(x.to_f).to_i result = true i = 3 while i <= upper if x % i == 0 result = false break end i += 2 end [x, result] end end time = Benchmark.realtime do @rdd_numbers.map(is_prime).collect end log('IsPrime', time) # --- Matrix multiplication --------------------------------------------------- matrix_size = ENV['MATRIX_SIZE'].to_i matrix = Array.new(matrix_size) do |row| Array.new(matrix_size) do |col| row+col end end; multiplication_func = Proc.new do |matrix| size = matrix.size Array.new(size) do |row| Array.new(size) do |col| matrix[row] result = 0 size.times do |i| result += matrix[row][i] * matrix[col][i] end result end end end time = Benchmark.realtime do rdd = sc.parallelize(matrix, 1) rdd.map_partitions(multiplication_func).collect end log('MatrixMultiplication', time) # --- Pi digits --------------------------------------------------------------- # http://rosettacode.org/wiki/Pi#Ruby pi_digit = ENV['PI_DIGIT'].to_i pi_func = Proc.new do |size| size = size.first result = '' q, r, t, k, n, l = 1, 0, 1, 1, 3, 3 while size > 0 if 4*q+r-t < n*t result << n.to_s size -= 1 nr = 10*(r-n*t) n = ((10*(3*q+r)) / t) - 10*n q *= 10 r = nr else nr = (2*q+r) * l nn = (q*(7*k+2)+r*l) / (t*l) q *= k t *= l l += 2 k += 1 n = nn r = nr end end [result] end time = Benchmark.realtime do rdd = sc.parallelize([pi_digit], 1) rdd.map_partitions(pi_func).collect end log('PiDigit', time) $log_file.close ================================================ FILE: benchmark/comparison/run-all.sh ================================================ #!/usr/bin/env bash # Current dir cd "$(dirname "$0")" # Exit immediately if a pipeline returns a non-zero status. set -e # Settings export WORKERS=2 export MATRIX_SIZE=100 export NUMBERS_COUNT=1000000 export TEXT_FILE=$(mktemp) export PI_DIGIT=1000 export RUBY_BATCH_SIZE=2048 text_file_rows=10 text_file_per_line=10 text_file_duplicates=50 mx="4096m" ms="4096m" # Parse arguments while (( "$#" )); do case $1 in --workers) WORKERS="$2" shift ;; --matrix-size) MATRIX_SIZE="$2" shift ;; --numbers-count) NUMBERS_COUNT="$2" shift ;; --random-file-rows) text_file_rows="$2" shift ;; --text-file-per-line) text_file_per_line="$2" shift ;; --text-file-duplicates) text_file_duplicates="$2" shift ;; --pi-digit) PI_DIGIT="$2" shift ;; --ruby-batch-size) RUBY_BATCH_SIZE="$2" shift ;; --mx) mx="$2" shift ;; --ms) ms="$2" shift ;; *) break ;; esac shift done # Generating file=$(mktemp) for (( i=0; i<$text_file_rows; i++ )) do shuf -n $text_file_per_line /usr/share/dict/words | tr '\n' ' ' >> $file echo >> $file done for (( i=0; i<$text_file_duplicates; i++ )) do cat $file >> $TEXT_FILE done # Before run if [[ -z "$SPARK_HOME" ]]; then export SPARK_HOME=$(pwd)/spark fi if [[ -z "$RSPARK_HOME" ]]; then export RSPARK_HOME=$(pwd)/rspark fi export SPARK_RUBY_BATCH_SIZE="$RUBY_BATCH_SIZE" SPARK_CLASSPATH=$($SPARK_HOME/bin/compute-classpath.sh 2>/dev/null) export _JAVA_OPTIONS="$_JAVA_OPTIONS -Xms$ms -Xmx$mx" # Log files export RUBY_MARSHAL_LOG=$(mktemp) export RUBY_OJ_LOG=$(mktemp) export PYTHON_LOG=$(mktemp) export SCALA_LOG=$(mktemp) export R_LOG=$(mktemp) # Run: echo "Workers: $WORKERS" echo "Matrix size: $MATRIX_SIZE" echo "Numbers count: $NUMBERS_COUNT" echo "Pi digits: $PI_DIGIT" echo "File: rows = $(($text_file_rows * $text_file_duplicates))" echo " per line = $text_file_per_line" # --- Ruby export SPARK_RUBY_SERIALIZER='marshal' export RUBY_LOG="$RUBY_MARSHAL_LOG" /usr/bin/env ruby ruby.rb &>/dev/null export SPARK_RUBY_SERIALIZER='oj' export RUBY_LOG="$RUBY_OJ_LOG" /usr/bin/env ruby ruby.rb &>/dev/null # # --- Python "$SPARK_HOME"/bin/spark-submit --master "local[*]" $(pwd)/python.py &>/dev/null # # --- Scala /usr/bin/env scalac -cp $SPARK_CLASSPATH scala.scala -d scala.jar &>/dev/null "$SPARK_HOME"/bin/spark-submit --master "local[*]" $(pwd)/scala.jar &>/dev/null # --- R # "$RSPARK_HOME"/sparkR r.r #&>/dev/null # Parse results echo "# Ruby (Marshal)" cat $RUBY_MARSHAL_LOG echo "" echo "# Ruby (Oj)" cat $RUBY_OJ_LOG echo "" echo "# Python" cat $PYTHON_LOG echo "" echo "# Scala" cat $SCALA_LOG echo "" echo "# R" cat $R_LOG ================================================ FILE: benchmark/comparison/scala.scala ================================================ import java.io._ import scala.math import scala.io.Source import org.apache.spark._ object Scala { val logFile = new PrintWriter(new File(System.getenv("SCALA_LOG"))) def log(args: Any*) { logFile.write(args.mkString(";")) logFile.write("\n") } def main(args: Array[String]) { val conf = new SparkConf().setAppName("Scala") val sc = new SparkContext(conf) val workers = System.getenv("WORKERS").toInt val numbersCount = System.getenv("NUMBERS_COUNT").toInt val textFile = System.getenv("TEXT_FILE") val numbers = 0 until numbersCount val floats = numbers.map(_.toDouble) val strings = Source.fromFile(textFile).mkString.split("\n") // ============================================================================= // Serialization // ============================================================================= var time: Long = 0 time = System.currentTimeMillis val rddNumbers = sc.parallelize(numbers, workers) time = System.currentTimeMillis - time log("NumbersSerialization", time/1000.0) time = System.currentTimeMillis val rddFloats = sc.parallelize(floats, workers) time = System.currentTimeMillis - time log("FloatsSerialization", time/1000.0) time = System.currentTimeMillis val rddStrings = sc.parallelize(strings, workers) time = System.currentTimeMillis - time log("StringsSerialization", time/1000.0) // ============================================================================= // Computing // ============================================================================= // --- Is prime? --------------------------------------------------------------- time = System.currentTimeMillis val primes = rddNumbers.map{ x => if(x < 2){ (x, false) } else if(x == 2){ (x, true) } else if(x % 2 == 0){ (x, false) } else{ val upper = math.sqrt(x.toDouble).toInt var result = true var i = 3 while(i <= upper && result == true){ if(x % i == 0){ result = false } else{ i += 2 } } (x, result) } } primes.collect() time = System.currentTimeMillis - time log("IsPrime", time/1000.0) // --- Matrix multiplication --------------------------------------------------- val matrixSize = System.getenv("MATRIX_SIZE").toInt val matrix = new Array[Array[Long]](matrixSize) for( row <- 0 until matrixSize ) { matrix(row) = new Array[Long](matrixSize) for( col <- 0 until matrixSize ) { matrix(row)(col) = row + col } } time = System.currentTimeMillis val rdd = sc.parallelize(matrix, 1) rdd.mapPartitions { it => val matrix = it.toArray val size = matrix.size val newMatrix = new Array[Array[Long]](size) for( row <- 0 until size ) { newMatrix(row) = new Array[Long](size) for( col <- 0 until size ) { var result: Long = 0 for( i <- 0 until size ) { result += matrix(row)(i) * matrix(col)(i) } newMatrix(row)(col) = result } } newMatrix.toIterator } time = System.currentTimeMillis - time log("MatrixMultiplication", time/1000.0) // --- Pi digits --------------------------------------------------------------- // http://rosettacode.org/wiki/Pi#Scala val piDigit = System.getenv("PI_DIGIT").toInt time = System.currentTimeMillis val piDigits = sc.parallelize(Array(piDigit), 1) piDigits.mapPartitions { it => var size = it.toArray.asInstanceOf[Array[Int]](0) var result = "" var r: BigInt = 0 var q, t, k: BigInt = 1 var n, l: BigInt = 3 var nr, nn: BigInt = 0 while(size > 0){ while((4*q+r-t) >= (n*t)){ nr = (2*q+r)*l nn = (q*(7*k)+2+(r*l))/(t*l) q = q * k t = t * l l = l + 2 k = k + 1 n = nn r = nr } result += n.toString size -= 1 nr = 10*(r-n*t) n = ((10*(3*q+r))/t)-(10*n) q = q * 10 r = nr } Iterator(result) } time = System.currentTimeMillis - time log("PiDigit", time/1000.0) sc.stop() logFile.close() } } ================================================ FILE: benchmark/custom_marshal.rb ================================================ require 'benchmark' require 'benchmark/ips' def pack_int(data) [data].pack('l>') end def pack_long(data) [data].pack('q>') end def pack_doubles(data) data.pack('G*') end module Standard class LabeledPoint def initialize(label, features) @label = label @features = Standard::Vector.new(features) end def marshal_dump [@label, @features] end def marshal_load(*) end end class Vector def initialize(array) @values = array end def marshal_dump [@values] end def marshal_load(*) end end end module Custom class LabeledPoint def initialize(label, features) @label = label @features = Custom::Vector.new(features) end def _dump(*) pack_long(@label) + @features._dump end def self._load(*) end end class Vector def initialize(array) @values = array end def _dump(*) result = 'v' result << pack_int(@values.size) result << pack_doubles(@values) result.encode(Encoding::ASCII_8BIT) end def self._load(*) end end end data_size = 10_000 vector_size = 1_000 values = Array.new(vector_size) { |x| rand(10_000..100_000) } @data1 = Array.new(data_size) {|i| Standard::LabeledPoint.new(i, values)} @data2 = Array.new(data_size) {|i| Custom::LabeledPoint.new(i, values)} Benchmark.ips do |r| r.report('standard') do Marshal.dump(@data1) end r.report('custom') do Marshal.dump(@data2) end r.compare! end ================================================ FILE: benchmark/digest.rb ================================================ lib = File.expand_path(File.dirname(__FILE__) + '/../lib') $LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib) def java? RUBY_PLATFORM =~ /java/ end unless java? require 'murmurhash3' end require 'digest' require 'benchmark' require 'ruby-spark' TEST = 5_000_000 WORDS = ["wefwefwef", "rgwefiwefwe", "a", "rujfwgrethrzjrhgawf", "irncrnuggo"] puts "TEST COUNT = #{TEST*WORDS.size}" # ================================================================================================= # Pure ruby mumrumur # funny-falcon/murmurhash3-ruby MASK32 = 0xffffffff def murmur3_32_rotl(x, r) ((x << r) | (x >> (32 - r))) & MASK32 end def murmur3_32_fmix(h) h &= MASK32 h ^= h >> 16 h = (h * 0x85ebca6b) & MASK32 h ^= h >> 13 h = (h * 0xc2b2ae35) & MASK32 h ^ (h >> 16) end def murmur3_32__mmix(k1) k1 = (k1 * 0xcc9e2d51) & MASK32 k1 = murmur3_32_rotl(k1, 15) (k1 * 0x1b873593) & MASK32 end def murmur3_32_str_hash(str, seed=0) h1 = seed numbers = str.unpack('V*C*') tailn = str.bytesize % 4 tail = numbers.slice!(numbers.size - tailn, tailn) for k1 in numbers h1 ^= murmur3_32__mmix(k1) h1 = murmur3_32_rotl(h1, 13) h1 = (h1*5 + 0xe6546b64) & MASK32 end unless tail.empty? k1 = 0 tail.reverse_each do |c1| k1 = (k1 << 8) | c1 end h1 ^= murmur3_32__mmix(k1) end h1 ^= str.bytesize murmur3_32_fmix(h1) end # ================================================================================================= # Benchmark Benchmark.bm(18) do |x| x.report("ruby hash"){ TEST.times{ WORDS.each{ |word| word.hash } } } x.report("ext portable"){ TEST.times{ WORDS.each{ |word| Spark::Digest.portable_hash(word) } } } x.report("murmur3 32"){ TEST.times{ WORDS.each{ |word| # MurmurHash3::V128.str_hash(word) # [MurmurHash3::V128.str_hash(word).join.to_i].pack("q>") # MurmurHash3::V128.str_hash(word) # a = MurmurHash3::V32.str_hash(word).to_s # a.slice!(0,8) MurmurHash3::V32.str_hash(word) } } } unless java? # Too slow # x.report("murmur3 32 (ruby)"){ # TEST.times{ # WORDS.each{ |word| # # MurmurHash3::V128.str_hash(word) # # [MurmurHash3::V128.str_hash(word).join.to_i].pack("q>") # # MurmurHash3::V128.str_hash(word) # # a = murmur3_32_str_hash(word).to_s # # a.slice!(0,8) # murmur3_32_str_hash(word) # } # } # } x.report("murmur3 128"){ TEST.times{ WORDS.each{ |word| # MurmurHash3::V128.str_hash(word) # [MurmurHash3::V128.str_hash(word).join.to_i].pack("q>") # a = MurmurHash3::V128.str_hash(word).to_s # a.slice!(0,8) MurmurHash3::V128.str_hash(word) } } } unless java? # x.report("sha256"){ # TEST.times{ # WORDS.each{ |word| # a = Digest::SHA256.digest(word) # # a.slice!(0,8) # } # } # } # x.report("md5"){ # TEST.times{ # WORDS.each{ |word| # a = Digest::MD5.digest(word) # # a.slice!(0,8) # } # } # } end ================================================ FILE: benchmark/enumerator.rb ================================================ require "benchmark" class Enumerator def defer(&blk) self.class.new do |y| each do |*input| blk.call(y, *input) end end end end ARRAY_SIZE = 50_000_000 def type_yield return to_enum(__callee__) unless block_given? ARRAY_SIZE.times { |i| yield i } end def yield_map_x2(enum) return to_enum(__callee__, enum) unless block_given? enum.each do |item| yield item*2 end end def type_enumerator_new Enumerator.new do |e| ARRAY_SIZE.times { |i| e << i } end end def enumerator_new_map_x2(enum) Enumerator.new do |e| enum.each do |item| e << item*2 end end end def enumerator_defer_x2(enum) enum.defer do |out, inp| out << inp*2 end end Benchmark.bm(26) do |x| x.report("yield max") do type_yield.max end x.report("yield sum") do type_yield.reduce(:+) end x.report("yield map x*2 sum") do yield_map_x2(type_yield).reduce(:+) end x.report("yield defer map x*2 sum") do enumerator_defer_x2(type_yield).reduce(:+) end x.report("-----"){} x.report("Enum.new max") do type_enumerator_new.max end x.report("Enum.new sum") do type_enumerator_new.reduce(:+) end x.report("Enum.new map x*2 sum") do enumerator_new_map_x2(type_enumerator_new).reduce(:+) end x.report("Enum.new defer map x*2 sum") do enumerator_defer_x2(type_enumerator_new).reduce(:+) end end ================================================ FILE: benchmark/serializer.rb ================================================ require "benchmark" require "yaml" require "msgpack" require "oj" # require "thrift" puts "Simple" data = (0..100000).to_a Benchmark.bmbm do |x| x.report("YAML") do serialized = YAML.dump(data) deserialized = YAML.load(serialized) puts "Size: #{serialized.size}, Equal: #{deserialized == data}" end x.report("Marshal") do serialized = Marshal.dump(data) deserialized = Marshal.load(serialized) puts "Size: #{serialized.size}, Equal: #{deserialized == data}" end x.report("MessagePack") do serialized = MessagePack.dump(data) deserialized = MessagePack.load(serialized) puts "Size: #{serialized.size}, Equal: #{deserialized == data}" end x.report("Oj") do serialized = Oj.dump(data) deserialized = Oj.load(serialized) puts "Size: #{serialized.size}, Equal: #{deserialized == data}" end # x.report("Thrift") do # serializer = Thrift::Serializer.new # deserializer = Thrift::Deserializer.new # serialized = serializer.serialize(data) # end end puts "" puts "More complex" data = Array.new(10000000) { [rand(97..122).chr, rand(10000000)] } Benchmark.bm do |x| # Take too long # x.report("YAML") do # serialized = YAML.dump(data) # YAML.load(serialized) # end x.report("Marshal") do serialized = Marshal.dump(data) deserialized = Marshal.load(serialized) puts " Size: #{serialized.size}, Equal: #{deserialized == data}" end x.report("MessagePack") do serialized = MessagePack.dump(data) deserialized = MessagePack.load(serialized) puts " Size: #{serialized.size}, Equal: #{deserialized == data}" end x.report("Oj") do serialized = Oj.dump(data) deserialized = Oj.load(serialized) puts " Size: #{serialized.size}, Equal: #{deserialized == data}" end # x.report("Thrift") do # serializer = Thrift::Serializer.new # deserializer = Thrift::Deserializer.new # serialized = serializer.serialize(data) # end end ================================================ FILE: benchmark/sort.rb ================================================ require "benchmark" array = [] 1000.times { array << {:bar => rand(1000)} } n = 500 Benchmark.bm(20) do |x| x.report("sort") { n.times { array.sort{ |a,b| b[:bar] <=> a[:bar] } } } x.report("sort reverse") { n.times { array.sort{ |a,b| a[:bar] <=> b[:bar] }.reverse } } x.report("sort_by -a[:bar]") { n.times { array.sort_by{ |a| -a[:bar] } } } x.report("sort_by a[:bar]*-1") { n.times { array.sort_by{ |a| a[:bar]*-1 } } } x.report("sort_by.reverse!") { n.times { array.sort_by{ |a| a[:bar] }.reverse } } end array = Array.new(10000) { Array.new(rand(1..10)){(97+rand(26)).chr}.join } Benchmark.bm(20) do |x| x.report("sort asc") { n.times { array.sort } } x.report("sort asc block") { n.times { array.sort{|a,b| a <=> b} } } x.report("sort desc") { n.times { array.sort{|a,b| b <=> a} } } x.report("sort asc reverse") { n.times { array.sort.reverse } } end key_value = Struct.new(:key, :value) do def <=>(other) key <=> other.key end end count = 10000 item_range = 1000000 array1 = Array.new(count) { [rand(item_range), rand(item_range)] } array2 = Array.new(count) { key_value.new rand(item_range), rand(item_range) } Benchmark.bm(20) do |x| x.report("sort_by") { n.times { array1.sort_by {|a| a[0]} } } x.report("sort struct") { n.times { array2.sort } } end ================================================ FILE: benchmark/sort2.rb ================================================ require "benchmark" require "algorithms" NUMBER_OF_SORTING = 1 NUMBER_OF_ARRAY = 10 WORDS_IN_ARRAY = 100000 MAX_WORD_SIZE = 10 EVAL_N_VALUES = 10 puts "NUMBER_OF_SORTING: #{NUMBER_OF_SORTING}" puts "NUMBER_OF_ARRAY: #{NUMBER_OF_ARRAY}" puts "WORDS_IN_ARRAY: #{WORDS_IN_ARRAY}" puts "MAX_WORD_SIZE: #{MAX_WORD_SIZE}" puts "EVAL_N_VALUES: #{EVAL_N_VALUES}" def words Array.new(WORDS_IN_ARRAY) { word } end def word Array.new(rand(1..MAX_WORD_SIZE)){(97+rand(26)).chr}.join end @array = Array.new(NUMBER_OF_ARRAY) { words.sort } # ================================================================================================= # Sort1 # Vrátí nový (nevyhodnocený) enumerator def sort1(data) return to_enum(__callee__, data) unless block_given? heap = [] # Inicializuji heap s prvními položkami # připojím samotné enumeratory pro volání .next data.each do |a| heap << [a.next, a] end while data.any? begin # Seřadím pole podle hodnot heap.sort_by!{|(item,_)| item} # Uložím si hodnotu a enumerator item, enum = heap.shift # Hodnota půjde do výsledku yield item # Místo odstraněné položky nahradí další ze stejného seznamu heap << [enum.next, enum] rescue StopIteration # Enumerator je prázdný data.delete(enum) end end end # ================================================================================================= # Sort1_2 # Vrátí nový (nevyhodnocený) enumerator def sort1_2(data) return to_enum(__callee__, data) unless block_given? heap = [] enums = [] # Inicializuji heap s prvními položkami # připojím samotné enumeratory pro volání .next data.each do |a| EVAL_N_VALUES.times { begin heap << [a.next, a] rescue StopIteration end } end while data.any? || heap.any? # Seřadím pole podle hodnot heap.sort_by!{|(item,_)| item} # Minimálně můžu vzít EVAL_N_VALUES EVAL_N_VALUES.times { break if heap.empty? # Uložím si hodnotu a enumerator item, enum = heap.shift # Hodnota půjde do výsledku yield item enums << enum } while (enum = enums.shift) begin heap << [enum.next, enum] rescue StopIteration data.delete(enum) enums.delete(enum) end end end end # ================================================================================================= # Sort 2 def sort2(data) return to_enum(__callee__, data) unless block_given? heap = Containers::Heap.new data.each do |enum| item = enum.next heap.push(item, [item, enum]) end while data.any? begin item, enum = heap.pop yield item item = enum.next heap.push(item, [item, enum]) rescue StopIteration data.delete(enum) end end end # ================================================================================================= # Benchmark Benchmark.bm(10) do |x| x.report("sort") do NUMBER_OF_SORTING.times { @result = @array.flatten.sort } end x.report("sort 1") do NUMBER_OF_SORTING.times { raise "Bad sorting" if @result != sort1(@array.map(&:each)).to_a } end x.report("sort 1_2") do NUMBER_OF_SORTING.times { raise "Bad sorting" if @result != sort1_2(@array.map(&:each)).to_a } end # x.report("sort 2") do # NUMBER_OF_SORTING.times { # raise "Bad sorting" if @result != sort2(@array.map(&:each)).to_a # } # end end ================================================ FILE: benchmark/take.rb ================================================ require "benchmark" SIZE = 100_000_000 @array1 = (0..SIZE).to_a; @array2 = (0..SIZE).to_a; @array3 = (0..SIZE).to_a; TAKE = 100_000 Benchmark.bm(15) do |x| # Fastest x.report("take"){ a=@array1.take(TAKE) } # Slowest and take most memory x.report("reverse drop"){ @array2.reverse! @array2.drop(@array2.size - TAKE) @array2.reverse! } # Least memory x.report("splice"){ a=@array2.slice!(0, TAKE) } end ================================================ FILE: bin/ruby-spark ================================================ #!/usr/bin/env ruby lib = File.expand_path(File.dirname(__FILE__) + '/../lib') $LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib) require 'ruby-spark' Spark::CLI.new.run ================================================ FILE: example/pi.rb ================================================ #!/usr/bin/env ruby lib = File.expand_path(File.dirname(__FILE__) + '/../lib') $LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib) require 'ruby-spark' Spark.logger.disable Spark.start slices = 3 n = 100000 * slices def map(_) x = rand * 2 - 1 y = rand * 2 - 1 if x**2 + y**2 < 1 return 1 else return 0 end end rdd = Spark.context.parallelize(1..n, slices) rdd = rdd.map(method(:map)) puts 'Pi is roughly %f' % (4.0 * rdd.sum / n) ================================================ FILE: example/website_search.rb ================================================ #!/usr/bin/env ruby # Parse sitemap and search word on every page require 'optparse' require 'open-uri' require 'nokogiri' require 'ruby-spark' options = { sitemap: 'http://fit.cvut.cz/sitemap.xml', query: 'cvut', workers: 2 } opt_parser = OptionParser.new do |opts| opts.banner = 'Usage: website_search.rb [options]' opts.separator '' opts.separator 'Specific options:' opts.on('-s', '--sitemap SITEMAP', 'Sitemap URL') do |sitemap| options[:sitemap] = sitemap end opts.on('-q', '--query QUERY', 'Query to search') do |query| options[:query] = query end opts.on('-w', '--workers WORKERS_NUM', Integer, 'Number of workers') do |workers| options[:workers] = workers end opts.on('--quite', 'Run quitely') do |v| Spark.logger.disabled end opts.on_tail('-h', '--help', 'Show this message') do puts opts exit end end opt_parser.parse! @links = [] def parse_sitemap(doc) doc.xpath('//sitemapindex/sitemap/loc').each do |loc| next_doc = Nokogiri::HTML(open(loc.text)) parse_sitemap(next_doc) end doc.xpath('//url/loc').each do |loc| @links << loc.text end end doc = Nokogiri::HTML(open(options[:sitemap])) parse_sitemap(doc) # Map function func = Proc.new do |url| begin open(url) {|f| [url, f.read.scan(query).count] } rescue [url, 0] end end Spark.start rdd = Spark.sc.parallelize(@links, options[:workers]) .add_library('open-uri') .bind(query: options[:query]) .map(func) .sort_by(lambda{|(_, value)| value}, false) rdd.collect.each do |(url, count)| puts "#{url} => #{count}" end ================================================ FILE: ext/ruby_c/extconf.rb ================================================ require 'mkmf' create_makefile("ruby_spark_ext") ================================================ FILE: ext/ruby_c/murmur.c ================================================ #include "murmur.h" #if defined(_MSC_VER) #define BIG_CONSTANT(x) (x) #else #define BIG_CONSTANT(x) (x##LLU) #endif /*----------------------------------------------------------------------------- // MurmurHash2, 64-bit versions, by Austin Appleby // // The same caveats as 32-bit MurmurHash2 apply here - beware of alignment // and endian-ness issues if used across multiple platforms. // // 64-bit hash for 64-bit platforms */ uint64_t MurmurHash64A(const void * key, int len, uint64_t seed) { const uint64_t m = BIG_CONSTANT(0xc6a4a7935bd1e995); const int r = 47; uint64_t h = seed ^ (len * m); const uint64_t * data = (const uint64_t *)key; const uint64_t * end = data + (len/8); while(data != end) { uint64_t k = *data++; k *= m; k ^= k >> r; k *= m; h ^= k; h *= m; } const unsigned char * data2 = (const unsigned char*)data; switch(len & 7) { case 7: h ^= ((uint64_t) data2[6]) << 48; case 6: h ^= ((uint64_t) data2[5]) << 40; case 5: h ^= ((uint64_t) data2[4]) << 32; case 4: h ^= ((uint64_t) data2[3]) << 24; case 3: h ^= ((uint64_t) data2[2]) << 16; case 2: h ^= ((uint64_t) data2[1]) << 8; case 1: h ^= ((uint64_t) data2[0]); h *= m; }; h ^= h >> r; h *= m; h ^= h >> r; return h; } /* 64-bit hash for 32-bit platforms */ uint64_t MurmurHash64B(const void * key, int len, uint64_t seed) { const uint32_t m = 0x5bd1e995; const int r = 24; uint32_t h1 = ((uint32_t) seed) ^ len; uint32_t h2 = ((uint32_t) (seed >> 32)); const uint32_t * data = (const uint32_t *)key; while(len >= 8) { uint32_t k1 = *data++; k1 *= m; k1 ^= k1 >> r; k1 *= m; h1 *= m; h1 ^= k1; len -= 4; uint32_t k2 = *data++; k2 *= m; k2 ^= k2 >> r; k2 *= m; h2 *= m; h2 ^= k2; len -= 4; } if(len >= 4) { uint32_t k1 = *data++; k1 *= m; k1 ^= k1 >> r; k1 *= m; h1 *= m; h1 ^= k1; len -= 4; } switch(len) { case 3: h2 ^= ((unsigned char*)data)[2] << 16; case 2: h2 ^= ((unsigned char*)data)[1] << 8; case 1: h2 ^= ((unsigned char*)data)[0]; h2 *= m; }; h1 ^= h2 >> 18; h1 *= m; h2 ^= h1 >> 22; h2 *= m; h1 ^= h2 >> 17; h1 *= m; h2 ^= h1 >> 19; h2 *= m; uint64_t h = h1; h = (h << 32) | h2; return h; } // ================================================================================================ // Ruby methods #define PORTABLE_HASH_SEED 16154832 VALUE murmur2_digest(VALUE rb_str, uint64_t seed) { StringValue(rb_str); void * key = RSTRING_PTR(rb_str); long len = RSTRING_LEN(rb_str); uint64_t result = MurmurHash64A(key, len, seed); return LONG2FIX(result); } // ------------------------------------------------------------------------------------------------ // Spark::Digest::Murmur2.digest VALUE method_murmur2_digest(int argc, VALUE *argv, VALUE klass) { if(argc == 0 || argc > 2){ rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc); } uint64_t seed = (argc == 1 ? 0 : NUM2UINT(argv[1])); return murmur2_digest(argv[0], seed); } // ------------------------------------------------------------------------------------------------ // Spark::Digest.portable_hash VALUE method_portable_hash(int argc, VALUE *argv, VALUE klass) { if(argc != 1){ rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc); } return murmur2_digest(argv[0], PORTABLE_HASH_SEED); } ================================================ FILE: ext/ruby_c/murmur.h ================================================ #ifndef MURMUR_INCLUDED #define MURMUR_INCLUDED #include "ruby.h" VALUE method_portable_hash(int argc, VALUE *argv, VALUE klass); VALUE method_murmur2_digest(int argc, VALUE *argv, VALUE klass); #endif ================================================ FILE: ext/ruby_c/ruby-spark.c ================================================ #include "ruby.h" #include "murmur.h" VALUE SparkModule; VALUE SparkDigestModule; VALUE SparkDigestMurmur2Class; void Init_ruby_spark_ext() { SparkModule = rb_define_module("Spark"); SparkDigestModule = rb_define_module_under(SparkModule, "Digest"); SparkDigestMurmur2Class = rb_define_class_under(SparkDigestModule, "Murmur2", rb_cObject); rb_define_singleton_method(SparkDigestModule, "portable_hash", method_portable_hash, -1); rb_define_singleton_method(SparkDigestMurmur2Class, "digest", method_murmur2_digest, -1); } ================================================ FILE: ext/ruby_java/Digest.java ================================================ import org.jruby.Ruby; import org.jruby.RubyModule; import org.jruby.RubyObject; import org.jruby.RubyClass; import org.jruby.RubyString; import org.jruby.RubyFixnum; import org.jruby.anno.JRubyModule; import org.jruby.anno.JRubyMethod; import org.jruby.runtime.ThreadContext; import org.jruby.runtime.builtin.IRubyObject; @JRubyModule(name="Spark::Digest") public class Digest extends RubyObject{ // Have to be the same as in C extension final static long PORTABLE_HASH_SEED = 16154832; public Digest(final Ruby ruby, RubyClass rubyClass) { super(ruby, rubyClass); } @JRubyMethod(module=true) public static IRubyObject portable_hash(ThreadContext context, IRubyObject self, IRubyObject arg) { Ruby ruby = self.getRuntime(); RubyString keyString = (RubyString)arg; long hash = Murmur2.hash64(keyString.getBytes(), (int)keyString.length().getLongValue(), PORTABLE_HASH_SEED); RubyFixnum result = new RubyFixnum(ruby, hash); return result; } } ================================================ FILE: ext/ruby_java/Murmur2.java ================================================ import org.jruby.Ruby; import org.jruby.RubyClass; import org.jruby.RubyObject; import org.jruby.RubyString; import org.jruby.RubyFixnum; import org.jruby.anno.JRubyClass; import org.jruby.anno.JRubyMethod; import org.jruby.runtime.ThreadContext; import org.jruby.runtime.builtin.IRubyObject; /** Murmur hash 2.0. * * The murmur hash is a relative fast hash function from * http://murmurhash.googlepages.com/ for platforms with efficient * multiplication. * * http://d3s.mff.cuni.cz/~holub/sw/javamurmurhash/ * */ @JRubyClass(name="Spark::Digest::Murmur2") public class Murmur2 extends RubyObject { public Murmur2(final Ruby ruby, RubyClass rubyClass) { super(ruby, rubyClass); } @JRubyMethod(required=1, optional=1, module=true) public static IRubyObject digest(ThreadContext context, IRubyObject self, IRubyObject[] args) { Ruby ruby = context.getRuntime(); RubyString keyString = (RubyString)args[0]; long seed; if(args.length > 1){ RubyFixnum rb_seed = (RubyFixnum)args[1]; seed = rb_seed.getLongValue(); } else{ seed = 0; } long hash = hash64(keyString.getBytes(), (int)keyString.length().getLongValue(), seed); RubyFixnum result = new RubyFixnum(ruby, hash); return result; } /** Generates 64 bit hash from byte array of the given length and seed. * * @param data byte array to hash * @param length length of the array to hash * @param seed initial seed value * @return 64 bit hash of the given array */ public static long hash64(final byte[] data, int length, long seed) { final long m = 0xc6a4a7935bd1e995L; final int r = 47; long h = (seed&0xffffffffl)^(length*m); int length8 = length/8; for (int i=0; i>> r; k *= m; h ^= k; h *= m; } switch (length%8) { case 7: h ^= (long)(data[(length&~7)+6]&0xff) << 48; case 6: h ^= (long)(data[(length&~7)+5]&0xff) << 40; case 5: h ^= (long)(data[(length&~7)+4]&0xff) << 32; case 4: h ^= (long)(data[(length&~7)+3]&0xff) << 24; case 3: h ^= (long)(data[(length&~7)+2]&0xff) << 16; case 2: h ^= (long)(data[(length&~7)+1]&0xff) << 8; case 1: h ^= (long)(data[length&~7]&0xff); h *= m; }; h ^= h >>> r; h *= m; h ^= h >>> r; return h; } } ================================================ FILE: ext/ruby_java/RubySparkExtService.java ================================================ import org.jruby.Ruby; import org.jruby.RubyClass; import org.jruby.RubyModule; import org.jruby.runtime.ObjectAllocator; import org.jruby.runtime.builtin.IRubyObject; import org.jruby.runtime.load.BasicLibraryService; public class RubySparkExtService implements BasicLibraryService { public boolean basicLoad(final Ruby ruby) throws java.io.IOException { RubyModule sparkModule = ruby.defineModule("Spark"); RubyModule sparkDigestModule = sparkModule.defineModuleUnder("Digest"); RubyClass sparkDigestMurmur2Class = sparkDigestModule.defineClassUnder("Murmur2", ruby.getObject(), sparkDigestMurmur2Allocator); sparkDigestModule.defineAnnotatedMethods(Digest.class); sparkDigestMurmur2Class.defineAnnotatedMethods(Murmur2.class); return true; } public static ObjectAllocator sparkDigestMurmur2Allocator = new ObjectAllocator() { public IRubyObject allocate(Ruby ruby, RubyClass rubyClass) { return new Murmur2(ruby, rubyClass); } }; } ================================================ FILE: ext/ruby_java/extconf.rb ================================================ require 'mkmf' create_makefile("ruby_spark_ext") ================================================ FILE: ext/spark/build.sbt ================================================ import AssemblyKeys._ assemblySettings // Default values val defaultScalaVersion = "2.10.4" val defaultSparkVersion = "1.6.0" val defaultSparkCoreVersion = "2.10" val defaultTargetDir = "target" val defaultHadoopVersion = "1.0.4" // Values val _hadoopVersion = scala.util.Properties.envOrElse("HADOOP_VERSION", defaultHadoopVersion) val _scalaVersion = scala.util.Properties.envOrElse("SCALA_VERSION", defaultScalaVersion) val _sparkVersion = scala.util.Properties.envOrElse("SPARK_VERSION", defaultSparkVersion) val _sparkCoreVersion = scala.util.Properties.envOrElse("SPARK_CORE_VERSION", defaultSparkCoreVersion) val _targetDir = scala.util.Properties.envOrElse("TARGET_DIR", defaultTargetDir) // Project settings name := "ruby-spark" version := "1.0.0" scalaVersion := _scalaVersion javacOptions ++= Seq("-source", "1.7", "-target", "1.7") // Jar target folder artifactPath in Compile in packageBin := file(s"${_targetDir}/ruby-spark.jar") outputPath in packageDependency := file(s"${_targetDir}/ruby-spark-deps.jar") // Protocol buffer support seq(sbtprotobuf.ProtobufPlugin.protobufSettings: _*) // Additional libraries libraryDependencies ++= Seq( "org.apache.spark" %% "spark-core" % _sparkVersion excludeAll(ExclusionRule(organization = "org.apache.hadoop")), "org.apache.spark" %% "spark-graphx" % _sparkVersion, "org.apache.spark" %% "spark-mllib" % _sparkVersion, "org.apache.spark" %% "spark-sql" % _sparkVersion, "org.apache.hadoop" % "hadoop-client" % _hadoopVersion, "com.github.fommil.netlib" % "all" % "1.1.2", "org.scalatest" % "scalatest_2.10" % "2.2.1" % "test" ) // Repositories resolvers ++= Seq( "JBoss Repository" at "http://repository.jboss.org/nexus/content/repositories/releases/", "Spray Repository" at "http://repo.spray.io/", "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/", "Akka Repository" at "http://repo.akka.io/releases/", "Twitter4J Repository" at "http://twitter4j.org/maven2/", "Apache HBase" at "https://repository.apache.org/content/repositories/releases", "Twitter Maven Repo" at "http://maven.twttr.com/", "scala-tools" at "https://oss.sonatype.org/content/groups/scala-tools", "Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/", "Second Typesafe repo" at "http://repo.typesafe.com/typesafe/maven-releases/", "Mesosphere Public Repository" at "http://downloads.mesosphere.io/maven", Resolver.sonatypeRepo("public") ) // Merge strategy mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) => { case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard case m if m.startsWith("META-INF") => MergeStrategy.discard case PathList("javax", "servlet", xs @ _*) => MergeStrategy.first case PathList("org", "apache", xs @ _*) => MergeStrategy.first case PathList("org", "jboss", xs @ _*) => MergeStrategy.first case "about.html" => MergeStrategy.rename case "reference.conf" => MergeStrategy.concat case _ => MergeStrategy.first } } ================================================ FILE: ext/spark/project/plugins.sbt ================================================ resolvers += Resolver.url("artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns) resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/" resolvers += "Spray Repository" at "http://repo.spray.io/" addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.10.2") addSbtPlugin("com.github.gseitz" % "sbt-protobuf" % "0.3.3") ================================================ FILE: ext/spark/sbt/sbt ================================================ #!/bin/bash # This script launches sbt for this project. If present it uses the system # version of sbt. If there is no system version of sbt it attempts to download # sbt locally. SBT_VERSION=0.13.9 URL1=http://dl.bintray.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar URL2=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar JAR=sbt/sbt-launch-${SBT_VERSION}.jar # Download sbt launch jar if it hasn't been downloaded yet if [ ! -f ${JAR} ]; then # Download printf "Attempting to fetch sbt\n" JAR_DL=${JAR}.part if hash wget 2>/dev/null; then (wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR} elif hash curl 2>/dev/null; then (curl --progress-bar ${URL1} > ${JAR_DL} || curl --progress-bar ${URL2} > ${JAR_DL}) && mv ${JAR_DL} ${JAR} else printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n" exit -1 fi fi if [ ! -f ${JAR} ]; then # We failed to download printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n" exit -1 fi printf "Launching sbt from ${JAR}\n" java \ -Xmx1200m -XX:MaxPermSize=350m -XX:ReservedCodeCacheSize=256m \ -jar ${JAR} \ "$@" ================================================ FILE: ext/spark/src/main/scala/Exec.scala ================================================ package org.apache.spark.api.ruby import java.io.{File, FileOutputStream, InputStreamReader, BufferedReader} import scala.collection.JavaConversions._ import org.apache.spark.{SparkEnv, Logging} import org.apache.spark.util._ /* ================================================================================================= * class FileCommand * ================================================================================================= * * Save command to file and than execute him because from Scala you cannot simply run * something like "bash --norc -i -c 'source .zshrc; ruby master.rb'" */ class FileCommand(command: String) extends Logging { var pb: ProcessBuilder = null var file: File = null // Command is complete. def this(command: String, env: SparkEnv) = { this(command) create(env) } // Template must contains %s which will be replaced for command def this(template: String, command: String, env: SparkEnv, envVars: Map[String, String]) = { this(template.format(command), env) setEnvVars(envVars) } private def create(env: SparkEnv) { val dir = new File(env.sparkFilesDir) val ext = if(Utils.isWindows) ".cmd" else ".sh" val shell = if(Utils.isWindows) "cmd" else "bash" file = File.createTempFile("command", ext, dir) val out = new FileOutputStream(file) out.write(command.getBytes) out.close logInfo(s"New FileCommand at ${file.getAbsolutePath}") pb = new ProcessBuilder(shell, file.getAbsolutePath) } def setEnvVars(vars: Map[String, String]) { pb.environment().putAll(vars) } def run = { new ExecutedFileCommand(pb.start) } } /* ================================================================================================= * class ExecutedFileCommand * ================================================================================================= * * Represent process executed from file. */ class ExecutedFileCommand(process: Process) { var reader: BufferedReader = null def readLine = { openInput reader.readLine.toString.trim } def openInput { if(reader != null){ return } val input = new InputStreamReader(process.getInputStream) reader = new BufferedReader(input) } // Delegation def destroy = process.destroy def getInputStream = process.getInputStream def getErrorStream = process.getErrorStream } ================================================ FILE: ext/spark/src/main/scala/MLLibAPI.scala ================================================ package org.apache.spark.mllib.api.python // PythonMLLibAPI is private for python class MLLibAPI extends PythonMLLibAPI {} ================================================ FILE: ext/spark/src/main/scala/Marshal.scala ================================================ package org.apache.spark.api.ruby.marshal import java.io.{DataInputStream, DataOutputStream, ByteArrayInputStream, ByteArrayOutputStream} import scala.collection.mutable.ArrayBuffer import scala.collection.JavaConverters._ /* ================================================================================================= * object Marshal * ================================================================================================= */ object Marshal { def load(bytes: Array[Byte]) = { val is = new DataInputStream(new ByteArrayInputStream(bytes)) val majorVersion = is.readUnsignedByte // 4 val minorVersion = is.readUnsignedByte // 8 (new MarshalLoad(is)).load } def dump(data: Any) = { val aos = new ByteArrayOutputStream val os = new DataOutputStream(aos) os.writeByte(4) os.writeByte(8) (new MarshalDump(os)).dump(data) aos.toByteArray } } /* ================================================================================================= * class IterableMarshaller * ================================================================================================= */ class IterableMarshaller(iter: Iterator[Any]) extends Iterator[Array[Byte]] { private val buffer = new ArrayBuffer[Any] override def hasNext: Boolean = iter.hasNext override def next(): Array[Byte] = { while (iter.hasNext) { buffer += iter.next() } Marshal.dump(buffer) } } ================================================ FILE: ext/spark/src/main/scala/MarshalDump.scala ================================================ package org.apache.spark.api.ruby.marshal import java.io.{DataInputStream, DataOutputStream, ByteArrayInputStream, ByteArrayOutputStream} import scala.collection.mutable.ArrayBuffer import scala.collection.JavaConverters._ import scala.reflect.{ClassTag, classTag} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.{Vector, DenseVector, SparseVector} /* ================================================================================================= * class MarshalDump * ================================================================================================= */ class MarshalDump(os: DataOutputStream) { val NAN_BYTELIST = "nan".getBytes val NEGATIVE_INFINITY_BYTELIST = "-inf".getBytes val INFINITY_BYTELIST = "inf".getBytes def dump(data: Any) { data match { case null => os.writeByte('0') case item: Boolean => val char = if(item) 'T' else 'F' os.writeByte(char) case item: Int => os.writeByte('i') dumpInt(item) case item: Array[_] => os.writeByte('[') dumpArray(item) case item: Double => os.writeByte('f') dumpFloat(item) case item: ArrayBuffer[Any] => dump(item.toArray) } } def dumpInt(data: Int) { if(data == 0){ os.writeByte(0) } else if (0 < data && data < 123) { os.writeByte(data + 5) } else if (-124 < data && data < 0) { os.writeByte((data - 5) & 0xff) } else { val buffer = new Array[Byte](4) var value = data var i = 0 while(i != 4 && value != 0 && value != -1){ buffer(i) = (value & 0xff).toByte value = value >> 8 i += 1 } val lenght = i + 1 if(value < 0){ os.writeByte(-lenght) } else{ os.writeByte(lenght) } os.write(buffer, 0, lenght) } } def dumpArray(array: Array[_]) { dumpInt(array.size) for(item <- array) { dump(item) } } def dumpFloat(value: Double) { if(value.isPosInfinity){ dumpString(NEGATIVE_INFINITY_BYTELIST) } else if(value.isNegInfinity){ dumpString(INFINITY_BYTELIST) } else if(value.isNaN){ dumpString(NAN_BYTELIST) } else{ // dumpString("%.17g".format(value)) dumpString(value.toString) } } def dumpString(data: String) { dumpString(data.getBytes) } def dumpString(data: Array[Byte]) { dumpInt(data.size) os.write(data) } } ================================================ FILE: ext/spark/src/main/scala/MarshalLoad.scala ================================================ package org.apache.spark.api.ruby.marshal import java.io.{DataInputStream, DataOutputStream, ByteArrayInputStream, ByteArrayOutputStream} import scala.collection.mutable.ArrayBuffer import scala.collection.JavaConverters._ import scala.reflect.{ClassTag, classTag} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.{Vector, DenseVector, SparseVector} /* ================================================================================================= * class MarshalLoad * ================================================================================================= */ class MarshalLoad(is: DataInputStream) { case class WaitForObject() val registeredSymbols = ArrayBuffer[String]() val registeredLinks = ArrayBuffer[Any]() def load: Any = { load(is.readUnsignedByte.toChar) } def load(dataType: Char): Any = { dataType match { case '0' => null case 'T' => true case 'F' => false case 'i' => loadInt case 'f' => loadAndRegisterFloat case ':' => loadAndRegisterSymbol case '[' => loadAndRegisterArray case 'U' => loadAndRegisterUserObject case _ => throw new IllegalArgumentException(s"Format is not supported: $dataType.") } } // ---------------------------------------------------------------------------------------------- // Load by type def loadInt: Int = { var c = is.readByte.toInt if (c == 0) { return 0 } else if (4 < c && c < 128) { return c - 5 } else if (-129 < c && c < -4) { return c + 5 } var result: Long = 0 if (c > 0) { result = 0 for( i <- 0 until c ) { result |= (is.readUnsignedByte << (8 * i)).toLong } } else { c = -c result = -1 for( i <- 0 until c ) { result &= ~((0xff << (8 * i)).toLong) result |= (is.readUnsignedByte << (8 * i)).toLong } } result.toInt } def loadAndRegisterFloat: Double = { val result = loadFloat registeredLinks += result result } def loadFloat: Double = { val string = loadString string match { case "nan" => Double.NaN case "inf" => Double.PositiveInfinity case "-inf" => Double.NegativeInfinity case _ => string.toDouble } } def loadString: String = { new String(loadStringBytes) } def loadStringBytes: Array[Byte] = { val size = loadInt val buffer = new Array[Byte](size) var readSize = 0 while(readSize < size){ val read = is.read(buffer, readSize, size-readSize) if(read == -1){ throw new IllegalArgumentException("Marshal too short.") } readSize += read } buffer } def loadAndRegisterSymbol: String = { val result = loadString registeredSymbols += result result } def loadAndRegisterArray: Array[Any] = { val size = loadInt val array = new Array[Any](size) registeredLinks += array for( i <- 0 until size ) { array(i) = loadNextObject } array } def loadAndRegisterUserObject: Any = { val klass = loadNextObject.asInstanceOf[String] // Register future class before load the next object registeredLinks += WaitForObject() val index = registeredLinks.size - 1 val data = loadNextObject val result = klass match { case "Spark::Mllib::LabeledPoint" => createLabeledPoint(data) case "Spark::Mllib::DenseVector" => createDenseVector(data) case "Spark::Mllib::SparseVector" => createSparseVector(data) case other => throw new IllegalArgumentException(s"Object $other is not supported.") } registeredLinks(index) = result result } // ---------------------------------------------------------------------------------------------- // Other loads def loadNextObject: Any = { val dataType = is.readUnsignedByte.toChar if(isLinkType(dataType)){ readLink(dataType) } else{ load(dataType) } } // ---------------------------------------------------------------------------------------------- // To java objects def createLabeledPoint(data: Any): LabeledPoint = { val array = data.asInstanceOf[Array[_]] new LabeledPoint(array(0).asInstanceOf[Double], array(1).asInstanceOf[Vector]) } def createDenseVector(data: Any): DenseVector = { new DenseVector(data.asInstanceOf[Array[_]].map(toDouble(_))) } def createSparseVector(data: Any): SparseVector = { val array = data.asInstanceOf[Array[_]] val size = array(0).asInstanceOf[Int] val indices = array(1).asInstanceOf[Array[_]].map(_.asInstanceOf[Int]) val values = array(2).asInstanceOf[Array[_]].map(toDouble(_)) new SparseVector(size, indices, values) } // ---------------------------------------------------------------------------------------------- // Helpers def toDouble(data: Any): Double = data match { case x: Int => x.toDouble case x: Double => x case _ => 0.0 } // ---------------------------------------------------------------------------------------------- // Cache def readLink(dataType: Char): Any = { val index = loadInt dataType match { case '@' => registeredLinks(index) case ';' => registeredSymbols(index) } } def isLinkType(dataType: Char): Boolean = { dataType == ';' || dataType == '@' } } ================================================ FILE: ext/spark/src/main/scala/RubyAccumulatorParam.scala ================================================ package org.apache.spark.api.ruby import java.io._ import java.net._ import java.util.{List, ArrayList} import scala.collection.JavaConversions._ import scala.collection.immutable._ import org.apache.spark._ import org.apache.spark.util.Utils /** * Internal class that acts as an `AccumulatorParam` for Ruby accumulators. Inside, it * collects a list of pickled strings that we pass to Ruby through a socket. */ private class RubyAccumulatorParam(serverHost: String, serverPort: Int) extends AccumulatorParam[List[Array[Byte]]] { // Utils.checkHost(serverHost, "Expected hostname") val bufferSize = SparkEnv.get.conf.getInt("spark.buffer.size", 65536) // Socket shoudl not be serialized // Otherwise: SparkException: Task not serializable @transient var socket: Socket = null @transient var socketOutputStream: DataOutputStream = null @transient var socketInputStream: DataInputStream = null def openSocket(){ synchronized { if (socket == null || socket.isClosed) { socket = new Socket(serverHost, serverPort) socketInputStream = new DataInputStream(new BufferedInputStream(socket.getInputStream, bufferSize)) socketOutputStream = new DataOutputStream(new BufferedOutputStream(socket.getOutputStream, bufferSize)) } } } override def zero(value: List[Array[Byte]]): List[Array[Byte]] = new ArrayList override def addInPlace(val1: List[Array[Byte]], val2: List[Array[Byte]]) : List[Array[Byte]] = synchronized { if (serverHost == null) { // This happens on the worker node, where we just want to remember all the updates val1.addAll(val2) val1 } else { // This happens on the master, where we pass the updates to Ruby through a socket openSocket() socketOutputStream.writeInt(val2.size) for (array <- val2) { socketOutputStream.writeInt(array.length) socketOutputStream.write(array) } socketOutputStream.flush() // Wait for acknowledgement // http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock // // if(in.readInt() != RubyConstant.ACCUMULATOR_ACK){ // throw new SparkException("Accumulator was not acknowledged") // } new ArrayList } } } ================================================ FILE: ext/spark/src/main/scala/RubyBroadcast.scala ================================================ package org.apache.spark.api.ruby import org.apache.spark.api.python.PythonBroadcast /** * An Wrapper for Ruby Broadcast, which is written into disk by Ruby. It also will * write the data into disk after deserialization, then Ruby can read it from disks. * * Class use Python logic - only for semantic */ class RubyBroadcast(@transient var _path: String, @transient var id: java.lang.Long) extends PythonBroadcast(_path) { } ================================================ FILE: ext/spark/src/main/scala/RubyConstant.scala ================================================ package org.apache.spark.api.ruby object RubyConstant { val DATA_EOF = -2 val WORKER_ERROR = -1 val WORKER_DONE = 0 val CREATE_WORKER = 1 val KILL_WORKER = 2 val KILL_WORKER_AND_WAIT = 3 val SUCCESSFULLY_KILLED = 4 val UNSUCCESSFUL_KILLING = 5 val ACCUMULATOR_ACK = 6 } ================================================ FILE: ext/spark/src/main/scala/RubyMLLibAPI.scala ================================================ package org.apache.spark.mllib.api.ruby import java.util.ArrayList import scala.collection.JavaConverters._ import org.apache.spark.rdd.RDD import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.clustering.GaussianMixtureModel import org.apache.spark.mllib.stat.distribution.MultivariateGaussian import org.apache.spark.mllib.api.python.MLLibAPI class RubyMLLibAPI extends MLLibAPI { // trainLinearRegressionModelWithSGD // trainLassoModelWithSGD // trainRidgeModelWithSGD // trainLogisticRegressionModelWithSGD // trainLogisticRegressionModelWithLBFGS // trainSVMModelWithSGD // trainKMeansModel // trainGaussianMixtureModel // Rjb have a problem with theta: Array[Array[Double]] override def trainNaiveBayesModel(data: JavaRDD[LabeledPoint], lambda: Double) = { val model = NaiveBayes.train(data.rdd, lambda) List( Vectors.dense(model.labels), Vectors.dense(model.pi), model.theta.toSeq ).map(_.asInstanceOf[Object]).asJava } // On python is wt just Object def predictSoftGMM( data: JavaRDD[Vector], wt: ArrayList[Object], mu: ArrayList[Object], si: ArrayList[Object]): RDD[Array[Double]] = { // val weight = wt.asInstanceOf[Array[Double]] val weight = wt.toArray.map(_.asInstanceOf[Double]) val mean = mu.toArray.map(_.asInstanceOf[DenseVector]) val sigma = si.toArray.map(_.asInstanceOf[DenseMatrix]) val gaussians = Array.tabulate(weight.length){ i => new MultivariateGaussian(mean(i), sigma(i)) } val model = new GaussianMixtureModel(weight, gaussians) model.predictSoft(data) } } ================================================ FILE: ext/spark/src/main/scala/RubyMLLibUtilAPI.scala ================================================ package org.apache.spark.mllib.api.ruby import java.util.ArrayList import org.apache.spark.mllib.util.LinearDataGenerator import org.apache.spark.mllib.regression.LabeledPoint object RubyMLLibUtilAPI { // Ruby does have a problem with creating Array[Double] def generateLinearInput( intercept: Double, weights: ArrayList[String], nPoints: Int, seed: Int, eps: Double = 0.1): Seq[LabeledPoint] = { LinearDataGenerator.generateLinearInput(intercept, weights.toArray.map(_.toString.toDouble), nPoints, seed, eps) } } ================================================ FILE: ext/spark/src/main/scala/RubyPage.scala ================================================ package org.apache.spark.ui.ruby // import javax.servlet.http.HttpServletRequest // import scala.xml.Node // import org.apache.spark.ui.{WebUIPage, UIUtils} // import org.apache.spark.util.Utils // private[ui] class RubyPage(parent: RubyTab, rbConfig: Array[Tuple2[String, String]]) extends WebUIPage("") { // def render(request: HttpServletRequest): Seq[Node] = { // val content = UIUtils.listingTable(header, row, rbConfig) // UIUtils.headerSparkPage("Ruby Config", content, parent) // } // private def header = Seq( // "Number" // ) // private def row(keyValue: (String, String)): Seq[Node] = { // // scalastyle:off // keyValue match { // case (key, value) => // // {key} // {value} // // } // // scalastyle:on // } // } class RubyPage {} ================================================ FILE: ext/spark/src/main/scala/RubyRDD.scala ================================================ package org.apache.spark.api.ruby import java.io._ import java.net._ import java.util.{List, ArrayList, Collections} import scala.util.Try import scala.reflect.ClassTag import scala.collection.JavaConversions._ import org.apache.spark._ import org.apache.spark.{SparkEnv, Partition, SparkException, TaskContext} import org.apache.spark.api.ruby._ import org.apache.spark.api.ruby.marshal._ import org.apache.spark.api.java.{JavaSparkContext, JavaPairRDD, JavaRDD} import org.apache.spark.api.python.PythonRDD import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import org.apache.spark.util.Utils import org.apache.spark.InterruptibleIterator /* ================================================================================================= * Class RubyRDD * ================================================================================================= */ class RubyRDD( @transient parent: RDD[_], command: Array[Byte], broadcastVars: ArrayList[Broadcast[RubyBroadcast]], accumulator: Accumulator[List[Array[Byte]]]) extends RDD[Array[Byte]](parent){ val bufferSize = conf.getInt("spark.buffer.size", 65536) val asJavaRDD: JavaRDD[Array[Byte]] = JavaRDD.fromRDD(this) override def getPartitions: Array[Partition] = firstParent.partitions override val partitioner = None /* ------------------------------------------------------------------------------------------ */ override def compute(split: Partition, context: TaskContext): Iterator[Array[Byte]] = { val env = SparkEnv.get // Get worker and id val (worker, workerId) = RubyWorker.create(env) // Start a thread to feed the process input from our parent's iterator val writerThread = new WriterThread(env, worker, split, context) context.addTaskCompletionListener { context => writerThread.shutdownOnTaskCompletion() writerThread.join() // Cleanup the worker socket. This will also cause the worker to exit. try { RubyWorker.remove(worker, workerId) worker.close() } catch { case e: Exception => logWarning("Failed to close worker socket", e) } } val stream = new DataInputStream(new BufferedInputStream(worker.getInputStream, bufferSize)) // Send data writerThread.start() // For violent termination of worker new MonitorThread(workerId, worker, context).start() // Return an iterator that read lines from the process's stdout val stdoutIterator = new StreamReader(stream, writerThread, context) // An iterator that wraps around an existing iterator to provide task killing functionality. new InterruptibleIterator(context, stdoutIterator) } // end compute /* ------------------------------------------------------------------------------------------ */ class WriterThread(env: SparkEnv, worker: Socket, split: Partition, context: TaskContext) extends Thread("stdout writer for worker") { @volatile private var _exception: Exception = null setDaemon(true) // Contains the exception thrown while writing the parent iterator to the process. def exception: Option[Exception] = Option(_exception) // Terminates the writer thread, ignoring any exceptions that may occur due to cleanup. def shutdownOnTaskCompletion() { assert(context.isCompleted) this.interrupt() } // ------------------------------------------------------------------------------------------- // Send the necessary data for worker // - split index // - command // - iterator override def run(): Unit = Utils.logUncaughtExceptions { try { SparkEnv.set(env) val stream = new BufferedOutputStream(worker.getOutputStream, bufferSize) val dataOut = new DataOutputStream(stream) // Partition index dataOut.writeInt(split.index) // Spark files PythonRDD.writeUTF(SparkFiles.getRootDirectory, dataOut) // Broadcast variables dataOut.writeInt(broadcastVars.length) for (broadcast <- broadcastVars) { dataOut.writeLong(broadcast.value.id) PythonRDD.writeUTF(broadcast.value.path, dataOut) } // Serialized command dataOut.writeInt(command.length) dataOut.write(command) // Send it dataOut.flush() // Data PythonRDD.writeIteratorToStream(firstParent.iterator(split, context), dataOut) dataOut.writeInt(RubyConstant.DATA_EOF) dataOut.flush() } catch { case e: Exception if context.isCompleted || context.isInterrupted => logDebug("Exception thrown after task completion (likely due to cleanup)", e) case e: Exception => // We must avoid throwing exceptions here, because the thread uncaught exception handler // will kill the whole executor (see org.apache.spark.executor.Executor). _exception = e } finally { Try(worker.shutdownOutput()) // kill worker process } } } // end WriterThread /* ------------------------------------------------------------------------------------------ */ class StreamReader(stream: DataInputStream, writerThread: WriterThread, context: TaskContext) extends Iterator[Array[Byte]] { def hasNext = _nextObj != null var _nextObj = read() // ------------------------------------------------------------------------------------------- def next(): Array[Byte] = { val obj = _nextObj if (hasNext) { _nextObj = read() } obj } // ------------------------------------------------------------------------------------------- private def read(): Array[Byte] = { if (writerThread.exception.isDefined) { throw writerThread.exception.get } try { stream.readInt() match { case length if length > 0 => val obj = new Array[Byte](length) stream.readFully(obj) obj case RubyConstant.WORKER_DONE => val numAccumulatorUpdates = stream.readInt() (1 to numAccumulatorUpdates).foreach { _ => val updateLen = stream.readInt() val update = new Array[Byte](updateLen) stream.readFully(update) accumulator += Collections.singletonList(update) } null case RubyConstant.WORKER_ERROR => // Exception from worker // message val length = stream.readInt() val obj = new Array[Byte](length) stream.readFully(obj) // stackTrace val stackTraceLen = stream.readInt() val stackTrace = new Array[String](stackTraceLen) (0 until stackTraceLen).foreach { i => val length = stream.readInt() val obj = new Array[Byte](length) stream.readFully(obj) stackTrace(i) = new String(obj, "utf-8") } // Worker will be killed stream.close // exception val exception = new RubyException(new String(obj, "utf-8"), writerThread.exception.getOrElse(null)) exception.appendToStackTrace(stackTrace) throw exception } } catch { case e: Exception if context.isInterrupted => logDebug("Exception thrown after task interruption", e) throw new TaskKilledException case e: Exception if writerThread.exception.isDefined => logError("Worker exited unexpectedly (crashed)", e) throw writerThread.exception.get case eof: EOFException => throw new SparkException("Worker exited unexpectedly (crashed)", eof) } } } // end StreamReader /* --------------------------------------------------------------------------------------------- * Monitor thread for controll worker. Kill worker if task is interrupted. */ class MonitorThread(workerId: Long, worker: Socket, context: TaskContext) extends Thread("Worker Monitor for worker") { setDaemon(true) override def run() { // Kill the worker if it is interrupted, checking until task completion. while (!context.isInterrupted && !context.isCompleted) { Thread.sleep(2000) } if (!context.isCompleted) { try { logWarning("Incomplete task interrupted: Attempting to kill Worker "+workerId.toString()) RubyWorker.kill(workerId) } catch { case e: Exception => logError("Exception when trying to kill worker "+workerId.toString(), e) } } } } // end MonitorThread } // end RubyRDD /* ================================================================================================= * Class PairwiseRDD * ================================================================================================= * * Form an RDD[(Array[Byte], Array[Byte])] from key-value pairs returned from Ruby. * This is used by PySpark's shuffle operations. * Borrowed from Python Package -> need new deserializeLongValue -> * Marshal will add the same 4b header */ class PairwiseRDD(prev: RDD[Array[Byte]]) extends RDD[(Long, Array[Byte])](prev) { override def getPartitions = prev.partitions override def compute(split: Partition, context: TaskContext) = prev.iterator(split, context).grouped(2).map { case Seq(a, b) => (Utils.deserializeLongValue(a.reverse), b) case x => throw new SparkException("PairwiseRDD: unexpected value: " + x) } val asJavaPairRDD : JavaPairRDD[Long, Array[Byte]] = JavaPairRDD.fromRDD(this) } /* ================================================================================================= * Object RubyRDD * ================================================================================================= */ object RubyRDD extends Logging { def runJob( sc: SparkContext, rdd: JavaRDD[Array[Byte]], partitions: ArrayList[Int], allowLocal: Boolean, filename: String): String = { type ByteArray = Array[Byte] type UnrolledPartition = Array[ByteArray] val allPartitions: Array[UnrolledPartition] = sc.runJob(rdd, (x: Iterator[ByteArray]) => x.toArray, partitions, allowLocal) val flattenedPartition: UnrolledPartition = Array.concat(allPartitions: _*) writeRDDToFile(flattenedPartition.iterator, filename) } def readRDDFromFile(sc: JavaSparkContext, filename: String, parallelism: Int): JavaRDD[Array[Byte]] = { val file = new DataInputStream(new BufferedInputStream(new FileInputStream(filename))) val objs = new collection.mutable.ArrayBuffer[Array[Byte]] try { while (true) { val length = file.readInt() val obj = new Array[Byte](length) file.readFully(obj) objs.append(obj) } } catch { case eof: EOFException => {} } JavaRDD.fromRDD(sc.sc.parallelize(objs, parallelism)) } def writeRDDToFile[T](items: Iterator[T], filename: String): String = { val file = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(filename))) try { PythonRDD.writeIteratorToStream(items, file) } finally { file.close() } filename } def writeRDDToFile[T](rdd: RDD[T], filename: String): String = { writeRDDToFile(rdd.collect.iterator, filename) } def readBroadcastFromFile(sc: JavaSparkContext, path: String, id: java.lang.Long): Broadcast[RubyBroadcast] = { sc.broadcast(new RubyBroadcast(path, id)) } /** * Convert an RDD of serialized Ruby objects to RDD of objects, that is usable in Java. */ def toJava(rbRDD: JavaRDD[Array[Byte]], batched: Boolean): JavaRDD[Any] = { rbRDD.rdd.mapPartitions { iter => iter.flatMap { item => val obj = Marshal.load(item) if(batched){ obj.asInstanceOf[Array[_]] } else{ Seq(item) } } }.toJavaRDD() } /** * Convert an RDD of Java objects to an RDD of serialized Ruby objects, that is usable by Ruby. */ def toRuby(jRDD: JavaRDD[_]): JavaRDD[Array[Byte]] = { jRDD.rdd.mapPartitions { iter => new IterableMarshaller(iter) } } } /* ================================================================================================= * Class RubyException * ================================================================================================= */ class RubyException(msg: String, cause: Exception) extends RuntimeException(msg, cause) { def appendToStackTrace(toAdded: Array[String]) { val newStactTrace = getStackTrace.toBuffer var regexpMatch = "(.*):([0-9]+):in `([a-z]+)'".r for(item <- toAdded) { item match { case regexpMatch(fileName, lineNumber, methodName) => newStactTrace += new StackTraceElement("RubyWorker", methodName, fileName, lineNumber.toInt) case _ => null } } setStackTrace(newStactTrace.toArray) } } ================================================ FILE: ext/spark/src/main/scala/RubySerializer.scala ================================================ package org.apache.spark.api.ruby import scala.collection.JavaConverters._ import scala.reflect.{ClassTag, classTag} import org.apache.spark.api.java.JavaRDD import org.apache.spark.api.ruby.marshal._ /* ================================================================================================= * object RubySerializer * ================================================================================================= */ object RubySerializer { } ================================================ FILE: ext/spark/src/main/scala/RubyTab.scala ================================================ package org.apache.spark.ui.ruby import scala.collection.mutable.HashMap import org.apache.spark.ui._ // class RubyTab(parent: SparkUI, rbConfig: HashMap[String, String]) extends SparkUITab(parent, "ruby"){ // attachPage(new RubyPage(this, rbConfig.toArray)) // } class RubyTab {} ================================================ FILE: ext/spark/src/main/scala/RubyUtils.scala ================================================ package org.apache.spark.api.ruby import org.apache.spark.util._ import org.apache.spark.{SparkConf, Logging} object RubyUtils extends Logging { def loadPropertiesFile(conf: SparkConf, path: String): String = { Utils.getPropertiesFromFile(path).foreach { case (key, value) => conf.set(key, value) } path } } ================================================ FILE: ext/spark/src/main/scala/RubyWorker.scala ================================================ package org.apache.spark.api.ruby import java.io.{File, DataInputStream, InputStream, DataOutputStream, FileOutputStream} import java.net.{InetAddress, ServerSocket, Socket, SocketException} import java.nio.file.Paths import scala.collection.mutable import scala.collection.JavaConversions._ import org.apache.spark._ import org.apache.spark.api.python.PythonRDD import org.apache.spark.util.Utils import org.apache.spark.util.RedirectThread /* ================================================================================================= * Object RubyWorker * ================================================================================================= * * Create and store server for creating workers. */ object RubyWorker extends Logging { val PROCESS_WAIT_TIMEOUT = 10000 private var serverSocket: ServerSocket = null private val serverHost = InetAddress.getByAddress(Array(127, 0, 0, 1)) private var serverPort: Int = 0 private var master: ExecutedFileCommand = null private var masterSocket: Socket = null private var masterOutputStream: DataOutputStream = null private var masterInputStream: DataInputStream = null private var workers = new mutable.WeakHashMap[Socket, Long]() /* ---------------------------------------------------------------------------------------------- * Create new worker but first check if exist SocketServer and master process. * If not it will create them. Worker have 2 chance to create. */ def create(env: SparkEnv): (Socket, Long) = { synchronized { // Create the server if it hasn't been started createServer(env) // Attempt to connect, restart and retry once if it fails try { createWorker } catch { case exc: SocketException => logWarning("Worker unexpectedly quit, attempting to restart") createWorker } } } /* ---------------------------------------------------------------------------------------------- * Create a worker throught master process. Return new socket and id. * According spark.ruby.worker.type id will be: * process: PID * thread: thread object id */ def createWorker: (Socket, Long) = { synchronized { masterOutputStream.writeInt(RubyConstant.CREATE_WORKER) var socket = serverSocket.accept() var id = new DataInputStream(socket.getInputStream).readLong() workers.put(socket, id) (socket, id) } } /* ---------------------------------------------------------------------------------------------- * Create SocketServer and bind it to the localhost. Max numbers of connection on queue * is set to default. If server is created withou exception -> create master. */ private def createServer(env: SparkEnv){ synchronized { // Already running? if(serverSocket != null && masterSocket != null) { return } try { // Start Socket Server for comunication serverSocket = new ServerSocket(0, 0, serverHost) serverPort = serverSocket.getLocalPort // Create a master for worker creations createMaster(env) } catch { case e: Exception => throw new SparkException("There was a problem with creating a server", e) } } } /* ---------------------------------------------------------------------------------------------- * In this point SocketServer must be created. Master process create and kill workers. * Creating workers from Java can be an expensive operation because new process can * get copy of address space. */ private def createMaster(env: SparkEnv){ synchronized { val isDriver = env.executorId == SparkContext.DRIVER_IDENTIFIER val executorOptions = env.conf.get("spark.ruby.executor.options", "") val commandTemplate = env.conf.get("spark.ruby.executor.command") val workerType = env.conf.get("spark.ruby.worker.type") // Where is root of ruby-spark var executorLocation = "" if(isDriver){ // Use worker from current active gem location executorLocation = env.conf.get("spark.ruby.driver_home") } else{ // Use gem installed on the system try { val homeCommand = (new FileCommand(commandTemplate, "ruby-spark home", env, getEnvVars(env))).run executorLocation = homeCommand.readLine } catch { case e: Exception => throw new SparkException("Ruby-spark gem is not installed.", e) } } // Master and worker are saved in GEM_ROOT/lib/spark/worker executorLocation = Paths.get(executorLocation, "lib", "spark", "worker").toString // Create master command // -C: change worker dir before execution val masterRb = s"ruby $executorOptions -C $executorLocation master.rb $workerType $serverPort" val masterCommand = new FileCommand(commandTemplate, masterRb, env, getEnvVars(env)) // Start master master = masterCommand.run // Redirect master stdout and stderr redirectStreamsToStderr(master.getInputStream, master.getErrorStream) // Wait for it to connect to our socket serverSocket.setSoTimeout(PROCESS_WAIT_TIMEOUT) try { // Use socket for comunication. Keep stdout and stdin for log masterSocket = serverSocket.accept() masterOutputStream = new DataOutputStream(masterSocket.getOutputStream) masterInputStream = new DataInputStream(masterSocket.getInputStream) PythonRDD.writeUTF(executorOptions, masterOutputStream) } catch { case e: Exception => throw new SparkException("Ruby master did not connect back in time", e) } } } /* ---------------------------------------------------------------------------------------------- * Gel all environment variables for executor */ def getEnvVars(env: SparkEnv): Map[String, String] = { val prefix = "spark.ruby.executor.env." env.conf.getAll.filter{case (k, _) => k.startsWith(prefix)} .map{case (k, v) => (k.substring(prefix.length), v)} .toMap } /* ------------------------------------------------------------------------------------------- */ def kill(workerId: Long){ masterOutputStream.writeInt(RubyConstant.KILL_WORKER) masterOutputStream.writeLong(workerId) } /* ------------------------------------------------------------------------------------------- */ def killAndWait(workerId: Long){ masterOutputStream.writeInt(RubyConstant.KILL_WORKER_AND_WAIT) masterOutputStream.writeLong(workerId) // Wait for answer masterInputStream.readInt() match { case RubyConstant.SUCCESSFULLY_KILLED => logInfo(s"Worker $workerId was successfully killed") case RubyConstant.UNSUCCESSFUL_KILLING => logInfo(s"Worker $workerId cannot be killed (maybe is already killed)") } } /* ---------------------------------------------------------------------------------------------- * workers HashMap is week but it avoid long list of workers which cannot be killed (killAndWait) */ def remove(worker: Socket, workerId: Long){ try { workers.remove(worker) } catch { case e: Exception => logWarning(s"Worker $workerId does not exist (maybe is already removed)") } } /* ------------------------------------------------------------------------------------------- */ def stopServer{ synchronized { // Kill workers workers.foreach { case (socket, id) => killAndWait(id) } // Kill master master.destroy // Stop SocketServer serverSocket.close() // Clean variables serverSocket = null serverPort = 0 master = null masterSocket = null masterOutputStream = null masterInputStream = null } } /* ------------------------------------------------------------------------------------------- */ private def redirectStreamsToStderr(streams: InputStream*) { try { for(stream <- streams) { new RedirectThread(stream, System.err, "stream reader").start() } } catch { case e: Exception => logError("Exception in redirecting streams", e) } } /* ------------------------------------------------------------------------------------------- */ } ================================================ FILE: ext/spark/src/test/scala/MarshalSpec.scala ================================================ package org.apache.spark.api.ruby.marshal import org.scalatest._ import org.apache.spark.api.ruby.marshal._ class MarshalSpec extends FunSpec with Matchers { // ==================================================================================== // Load describe("Marshal.load"){ describe("single value"){ it("int"){ val data = 1 val serialized = Array[Byte](4, 8, 105, 6) Marshal.load(serialized) should equal(data) } it("double"){ val data = 1.2 val serialized = Array[Byte](4, 8, 102, 8, 49, 46, 50) Marshal.load(serialized) should equal(data) } } describe("array"){ it("ints"){ val data = Array(1, 2, 3, 4, 5) val serialized = Array[Byte](4, 8, 91, 10, 105, 6, 105, 7, 105, 8, 105, 9, 105, 10) Marshal.load(serialized) should equal(data) } it("doubles"){ val data = Array(1.1, 2.2, 3.3) val serialized = Array[Byte](4, 8, 91, 8, 102, 8, 49, 46, 49, 102, 8, 50, 46, 50, 102, 8, 51, 46, 51) Marshal.load(serialized) should equal(data) } } } // ==================================================================================== // Dump describe("Marshal.dump"){ describe("single value"){ it("int"){ val data = 1 val serialized = Array(4, 8, 105, 6) Marshal.dump(data) should equal(serialized) } it("double"){ val data = 1.2 val serialized = Array(4, 8, 102, 8, 49, 46, 50) Marshal.dump(data) should equal(serialized) } } describe("array"){ it("ints"){ val data = Array(1, 2, 3, 4, 5) val serialized = Array(4, 8, 91, 10, 105, 6, 105, 7, 105, 8, 105, 9, 105, 10) Marshal.dump(data) should equal(serialized) } it("doubles"){ val data = Array(1.1, 2.2, 3.3) val serialized = Array(4, 8, 91, 8, 102, 8, 49, 46, 49, 102, 8, 50, 46, 50, 102, 8, 51, 46, 51) Marshal.dump(data) should equal(serialized) } } } } ================================================ FILE: lib/ruby-spark.rb ================================================ require_relative 'spark' ================================================ FILE: lib/spark/accumulator.rb ================================================ module Spark ## # A shared variable that can be accumulated, i.e., has a commutative and associative "add" # operation. Worker tasks on a Spark cluster can add values to an Accumulator with the `+=` # operator, but only the driver program is allowed to access its value, using value. # Updates from the workers get propagated automatically to the driver program. # # == Arguments: # value:: # Initial value for accumulator. This values is stored only on driver process # # accum_param:: # How merge 2 value on worker or driver process. # Symbol or Proc (or String) # # zero_value:: # Initial value for worker process # # # == Examples: # # accum1 = $sc.accumulator(1) # accum2 = $sc.accumulator(2, :*, 1) # accum3 = $sc.accumulator(3, lambda{|max, val| val > max ? val : max}) # # accum1 += 1 # # accum2.add(2) # accum2.add(2) # accum2.add(2) # # accum3.add(9) # accum3.add(6) # accum3.add(7) # # accum1.value # => 2 # accum2.value # => 16 # accum3.value # => 9 # # func = Proc.new do |_, index| # accum1.add(1) # accum2.add(2) # accum3.add(index * 10) # end # # rdd = $sc.parallelize(0..4, 4) # rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3) # rdd = rdd.map_partitions_with_index(func) # rdd.collect # # accum1.value # => 6 # accum2.value # => 256 # accum3.value # => 30 # class Accumulator attr_reader :id, :value, :accum_param, :zero_value @@instances = {} @@changed = [] SUPPORTED_SYMBOLS = [:+, :-, :*, :/, :**] # ========================================================================= # Creating and selecting Spark::Accumulator def initialize(value, accum_param=:+, zero_value=0) @id = object_id @value = value @accum_param = accum_param @zero_value = zero_value @driver = true valid_accum_param @@instances[@id] = self end def inspect result = %{#<#{self.class.name}:0x#{object_id}\n} result << %{ ID: #{@id}\n} result << %{ Zero: #{@zero_value.to_s[0, 10]}\n} result << %{Value: #{@value.to_s[0, 10]}>} result end def self.changed @@changed end def self.instances @@instances end def valid_accum_param if @accum_param.is_a?(Symbol) raise Spark::AccumulatorError, "Unsupported symbol #{@accum_param}" unless SUPPORTED_SYMBOLS.include?(@accum_param) @serialized_accum_param = @accum_param return end if @accum_param.is_a?(Proc) begin @serialized_accum_param = @accum_param.to_source return rescue raise Spark::SerializeError, 'Proc can not be serialized. Use String instead.' end end if @accum_param.is_a?(String) @serialized_accum_param = @accum_param @accum_param = eval(@accum_param) unless @accum_param.is_a?(Proc) raise Spark::SerializeError, 'Yours param is not a Proc.' end return end raise Spark::AccumulatorError, 'Unsupported param. Use Symbol, Proc or String.' end # Driver process or worker def driver? @driver end # ========================================================================= # Operations def add(term) if !driver? && !@@changed.include?(self) @@changed << self end if @accum_param.is_a?(Proc) @value = @accum_param.call(@value, term) else add_by_symbol(term) end end def +(term) add(term) self end def add_by_symbol(term) case @accum_param when :+ @value += term when :- @value -= term when :* @value *= term when :/ @value /= term when :** @value **= term end end # ========================================================================= # Dump and load def marshal_dump [@id, @zero_value, @serialized_accum_param] end def marshal_load(array) @id, @zero_value, @serialized_accum_param = array @value = @zero_value @driver = false load_accum_param end def load_accum_param if @serialized_accum_param.is_a?(String) @accum_param = eval(@serialized_accum_param) else @accum_param = @serialized_accum_param end end end end # ============================================================================= # Server for handeling Accumulator update # module Spark class Accumulator class Server attr_reader :server, :host, :port def self.start @instance ||= Spark::Accumulator::Server.new end def self.stop @instance && @instance.stop end def self.host start @instance.host end def self.port start @instance.port end def initialize @server = TCPServer.new(0) @host = @server.hostname @port = @server.port @threads = [] handle_accept end def stop @threads.each(&:kill) rescue nil end def handle_accept @threads << Thread.new do loop { handle_connection(@server.accept) } end end def handle_connection(socket) @threads << Thread.new do until socket.closed? count = socket.read_int count.times do data = socket.read_data accum = Spark::Accumulator.instances[data[0]] if accum accum.add(data[1]) else Spark.logger.warn("Accumulator with id #{data[0]} does not exist.") end end # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock # socket.write_int(Spark::Constant::ACCUMULATOR_ACK) end end end end end end ================================================ FILE: lib/spark/broadcast.rb ================================================ module Spark ## # Broadcast a read-only variable to the cluster, returning a Spark::Broadcast # object for reading it in distributed functions. The variable will # be sent to each cluster only once. # # == Example: # # broadcast1 = $sc.broadcast('a') # broadcast2 = $sc.broadcast('b') # broadcast3 = $sc.broadcast([1,2,3]) # # func = Proc.new do |part, index| # [ # broadcast1.value * index, # broadcast2.value * index, # broadcast3.value.reduce(:+) # ] # end # # rdd = $sc.parallelize(0..5, 4) # rdd = rdd.bind(broadcast1: broadcast1, broadcast2: broadcast2, broadcast3: broadcast3) # rdd = rdd.map_partitions_with_index(func) # rdd.collect # # => ["", "", 6, "a", "b", 6, "aa", "bb", 6, "aaa", "bbb", 6] # class Broadcast LOADED = 0 # id, value, path NOT_LOADED = 1 # id, path WITHOUT_PATH = 2 # id attr_reader :id, :state, :path, :jbroadcast @@registered = {} # ========================================================================= # Creating broadcast for SparkContext # Create new Broadcast and dump value to the disk # # b = $sc.broadcast('a') # # b.value # => 'a' # b.path # b.jbroadcast # def initialize(sc, value) @id = object_id @value = value @state = LOADED file = Tempfile.create('broadcast', sc.temp_dir) file.binmode file.write(Marshal.dump(value)) file.close @path = file.path @jbroadcast = RubyRDD.readBroadcastFromFile(sc.jcontext, @path, Spark.jb.to_long(@id)) ObjectSpace.define_finalizer(self, proc { File.unlink(@path) }) end def inspect result = %{#<#{self.class.name}:0x#{object_id}\n} result << %{ ID: #{@id}\n} result << %{Value: #{@value.to_s[0, 10]}>} result end def self.register(id, path) @@registered[id] = path end def value case state when LOADED @value when NOT_LOADED @value = Marshal.load(File.read(@path)) @state = LOADED @value when WITHOUT_PATH @path = @@registered[id] if @path @state = NOT_LOADED value else raise Spark::BroadcastError, "Broadcast #{@id} do not have registered path." end end end def marshal_dump @id end def marshal_load(id) @id = id @state = WITHOUT_PATH end end end ================================================ FILE: lib/spark/build.rb ================================================ module Spark module Build DEFAULT_SCALA_VERSION = '2.10.4' DEFAULT_CORE_VERSION = '2.10' DEFAULT_SPARK_VERSION = '1.6.0' DEFAULT_HADOOP_VERSION = '1.0.4' SBT = 'sbt/sbt' SBT_DEPS = 'assemblyPackageDependency' SBT_EXT = 'package' SBT_CLEAN = 'clean' def self.build(options={}) scala_version = options[:scala_version] || DEFAULT_SCALA_VERSION spark_core_version = options[:spark_core_version] || DEFAULT_CORE_VERSION spark_version = options[:spark_version] || DEFAULT_SPARK_VERSION hadoop_version = options[:hadoop_version] || DEFAULT_HADOOP_VERSION target = options[:target] || Spark.target_dir only_ext = options[:only_ext] || false env = { 'SCALA_VERSION' => scala_version, 'SPARK_VERSION' => spark_version, 'SPARK_CORE_VERSION' => spark_core_version, 'HADOOP_VERSION' => hadoop_version, 'TARGET_DIR' => target } cmd = [SBT] cmd << SBT_EXT cmd << SBT_DEPS unless only_ext cmd << SBT_CLEAN unless $DEBUG Dir.chdir(Spark.spark_ext_dir) do unless Kernel.system(env, cmd.join(' ')) raise Spark::BuildError, 'Spark cannot be assembled.' end end end end end ================================================ FILE: lib/spark/cli.rb ================================================ require 'commander' module Commander module UI # Disable paging # for 'classic' help def self.enable_paging end end end module Spark class CLI include Commander::Methods # IRB_HISTORY_FILE = File.join(Dir.home, '.irb_spark_history') # IRB_HISTORY_SIZE = 100 def run program :name, 'RubySpark' program :version, Spark::VERSION program :description, 'Ruby wrapper for Spark' global_option('-d', '--debug', 'Logging message to stdout'){ $DEBUG = true } default_command :help # Build --------------------------------------------------------------- command :build do |c| c.syntax = 'build [options]' c.description = 'Build spark and gem extensions' c.option '--hadoop-version STRING', String, 'Version of hadoop which will assembled with the Spark' c.option '--spark-core-version STRING', String, 'Version of Spark core' c.option '--spark-version STRING', String, 'Version of Spark' c.option '--scala-version STRING', String, 'Version of Scala' c.option '--target STRING', String, 'Directory where Spark will be stored' c.option '--only-ext', 'Build only extension for RubySpark' c.action do |args, options| Spark::Build.build(options.__hash__) puts puts 'Everything is OK' end end alias_command :install, :build # Shell ----------------------------------------------------------------- command :shell do |c| c.syntax = 'shell [options]' c.description = 'Start ruby shell for spark' c.option '--target STRING', String, 'Directory where Spark is stored' c.option '--properties-file STRING', String, 'Path to a file from which to load extra properties' c.option '--[no-]start', 'Start Spark immediately' c.option '--[no-]logger', 'Enable/disable logger (default: enable)' c.option '--auto-reload', 'Autoreload changed files' c.action do |args, options| options.default start: true, logger: true Spark.load_lib(options.target) Spark.logger.disable unless options.logger Spark.config do set_app_name 'RubySpark' end Spark.config.from_file(options.properties_file) if options.auto_reload require 'listen' listener = Listen.to(File.join(Spark.root, 'lib')) do |modified, added, removed| (modified+added).each do |file| silence_warnings { load(file) } end end listener.start end if options.start # Load Java and Spark Spark.start $sc = Spark.context Spark.print_logo('Spark context is loaded as $sc') else Spark.print_logo('You can start Spark with Spark.start') end # Load Pry require 'pry' Pry.start end end # # IRB ------------------------------------------------------------------- # command :irb do |c| # c.syntax = 'irb [options]' # c.description = 'Start ruby shell for spark' # c.option '--spark-home STRING', String, 'Directory where Spark is stored' # c.option '--[no-]start', 'Start Spark immediately' # c.option '--[no-]logger', 'Enable/disable logger (default: enable)' # # c.action do |args, options| # options.default start: true, logger: true # # Spark.load_lib(options.spark_home) # Spark::Logger.disable unless options.logger # # Spark.config do # set_app_name 'Pry RubySpark' # end # # if options.start # # Load Java and Spark # Spark.start # $sc = Spark.context # # Spark.print_logo('Spark context is loaded as $sc') # else # Spark.print_logo('You can start Spark with Spark.start') # end # # # Load IRB # require 'irb' # require 'irb/completion' # require 'irb/ext/save-history' # # begin # file = File.expand_path(IRB_HISTORY_FILE) # if File.exists?(file) # lines = IO.readlines(file).collect { |line| line.chomp } # Readline::HISTORY.push(*lines) # end # Kernel.at_exit do # lines = Readline::HISTORY.to_a.reverse.uniq.reverse # lines = lines[-IRB_HISTORY_SIZE, IRB_HISTORY_SIZE] if lines.nitems > IRB_HISTORY_SIZE # File.open(IRB_HISTORY_FILE, File::WRONLY | File::CREAT | File::TRUNC) { |io| io.puts lines.join("\n") } # end # rescue # end # # ARGV.clear # Clear Thor ARGV, otherwise IRB will parse it # ARGV.concat ['--readline', '--prompt-mode', 'simple'] # IRB.start # end # end # Home ------------------------------------------------------------------ command :home do |c| c.action do |args, options| puts Spark.home exit(0) end end # Ruby spark jar -------------------------------------------------------- command :ruby_spark_jar do |c| c.action do |args, options| puts Spark.ruby_spark_jar exit(0) end end run! end end end ================================================ FILE: lib/spark/command/base.rb ================================================ ## # Spark::Command::Base # # Parent for all commands (Map, FlatMap, Sort, ...) # class Spark::Command::Base DEFAULT_VARIABLE_OPTIONS = { type: Hash, function: true } def initialize(*args) settings.variables.each do |name, options| instance_variable_set("@#{name}", args.shift) end end def to_s self.class.name.split('::').last end def self.error(message) raise Spark::CommandError, message end def error(message) self.class.error(message) end def log(message=nil) $stdout.puts %{==> #{Time.now.strftime("%H:%M:%S")} [#{self.class.name}] #{message}} $stdout.flush end # =============================================================================================== # Methods called during class loading # This is not nicer way but these methods set/get classes variables for child # Settings for command (variables) def self.settings init_settings class_variable_get(:@@settings) end def settings self.class.settings end # Init empty settings def self.init_settings if !class_variable_defined?(:@@settings) struct = Struct.new(:variables) class_variable_set(:@@settings, struct.new) settings.variables = {} end end # New variable for command # # == Example: # # class Map < Spark::Command::Base # variable :map_function # end # # command = Map.new(1) # # command.instance_variables # # => [:@map_function] # command.instance_variable_get(:@map_function) # # => 1 # def self.variable(name, options={}) if settings.variables.has_key?(name) error "Function #{name} already exist." end settings.variables[name] = DEFAULT_VARIABLE_OPTIONS.merge(options) end # =============================================================================================== # Executing methods # Execute command for data and split index def execute(iterator, split_index) # Implemented on Base but can be override before_run # Run has to be implemented on child if iterator.is_a?(Enumerator::Lazy) && respond_to?(:lazy_run) return lazy_run(iterator, split_index) end iterator = iterator.to_a run(iterator, split_index) end def prepared? !!@prepared end # This is called before execution. Executing will be stopped if # some command contains error (e.g. badly serialized lambda). # # == What is doing? # * evaluate lambda # * evaluate method # * make new lambda # def prepare return if prepared? to_function = settings.variables.select {|_, options| options[:function]} to_function.each do |name, options| name = "@#{name}" data = instance_variable_get(name) case data[:type] when 'proc' result = eval(data[:content]) when 'symbol' result = lambda(&data[:content]) when 'method' # Method must me added to instance not Class instance_eval(data[:content]) # Method will be available as Proc result = lambda(&method(data[:name])) end instance_variable_set(name, result) end @prepared = true end # This method is called before every execution. def before_run end # =============================================================================================== # Bound objects attr_accessor :__objects__ def method_missing(method, *args, &block) if __objects__ && __objects__.has_key?(method) return __objects__[method] end super end end ================================================ FILE: lib/spark/command/basic.rb ================================================ _Base = Spark::Command::Base # ------------------------------------------------------------------------------------------------- # Map class Spark::Command::Map < _Base variable :map_function def run(iterator, *) iterator.map! do |item| @map_function.call(item) end iterator end def lazy_run(iterator, *) iterator.map do |item| @map_function.call(item) end end end # ------------------------------------------------------------------------------------------------- # FlatMap class Spark::Command::FlatMap < Spark::Command::Map def run(iterator, *) iterator = super iterator.flatten!(1) iterator end def lazy_run(iterator, *) iterator.flat_map do |item| @map_function.call(item) end end end # ------------------------------------------------------------------------------------------------- # MapPartitionsWithIndex class Spark::Command::MapPartitionsWithIndex < _Base variable :partition_function def run(iterator, index) iterator = @partition_function.call(iterator, index) iterator end # User should controll if there is Enumerator or not # alias_method :lazy_run, :run end # ------------------------------------------------------------------------------------------------- # MapPartitions class Spark::Command::MapPartitions < Spark::Command::MapPartitionsWithIndex def run(iterator, *) # Do not use `super` because `@partition_function` can be method with 1 argument iterator = @partition_function.call(iterator) iterator end # alias_method :lazy_run, :run end # ------------------------------------------------------------------------------------------------- # Filter class Spark::Command::Filter < _Base variable :filter_function def run(iterator, *) iterator.select! do |item| @filter_function.call(item) end iterator end def lazy_run(iterator, *) iterator.select do |item| @filter_function.call(item) end end end # ------------------------------------------------------------------------------------------------- # Compact class Spark::Command::Compact < _Base def run(iterator, *) iterator.compact! iterator end def lazy_run(iterator, *) iterator.select do |item| !item.nil? end end end # ------------------------------------------------------------------------------------------------- # Glom class Spark::Command::Glom < _Base def run(iterator, *) [iterator] end def lazy_run(iterator, *) run(iterator.to_a) end end # ------------------------------------------------------------------------------------------------- # Shuffle class Spark::Command::Shuffle < _Base variable :seed, function: false, type: Integer def run(iterator, *) iterator.shuffle!(random: rng) iterator end def rng Random.new(@seed) end end # ------------------------------------------------------------------------------------------------- # PartitionBy class Spark::Command::PartitionBy class Base < Spark::Command::Base include Spark::Helper::Serialize def prepare super # Default. Keep it after super because Sorting has own key_function. @key_function ||= lambda{|x| x[0]} end def run(iterator, *) iterator.map! do |item| make_partition_item(item) end iterator.flatten!(1) iterator end def lazy_run(iterator, *) iterator.flat_map do |item| make_partition_item(item) end end private def make_partition_item(item) [ pack_long(@partition_func.call(@key_function[item])), item ] end end class Basic < Base variable :partition_func end class Sorting < Base variable :key_function variable :bounds, function: false, type: Array variable :ascending, function: false, type: [TrueClass, FalseClass] variable :num_partitions, function: false, type: Numeric def prepare super # Index by bisect alghoritm @partition_func ||= Proc.new do |key| count = 0 @bounds.each{|i| break if i >= key count += 1 } if @ascending count else @num_partitions - 1 - count end end end end # Sorting end # PartitionBy # ------------------------------------------------------------------------------------------------- # Aggregate class Spark::Command::Aggregate < _Base variable :reduce_func variable :zero_value, function: false, type: Object def run(iterator, *) [iterator.reduce(@zero_value, &@reduce_func)] end def lazy_run(iterator, *) run(iterator) end end # ------------------------------------------------------------------------------------------------- # Reduce class Spark::Command::Reduce < Spark::Command::Aggregate def run(iterator, *) [iterator.reduce(&@reduce_func)] end end # ------------------------------------------------------------------------------------------------- # Foreach class Spark::Command::Foreach < _Base variable :each_function def run(iterator, *) iterator.each do |item| @each_function.call(item) end nil end end # ------------------------------------------------------------------------------------------------- # ForeachPartition class Spark::Command::ForeachPartition < _Base variable :partition_function def run(iterator, *) @partition_function.call(iterator) nil end end # ------------------------------------------------------------------------------------------------- # KeyBy class Spark::Command::KeyBy < _Base variable :key_function def run(iterator, *) iterator.map! do |item| [@key_function.call(item), item] end iterator end def lazy_run(iterator, *) iterator.map do |item| [@key_function.call(item), item] end end end # ------------------------------------------------------------------------------------------------- # Take class Spark::Command::Take < _Base variable :total, function: false, type: Numeric variable :last_part, function: false, type: Numeric def run(iterator, index) if index == @last_part && iterator.size > @total return iterator.slice!(0, @total) end iterator end end # ------------------------------------------------------------------------------------------------- # Pipe class Spark::Command::Pipe < _Base variable :cmds, function: false, type: Array def before_run require 'open3' @in, @out, @threads = Open3.pipeline_rw(*@cmds) end def run(iterator, *) create_writing_thread(iterator) new_iterator = [] # Read full input begin loop { new_iterator << @out.readline.rstrip } rescue EOFError end new_iterator end def lazy_run(iterator, *) create_writing_thread(iterator) Enumerator::Lazy.new([nil]) do |yielder, _| begin loop { yielder << @out.readline.rstrip } rescue EOFError end end end private def create_writing_thread(iterator) @writing_thread = Thread.new do # Send complete iterator to the pipe iterator.each do |item| @in.puts(item.to_s.rstrip) end # Input must be closed for EOFError @in.close end end end ================================================ FILE: lib/spark/command/pair.rb ================================================ _Base = Spark::Command::Base # ------------------------------------------------------------------------------------------------- # CombineByKey class Spark::Command::CombineByKey # --------------- class Base < Spark::Command::Base def run(iterator, *) _run(iterator).to_a end def lazy_run(iterator, *) _run(iterator).lazy end end # --------------- class Combine < Base variable :create_combiner variable :merge_value def _run(iterator) # Not use combiners[key] ||= .. # it tests nil and not has_key? combiners = {} iterator.each do |key, value| if combiners.has_key?(key) combiners[key] = @merge_value.call(combiners[key], value) else combiners[key] = @create_combiner.call(value) end end combiners end end # --------------- class Merge < Base variable :merge_combiners def _run(iterator, *) combiners = {} iterator.each do |key, value| if combiners.has_key?(key) combiners[key] = @merge_combiners.call(combiners[key], value) else combiners[key] = value end end combiners end end # --------------- class CombineWithZero < Base variable :zero_value, function: false, type: Object variable :merge_value def _run(iterator) # Not use combiners[key] ||= .. # it tests nil and not has_key? combiners = {} iterator.each do |key, value| unless combiners.has_key?(key) combiners[key] = @zero_value end combiners[key] = @merge_value.call(combiners[key], value) end combiners end end # --------------- end # ------------------------------------------------------------------------------------------------- # MapValues class Spark::Command::MapValues < _Base variable :map_function def run(iterator, *) iterator.map! do |item| item[1] = @map_function.call(item[1]) item end iterator end def lazy_run(iterator, *) iterator.map do |item| item[1] = @map_function.call(item[1]) item end end end # ------------------------------------------------------------------------------------------------- # FlatMapValues class Spark::Command::FlatMapValues < _Base variable :map_function def run(iterator, *) iterator.map! do |(key, values)| values = @map_function.call(values) values.flatten!(1) values.map! do |value| [key, value] end end iterator.flatten!(1) iterator end end ================================================ FILE: lib/spark/command/sort.rb ================================================ _Base = Spark::Command::Base # ------------------------------------------------------------------------------------------------- # Sort class Spark::Command::SortByKey < _Base variable :key_function variable :ascending, function: false, type: [TrueClass, FalseClass] variable :spilling, function: false, type: [TrueClass, FalseClass] variable :memory, function: false, type: [Numeric, NilClass] variable :serializer, function: false, type: Spark::Serializer::Base # Currently disabled def before_run @spilling = false end def run(iterator, _) if @spilling iterator = run_with_spilling(iterator.each) else run_without_spilling(iterator) end iterator end def run_with_enum(iterator, _) if @spilling iterator = run_with_spilling(iterator) else iterator = iterator.to_a run_without_spilling(iterator) end iterator end private def run_with_spilling(iterator) sorter = Spark::ExternalSorter.new(@memory, @serializer) sorter.sort_by(iterator, @ascending, @key_function) end def run_without_spilling(iterator) iterator.sort_by!(&@key_function) iterator.reverse! unless @ascending end end ================================================ FILE: lib/spark/command/statistic.rb ================================================ _Base = Spark::Command::Base # ------------------------------------------------------------------------------------------------- # Sample class Spark::Command::Sample < _Base variable :with_replacement, function: false, type: [TrueClass, FalseClass] variable :fraction, function: false, type: Numeric variable :seed, function: false, type: [NilClass, Numeric] def run(iterator, _) sampler.sample(iterator) end def lazy_run(iterator, _) sampler.lazy_sample(iterator) end def sampler @sampler ||= _sampler end def _sampler if @with_replacement sampler = Spark::Sampler::Poisson else sampler = Spark::Sampler::Uniform end sampler = sampler.new(@fraction, @seed) end end # ------------------------------------------------------------------------------------------------- # Stats class Spark::Command::Stats < _Base def run(iterator, *) [Spark::StatCounter.new(iterator)] end def lazy_run(iterator, *) run(iterator) end end # ------------------------------------------------------------------------------------------------- # Histogram class Spark::Command::Histogram < _Base include Spark::Helper::Statistic variable :even, function: false, type: [TrueClass, FalseClass] variable :buckets, function: false, type: Array def run(iterator, *) counters = Array.new(counter_size) { 0 } iterator.each do |item| if item.nil? || (item.is_a?(Float) && !item.finite?) || item > max || item < min next end x = bucket_function.call(item) if x.nil? # next else counters[x] += 1 end end [counters] end def lazy_run(iterator, *) run(iterator) end private def min @buckets.first end def max @buckets.last end def counter_size @buckets.size-1 end def increment @buckets[1]-@buckets[0] end # Decide which bucket function to pass. We decide here rather than having # a general function so that the decission need only be made once. def bucket_function @bucket_function ||= _bucket_function end def _bucket_function if @even fast_bucket_function else basic_bucket_function end end # Determine the bucket function in constant time. # Requires that buckets are evenly spaced def fast_bucket_function Proc.new do |item| if item.is_a?(Float) && item.nan? nil else bucket_number = (item - min)/increment if bucket_number > counter_size || bucket_number < 0 nil else [bucket_number.to_i, counter_size-1].min end end end end # Basic bucket function. Same as right bisect. def basic_bucket_function Proc.new do |item| bucket_number = bisect_right(@buckets, item) - 1 # Counters is @buckets.size - 1 # [bucket_number, counter_size-1].min if bucket_number > counter_size-1 counter_size-1 else bucket_number end end end end ================================================ FILE: lib/spark/command.rb ================================================ module Spark ## # Container which includes all commands and other things for worker # Every RDD have own copy of Command # class Command attr_accessor :serializer, :deserializer, :commands, :libraries, :bound_objects def initialize @serializer = nil @deserializer = nil @commands = [] @libraries = [] @bound_objects = {} end def execute(iterator, split_index) # Require necessary libraries libraries.each{|lib| require lib} # Prepare bound objects @commands.each do |command| command.__objects__ = bound_objects end # Prepare for running @commands.each(&:prepare) # Run all task @commands.each do |command| iterator = command.execute(iterator, split_index) end # Return changed iterator. This is not be necessary for some tasks # because of using inplace changing but some task can return # only one value (for example reduce). iterator end def last @commands.last end def bound_objects # Objects from users # Already initialized objects on worker return @bound_objects if @bound_objects if @serialized_bound_objects # Still serialized @bound_objects = Marshal.load(@serialized_bound_objects) else # Something else @bound_objects = {} end end # Bound objects can depend on library which is loaded during @execute # In that case worker raise "undefined class/module" def marshal_dump [@serializer, @deserializer, @commands, @libraries, serialized_bound_objects] end def marshal_load(array) @serializer = array.shift @deserializer = array.shift @commands = array.shift @libraries = array.shift @serialized_bound_objects = array.shift end private def serialized_bound_objects @serialized_bound_objects ||= Marshal.dump(@bound_objects) end end end require 'spark/command/base' require 'spark/command/basic' require 'spark/command/pair' require 'spark/command/statistic' require 'spark/command/sort' ================================================ FILE: lib/spark/command_builder.rb ================================================ require 'spark/command_validator' module Spark ## # Builder for building correct {Spark::Command} # class CommandBuilder extend Forwardable include Spark::Helper::Serialize include Spark::Helper::System include Spark::CommandValidator attr_reader :command def_delegators :@command, :serializer, :serializer=, :deserializer, :deserializer=, :commands, :commands=, :libraries, :libraries=, :bound_objects, :bound_objects= def initialize(serializer, deserializer=nil) create_command self.serializer = serializer self.deserializer = deserializer || serializer.dup end def create_command @command = Spark::Command.new end # Do not user Marshal.dump(Marshal.load(self)) because some variables # have marshal_dump prepared for worker. def deep_copy copy = self.dup copy.create_command copy.serializer = self.serializer.deep_copy copy.deserializer = self.deserializer.deep_copy copy.commands = self.commands.dup copy.libraries = self.libraries.dup copy.bound_objects = self.bound_objects.dup copy end # Serialize Command class for worker # Java use signed number def build unpack_chars(Marshal.dump(@command)) end def add_command(klass, *args) variables = klass.settings.variables validate_size(variables, args) built_args = [] variables.values.zip(args) do |var, arg| if var[:function] arg = serialize_function(arg) end validate(arg, var) built_args << arg end comm = klass.new(*built_args) @command.commands << comm self end def add_library(*libraries) @command.libraries += libraries end def bind(objects) objects.symbolize_keys! @command.bound_objects.merge!(objects) end private # Serialized can be Proc and Method # # === Func # * *string:* already serialized proc # * *proc:* proc # * *symbol:* name of method # * *method:* Method class # def serialize_function(func) case func when String serialize_function_from_string(func) when Symbol serialize_function_from_symbol(func) when Proc serialize_function_from_proc(func) when Method serialize_function_from_method(func) else raise Spark::CommandError, 'You must enter String, Symbol, Proc or Method.' end end def serialize_function_from_string(string) {type: 'proc', content: string} end def serialize_function_from_symbol(symbol) {type: 'symbol', content: symbol} end # Serialize Proc as String # # lambda{|x| x*x}.to_source # # => "proc { |x| (x * x) }" # def serialize_function_from_proc(proc) serialize_function_from_string(proc.to_source) rescue raise Spark::SerializeError, 'Proc can not be serialized. Use String instead.' end # Serialize method as string # # def test(x) # x*x # end # serialize_function_from_method(method(:test)) # # # => "def test(x)\n x*x\nend\n" # def serialize_function_from_method(meth) if pry? meth = Pry::Method.new(meth) end {type: 'method', name: meth.name, content: meth.source} rescue raise Spark::SerializeError, 'Method can not be serialized. Use full path or Proc.' end end end ================================================ FILE: lib/spark/command_validator.rb ================================================ module Spark module CommandValidator def validate(value, options) validate_type(value, options[:type]) end def valid?(value, options) begin validate(value, options) return true rescue return false end end def validate_type(value, types) types = [types] if !types.is_a?(Array) types.each do |type| return if value.is_a?(type) end error "Value: #{value} should be a #{types.join(' or ')} but is #{value.class}." end def validate_size(array1, array2) if array1.size != array2.size error "Wrong number of arguments (#{array1.size} for #{array2.size})" end end end end ================================================ FILE: lib/spark/config.rb ================================================ # Necessary libraries Spark.load_lib module Spark # Common configuration for RubySpark and Spark class Config include Spark::Helper::System TYPES = { 'spark.shuffle.spill' => :boolean, 'spark.ruby.serializer.compress' => :boolean } # Initialize java SparkConf and load default configuration. def initialize @spark_conf = SparkConf.new(true) set_default from_file(Spark::DEFAULT_CONFIG_FILE) end def from_file(file) check_read_only if file && File.exist?(file) file = File.expand_path(file) RubyUtils.loadPropertiesFile(spark_conf, file) end end def [](key) get(key) end def []=(key, value) set(key, value) end def spark_conf if Spark.started? # Get latest configuration Spark.context.jcontext.conf else @spark_conf end end def valid! errors = [] if !contains?('spark.app.name') errors << 'An application name must be set in your configuration.' end if !contains?('spark.master') errors << 'A master URL must be set in your configuration.' end if Spark::Serializer.find(get('spark.ruby.serializer')).nil? errors << 'Unknow serializer.' end scanned = get('spark.ruby.executor.command').scan('%s') if scanned.size == 0 errors << "Executor command must contain '%s'." end if scanned.size > 1 errors << "Executor command can contain only one '%s'." end if errors.any? errors.map!{|error| "- #{error}"} raise Spark::ConfigurationError, "Configuration is not valid:\r\n#{errors.join("\r\n")}" end end def read_only? Spark.started? end # Rescue from NoSuchElementException def get(key) value = spark_conf.get(key.to_s) case TYPES[key] when :boolean parse_boolean(value) when :integer parse_integer(value) else value end rescue nil end def get_all Hash[spark_conf.getAll.map{|tuple| [tuple._1, tuple._2]}] end def contains?(key) spark_conf.contains(key.to_s) end def set(key, value) check_read_only spark_conf.set(key.to_s, value.to_s) end def set_app_name(name) set('spark.app.name', name) end def set_master(master) set('spark.master', master) end def parse_boolean(value) case value when 'true' true when 'false' false end end def parse_integer(value) value.to_i end # ============================================================================= # Defaults def set_default set_app_name('RubySpark') set_master('local[*]') set('spark.ruby.driver_home', Spark.home) set('spark.ruby.serializer', default_serializer) set('spark.ruby.serializer.compress', default_serializer_compress) set('spark.ruby.serializer.batch_size', default_serializer_batch_size) set('spark.ruby.executor.command', default_executor_command) set('spark.ruby.executor.options', default_executor_options) set('spark.ruby.worker.type', default_worker_type) load_executor_envs # set('spark.ruby.executor.install', default_executor_install) end def default_serializer ENV['SPARK_RUBY_SERIALIZER'] || Spark::Serializer::DEFAULT_SERIALIZER_NAME end def default_serializer_compress ENV['SPARK_RUBY_SERIALIZER_COMPRESS'] || Spark::Serializer::DEFAULT_COMPRESS end def default_serializer_batch_size ENV['SPARK_RUBY_SERIALIZER_BATCH_SIZE'] || Spark::Serializer::DEFAULT_BATCH_SIZE end # Command template which is applied when scala want create a ruby # process (e.g. master, home request). Command is represented by '%s'. # # == Example: # bash --norc -i -c "export HOME=/home/user; cd; source .bashrc; %s" # def default_executor_command ENV['SPARK_RUBY_EXECUTOR_COMMAND'] || '%s' end # Options for every worker. # # == Example: # -J-Xmx512m # def default_executor_options ENV['SPARK_RUBY_EXECUTOR_OPTIONS'] || '' end # # Install command which is triggered before on start. # # This command using executor command template. # # # # == Example: # # gem install ruby-spark -v 1.2.0 # # # def default_executor_install # ENV['SPARK_RUBY_EXECUTOR_INSTALL'] || '' # end # Type of worker. # # == Options: # process:: (default) # thread:: (experimental) # def default_worker_type ENV['SPARK_RUBY_WORKER_TYPE'] || 'process' end # Load environment variables for executor from ENV. # # == Examples: # SPARK_RUBY_EXECUTOR_ENV_KEY1="1" # SPARK_RUBY_EXECUTOR_ENV_KEY2="2" # def load_executor_envs prefix = 'SPARK_RUBY_EXECUTOR_ENV_' envs = ENV.select{|key, _| key.start_with?(prefix)} envs.each do |key, value| key = key.dup # ENV keys are frozen key.slice!(0, prefix.size) set("spark.ruby.executor.env.#{key}", value) end end # Aliases alias_method :getAll, :get_all alias_method :setAppName, :set_app_name alias_method :setMaster, :set_master private def check_read_only if read_only? raise Spark::ConfigurationError, 'Configuration is ready only' end end end end ================================================ FILE: lib/spark/constant.rb ================================================ module Spark # Commond constant for Ruby and Spark module Constant DATA_EOF = -2 WORKER_ERROR = -1 WORKER_DONE = 0 CREATE_WORKER = 1 KILL_WORKER = 2 KILL_WORKER_AND_WAIT = 3 SUCCESSFULLY_KILLED = 4 UNSUCCESSFUL_KILLING = 5 ACCUMULATOR_ACK = 6 end end ================================================ FILE: lib/spark/context.rb ================================================ # Necessary libraries Spark.load_lib module Spark ## # Main entry point for Spark functionality. A SparkContext represents the connection to a Spark # cluster, and can be used to create RDDs, accumulators and broadcast variables on that cluster. # class Context include Spark::Helper::System include Spark::Helper::Parser include Spark::Helper::Logger attr_reader :jcontext, :jaccumulator, :temp_dir # Constructor for Ruby context. Configuration is automatically is taken # from Spark. Config will be automatically set to default if user start # context first. # def initialize Spark.config.valid! @jcontext = JavaSparkContext.new(Spark.config.spark_conf) @jcontext.addJar(Spark.ruby_spark_jar) # Does not work on 1.2 # ui.attachTab(RubyTab.new(ui, to_java_hash(RbConfig::CONFIG))) spark_local_dir = JUtils.getLocalDir(sc.conf) @temp_dir = JUtils.createTempDir(spark_local_dir, 'ruby').getAbsolutePath accum_server = Spark::Accumulator::Server accum_server.start @jaccumulator = @jcontext.accumulator(ArrayList.new, RubyAccumulatorParam.new(accum_server.host, accum_server.port)) log_info("Ruby accumulator server is running on port #{accum_server.port}") set_call_site('Ruby') # description of stage end def inspect result = %{#<#{self.class.name}:0x#{object_id}\n} result << %{Tempdir: "#{temp_dir}">} result end def stop Spark::Accumulator::Server.stop log_info('Ruby accumulator server was stopped') @jcontext.stop end def sc @jcontext.sc end def ui sc.ui end # Default level of parallelism to use when not given by user (e.g. parallelize and makeRDD) # def default_parallelism sc.defaultParallelism end # Default serializer # # Batch -> Compress -> Basic # def default_serializer # Basic serializer = Spark::Serializer.find!(config('spark.ruby.serializer')).new # Compress if config('spark.ruby.serializer.compress') serializer = Spark::Serializer.compressed(serializer) end # Bactching batch_size = default_batch_size if batch_size == 'auto' serializer = Spark::Serializer.auto_batched(serializer) else serializer = Spark::Serializer.batched(serializer, batch_size) end # Finally, "container" contains serializers serializer end def default_batch_size size = config('spark.ruby.serializer.batch_size').to_i if size >= 1 size else 'auto' end end # Set a local property that affects jobs submitted from this thread, such as the # Spark fair scheduler pool. # def set_local_property(key, value) jcontext.setLocalProperty(key, value) end # Get a local property set in this thread, or null if it is missing # def get_local_property(key) jcontext.getLocalProperty(key) end # Support function for API backtraces. # def set_call_site(site) jcontext.setCallSite(site) end def clear_call_site jcontext.clearCallSite end # Return a copy of this SparkContext's configuration. The configuration *cannot* # be changed at runtime. # def config(key=nil) if key Spark.config.get(key) else Spark.config end end # Add a file to be downloaded with this Spark job on every node. # The path of file passed can be either a local file, a file in HDFS # (or other Hadoop-supported filesystems), or an HTTP, HTTPS or FTP URI. # # To access the file in Spark jobs, use `SparkFiles.get(file_name)` with the # filename to find its download location. # # == Example: # `echo 10 > test.txt` # # $sc.add_file('test.txt') # $sc.parallelize(0..5).map(lambda{|x| x * SparkFiles.get_content('test.txt').to_i}).collect # # => [0, 10, 20, 30, 40, 50] # def add_file(*files) files.each do |file| sc.addFile(file) end end # Broadcast a read-only variable to the cluster, returning a Spark::Broadcast # object for reading it in distributed functions. The variable will # be sent to each cluster only once. # # == Example: # broadcast1 = $sc.broadcast('a') # broadcast2 = $sc.broadcast('b') # # rdd = $sc.parallelize(0..5, 4) # rdd = rdd.bind(broadcast1: broadcast1, broadcast2: broadcast2) # rdd = rdd.map_partitions_with_index(lambda{|part, index| [broadcast1.value * index, broadcast2.value * index] }) # rdd.collect # # => ["", "", "a", "b", "aa", "bb", "aaa", "bbb"] # def broadcast(value) Spark::Broadcast.new(self, value) end # Create an Accumulator with the given initial value, using a given # accum_param helper object to define how to add values of the # data type if provided. # # == Example: # accum = $sc.accumulator(7) # # rdd = $sc.parallelize(0..5, 4) # rdd = rdd.bind(accum: accum) # rdd = rdd.map_partitions(lambda{|_| accum.add(1) }) # rdd = rdd.collect # # accum.value # # => 11 # def accumulator(value, accum_param=:+, zero_value=0) Spark::Accumulator.new(value, accum_param, zero_value) end # Distribute a local Ruby collection to form an RDD # Direct method can be slow so be careful, this method update data inplace # # == Parameters: # data:: Range or Array # num_slices:: number of slice # serializer:: custom serializer (default: serializer based on configuration) # # == Examples: # $sc.parallelize(["1", "2", "3"]).map(lambda{|x| x.to_i}).collect # #=> [1, 2, 3] # # $sc.parallelize(1..3).map(:to_s).collect # #=> ["1", "2", "3"] # def parallelize(data, num_slices=nil, serializer=nil) num_slices ||= default_parallelism serializer ||= default_serializer serializer.check_each(data) # Through file file = Tempfile.new('to_parallelize', temp_dir) serializer.dump_to_io(data, file) file.close # not unlink jrdd = RubyRDD.readRDDFromFile(jcontext, file.path, num_slices) Spark::RDD.new(jrdd, self, serializer) ensure file && file.unlink end # Read a text file from HDFS, a local file system (available on all nodes), or any # Hadoop-supported file system URI, and return it as an RDD of Strings. # # == Example: # f = Tempfile.new("test") # f.puts("1") # f.puts("2") # f.close # # $sc.text_file(f.path).map(lambda{|x| x.to_i}).collect # # => [1, 2] # def text_file(path, min_partitions=nil, encoding=Encoding::UTF_8, serializer=nil) min_partitions ||= default_parallelism serializer ||= default_serializer deserializer = Spark::Serializer.build { __text__(encoding) } Spark::RDD.new(@jcontext.textFile(path, min_partitions), self, serializer, deserializer) end # Read a directory of text files from HDFS, a local file system (available on all nodes), or any # Hadoop-supported file system URI. Each file is read as a single record and returned in a # key-value pair, where the key is the path of each file, the value is the content of each file. # # == Example: # dir = Dir.mktmpdir # f1 = Tempfile.new("test1", dir) # f2 = Tempfile.new("test2", dir) # f1.puts("1"); f1.puts("2"); # f2.puts("3"); f2.puts("4"); # f1.close # f2.close # # $sc.whole_text_files(dir).flat_map(lambda{|key, value| value.split}).collect # # => ["1", "2", "3", "4"] # def whole_text_files(path, min_partitions=nil, serializer=nil) min_partitions ||= default_parallelism serializer ||= default_serializer deserializer = Spark::Serializer.build{ __pair__(__text__, __text__) } Spark::RDD.new(@jcontext.wholeTextFiles(path, min_partitions), self, serializer, deserializer) end # Executes the given partition function f on the specified set of partitions, # returning the result as an array of elements. # # If partitions is not specified, this will run over all partitions. # # == Example: # rdd = $sc.parallelize(0..10, 5) # $sc.run_job(rdd, lambda{|x| x.to_s}, [0,2]) # # => ["[0, 1]", "[4, 5]"] # def run_job(rdd, f, partitions=nil, allow_local=false) run_job_with_command(rdd, partitions, allow_local, Spark::Command::MapPartitions, f) end # Execute the given command on specific set of partitions. # def run_job_with_command(rdd, partitions, allow_local, command, *args) if !partitions.nil? && !partitions.is_a?(Array) raise Spark::ContextError, 'Partitions must be nil or Array' end partitions_size = rdd.partitions_size # Execute all parts if partitions.nil? partitions = (0...partitions_size).to_a end # Can happend when you use coalesce partitions.delete_if {|part| part >= partitions_size} # Rjb represent Fixnum as Integer but Jruby as Long partitions = to_java_array_list(convert_to_java_int(partitions)) # File for result file = Tempfile.new('collect', temp_dir) mapped = rdd.new_rdd_from_command(command, *args) RubyRDD.runJob(rdd.context.sc, mapped.jrdd, partitions, allow_local, file.path) mapped.collect_from_file(file) end # Aliases alias_method :textFile, :text_file alias_method :wholeTextFiles, :whole_text_files alias_method :defaultParallelism, :default_parallelism alias_method :setLocalProperty, :set_local_property alias_method :getLocalProperty, :get_local_property alias_method :setCallSite, :set_call_site alias_method :clearCallSite, :clear_call_site alias_method :runJob, :run_job alias_method :runJobWithCommand, :run_job_with_command alias_method :addFile, :add_file end end ================================================ FILE: lib/spark/error.rb ================================================ module Spark # Extension cannot be built class BuildError < StandardError end # Proc.to_source # Java object cannot be converted class SerializeError < StandardError end # Serializer method # Non-existing serializer class NotImplemented < StandardError end # Missison app_name or master class ConfigurationError < StandardError end # Wrong parameters class RDDError < StandardError end # Validations class CommandError < StandardError end # Parser helper # SQL DataType class ParseError < StandardError end # Validation in context class ContextError < StandardError end # Broadcasts # Missing path class BroadcastError < StandardError end # Accumulators # Existing keys # Wrong ID class AccumulatorError < StandardError end # Wrong instances class MllibError < StandardError end # Wrong datatype class SQLError < StandardError end # Missing Java class class JavaBridgeError < StandardError end end ================================================ FILE: lib/spark/ext/hash.rb ================================================ module Spark module CoreExtension module Hash module ClassMethods end module InstanceMethods # Destructively convert all keys to strings. def stringify_keys_with_spark! transform_keys!{ |key| key.to_s } end # Destructively convert all keys to symbols, as long as they respond def symbolize_keys_with_spark! transform_keys!{ |key| key.to_sym rescue key } end # Destructively convert all keys using the block operations. # Same as transform_keys but modifies +self+. def transform_keys_with_spark! keys.each do |key| self[yield(key)] = delete(key) end self end end def self.included(base) base.extend(ClassMethods) base.send(:include, InstanceMethods) base.class_eval do patch_unless_exist :stringify_keys!, :spark patch_unless_exist :symbolize_keys!, :spark patch_unless_exist :transform_keys!, :spark end end end end end Hash.__send__(:include, Spark::CoreExtension::Hash) ================================================ FILE: lib/spark/ext/integer.rb ================================================ module Spark module CoreExtension module Integer module ClassMethods end module InstanceMethods end def self.included(base) base.extend(ClassMethods) base.send(:include, InstanceMethods) base.class_eval do const_set :MAX_WITH_SPARK, 1 << (1.size * 8 - 2) - 1 const_set :MIN_WITH_SPARK, -const_get(:MAX_WITH_SPARK) - 1 path_const_unless_exist :MAX, :SPARK path_const_unless_exist :MIN, :SPARK end end end end end Integer.__send__(:include, Spark::CoreExtension::Integer) ================================================ FILE: lib/spark/ext/io.rb ================================================ module Spark module CoreExtension module IO module ClassMethods end module InstanceMethods # Reading def read_int unpack_int(read(4)) end def read_int_or_eof bytes = read(4) return Spark::Constant::DATA_EOF if bytes.nil? unpack_int(bytes) end def read_long unpack_long(read(8)) end def read_string read(read_int) end def read_data Marshal.load(read_string) end # Writing def write_int(data) write(pack_int(data)) end def write_long(data) write(pack_long(data)) end # Size and data can have different encoding # Marshal: both ASCII # Oj: ASCII and UTF-8 def write_string(data) write_int(data.bytesize) write(data) end def write_data(data) write_string(Marshal.dump(data)) end end def self.included(base) base.extend(ClassMethods) base.send(:include, Spark::Helper::Serialize) base.send(:include, InstanceMethods) end end end end IO.__send__(:include, Spark::CoreExtension::IO) StringIO.__send__(:include, Spark::CoreExtension::IO) ================================================ FILE: lib/spark/ext/ip_socket.rb ================================================ module Spark module CoreExtension module IPSocket module ClassMethods end module InstanceMethods def port addr[1] end def hostname addr(true)[2] end def numeric_address addr[3] end end def self.included(base) base.extend(ClassMethods) base.send(:include, InstanceMethods) end end end end IPSocket.__send__(:include, Spark::CoreExtension::IPSocket) ================================================ FILE: lib/spark/ext/module.rb ================================================ module Spark module CoreExtension module Module # Patch method to class unless already exist # # == Example: # # class Hash # def a # 1 # end # end # # module HashExtension # module InstanceMethods # def a_with_spark # 2 # end # # def b_with_spark # 1 # end # end # # def self.included(base) # base.send(:include, InstanceMethods) # base.class_eval do # patch_unless_exist :a, :spark # patch_unless_exist :b, :spark # end # end # end # # Hash.include(HashExtension) # # Hash.new.a # => 1 # Hash.new.b # => 1 # def patch_unless_exist(target, suffix) unless method_defined?(target) aliased_target, punctuation = target.to_s.sub(/([?!=])$/, ''), $1 alias_method target, "#{aliased_target}_with_#{suffix}#{punctuation}" end end def path_const_unless_exist(target, suffix) unless const_defined?(target) const_set(target, const_get("#{target}_WITH_#{suffix}")) end end end end end Module.__send__(:include, Spark::CoreExtension::Module) ================================================ FILE: lib/spark/ext/object.rb ================================================ module Spark module CoreExtension module Object module ClassMethods end module InstanceMethods def deep_copy_with_spark Marshal.load(Marshal.dump(self)) end def silence_warnings old_verbose, $VERBOSE = $VERBOSE, nil yield ensure $VERBOSE = old_verbose end def cattr_reader_with_spark(*syms) syms.each do |sym| raise NameError.new("Invalid attribute name: #{sym}") unless sym =~ /^[_A-Za-z]\w*$/ class_eval(<<-EOS, __FILE__, __LINE__ + 1) @@#{sym} = nil unless defined? @@#{sym} def self.#{sym} @@#{sym} end EOS class_eval(<<-EOS, __FILE__, __LINE__ + 1) def #{sym} @@#{sym} end EOS end end def cattr_writer_with_spark(*syms) syms.each do |sym| raise NameError.new("Invalid attribute name: #{sym}") unless sym =~ /^[_A-Za-z]\w*$/ class_eval(<<-EOS, __FILE__, __LINE__ + 1) @@#{sym} = nil unless defined? @@#{sym} def self.#{sym}=(obj) @@#{sym} = obj end EOS class_eval(<<-EOS, __FILE__, __LINE__ + 1) def #{sym}=(obj) @@#{sym} = obj end EOS end end def cattr_accessor_with_spark(*syms) cattr_reader_with_spark(*syms) cattr_writer_with_spark(*syms) end end def self.included(base) base.extend(ClassMethods) base.send(:include, InstanceMethods) base.class_eval do patch_unless_exist :deep_copy, :spark patch_unless_exist :silence_warnings, :spark patch_unless_exist :cattr_accessor, :spark end end end end end Object.__send__(:include, Spark::CoreExtension::Object) ================================================ FILE: lib/spark/ext/string.rb ================================================ module Spark module CoreExtension module String module ClassMethods end module InstanceMethods def camelize_with_spark self.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase } end end def self.included(base) base.extend(ClassMethods) base.send(:include, InstanceMethods) base.class_eval do patch_unless_exist :camelize, :spark end end end end end String.__send__(:include, Spark::CoreExtension::String) ================================================ FILE: lib/spark/helper/logger.rb ================================================ module Spark module Helper module Logger def self.included(base) base.send :extend, Methods base.send :include, Methods end module Methods def log_info(message) Spark.logger.info(message) end def log_debug(message) Spark.logger.debug(message) end def log_trace(message) Spark.logger.trace(message) end def log_warning(message) Spark.logger.warning(message) end def log_error(message) Spark.logger.error(message) end alias_method :logInfo, :log_info alias_method :logDebug, :log_debug alias_method :logTrace, :log_trace alias_method :logWarning, :log_warning alias_method :logError, :log_error end # Methods end # Logger end # Helper end # Spark ================================================ FILE: lib/spark/helper/parser.rb ================================================ module Spark module Helper module Parser def self.included(base) base.send :extend, Methods base.send :include, Methods end module Methods def to_java_hash(hash) hash_map = HashMap.new hash.each_pair do |key, value| begin # RJB raise Object is NULL (but new record is put correctly) hash_map.put(key, value) rescue RuntimeError end end hash_map end def convert_to_java_int(data) if data.is_a?(Array) data.map{|x| JInteger.new(x)} else JInteger.new(data) end end def to_java_array_list(array) array_list = ArrayList.new array.each do |item| array_list.add(item) end array_list end # Parse and convert memory size. Shifting be better but Float doesn't support it. # # == Examples: # to_memory_size("512mb") # # => 524288 # # to_memory_size("512 MB") # # => 524288 # # to_memory_size("512mb", "GB") # # => 0.5 # def to_memory_size(memory, result_unit="KB") match = memory.match(/([\d]+)[\s]*([\w]*)/) if match.nil? raise Spark::ParseError, "Memory has wrong format. Use: 'SIZE UNIT'" end size = match[1].to_f unit = match[2] size *= memory_multiplier_based_kb(unit) size /= memory_multiplier_based_kb(result_unit) size.round(2) end # Based to KB def memory_multiplier_based_kb(type) case type.to_s.upcase when "G", "GB" 1048576 when "M", "MB" 1024 when "K", "KB" 1 else raise Spark::ParseError, "Unsupported type #{type}" end end end # Methods end # Parser end # Helper end # Spark ================================================ FILE: lib/spark/helper/serialize.rb ================================================ module Spark module Helper module Serialize DIRECTIVE_INTEGER_BIG_ENDIAN = 'l>' DIRECTIVE_INTEGERS_BIG_ENDIAN = 'l>*' DIRECTIVE_LONG_BIG_ENDIAN = 'q>' DIRECTIVE_LONGS_BIG_ENDIAN = 'q>*' DIRECTIVE_DOUBLE_BIG_ENDIAN = 'G' DIRECTIVE_DOUBLES_BIG_ENDIAN = 'G*' DIRECTIVE_UNSIGNED_CHARS = 'C*' DIRECTIVE_CHARS = 'c*' # Packing def pack_int(data) [data].pack(DIRECTIVE_INTEGER_BIG_ENDIAN) end def pack_long(data) [data].pack(DIRECTIVE_LONG_BIG_ENDIAN) end def pack_double(data) [data].pack(DIRECTIVE_DOUBLE_BIG_ENDIAN) end def pack_unsigned_chars(data) data.pack(DIRECTIVE_UNSIGNED_CHARS) end def pack_ints(data) __check_array(data) data.pack(DIRECTIVE_INTEGERS_BIG_ENDIAN) end def pack_longs(data) __check_array(data) data.pack(DIRECTIVE_LONGS_BIG_ENDIAN) end def pack_doubles(data) __check_array(data) data.pack(DIRECTIVE_DOUBLES_BIG_ENDIAN) end # Unpacking def unpack_int(data) data.unpack(DIRECTIVE_INTEGER_BIG_ENDIAN)[0] end def unpack_long(data) data.unpack(DIRECTIVE_LONG_BIG_ENDIAN)[0] end def unpack_chars(data) data.unpack(DIRECTIVE_CHARS) end private def __check_array(data) unless data.is_a?(Array) raise ArgumentError, 'Data must be an Array.' end end end end end ================================================ FILE: lib/spark/helper/statistic.rb ================================================ module Spark module Helper module Statistic # Returns a sampling rate that guarantees a sample of size >= sampleSizeLowerBound 99.99% of the time. # # == How the sampling rate is determined: # Let p = num / total, where num is the sample size and total is the total number of # datapoints in the RDD. We're trying to compute q > p such that # * when sampling with replacement, we're drawing each datapoint with prob_i ~ Pois(q), # where we want to guarantee Pr[s < num] < 0.0001 for s = sum(prob_i for i from 0 to total), # i.e. the failure rate of not having a sufficiently large sample < 0.0001. # Setting q = p + 5 * sqrt(p/total) is sufficient to guarantee 0.9999 success rate for # num > 12, but we need a slightly larger q (9 empirically determined). # * when sampling without replacement, we're drawing each datapoint with prob_i # ~ Binomial(total, fraction) and our choice of q guarantees 1-delta, or 0.9999 success # rate, where success rate is defined the same as in sampling with replacement. # def compute_fraction(lower_bound, total, with_replacement) lower_bound = lower_bound.to_f if with_replacement upper_poisson_bound(lower_bound) / total else fraction = lower_bound / total upper_binomial_bound(0.00001, total, fraction) end end def upper_poisson_bound(bound) num_std = if bound < 6 12 elsif bound < 16 9 else 6 end.to_f [bound + num_std * Math.sqrt(bound), 1e-10].max end def upper_binomial_bound(delta, total, fraction) gamma = -Math.log(delta) / total [1, fraction + gamma + Math.sqrt(gamma*gamma + 2*gamma*fraction)].min end # Bisect right # # == Examples: # data = [1,5,6,8,96,120,133] # # bisect_right(data, 0) # => 0 # bisect_right(data, 1) # => 1 # bisect_right(data, 5) # => 2 # bisect_right(data, 9) # => 4 # bisect_right(data, 150) # => 7 # def bisect_right(data, value, low=0, high=data.size) if low < 0 raise ArgumentError, 'Low must be >= 0.' end while low < high mid = (low + high) / 2 if value < data[mid] high = mid else low = mid + 1 end end low end # Determine bound of partitioning # # == Example: # data = [0,1,2,3,4,5,6,7,8,9,10] # determine_bounds(data, 3) # # => [3, 7] # def determine_bounds(data, num_partitions) if num_partitions > data.size return data end bounds = [] count = data.size (0...(num_partitions-1)).each do |index| bounds << data[count * (index+1) / num_partitions] end bounds end end end end ================================================ FILE: lib/spark/helper/system.rb ================================================ module Spark module Helper module System def self.included(base) base.send :extend, Methods base.send :include, Methods end module Methods def windows? RbConfig::CONFIG['host_os'] =~ /mswin|mingw/ end def mri? RbConfig::CONFIG['ruby_install_name'] == 'ruby' end def jruby? RbConfig::CONFIG['ruby_install_name'] == 'jruby' end def pry? !!Thread.current[:__pry__] end # Memory usage in kb def memory_usage if jruby? runtime = java.lang.Runtime.getRuntime (runtime.totalMemory - runtime.freeMemory) >> 10 elsif windows? # not yet else `ps -o rss= -p #{Process.pid}`.to_i end end end # Methods end # System end # Helper end # Spark ================================================ FILE: lib/spark/helper.rb ================================================ module Spark module Helper autoload :System, "spark/helper/system" autoload :Logger, "spark/helper/logger" autoload :Statistic, "spark/helper/statistic" autoload :Serialize, "spark/helper/serialize" autoload :Partition, "spark/helper/partition" autoload :Parser, "spark/helper/parser" end end ================================================ FILE: lib/spark/java_bridge/base.rb ================================================ ## # Spark::JavaBridge::Base # # Parent for all adapter (ruby - java) # module Spark module JavaBridge class Base include Spark::Helper::System JAVA_OBJECTS = [ 'java.util.ArrayList', 'scala.collection.mutable.HashMap', 'org.apache.spark.SparkConf', 'org.apache.spark.api.java.JavaSparkContext', 'org.apache.spark.api.ruby.RubyRDD', 'org.apache.spark.api.ruby.RubyUtils', 'org.apache.spark.api.ruby.RubyWorker', 'org.apache.spark.api.ruby.PairwiseRDD', 'org.apache.spark.api.ruby.RubyAccumulatorParam', 'org.apache.spark.api.ruby.RubySerializer', 'org.apache.spark.api.python.PythonRDD', 'org.apache.spark.api.python.PythonPartitioner', 'org.apache.spark.api.python.PythonUtils', 'org.apache.spark.ui.ruby.RubyTab', 'org.apache.spark.mllib.api.ruby.RubyMLLibAPI', :JInteger => 'java.lang.Integer', :JLong => 'java.lang.Long', :JLogger => 'org.apache.log4j.Logger', :JLevel => 'org.apache.log4j.Level', :JPriority => 'org.apache.log4j.Priority', :JUtils => 'org.apache.spark.util.Utils', :JDataType => 'org.apache.spark.sql.types.DataType', :JSQLContext => 'org.apache.spark.sql.SQLContext', :JDenseVector => 'org.apache.spark.mllib.linalg.DenseVector', :JDenseMatrix => 'org.apache.spark.mllib.linalg.DenseMatrix', :JStorageLevel => 'org.apache.spark.storage.StorageLevel', :JSQLFunctions => 'org.apache.spark.sql.functions' ] JAVA_TEST_OBJECTS = [ 'org.apache.spark.mllib.api.ruby.RubyMLLibUtilAPI' ] RUBY_TO_JAVA_SKIP = [Fixnum, Integer] def initialize(target) @target = target end # Import all important classes into Objects def import_all return if @imported java_objects.each do |name, klass| import(name, klass) end @imported = true nil end # Import classes for testing def import_all_test return if @imported_test java_test_objects.each do |name, klass| import(name, klass) end @imported_test = true nil end # Call java object def call(klass, method, *args) # To java args.map!{|item| to_java(item)} # Call java result = klass.__send__(method, *args) # To ruby to_ruby(result) end def to_array_list(array) array_list = ArrayList.new array.each do |item| array_list.add(to_java(item)) end array_list end def to_seq(array) PythonUtils.toSeq(to_array_list(array)) end def to_long(number) return nil if number.nil? JLong.new(number) end def to_java(object) if RUBY_TO_JAVA_SKIP.include?(object.class) # Some object are convert automatically # This is for preventing errors # For example: jruby store integer as long so 1.to_java is Long object elsif object.respond_to?(:to_java) object.to_java elsif object.is_a?(Array) to_array_list(object) else object end end # Array problem: # Rjb: object.toArray -> Array # Jruby: object.toArray -> java.lang.Object # def to_ruby(object) # Java object if java_object?(object) class_name = object.getClass.getSimpleName case class_name when 'ArraySeq' result = [] iterator = object.iterator while iterator.hasNext result << to_ruby(iterator.next) end result when 'Map2', 'Map3', 'Map4', 'HashTrieMap' Hash[ object.toSeq.array.to_a.map!{|item| [item._1, item._2]} ] when 'SeqWrapper'; object.toArray.to_a.map!{|item| to_ruby(item)} when 'ofRef'; object.array.to_a.map!{|item| to_ruby(item)} # WrappedArray$ofRef when 'LabeledPoint'; Spark::Mllib::LabeledPoint.from_java(object) when 'DenseVector'; Spark::Mllib::DenseVector.from_java(object) when 'KMeansModel'; Spark::Mllib::KMeansModel.from_java(object) when 'DenseMatrix'; Spark::Mllib::DenseMatrix.from_java(object) when 'GenericRowWithSchema'; Spark::SQL::Row.from_java(object, true) else # Some RDD if class_name != 'JavaRDD' && class_name.end_with?('RDD') object = object.toJavaRDD class_name = 'JavaRDD' end # JavaRDD if class_name == 'JavaRDD' jrdd = RubyRDD.toRuby(object) serializer = Spark::Serializer.build { __batched__(__marshal__) } serializer = Spark::Serializer.build { __batched__(__marshal__, 2) } return Spark::RDD.new(jrdd, Spark.sc, serializer, deserializer) end # Unknow Spark.logger.warn("Java object '#{object.getClass.name}' was not converted.") object end # Array can be automatically transfered but content not elsif object.is_a?(Array) object.map! do |item| to_ruby(item) end object # Already transfered else object end end alias_method :java_to_ruby, :to_ruby alias_method :ruby_to_java, :to_java private def jars result = Dir.glob(File.join(@target, '*.jar')) result.flatten! result end def objects_with_names(objects) hash = {} objects.each do |object| if object.is_a?(Hash) hash.merge!(object) else key = object.split('.').last.to_sym hash[key] = object end end hash end def java_objects objects_with_names(JAVA_OBJECTS) end def java_test_objects objects_with_names(JAVA_TEST_OBJECTS) end def raise_missing_class(klass) raise Spark::JavaBridgeError, "Class #{klass} is missing. Make sure that Spark and RubySpark is assembled." end end end end ================================================ FILE: lib/spark/java_bridge/jruby.rb ================================================ require 'java' module Spark module JavaBridge class JRuby < Base def initialize(*args) super jars.each {|jar| require jar} end def import(name, klass) klass = "Java::#{klass}" Object.const_set(name, eval(klass)) rescue NameError raise_missing_class(klass) end def java_object?(object) object.is_a?(JavaProxy) end end end end ================================================ FILE: lib/spark/java_bridge/rjb.rb ================================================ if !ENV.has_key?('JAVA_HOME') raise Spark::ConfigurationError, 'Environment variable JAVA_HOME is not set' end require 'rjb' module Spark module JavaBridge class RJB < Base def initialize(*args) super Rjb.load(jars) Rjb.primitive_conversion = true end def import(name, klass) Object.const_set(name, silence_warnings { Rjb.import(klass) }) rescue NoClassDefFoundError raise_missing_class(klass) end def java_object?(object) object.is_a?(Rjb::Rjb_JavaProxy) end private def jars separator = windows? ? ';' : ':' super.join(separator) end end end end ================================================ FILE: lib/spark/java_bridge.rb ================================================ module Spark module JavaBridge autoload :Base, 'spark/java_bridge/base' autoload :JRuby, 'spark/java_bridge/jruby' autoload :RJB, 'spark/java_bridge/rjb' include Spark::Helper::System def self.init(*args) if jruby? klass = JRuby else klass = RJB end klass.new(*args) end end end ================================================ FILE: lib/spark/library.rb ================================================ module Spark module Library def autoload(klass, location, import=true) if import @for_importing ||= [] @for_importing << klass end super(klass, location) end def autoload_without_import(klass, location) autoload(klass, location, false) end def import(to=Object) @for_importing.each do |klass| to.const_set(klass, const_get(klass)) end nil end end end ================================================ FILE: lib/spark/logger.rb ================================================ # Necessary libraries Spark.load_lib module Spark class Logger attr_reader :jlogger def initialize @jlogger = JLogger.getLogger('Ruby') end def level_off JLevel.toLevel('OFF') end # Disable all Spark log def disable jlogger.setLevel(level_off) JLogger.getLogger('org').setLevel(level_off) JLogger.getLogger('akka').setLevel(level_off) JLogger.getRootLogger.setLevel(level_off) end def enabled? !disabled? end def info(message) jlogger.info(message) if info? end def debug(message) jlogger.debug(message) if debug? end def trace(message) jlogger.trace(message) if trace? end def warning(message) jlogger.warn(message) if warning? end def error(message) jlogger.error(message) if error? end def info? level_enabled?('info') end def debug? level_enabled?('debug') end def trace? level_enabled?('trace') end def warning? level_enabled?('warn') end def error? level_enabled?('error') end def level_enabled?(type) jlogger.isEnabledFor(JPriority.toPriority(type.upcase)) end alias_method :warn, :warning end end ================================================ FILE: lib/spark/mllib/classification/common.rb ================================================ module Spark module Mllib class ClassificationModel attr_reader :weights, :intercept, :threshold def initialize(weights, intercept) @weights = Spark::Mllib::Vectors.to_vector(weights) @intercept = intercept.to_f @threshold = nil end def threshold=(value) @threshold = value.to_f end def clear_threshold @threshold = nil end end end end module Spark module Mllib class ClassificationMethodBase < RegressionMethodBase end end end ================================================ FILE: lib/spark/mllib/classification/logistic_regression.rb ================================================ module Spark module Mllib ## # LogisticRegressionModel # # A linear binary classification model derived from logistic regression. # # == Examples: # # Spark::Mllib.import # # # Dense vectors # data = [ # LabeledPoint.new(0.0, [0.0, 1.0]), # LabeledPoint.new(1.0, [1.0, 0.0]), # ] # lrm = LogisticRegressionWithSGD.train($sc.parallelize(data)) # # lrm.predict([1.0, 0.0]) # # => 1 # lrm.predict([0.0, 1.0]) # # => 0 # # lrm.clear_threshold # lrm.predict([0.0, 1.0]) # # => 0.123... # # # # Sparse vectors # data = [ # LabeledPoint.new(0.0, SparseVector.new(2, {0 => 0.0})), # LabeledPoint.new(1.0, SparseVector.new(2, {1 => 1.0})), # LabeledPoint.new(0.0, SparseVector.new(2, {0 => 1.0})), # LabeledPoint.new(1.0, SparseVector.new(2, {1 => 2.0})) # ] # lrm = LogisticRegressionWithSGD.train($sc.parallelize(data)) # # lrm.predict([0.0, 1.0]) # # => 1 # lrm.predict([1.0, 0.0]) # # => 0 # lrm.predict(SparseVector.new(2, {1 => 1.0})) # # => 1 # lrm.predict(SparseVector.new(2, {0 => 1.0})) # # => 0 # # # # LogisticRegressionWithLBFGS # data = [ # LabeledPoint.new(0.0, [0.0, 1.0]), # LabeledPoint.new(1.0, [1.0, 0.0]), # ] # lrm = LogisticRegressionWithLBFGS.train($sc.parallelize(data)) # # lrm.predict([1.0, 0.0]) # # => 1 # lrm.predict([0.0, 1.0]) # # => 0 # class LogisticRegressionModel < ClassificationModel def initialize(*args) super @threshold = 0.5 end # Predict values for a single data point or an RDD of points using # the model trained. def predict(vector) vector = Spark::Mllib::Vectors.to_vector(vector) margin = weights.dot(vector) + intercept score = 1.0 / (1.0 + Math.exp(-margin)) if threshold.nil? return score end if score > threshold 1 else 0 end end end end end module Spark module Mllib class LogisticRegressionWithSGD < ClassificationMethodBase DEFAULT_OPTIONS = { iterations: 100, step: 1.0, mini_batch_fraction: 1.0, initial_weights: nil, reg_param: 0.01, reg_type: 'l2', intercept: false, validate: true, convergence_tol: 0.001 } # Train a logistic regression model on the given data. # # == Arguments: # rdd:: # The training data, an RDD of LabeledPoint. # # iterations:: # The number of iterations (default: 100). # # step:: # The step parameter used in SGD (default: 1.0). # # mini_batch_fraction:: # Fraction of data to be used for each SGD iteration. # # initial_weights:: # The initial weights (default: nil). # # reg_param:: # The regularizer parameter (default: 0.01). # # reg_type:: # The type of regularizer used for training our model (default: "l2"). # # Allowed values: # - "l1" for using L1 regularization # - "l2" for using L2 regularization # - nil for no regularization # # intercept:: # Boolean parameter which indicates the use # or not of the augmented representation for # training data (i.e. whether bias features # are activated or not). # (default: false) # # validate:: # Boolean parameter which indicates if the # algorithm should validate data before training. # (default: true) # # convergence_tol:: # A condition which decides iteration termination. # (default: 0.001) # def self.train(rdd, options={}) super weights, intercept = Spark.jb.call(RubyMLLibAPI.new, 'trainLogisticRegressionModelWithSGD', rdd, options[:iterations].to_i, options[:step].to_f, options[:mini_batch_fraction].to_f, options[:initial_weights], options[:reg_param].to_f, options[:reg_type], options[:intercept], options[:validate], options[:convergence_tol]) LogisticRegressionModel.new(weights, intercept) end end end end module Spark module Mllib class LogisticRegressionWithLBFGS < ClassificationMethodBase DEFAULT_OPTIONS = { iterations: 100, initial_weights: nil, reg_param: 0.01, reg_type: 'l2', intercept: false, corrections: 10, tolerance: 0.0001 } # Train a logistic regression model on the given data. # # == Arguments: # rdd:: # The training data, an RDD of LabeledPoint. # # iterations:: # The number of iterations (default: 100). # # initial_weights:: # The initial weights (default: nil). # # reg_param:: # The regularizer parameter (default: 0.01). # # reg_type:: # The type of regularizer used for training our model (default: "l2"). # # Allowed values: # - "l1" for using L1 regularization # - "l2" for using L2 regularization # - nil for no regularization # # intercept:: # Boolean parameter which indicates the use # or not of the augmented representation for # training data (i.e. whether bias features # are activated or not). # # corrections:: # The number of corrections used in the LBFGS update (default: 10). # # tolerance:: # The convergence tolerance of iterations for L-BFGS (default: 0.0001). # def self.train(rdd, options={}) super weights, intercept = Spark.jb.call(RubyMLLibAPI.new, 'trainLogisticRegressionModelWithLBFGS', rdd, options[:iterations].to_i, options[:initial_weights], options[:reg_param].to_f, options[:reg_type], options[:intercept], options[:corrections].to_i, options[:tolerance].to_f) LogisticRegressionModel.new(weights, intercept) end end end end ================================================ FILE: lib/spark/mllib/classification/naive_bayes.rb ================================================ module Spark module Mllib ## # NaiveBayesModel # # Model for Naive Bayes classifiers. # # Contains two parameters: # pi:: vector of logs of class priors (dimension C) # theta:: matrix of logs of class conditional probabilities (CxD) # # == Examples: # # Spark::Mllib.import # # # Dense vectors # data = [ # LabeledPoint.new(0.0, [0.0, 0.0]), # LabeledPoint.new(0.0, [0.0, 1.0]), # LabeledPoint.new(1.0, [1.0, 0.0]) # ] # model = NaiveBayes.train($sc.parallelize(data)) # # model.predict([0.0, 1.0]) # # => 0.0 # model.predict([1.0, 0.0]) # # => 1.0 # # # # Sparse vectors # data = [ # LabeledPoint.new(0.0, SparseVector.new(2, {1 => 0.0})), # LabeledPoint.new(0.0, SparseVector.new(2, {1 => 1.0})), # LabeledPoint.new(1.0, SparseVector.new(2, {0 => 1.0})) # ] # model = NaiveBayes.train($sc.parallelize(data)) # # model.predict(SparseVector.new(2, {1 => 1.0})) # # => 0.0 # model.predict(SparseVector.new(2, {0 => 1.0})) # # => 1.0 # class NaiveBayesModel attr_reader :labels, :pi, :theta def initialize(labels, pi, theta) @labels = labels @pi = pi @theta = theta end # Predict values for a single data point or an RDD of points using # the model trained. def predict(vector) vector = Spark::Mllib::Vectors.to_vector(vector) array = (vector.dot(theta) + pi).to_a index = array.index(array.max) labels[index] end end end end module Spark module Mllib class NaiveBayes # Trains a Naive Bayes model given an RDD of (label, features) pairs. # # This is the Multinomial NB (http://tinyurl.com/lsdw6p) which can handle all kinds of # discrete data. For example, by converting documents into TF-IDF vectors, it can be used for # document classification. By making every vector a 0-1 vector, it can also be used as # Bernoulli NB (http://tinyurl.com/p7c96j6). The input feature values must be nonnegative. # # == Arguments: # rdd:: RDD of LabeledPoint. # lambda:: The smoothing parameter. # def self.train(rdd, lambda=1.0) # Validation first = rdd.first unless first.is_a?(LabeledPoint) raise Spark::MllibError, "RDD should contains LabeledPoint, got #{first.class}" end labels, pi, theta = Spark.jb.call(RubyMLLibAPI.new, 'trainNaiveBayesModel', rdd, lambda) theta = Spark::Mllib::Matrices.dense(theta.size, theta.first.size, theta) NaiveBayesModel.new(labels, pi, theta) end end end end ================================================ FILE: lib/spark/mllib/classification/svm.rb ================================================ module Spark module Mllib ## # SVMModel # # A support vector machine. # # == Examples: # # Spark::Mllib.import # # # Dense vectors # data = [ # LabeledPoint.new(0.0, [0.0]), # LabeledPoint.new(1.0, [1.0]), # LabeledPoint.new(1.0, [2.0]), # LabeledPoint.new(1.0, [3.0]) # ] # svm = SVMWithSGD.train($sc.parallelize(data)) # # svm.predict([1.0]) # # => 1 # svm.clear_threshold # svm.predict([1.0]) # # => 1.25... # # # # Sparse vectors # data = [ # LabeledPoint.new(0.0, SparseVector.new(2, {0 => -1.0})), # LabeledPoint.new(1.0, SparseVector.new(2, {1 => 1.0})), # LabeledPoint.new(0.0, SparseVector.new(2, {0 => 0.0})), # LabeledPoint.new(1.0, SparseVector.new(2, {1 => 2.0})) # ] # svm = SVMWithSGD.train($sc.parallelize(data)) # # svm.predict(SparseVector.new(2, {1 => 1.0})) # # => 1 # svm.predict(SparseVector.new(2, {0 => -1.0})) # # => 0 # class SVMModel < ClassificationModel def initialize(*args) super @threshold = 0.0 end # Predict values for a single data point or an RDD of points using # the model trained. def predict(vector) vector = Spark::Mllib::Vectors.to_vector(vector) margin = weights.dot(vector) + intercept if threshold.nil? return margin end if margin > threshold 1 else 0 end end end end end module Spark module Mllib class SVMWithSGD < ClassificationMethodBase DEFAULT_OPTIONS = { iterations: 100, step: 1.0, reg_param: 0.01, mini_batch_fraction: 1.0, initial_weights: nil, reg_type: 'l2', intercept: false, validate: true, convergence_tol: 0.001 } # Train a support vector machine on the given data. # # rdd:: # The training data, an RDD of LabeledPoint. # # iterations:: # The number of iterations (default: 100). # # step:: # The step parameter used in SGD (default: 1.0). # # reg_param:: # The regularizer parameter (default: 0.01). # # mini_batch_fraction:: # Fraction of data to be used for each SGD iteration. # # initial_weights:: # The initial weights (default: nil). # # reg_type:: # The type of regularizer used for training our model (default: "l2"). # # Allowed values: # - "l1" for using L1 regularization # - "l2" for using L2 regularization # - nil for no regularization # # intercept:: # Boolean parameter which indicates the use # or not of the augmented representation for # training data (i.e. whether bias features # are activated or not). # (default: false) # # validateData:: # Boolean parameter which indicates if the # algorithm should validate data before training. # (default: true) # # convergence_tol:: # A condition which decides iteration termination. # (default: 0.001) # def self.train(rdd, options={}) super weights, intercept = Spark.jb.call(RubyMLLibAPI.new, 'trainSVMModelWithSGD', rdd, options[:iterations].to_i, options[:step].to_f, options[:reg_param].to_f, options[:mini_batch_fraction].to_f, options[:initial_weights], options[:reg_type], options[:intercept], options[:validate], options[:convergence_tol]) SVMModel.new(weights, intercept) end end end end ================================================ FILE: lib/spark/mllib/clustering/gaussian_mixture.rb ================================================ module Spark module Mllib ## # GaussianMixtureModel # # A clustering model derived from the Gaussian Mixture Model method. # # == Examples: # # Spark::Mllib.import # # data = [ # DenseVector.new([-0.1, -0.05]), # DenseVector.new([-0.01, -0.1]), # DenseVector.new([0.9, 0.8]), # DenseVector.new([0.75, 0.935]), # DenseVector.new([-0.83, -0.68]), # DenseVector.new([-0.91, -0.76]) # ] # # model = GaussianMixture.train($sc.parallelize(data), 3, convergence_tol: 0.0001, max_iterations: 50, seed: 10) # # labels = model.predict($sc.parallelize(data)).collect # class GaussianMixtureModel attr_reader :weights, :gaussians, :k def initialize(weights, gaussians) @weights = weights @gaussians = gaussians @k = weights.size end # Find the cluster to which the points in 'x' has maximum membership # in this model. def predict(rdd) if rdd.is_a?(Spark::RDD) predict_soft(rdd).map('lambda{|x| x.index(x.max)}') else raise ArgumentError, 'Argument must be a RDD.' end end # Find the membership of each point in 'x' to all mixture components. def predict_soft(rdd) Spark.jb.call(RubyMLLibAPI.new, 'predictSoftGMM', rdd, weights, means, sigmas) end def means @means ||= @gaussians.map(&:mu) end def sigmas @sigmas ||= @gaussians.map(&:sigma) end end end end module Spark module Mllib class GaussianMixture def self.train(rdd, k, convergence_tol: 0.001, max_iterations: 100, seed: nil) weights, means, sigmas = Spark.jb.call(RubyMLLibAPI.new, 'trainGaussianMixtureModel', rdd, k, convergence_tol, max_iterations, Spark.jb.to_long(seed)) means.map! {|mu| Spark.jb.java_to_ruby(mu)} sigmas.map!{|sigma| Spark.jb.java_to_ruby(sigma)} mvgs = Array.new(k) do |i| MultivariateGaussian.new(means[i], sigmas[i]) end GaussianMixtureModel.new(weights, mvgs) end end end end ================================================ FILE: lib/spark/mllib/clustering/kmeans.rb ================================================ module Spark module Mllib ## # KMeansModel # # A clustering model derived from the k-means method. # # == Examples: # # Spark::Mllib.import # # # Dense vectors # data = [ # DenseVector.new([0.0,0.0]), # DenseVector.new([1.0,1.0]), # DenseVector.new([9.0,8.0]), # DenseVector.new([8.0,9.0]) # ] # # model = KMeans.train($sc.parallelize(data), 2, max_iterations: 10, # runs: 30, initialization_mode: "random") # # model.predict([0.0, 0.0]) == model.predict([1.0, 1.0]) # # => true # model.predict([8.0, 9.0]) == model.predict([9.0, 8.0]) # # => true # # # # Sparse vectors # data = [ # SparseVector.new(3, {1 => 1.0}), # SparseVector.new(3, {1 => 1.1}), # SparseVector.new(3, {2 => 1.0}), # SparseVector.new(3, {2 => 1.1}) # ] # model = KMeans.train($sc.parallelize(data), 2, initialization_mode: "k-means||") # # model.predict([0.0, 1.0, 0.0]) == model.predict([0, 1.1, 0.0]) # # => true # model.predict([0.0, 0.0, 1.0]) == model.predict([0, 0, 1.1]) # # => true # model.predict(data[0]) == model.predict(data[1]) # # => true # model.predict(data[2]) == model.predict(data[3]) # # => true # class KMeansModel attr_reader :centers def initialize(centers) @centers = centers end # Find the cluster to which x belongs in this model. def predict(vector) vector = Spark::Mllib::Vectors.to_vector(vector) best = 0 best_distance = Float::INFINITY @centers.each_with_index do |center, index| distance = vector.squared_distance(center) if distance < best_distance best = index best_distance = distance end end best end def self.from_java(object) centers = object.clusterCenters centers.map! do |center| Spark.jb.java_to_ruby(center) end KMeansModel.new(centers) end end end end module Spark module Mllib class KMeans # Trains a k-means model using the given set of parameters. # # == Arguments: # rdd:: # The training data, an RDD of Vectors. # # k:: # Number of clusters. # # max_iterations:: # Max number of iterations. # # runs:: # Number of parallel runs, defaults to 1. The best model is returned. # # initialization_mode:: # Initialization model, either "random" or "k-means||" (default). # # seed:: # Random seed value for cluster initialization. # # epsilon:: # The distance threshold within which we've consider centers to have converged. # def self.train(rdd, k, max_iterations: 100, runs: 1, initialization_mode: 'k-means||', seed: nil, initialization_steps: 5, epsilon: 0.0001) cluster_initial_model = [] # Call returns KMeansModel Spark.jb.call(RubyMLLibAPI.new, 'trainKMeansModel', rdd, k, max_iterations, runs, initialization_mode, Spark.jb.to_long(seed), initialization_steps, epsilon, cluster_initial_model) end end end end ================================================ FILE: lib/spark/mllib/matrix.rb ================================================ module Spark module Mllib module Matrices def self.dense(*args) DenseMatrix.new(*args) end def self.sparse(*args) SparseMatrix.new(*args) end def self.to_matrix(data) if data.is_a?(SparseMatrix) || data.is_a?(DenseMatrix) data elsif data.is_a?(Array) DenseMatrix.new(data) end end end end end module Spark module Mllib # @abstract Parent for all type of matrices class MatrixBase < MatrixAdapter end end end module Spark module Mllib ## # DenseMatrix # # DenseMatrix.new(2, 3, [[1,2,3], [4,5,6]]).values # # => [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]] # class DenseMatrix < MatrixBase def initialize(rows, cols, values) super(:dense, rows, cols, values.to_a) end def to_java JDenseMatrix.new(shape[0], shape[1], values.flatten) end def self.from_java(object) rows = object.numRows cols = object.numCols values = object.values DenseMatrix.new(rows, cols, values) end end end end module Spark module Mllib ## # SparseMatrix # # == Arguments: # rows:: # Number of rows. # # cols:: # Number of columns. # # col_pointers:: # The index corresponding to the start of a new column. # # row_indices:: # The row index of the entry. They must be in strictly # increasing order for each column. # # values:: # Nonzero matrix entries in column major. # # == Examples: # # SparseMatrix.new(3, 3, [0, 2, 3, 6], [0, 2, 1, 0, 1, 2], [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).values # # # => [ # # [1.0, 0.0, 4.0], # # [0.0, 3.0, 5.0], # # [2.0, 0.0, 6.0] # # ] # class SparseMatrix < MatrixBase attr_reader :col_pointers, :row_indices def initialize(rows, cols, col_pointers, row_indices, values) super(:sparse, rows, cols) @col_pointers = col_pointers @row_indices = row_indices @values = values j = 0 while j < cols idx = col_pointers[j] idx_end = col_pointers[j+1] while idx < idx_end self[row_indices[idx], j] = values[idx] idx += 1 end j += 1 end end end end end ================================================ FILE: lib/spark/mllib/regression/common.rb ================================================ module Spark module Mllib ## # RegressionModel # # A linear model that has a vector of coefficients and an intercept. # class RegressionModel attr_reader :weights, :intercept def initialize(weights, intercept) @weights = Spark::Mllib::Vectors.to_vector(weights) @intercept = intercept.to_f end # Predict the value of the dependent variable given a vector data # containing values for the independent variables. # # == Examples: # lm = RegressionModel.new([1.0, 2.0], 0.1) # # lm.predict([-1.03, 7.777]) - 14.624 < 1e-6 # # => true # # lm.predict(SparseVector.new(2, {0 => -1.03, 1 => 7.777})) - 14.624 < 1e-6 # # => true # def predict(data) data = Spark::Mllib::Vectors.to_vector(data) @weights.dot(data) + @intercept end end end end module Spark module Mllib ## # RegressionMethodBase # # Parent for regression methods # class RegressionMethodBase def self.train(rdd, options) # String keys to symbols options.symbolize_keys! # Reverse merge self::DEFAULT_OPTIONS.each do |key, value| if options.has_key?(key) # value from user else options[key] = value end end # Validation first = rdd.first unless first.is_a?(LabeledPoint) raise Spark::MllibError, "RDD should contains LabeledPoint, got #{first.class}" end # Initial weights is optional for user (not for Spark) options[:initial_weights] = Vectors.to_vector(options[:initial_weights] || [0.0] * first.features.size) end end end end ================================================ FILE: lib/spark/mllib/regression/labeled_point.rb ================================================ module Spark module Mllib ## # LabeledPoint # # The features and labels of a data point. # # == Parameters: # label:: # Label for this data point. # # features:: # Vector of features for this point # class LabeledPoint attr_reader :label, :features def initialize(label, features) @label = label.to_f @features = Spark::Mllib::Vectors.to_vector(features) end def self.from_java(object) LabeledPoint.new( object.label, Spark.jb.java_to_ruby(object.features) ) end def marshal_dump [@label, @features] end def marshal_load(array) initialize(array[0], array[1]) end end end end ================================================ FILE: lib/spark/mllib/regression/lasso.rb ================================================ ## # LassoModel # # Train a regression model with L1-regularization using Stochastic Gradient Descent. # This solves the l1-regularized least squares regression formulation # f(weights) = 1/2n ||A weights-y||^2^ + regParam ||weights||_1 # Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with # its corresponding right hand side label y. # See also the documentation for the precise formulation. # # == Examples: # # Spark::Mllib.import # # # Dense vectors # data = [ # LabeledPoint.new(0.0, [0.0]), # LabeledPoint.new(1.0, [1.0]), # LabeledPoint.new(3.0, [2.0]), # LabeledPoint.new(2.0, [3.0]) # ] # lrm = LassoWithSGD.train($sc.parallelize(data), initial_weights: [1.0]) # # lrm.predict([0.0]) - 0 < 0.5 # # => true # # lrm.predict([1.0]) - 1 < 0.5 # # => true # # lrm.predict(SparseVector.new(1, {0 => 1.0})) - 1 < 0.5 # # => true # # # # Sparse vectors # data = [ # LabeledPoint.new(0.0, SparseVector.new(1, {0 => 0.0})), # LabeledPoint.new(1.0, SparseVector.new(1, {0 => 1.0})), # LabeledPoint.new(3.0, SparseVector.new(1, {0 => 2.0})), # LabeledPoint.new(2.0, SparseVector.new(1, {0 => 3.0})) # ] # lrm = LinearRegressionWithSGD.train($sc.parallelize(data), initial_weights: [1.0]) # # lrm.predict([0.0]) - 0 < 0.5 # # => true # # lrm.predict(SparseVector.new(1, {0 => 1.0})) - 1 < 0.5 # # => true # class Spark::Mllib::LassoModel < Spark::Mllib::RegressionModel end module Spark module Mllib class LassoWithSGD < RegressionMethodBase DEFAULT_OPTIONS = { iterations: 100, step: 1.0, reg_param: 0.01, mini_batch_fraction: 1.0, initial_weights: nil, intercept: false, validate: true, convergence_tol: 0.001 } # Train a Lasso regression model on the given data. # # == Parameters: # rdd:: # The training data (RDD instance). # # iterations:: # The number of iterations (default: 100). # # step:: # The step parameter used in SGD (default: 1.0). # # reg_param:: # The regularizer parameter (default: 0.0). # # mini_batch_fraction:: # Fraction of data to be used for each SGD iteration (default: 1.0). # # initial_weights:: # The initial weights (default: nil). # # intercept:: # Boolean parameter which indicates the use # or not of the augmented representation for # training data (i.e. whether bias features # are activated or not). # (default: false) # # validate:: # Boolean parameter which indicates if the # algorithm should validate data before training. # (default: true) # # convergence_tol:: # A condition which decides iteration termination. # (default: 0.001) # def self.train(rdd, options={}) super weights, intercept = Spark.jb.call(RubyMLLibAPI.new, 'trainLassoModelWithSGD', rdd, options[:iterations].to_i, options[:step].to_f, options[:reg_param].to_f, options[:mini_batch_fraction].to_f, options[:initial_weights], options[:intercept], options[:validate], options[:convergence_tol]) LassoModel.new(weights, intercept) end end end end ================================================ FILE: lib/spark/mllib/regression/linear.rb ================================================ ## # LinearRegressionModel # # Train a linear regression model with no regularization using Stochastic Gradient Descent. # This solves the least squares regression formulation # f(weights) = 1/n ||A weights-y||^2^ # (which is the mean squared error). # Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with # its corresponding right hand side label y. # See also the documentation for the precise formulation. # # == Examples: # # Spark::Mllib.import # # # Dense vectors # data = [ # LabeledPoint.new(0.0, [0.0]), # LabeledPoint.new(1.0, [1.0]), # LabeledPoint.new(3.0, [2.0]), # LabeledPoint.new(2.0, [3.0]) # ] # lrm = LinearRegressionWithSGD.train($sc.parallelize(data), initial_weights: [1.0]) # # lrm.intercept # => 0.0 # lrm.weights # => [0.9285714285714286] # # lrm.predict([0.0]) < 0.5 # # => true # # lrm.predict([1.0]) - 1 < 0.5 # # => true # # lrm.predict(SparseVector.new(1, {0 => 1.0})) - 1 < 0.5 # # => true # # # Sparse vectors # data = [ # LabeledPoint.new(0.0, SparseVector.new(1, {0 => 0.0})), # LabeledPoint.new(1.0, SparseVector.new(1, {0 => 1.0})), # LabeledPoint.new(3.0, SparseVector.new(1, {0 => 2.0})), # LabeledPoint.new(2.0, SparseVector.new(1, {0 => 3.0})) # ] # lrm = LinearRegressionWithSGD.train($sc.parallelize(data), initial_weights: [1.0]) # # lrm.intercept # => 0.0 # lrm.weights # => [0.9285714285714286] # # lrm.predict([0.0]) < 0.5 # # => true # # lrm.predict(SparseVector.new(1, {0 => 1.0})) - 1 < 0.5 # # => true # class Spark::Mllib::LinearRegressionModel < Spark::Mllib::RegressionModel end module Spark module Mllib class LinearRegressionWithSGD < RegressionMethodBase DEFAULT_OPTIONS = { iterations: 100, step: 1.0, mini_batch_fraction: 1.0, initial_weights: nil, reg_param: 0.0, reg_type: nil, intercept: false, validate: true, convergence_tol: 0.001 } # Train a linear regression model on the given data. # # == Parameters: # rdd:: # The training data (RDD instance). # # iterations:: # The number of iterations (default: 100). # # step:: # The step parameter used in SGD (default: 1.0). # # mini_batch_fraction:: # Fraction of data to be used for each SGD iteration (default: 1.0). # # initial_weights:: # The initial weights (default: nil). # # reg_param:: # The regularizer parameter (default: 0.0). # # reg_type:: # The type of regularizer used for training our model (default: nil). # # Allowed values: # - "l1" for using L1 regularization (lasso), # - "l2" for using L2 regularization (ridge), # - None for no regularization # # intercept:: # Boolean parameter which indicates the use # or not of the augmented representation for # training data (i.e. whether bias features # are activated or not). # (default: false) # # validate:: # Boolean parameter which indicates if the # algorithm should validate data before training. # (default: true) # # convergence_tol:: # A condition which decides iteration termination. # (default: 0.001) # def self.train(rdd, options={}) super weights, intercept = Spark.jb.call(RubyMLLibAPI.new, 'trainLinearRegressionModelWithSGD', rdd, options[:iterations].to_i, options[:step].to_f, options[:mini_batch_fraction].to_f, options[:initial_weights], options[:reg_param].to_f, options[:reg_type], options[:intercept], options[:validate], options[:convergence_tol]) LinearRegressionModel.new(weights, intercept) end end end end ================================================ FILE: lib/spark/mllib/regression/ridge.rb ================================================ ## # RidgeRegressionModel # # Train a regression model with L2-regularization using Stochastic Gradient Descent. # This solves the l1-regularized least squares regression formulation # f(weights) = 1/2n ||A weights-y||^2^ + regParam/2 ||weights||^2^ # Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with # its corresponding right hand side label y. # See also the documentation for the precise formulation. # # == Examples: # # Spark::Mllib.import # # data = [ # LabeledPoint.new(0.0, [0.0]), # LabeledPoint.new(1.0, [1.0]), # LabeledPoint.new(3.0, [2.0]), # LabeledPoint.new(2.0, [3.0]) # ] # lrm = RidgeRegressionWithSGD.train($sc.parallelize(data), initial_weights: [1.0]) # # lrm.predict([0.0]) - 0 < 0.5 # # => true # # lrm.predict([1.0]) - 1 < 0.5 # # => true # # lrm.predict(SparseVector.new(1, {0 => 1.0})) - 1 < 0.5 # # => true # # data = [ # LabeledPoint.new(0.0, SparseVector.new(1, {0 => 0.0})), # LabeledPoint.new(1.0, SparseVector.new(1, {0 => 1.0})), # LabeledPoint.new(3.0, SparseVector.new(1, {0 => 2.0})), # LabeledPoint.new(2.0, SparseVector.new(1, {0 => 3.0})) # ] # lrm = LinearRegressionWithSGD.train($sc.parallelize(data), initial_weights: [1.0]) # # lrm.predict([0.0]) - 0 < 0.5 # # => true # # lrm.predict(SparseVector.new(1, {0 => 1.0})) - 1 < 0.5 # # => true # class Spark::Mllib::RidgeRegressionModel < Spark::Mllib::RegressionModel end module Spark module Mllib class RidgeRegressionWithSGD < RegressionMethodBase DEFAULT_OPTIONS = { iterations: 100, step: 1.0, reg_param: 0.01, mini_batch_fraction: 1.0, initial_weights: nil, intercept: false, validate: true, convergence_tol: 0.001 } # Train a ridge regression model on the given data. # # == Parameters: # rdd:: # The training data (RDD instance). # # iterations:: # The number of iterations (default: 100). # # step:: # The step parameter used in SGD (default: 1.0). # # reg_param:: # The regularizer parameter (default: 0.0). # # mini_batch_fraction:: # Fraction of data to be used for each SGD iteration (default: 1.0). # # initial_weights:: # The initial weights (default: nil). # # intercept:: # Boolean parameter which indicates the use # or not of the augmented representation for # training data (i.e. whether bias features # are activated or not). # (default: false) # # validate:: # Boolean parameter which indicates if the # algorithm should validate data before training. # (default: true) # # convergence_tol:: # A condition which decides iteration termination. # (default: 0.001) # def self.train(rdd, options={}) super weights, intercept = Spark.jb.call(RubyMLLibAPI.new, 'trainRidgeModelWithSGD', rdd, options[:iterations].to_i, options[:step].to_f, options[:reg_param].to_f, options[:mini_batch_fraction].to_f, options[:initial_weights], options[:intercept], options[:validate], options[:convergence_tol]) RidgeRegressionModel.new(weights, intercept) end end end end ================================================ FILE: lib/spark/mllib/ruby_matrix/matrix_adapter.rb ================================================ require 'matrix' module Spark module Mllib class MatrixAdapter < ::Matrix def self.new(*args) object = self.allocate if args.size == 2 # Matrix is initialized from Matrix # Arguments: rows, column count object.__send__(:original_initialize, *args) else object.__send__(:initialize, *args) end object end alias_method :original_initialize, :initialize def initialize(type, rows, cols, values=nil) case type when :dense values = values.dup if rows * cols == values.size # Values are on one row # 2x2 => [1,2,3,4] values = values.each_slice(cols).to_a else # 2x2 => [[1,2], [3,4]] end when :sparse values = Array.new(rows) { Array.new(cols) { 0.0 } } else raise Spark::MllibError, 'Unknow vector type.' end super(values, cols) end def shape [row_count, column_count] end def values @values || to_a end end end end ================================================ FILE: lib/spark/mllib/ruby_matrix/vector_adapter.rb ================================================ require 'matrix' # Based on ruby 2.1 class Vector def self.elements(array, copy=true) DenseVector.new(convert_to_array(array, copy)) end end module Spark module Mllib class VectorAdapter < ::Vector def self.new(*args) object = self.allocate object.__send__(:initialize, *args) object end def initialize(*args) case args.shift when :dense values = args.shift.dup when :sparse values = [0.0] * args.shift.to_i else raise Spark::MllibError, 'Unknow vector type.' end super(values) end def []=(index, value) @elements[index] = value end def dot(other) if other.is_a?(Spark::Mllib::MatrixBase) other * self else inner_product(other) end end def squared_distance(other) diff = self - other diff.dot(diff) end def values @values || to_a end end end end ================================================ FILE: lib/spark/mllib/stat/distribution.rb ================================================ ## # MultivariateGaussian # # This class provides basic functionality for a Multivariate Gaussian (Normal) Distribution. In # the event that the covariance matrix is singular, the density will be computed in a # reduced dimensional subspace under which the distribution is supported. # # == Arguments: # mu:: The mean vector of the distribution # sigma:: The covariance matrix of the distribution # Spark::Mllib::MultivariateGaussian = Struct.new(:mu, :sigma) ================================================ FILE: lib/spark/mllib/vector.rb ================================================ module Spark module Mllib module Vectors def self.dense(*args) DenseVector.new(*args) end def self.sparse(*args) SparseVector.new(*args) end def self.parse(data) if data.start_with?('[') && data.end_with?(']') DenseVector.parse(data) elsif data.start_with?('(') && data.end_with?(')') SparseVector.parse(data) else raise ArgumentError, 'Unknow vector.' end end def self.to_vector(data) if data.is_a?(SparseVector) || data.is_a?(DenseVector) data elsif data.is_a?(Array) DenseVector.new(data) end end end end end module Spark module Mllib # @abstract Parent for all type of vectors class VectorBase < VectorAdapter end end end module Spark module Mllib ## # A dense vector represented by a value array. # # Dense vector is a vector in which most of the elements are non-zero. # # == Example: # DenseVector.new([1,2,3,4,5]).values # # => [1, 2, 3, 4, 5] # # DenseVector.new(1..5).values # # => [1, 2, 3, 4, 5] # class DenseVector < VectorBase def initialize(values) super(:dense, values.to_a) end # Covert string to vector # # DenseVector.parse("[1.0,2.0,3.0,4.0,5.0]") # def self.parse(data) unless data =~ /\[[0-9., ]+\]/ raise ArgumentError, 'Unknow format for DenseVector.' end data.sub!('[', '') data.sub!(']', '') data = data.split(',') data.map!(&:to_f) DenseVector.new(data) end # Convert vector to string # # DenseVector.new([1,2,3,4,5]).to_s # # => "[1.0,2.0,3.0,4.0,5.0]" # def to_s "[#{values.join(',')}]" end def to_java JDenseVector.new(values) end def self.from_java(object) DenseVector.new(object.values) end def marshal_dump values end def marshal_load(array) initialize(array) end end end end module Spark module Mllib ## # A sparse vector represented by an index array and an value array. # # Sparse vector is a vector in which most of the elements are zero. # # == Example: # SparseVector.new(4, {1 => 1.0, 3 => 5.5}).values # # => [0, 1.0, 0, 5.5] # # SparseVector.new(4, [[1, 3], [1.0, 5.5]]).values # # => [0, 1.0, 0, 5.5] # # SparseVector.new(4, [1, 3], [1.0, 5.5]).values # # => [0, 1.0, 0, 5.5] # class SparseVector < VectorBase attr_reader :indices def initialize(arg1, arg2=nil, arg3=nil) super(:sparse, arg1) if arg2.is_a?(Hash) @indices = arg2.keys @values = arg2.values else @indices = arg2 @values = arg3 end @indices.zip(@values).each do |(index, value)| self[index] = value end end # Covert string to vector # # SparseVector.parse("(5,[1,4],[3.0,5.0])") # def self.parse(data) data = data.match(/\(([0-9]+)[ ]*,[ ]*\[([0-9,. ]*)\][ ]*,[ ]*\[([0-9,. ]*)\]\)/) if data size = data[1].to_i indices = data[2].split(',') indices.map!(&:to_i) values = data[3].split(',') values.map!(&:to_f) SparseVector.new(size, indices, values) else raise ArgumentError, 'Unknow format for SparseVector.' end end # Convert vector to string # # SparseVector.new(5, {1 => 3, 4 => 5}).to_s # # => "(5,[1,4],[3.0,5.0])" # def to_s "(#{size},[#{indices.join(',')}],[#{values.join(',')}])" end def marshal_dump [size, indices, values] end def marshal_load(array) initialize(array[0], array[1], array[2]) end end end end ================================================ FILE: lib/spark/mllib.rb ================================================ module Spark # MLlib is Spark’s scalable machine learning library consisting of common learning algorithms and utilities, # including classification, regression, clustering, collaborative filtering, dimensionality reduction, # as well as underlying optimization primitives. module Mllib extend Spark::Library # Base classes autoload_without_import :VectorBase, 'spark/mllib/vector' autoload_without_import :MatrixBase, 'spark/mllib/matrix' autoload_without_import :RegressionMethodBase, 'spark/mllib/regression/common' autoload_without_import :ClassificationMethodBase, 'spark/mllib/classification/common' # Linear algebra autoload :Vectors, 'spark/mllib/vector' autoload :DenseVector, 'spark/mllib/vector' autoload :SparseVector, 'spark/mllib/vector' autoload :Matrices, 'spark/mllib/matrix' autoload :DenseMatrix, 'spark/mllib/matrix' autoload :SparseMatrix, 'spark/mllib/matrix' # Regression autoload :LabeledPoint, 'spark/mllib/regression/labeled_point' autoload :RegressionModel, 'spark/mllib/regression/common' autoload :LinearRegressionModel, 'spark/mllib/regression/linear' autoload :LinearRegressionWithSGD, 'spark/mllib/regression/linear' autoload :LassoModel, 'spark/mllib/regression/lasso' autoload :LassoWithSGD, 'spark/mllib/regression/lasso' autoload :RidgeRegressionModel, 'spark/mllib/regression/ridge' autoload :RidgeRegressionWithSGD, 'spark/mllib/regression/ridge' # Classification autoload :ClassificationModel, 'spark/mllib/classification/common' autoload :LogisticRegressionWithSGD, 'spark/mllib/classification/logistic_regression' autoload :LogisticRegressionWithLBFGS, 'spark/mllib/classification/logistic_regression' autoload :SVMModel, 'spark/mllib/classification/svm' autoload :SVMWithSGD, 'spark/mllib/classification/svm' autoload :NaiveBayesModel, 'spark/mllib/classification/naive_bayes' autoload :NaiveBayes, 'spark/mllib/classification/naive_bayes' # Clustering autoload :KMeans, 'spark/mllib/clustering/kmeans' autoload :KMeansModel, 'spark/mllib/clustering/kmeans' autoload :GaussianMixture, 'spark/mllib/clustering/gaussian_mixture' autoload :GaussianMixtureModel, 'spark/mllib/clustering/gaussian_mixture' # Stat autoload :MultivariateGaussian, 'spark/mllib/stat/distribution' def self.prepare return if @prepared # if narray? # require 'spark/mllib/narray/vector' # require 'spark/mllib/narray/matrix' # elsif mdarray? # require 'spark/mllib/mdarray/vector' # require 'spark/mllib/mdarray/matrix' # else # require 'spark/mllib/matrix/vector' # require 'spark/mllib/matrix/matrix' # end require 'spark/mllib/ruby_matrix/vector_adapter' require 'spark/mllib/ruby_matrix/matrix_adapter' @prepared = true nil end def self.narray? Gem::Specification::find_all_by_name('narray').any? end def self.mdarray? Gem::Specification::find_all_by_name('mdarray').any? end end end Spark::Mllib.prepare ================================================ FILE: lib/spark/rdd.rb ================================================ module Spark ## # A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. Represents an immutable, # partitioned collection of elements that can be operated on in parallel. This class contains the # basic operations available on all RDDs, such as `map`, `filter`, and `persist`. # class RDD extend Forwardable attr_reader :jrdd, :context, :command include Spark::Helper::Logger include Spark::Helper::Parser include Spark::Helper::Statistic def_delegators :@command, :serializer, :deserializer, :libraries, :files # Initializing RDD, this method is root of all Pipelined RDD - its unique # If you call some operations on this class it will be computed in Java # # == Parameters: # jrdd:: org.apache.spark.api.java.JavaRDD # context:: {Spark::Context} # serializer:: {Spark::Serializer} # def initialize(jrdd, context, serializer, deserializer=nil) @jrdd = jrdd @context = context @cached = false @checkpointed = false @command = Spark::CommandBuilder.new(serializer, deserializer) end def inspect comms = @command.commands.join(' -> ') result = %{#<#{self.class.name}:0x#{object_id}} result << %{ (#{comms})} unless comms.empty? result << %{ (cached)} if cached? result << %{\n} result << %{ Serializer: "#{serializer}"\n} result << %{Deserializer: "#{deserializer}"} result << %{>} result end # ============================================================================= # Operators def +(other) self.union(other) end # ============================================================================= # Commad and serializer def add_command(klass, *args) @command.deep_copy.add_command(klass, *args) end # Add ruby library # Libraries will be included before computing # # == Example: # rdd.add_library('pry').add_library('nio4r', 'distribution') # def add_library(*libraries) @command.add_library(*libraries) self end # Bind object to RDD # # == Example: # text = "test" # # rdd = $sc.parallelize(0..5) # rdd = rdd.map(lambda{|x| x.to_s + " " + text}) # rdd = rdd.bind(text: text) # # rdd.collect # # => ["0 test", "1 test", "2 test", "3 test", "4 test", "5 test"] # def bind(objects) unless objects.is_a?(Hash) raise ArgumentError, 'Argument must be a Hash.' end @command.bind(objects) self end def new_rdd_from_command(klass, *args) comm = add_command(klass, *args) PipelinedRDD.new(self, comm) end # ============================================================================= # Variables and non-computing functions def config @context.config end def default_reduce_partitions config['spark.default.parallelism'] || partitions_size end # Count of ParallelCollectionPartition def partitions_size jrdd.rdd.partitions.size end # A unique ID for this RDD (within its SparkContext). def id jrdd.id end # Persist this RDD with the default storage level MEMORY_ONLY_SER because of serialization. def cache persist('memory_only_ser') end # Set this RDD's storage level to persist its values across operations after the first time # it is computed. This can only be used to assign a new storage level if the RDD does not # have a storage level set yet. # # See StorageLevel for type of new_level # def persist(new_level) @cached = true jrdd.persist(Spark::StorageLevel.java_get(new_level)) self end # Mark the RDD as non-persistent, and remove all blocks for it from memory and disk. # # == Parameters: # blocking:: whether to block until all blocks are deleted. # def unpersist(blocking=true) @cached = false jrdd.unpersist(blocking) self end def cached? @cached end def checkpointed? @checkpointed end # Return the name of this RDD. # def name _name = jrdd.name _name && _name.encode(Encoding::UTF_8) end # Assign a name to this RDD. # def set_name(value) jrdd.setName(value) value end def name=(value) set_name(value) end def to_java marshal = Spark::Serializer.marshal if deserializer.batched? ser = deserializer.deep_copy ser.serializer = marshal else ser = Spark::Serializer.batched(marshal) end rdd = self.reserialize(ser) RubyRDD.toJava(rdd.jrdd, rdd.serializer.batched?) end # ============================================================================= # Actions which return value # Return an array that contains all of the elements in this RDD. # RJB raise an error if stage is killed. def collect(as_enum=false) file = Tempfile.new('collect', context.temp_dir) context.set_call_site(caller.first) RubyRDD.writeRDDToFile(jrdd.rdd, file.path) collect_from_file(file, as_enum) rescue => e raise Spark::RDDError, e.message ensure context.clear_call_site end def collect_from_file(file, as_enum=false) if self.is_a?(PipelinedRDD) klass = @command.serializer else klass = @command.deserializer end if as_enum result = klass.load_from_file(file) else result = klass.load_from_io(file).to_a file.close file.unlink end result end # Convert an Array to Hash # def collect_as_hash Hash[collect] end # Take the first num elements of the RDD. # # It works by first scanning one partition, and use the results from # that partition to estimate the number of additional partitions needed # to satisfy the limit. # # == Example: # rdd = $sc.parallelize(0..100, 20) # rdd.take(5) # # => [0, 1, 2, 3, 4] # def take(count) buffer = [] parts_count = self.partitions_size # No parts was scanned, yet last_scanned = -1 while buffer.empty? last_scanned += 1 buffer += context.run_job_with_command(self, [last_scanned], true, Spark::Command::Take, 0, -1) end # Assumption. Depend on batch_size and how Spark divided data. items_per_part = buffer.size left = count - buffer.size while left > 0 && last_scanned < parts_count parts_to_take = (left.to_f/items_per_part).ceil parts_for_scanned = Array.new(parts_to_take) do last_scanned += 1 end # We cannot take exact number of items because workers are isolated from each other. # => once you take e.g. 50% from last part and left is still > 0 then its very # difficult merge new items items = context.run_job_with_command(self, parts_for_scanned, true, Spark::Command::Take, left, last_scanned) buffer += items left = count - buffer.size # Average size of all parts items_per_part = [items_per_part, items.size].reduce(0){|sum, x| sum + x.to_f/2} end buffer.slice!(0, count) end # Return the first element in this RDD. # # == Example: # rdd = $sc.parallelize(0..100) # rdd.first # # => 0 # def first self.take(1)[0] end # Reduces the elements of this RDD using the specified lambda or method. # # == Example: # rdd = $sc.parallelize(0..10) # rdd.reduce(lambda{|sum, x| sum+x}) # # => 55 # def reduce(f) _reduce(Spark::Command::Reduce, f, f) end # Aggregate the elements of each partition, and then the results for all the partitions, using a # given associative function and a neutral "zero value". # # The function f(x, y) is allowed to modify x and return it as its result value to avoid # object allocation; however, it should not modify y. # # Be careful, zero_values is applied to all stages. See example. # # == Example: # rdd = $sc.parallelize(0..10, 2) # rdd.fold(1, lambda{|sum, x| sum+x}) # # => 58 # def fold(zero_value, f) self.aggregate(zero_value, f, f) end # Aggregate the elements of each partition, and then the results for all the partitions, using # given combine functions and a neutral "zero value". # # This function can return a different result type. We need one operation for merging. # # Result must be an Array otherwise Serializer Array's zero value will be send # as multiple values and not just one. # # == Example: # # 1 2 3 4 5 => 15 + 1 = 16 # # 6 7 8 9 10 => 40 + 1 = 41 # # 16 * 41 = 656 # # seq = lambda{|x,y| x+y} # com = lambda{|x,y| x*y} # # rdd = $sc.parallelize(1..10, 2) # rdd.aggregate(1, seq, com) # # => 656 # def aggregate(zero_value, seq_op, comb_op) _reduce(Spark::Command::Aggregate, seq_op, comb_op, zero_value) end # Return the max of this RDD # # == Example: # rdd = $sc.parallelize(0..10) # rdd.max # # => 10 # def max self.reduce('lambda{|memo, item| memo > item ? memo : item }') end # Return the min of this RDD # # == Example: # rdd = $sc.parallelize(0..10) # rdd.min # # => 0 # def min self.reduce('lambda{|memo, item| memo < item ? memo : item }') end # Return the sum of this RDD # # == Example: # rdd = $sc.parallelize(0..10) # rdd.sum # # => 55 # def sum self.reduce('lambda{|sum, item| sum + item}') end # Return the number of values in this RDD # # == Example: # rdd = $sc.parallelize(0..10) # rdd.count # # => 11 # def count # nil is for seq_op => it means the all result go directly to one worker for combine @count ||= self.map_partitions('lambda{|iterator| iterator.to_a.size }') .aggregate(0, nil, 'lambda{|sum, item| sum + item }') end # Return a {Spark::StatCounter} object that captures the mean, variance # and count of the RDD's elements in one operation. def stats @stats ||= new_rdd_from_command(Spark::Command::Stats).reduce('lambda{|memo, item| memo.merge(item)}') end # Compute the mean of this RDD's elements. # # == Example: # $sc.parallelize([1, 2, 3]).mean # # => 2.0 # def mean stats.mean end # Compute the variance of this RDD's elements. # # == Example: # $sc.parallelize([1, 2, 3]).variance # # => 0.666... # def variance stats.variance end # Compute the standard deviation of this RDD's elements. # # == Example: # $sc.parallelize([1, 2, 3]).stdev # # => 0.816... # def stdev stats.stdev end # Compute the sample standard deviation of this RDD's elements (which # corrects for bias in estimating the standard deviation by dividing by # N-1 instead of N). # # == Example: # $sc.parallelize([1, 2, 3]).sample_stdev # # => 1.0 # def sample_stdev stats.sample_stdev end # Compute the sample variance of this RDD's elements (which corrects # for bias in estimating the variance by dividing by N-1 instead of N). # # == Example: # $sc.parallelize([1, 2, 3]).sample_variance # # => 1.0 # def sample_variance stats.sample_variance end # Compute a histogram using the provided buckets. The buckets # are all open to the right except for the last which is closed. # e.g. [1,10,20,50] means the buckets are [1,10) [10,20) [20,50], # which means 1<=x<10, 10<=x<20, 20<=x<=50. And on the input of 1 # and 50 we would have a histogram of 1,0,1. # # If your histogram is evenly spaced (e.g. [0, 10, 20, 30]), # this can be switched from an O(log n) inseration to O(1) per # element(where n = # buckets). # # Buckets must be sorted and not contain any duplicates, must be # at least two elements. # # == Examples: # rdd = $sc.parallelize(0..50) # # rdd.histogram(2) # # => [[0.0, 25.0, 50], [25, 26]] # # rdd.histogram([0, 5, 25, 50]) # # => [[0, 5, 25, 50], [5, 20, 26]] # # rdd.histogram([0, 15, 30, 45, 60]) # # => [[0, 15, 30, 45, 60], [15, 15, 15, 6]] # def histogram(buckets) # ----------------------------------------------------------------------- # Integer # if buckets.is_a?(Integer) # Validation if buckets < 1 raise ArgumentError, "Bucket count must be >= 1, #{buckets} inserted." end # Filter invalid values # Nil and NaN func = 'lambda{|x| if x.nil? || (x.is_a?(Float) && x.nan?) false else true end }' filtered = self.filter(func) # Compute the minimum and the maximum func = 'lambda{|memo, item| [memo[0] < item[0] ? memo[0] : item[0], memo[1] > item[1] ? memo[1] : item[1]] }' min, max = filtered.map('lambda{|x| [x, x]}').reduce(func) # Min, max must be valid numbers if (min.is_a?(Float) && !min.finite?) || (max.is_a?(Float) && !max.finite?) raise Spark::RDDError, 'Histogram on either an empty RDD or RDD containing +/-infinity or NaN' end # Already finished if min == max || buckets == 1 return [min, max], [filtered.count] end # Custom range begin span = max - min # increment buckets = (0...buckets).map do |x| min + (x * span) / buckets.to_f end buckets << max rescue NoMethodError raise Spark::RDDError, 'Can not generate buckets with non-number in RDD' end even = true # ----------------------------------------------------------------------- # Array # elsif buckets.is_a?(Array) if buckets.size < 2 raise ArgumentError, 'Buckets should have more than one value.' end if buckets.detect{|x| x.nil? || (x.is_a?(Float) && x.nan?)} raise ArgumentError, 'Can not have nil or nan numbers in buckets.' end if buckets.detect{|x| buckets.count(x) > 1} raise ArgumentError, 'Buckets should not contain duplicated values.' end if buckets.sort != buckets raise ArgumentError, 'Buckets must be sorted.' end even = false # ----------------------------------------------------------------------- # Other # else raise Spark::RDDError, 'Buckets should be number or array.' end reduce_func = 'lambda{|memo, item| memo.size.times do |i| memo[i] += item[i] end memo }' return buckets, new_rdd_from_command(Spark::Command::Histogram, even, buckets).reduce(reduce_func) end # Applies a function f to all elements of this RDD. # # == Example: # rdd = $sc.parallelize(0..5) # rdd.foreach(lambda{|x| puts x}) # # => nil # def foreach(f, options={}) new_rdd_from_command(Spark::Command::Foreach, f).collect nil end # Applies a function f to each partition of this RDD. # # == Example: # rdd = $sc.parallelize(0..5) # rdd.foreachPartition(lambda{|x| puts x.to_s}) # # => nil # def foreach_partition(f, options={}) new_rdd_from_command(Spark::Command::ForeachPartition, f).collect nil end # ============================================================================= # Transformations of RDD # Return a new RDD by applying a function to all elements of this RDD. # # == Example: # rdd = $sc.parallelize(0..5) # rdd.map(lambda {|x| x*2}).collect # # => [0, 2, 4, 6, 8, 10] # def map(f) new_rdd_from_command(Spark::Command::Map, f) end # Return a new RDD by first applying a function to all elements of this # RDD, and then flattening the results. # # == Example: # rdd = $sc.parallelize(0..5) # rdd.flat_map(lambda {|x| [x, 1]}).collect # # => [0, 1, 1, 1, 2, 1, 3, 1, 4, 1, 5, 1] # def flat_map(f) new_rdd_from_command(Spark::Command::FlatMap, f) end # Return a new RDD by applying a function to each partition of this RDD. # # == Example: # rdd = $sc.parallelize(0..10, 2) # rdd.map_partitions(lambda{|part| part.reduce(:+)}).collect # # => [15, 40] # def map_partitions(f) new_rdd_from_command(Spark::Command::MapPartitions, f) end # Return a new RDD by applying a function to each partition of this RDD, while tracking the index # of the original partition. # # == Example: # rdd = $sc.parallelize(0...4, 4) # rdd.map_partitions_with_index(lambda{|part, index| part.first * index}).collect # # => [0, 1, 4, 9] # def map_partitions_with_index(f, options={}) new_rdd_from_command(Spark::Command::MapPartitionsWithIndex, f) end # Return a new RDD containing only the elements that satisfy a predicate. # # == Example: # rdd = $sc.parallelize(0..10) # rdd.filter(lambda{|x| x.even?}).collect # # => [0, 2, 4, 6, 8, 10] # def filter(f) new_rdd_from_command(Spark::Command::Filter, f) end # Return a new RDD containing non-nil elements. # # == Example: # rdd = $sc.parallelize([1, nil, 2, nil, 3]) # rdd.compact.collect # # => [1, 2, 3] # def compact new_rdd_from_command(Spark::Command::Compact) end # Return an RDD created by coalescing all elements within each partition into an array. # # == Example: # rdd = $sc.parallelize(0..10, 3) # rdd.glom.collect # # => [[0, 1, 2], [3, 4, 5, 6], [7, 8, 9, 10]] # def glom new_rdd_from_command(Spark::Command::Glom) end # Return a new RDD that is reduced into num_partitions partitions. # # == Example: # rdd = $sc.parallelize(0..10, 3) # rdd.coalesce(2).glom.collect # # => [[0, 1, 2], [3, 4, 5, 6, 7, 8, 9, 10]] # def coalesce(num_partitions) if self.is_a?(PipelinedRDD) deser = @command.serializer else deser = @command.deserializer end new_jrdd = jrdd.coalesce(num_partitions) RDD.new(new_jrdd, context, @command.serializer, deser) end # Return the Cartesian product of this RDD and another one, that is, the # RDD of all pairs of elements `(a, b)` where `a` is in `self` and # `b` is in `other`. # # == Example: # rdd1 = $sc.parallelize([1,2,3]) # rdd2 = $sc.parallelize([4,5,6]) # # rdd1.cartesian(rdd2).collect # # => [[1, 4], [1, 5], [1, 6], [2, 4], [2, 5], [2, 6], [3, 4], [3, 5], [3, 6]] # def cartesian(other) _deserializer = Spark::Serializer::Cartesian.new(self.deserializer, other.deserializer) new_jrdd = jrdd.cartesian(other.jrdd) RDD.new(new_jrdd, context, serializer, _deserializer) end # Return a new RDD containing the distinct elements in this RDD. # Ordering is not preserved because of reducing # # == Example: # rdd = $sc.parallelize([1,1,1,2,3]) # rdd.distinct.collect # # => [1, 2, 3] # def distinct self.map('lambda{|x| [x, nil]}') .reduce_by_key('lambda{|x,_| x}') .map('lambda{|x| x[0]}') end # Return a shuffled RDD. # # == Example: # rdd = $sc.parallelize(0..10) # rdd.shuffle.collect # # => [3, 10, 6, 7, 8, 0, 4, 2, 9, 1, 5] # def shuffle(seed=nil) seed ||= Random.new_seed new_rdd_from_command(Spark::Command::Shuffle, seed) end # Return the union of this RDD and another one. Any identical elements will appear multiple # times (use .distinct to eliminate them). # # == Example: # rdd = $sc.parallelize([1, 2, 3]) # rdd.union(rdd).collect # # => [1, 2, 3, 1, 2, 3] # def union(other) if self.serializer != other.serializer other = other.reserialize(serializer) end new_jrdd = jrdd.union(other.jrdd) RDD.new(new_jrdd, context, serializer, deserializer) end # Return a new RDD with different serializer. This method is useful during union # and join operations. # # == Example: # rdd = $sc.parallelize([1, 2, 3], nil, serializer: "marshal") # rdd = rdd.map(lambda{|x| x.to_s}) # rdd.reserialize("oj").collect # # => ["1", "2", "3"] # def reserialize(new_serializer) if serializer == new_serializer return self end new_command = @command.deep_copy new_command.serializer = new_serializer PipelinedRDD.new(self, new_command) end # Return the intersection of this RDD and another one. The output will not contain # any duplicate elements, even if the input RDDs did. # # == Example: # rdd1 = $sc.parallelize([1,2,3,4,5]) # rdd2 = $sc.parallelize([1,4,5,6,7]) # rdd1.intersection(rdd2).collect # # => [1, 4, 5] # def intersection(other) mapping_function = 'lambda{|item| [item, nil]}' filter_function = 'lambda{|(key, values)| values.size > 1}' self.map(mapping_function) .cogroup(other.map(mapping_function)) .filter(filter_function) .keys end # Return a copy of the RDD partitioned using the specified partitioner. # # == Example: # rdd = $sc.parallelize(["1","2","3","4","5"]).map(lambda {|x| [x, 1]}) # rdd.partitionBy(2).glom.collect # # => [[["3", 1], ["4", 1]], [["1", 1], ["2", 1], ["5", 1]]] # def partition_by(num_partitions, partition_func=nil) num_partitions ||= default_reduce_partitions partition_func ||= 'lambda{|x| Spark::Digest.portable_hash(x.to_s)}' _partition_by(num_partitions, Spark::Command::PartitionBy::Basic, partition_func) end # Return a sampled subset of this RDD. Operations are base on Poisson and Uniform # distributions. # TODO: Replace Unfirom for Bernoulli # # == Examples: # rdd = $sc.parallelize(0..100) # # rdd.sample(true, 10).collect # # => [17, 17, 22, 23, 51, 52, 62, 64, 69, 70, 96] # # rdd.sample(false, 0.1).collect # # => [3, 5, 9, 32, 44, 55, 66, 68, 75, 80, 86, 91, 98] # def sample(with_replacement, fraction, seed=nil) new_rdd_from_command(Spark::Command::Sample, with_replacement, fraction, seed) end # Return a fixed-size sampled subset of this RDD in an array # # == Examples: # rdd = $sc.parallelize(0..100) # # rdd.take_sample(true, 10) # # => [90, 84, 74, 44, 27, 22, 72, 96, 80, 54] # # rdd.take_sample(false, 10) # # => [5, 35, 30, 48, 22, 33, 40, 75, 42, 32] # def take_sample(with_replacement, num, seed=nil) if num < 0 raise Spark::RDDError, 'Size have to be greater than 0' elsif num == 0 return [] end # Taken from scala num_st_dev = 10.0 # Number of items initial_count = self.count return [] if initial_count == 0 # Create new generator seed ||= Random.new_seed rng = Random.new(seed) # Shuffle elements if requested num if greater than array size if !with_replacement && num >= initial_count return self.shuffle(seed).collect end # Max num max_sample_size = Integer::MAX - (num_st_dev * Math.sqrt(Integer::MAX)).to_i if num > max_sample_size raise Spark::RDDError, "Size can not be greate than #{max_sample_size}" end # Approximate fraction with tolerance fraction = compute_fraction(num, initial_count, with_replacement) # Compute first samled subset samples = self.sample(with_replacement, fraction, seed).collect # If the first sample didn't turn out large enough, keep trying to take samples; # this shouldn't happen often because we use a big multiplier for their initial size. index = 0 while samples.size < num log_warning("Needed to re-sample due to insufficient sample size. Repeat #{index}") samples = self.sample(with_replacement, fraction, rng.rand(0..Integer::MAX)).collect index += 1 end samples.shuffle!(random: rng) samples[0, num] end # Return an RDD created by piping elements to a forked external process. # # == Cmds: # cmd = [env,] command... [,options] # # env: hash # name => val : set the environment variable # name => nil : unset the environment variable # command...: # commandline : command line string which is passed to the standard shell # cmdname, arg1, ... : command name and one or more arguments (This form does # not use the shell. See below for caveats.) # [cmdname, argv0], arg1, ... : command name, argv[0] and zero or more arguments (no shell) # options: hash # # See http://ruby-doc.org/core-2.2.0/Process.html#method-c-spawn # # == Examples: # $sc.parallelize(0..5).pipe('cat').collect # # => ["0", "1", "2", "3", "4", "5"] # # rdd = $sc.parallelize(0..5) # rdd = rdd.pipe('cat', "awk '{print $1*10}'") # rdd = rdd.map(lambda{|x| x.to_i + 1}) # rdd.collect # # => [1, 11, 21, 31, 41, 51] # def pipe(*cmds) new_rdd_from_command(Spark::Command::Pipe, cmds) end # ============================================================================= # Pair functions # Merge the values for each key using an associative reduce function. This will also perform # the merging locally on each mapper before sending results to a reducer, similarly to a # "combiner" in MapReduce. Output will be hash-partitioned with the existing partitioner/ # parallelism level. # # == Example: # rdd = $sc.parallelize(["a","b","c","a","b","c","a","c"]).map(lambda{|x| [x, 1]}) # rdd.reduce_by_key(lambda{|x,y| x+y}).collect_as_hash # # => {"a"=>3, "b"=>2, "c"=>3} # def reduce_by_key(f, num_partitions=nil) combine_by_key('lambda {|x| x}', f, f, num_partitions) end # Generic function to combine the elements for each key using a custom set of aggregation # functions. Turns a JavaPairRDD[(K, V)] into a result of type JavaPairRDD[(K, C)], for a # "combined type" C * Note that V and C can be different -- for example, one might group an # RDD of type (Int, Int) into an RDD of type (Int, List[Int]). Users provide three # functions: # # == Parameters: # create_combiner:: which turns a V into a C (e.g., creates a one-element list) # merge_value:: to merge a V into a C (e.g., adds it to the end of a list) # merge_combiners:: to combine two C's into a single one. # # == Example: # def combiner(x) # x # end # # def merge(x,y) # x+y # end # # rdd = $sc.parallelize(["a","b","c","a","b","c","a","c"], 2).map(lambda{|x| [x, 1]}) # rdd.combine_by_key(method(:combiner), method(:merge), method(:merge)).collect_as_hash # # => {"a"=>3, "b"=>2, "c"=>3} # def combine_by_key(create_combiner, merge_value, merge_combiners, num_partitions=nil) _combine_by_key( [Spark::Command::CombineByKey::Combine, create_combiner, merge_value], [Spark::Command::CombineByKey::Merge, merge_combiners], num_partitions ) end # Return an RDD of grouped items. # # == Example: # rdd = $sc.parallelize(0..5) # rdd.group_by(lambda{|x| x%2}).collect # # => [[0, [0, 2, 4]], [1, [1, 3, 5]]] # def group_by(f, num_partitions=nil) self.key_by(f).group_by_key(num_partitions) end # Group the values for each key in the RDD into a single sequence. Allows controlling the # partitioning of the resulting key-value pair RDD by passing a Partitioner. # # Note: If you are grouping in order to perform an aggregation (such as a sum or average) # over each key, using reduce_by_key or combine_by_key will provide much better performance. # # == Example: # rdd = $sc.parallelize([["a", 1], ["a", 2], ["b", 3]]) # rdd.group_by_key.collect # # => [["a", [1, 2]], ["b", [3]]] # def group_by_key(num_partitions=nil) create_combiner = 'lambda{|item| [item]}' merge_value = 'lambda{|combiner, item| combiner << item; combiner}' merge_combiners = 'lambda{|combiner_1, combiner_2| combiner_1 += combiner_2; combiner_1}' combine_by_key(create_combiner, merge_value, merge_combiners, num_partitions) end # Merge the values for each key using an associative function f # and a neutral `zero_value` which may be added to the result an # arbitrary number of times, and must not change the result # (e.g., 0 for addition, or 1 for multiplication.). # # == Example: # rdd = $sc.parallelize([["a", 1], ["b", 2], ["a", 3], ["a", 4], ["c", 5]]) # rdd.fold_by_key(1, lambda{|x,y| x+y}) # # => [["a", 9], ["c", 6], ["b", 3]] # def fold_by_key(zero_value, f, num_partitions=nil) self.aggregate_by_key(zero_value, f, f, num_partitions) end # Aggregate the values of each key, using given combine functions and a neutral zero value. # # == Example: # def combine(x,y) # x+y # end # # def merge(x,y) # x*y # end # # rdd = $sc.parallelize([["a", 1], ["b", 2], ["a", 3], ["a", 4], ["c", 5]], 2) # rdd.aggregate_by_key(1, method(:combine), method(:merge)) # # => [["b", 3], ["a", 16], ["c", 6]] # def aggregate_by_key(zero_value, seq_func, comb_func, num_partitions=nil) _combine_by_key( [Spark::Command::CombineByKey::CombineWithZero, zero_value, seq_func], [Spark::Command::CombineByKey::Merge, comb_func], num_partitions ) end # The same functionality as cogroup but this can grouped only 2 rdd's and you # can change num_partitions. # # == Example: # rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3]]) # rdd2 = $sc.parallelize([["a", 4], ["a", 5], ["b", 6]]) # rdd1.group_with(rdd2).collect # # => [["a", [1, 2, 4, 5]], ["b", [3, 6]]] # def group_with(other, num_partitions=nil) self.union(other).group_by_key(num_partitions) end # For each key k in `this` or `other`, return a resulting RDD that contains a tuple with the # list of values for that key in `this` as well as `other`. # # == Example: # rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3]]) # rdd2 = $sc.parallelize([["a", 4], ["a", 5], ["b", 6]]) # rdd3 = $sc.parallelize([["a", 7], ["a", 8], ["b", 9]]) # rdd1.cogroup(rdd2, rdd3).collect # # => [["a", [1, 2, 4, 5, 7, 8]], ["b", [3, 6, 9]]] # def cogroup(*others) unioned = self others.each do |other| unioned = unioned.union(other) end unioned.group_by_key end # Return each (key, value) pair in self RDD that has no pair with matching # key in other RDD. # # == Example: # rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3], ["c", 4]]) # rdd2 = $sc.parallelize([["b", 5], ["c", 6]]) # rdd1.subtract_by_key(rdd2).collect # # => [["a", 1], ["a", 2]] # def subtract_by_key(other, num_partitions=nil) create_combiner = 'lambda{|item| [[item]]}' merge_value = 'lambda{|combiner, item| combiner.first << item; combiner}' merge_combiners = 'lambda{|combiner_1, combiner_2| combiner_1 += combiner_2; combiner_1}' self.union(other) .combine_by_key(create_combiner, merge_value, merge_combiners, num_partitions) .filter('lambda{|(key,values)| values.size == 1}') .flat_map_values('lambda{|item| item.first}') end # Return an RDD with the elements from self that are not in other. # # == Example: # rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3], ["c", 4]]) # rdd2 = $sc.parallelize([["a", 2], ["c", 6]]) # rdd1.subtract(rdd2).collect # # => [["a", 1], ["b", 3], ["c", 4]] # def subtract(other, num_partitions=nil) mapping_function = 'lambda{|x| [x,nil]}' self.map(mapping_function) .subtract_by_key(other.map(mapping_function), num_partitions) .keys end # Sort the RDD by key # # == Example: # rdd = $sc.parallelize([["c", 1], ["b", 2], ["a", 3]]) # rdd.sort_by_key.collect # # => [["a", 3], ["b", 2], ["c", 1]] # def sort_by_key(ascending=true, num_partitions=nil) self.sort_by('lambda{|(key, _)| key}') end # Sort the RDD by value # # == Example: # rdd = $sc.parallelize([["a", 3], ["b", 1], ["c", 2]]) # rdd.sort_by_value.collect # # => [["b", 1], ["c", 2], ["a", 3]] # def sort_by_value(ascending=true, num_partitions=nil) self.sort_by('lambda{|(_, value)| value}') end # Sorts this RDD by the given key_function # # This is a different implementation than spark. Sort by doesn't use # key_by method first. It can be slower but take less memory and # you can always use map.sort_by_key # # == Example: # rdd = $sc.parallelize(["aaaaaaa", "cc", "b", "eeee", "ddd"]) # # rdd.sort_by.collect # # => ["aaaaaaa", "b", "cc", "ddd", "eeee"] # # rdd.sort_by(lambda{|x| x.size}).collect # # => ["b", "cc", "ddd", "eeee", "aaaaaaa"] # def sort_by(key_function=nil, ascending=true, num_partitions=nil) key_function ||= 'lambda{|x| x}' num_partitions ||= default_reduce_partitions command_klass = Spark::Command::SortByKey # Allow spill data to disk due to memory limit # spilling = config['spark.shuffle.spill'] || false spilling = false memory = '' # Set spilling to false if worker has unlimited memory if memory.empty? spilling = false memory = nil else memory = to_memory_size(memory) end # Sorting should do one worker if num_partitions == 1 rdd = self rdd = rdd.coalesce(1) if partitions_size > 1 return rdd.new_rdd_from_command(command_klass, key_function, ascending, spilling, memory, serializer) end # Compute boundary of collection # Collection should be evenly distributed # 20.0 is from scala RangePartitioner (for roughly balanced output partitions) count = self.count sample_size = num_partitions * 20.0 fraction = [sample_size / [count, 1].max, 1.0].min samples = self.sample(false, fraction, 1).map(key_function).collect samples.sort! # Reverse is much faster than reverse sort_by samples.reverse! if !ascending # Determine part bounds bounds = determine_bounds(samples, num_partitions) shuffled = _partition_by(num_partitions, Spark::Command::PartitionBy::Sorting, key_function, bounds, ascending, num_partitions) shuffled.new_rdd_from_command(command_klass, key_function, ascending, spilling, memory, serializer) end # Creates array of the elements in this RDD by applying function f. # # == Example: # rdd = $sc.parallelize(0..5) # rdd.key_by(lambda{|x| x%2}).collect # # => [[0, 0], [1, 1], [0, 2], [1, 3], [0, 4], [1, 5]] # def key_by(f) new_rdd_from_command(Spark::Command::KeyBy, f) end # Pass each value in the key-value pair RDD through a map function without changing # the keys. This also retains the original RDD's partitioning. # # == Example: # rdd = $sc.parallelize(["ruby", "scala", "java"]) # rdd = rdd.map(lambda{|x| [x, x]}) # rdd = rdd.map_values(lambda{|x| x.upcase}) # rdd.collect # # => [["ruby", "RUBY"], ["scala", "SCALA"], ["java", "JAVA"]] # def map_values(f) new_rdd_from_command(Spark::Command::MapValues, f) end # Pass each value in the key-value pair RDD through a flat_map function # without changing the keys; this also retains the original RDD's # partitioning. # # == Example: # rdd = $sc.parallelize([["a", [1,2]], ["b", [3]]]) # rdd = rdd.flat_map_values(lambda{|x| x*2}) # rdd.collect # # => [["a", 1], ["a", 2], ["a", 1], ["a", 2], ["b", 3], ["b", 3]] # def flat_map_values(f) new_rdd_from_command(Spark::Command::FlatMapValues, f) end # Return an RDD with the first element of PairRDD # # == Example: # rdd = $sc.parallelize([[1,2], [3,4], [5,6]]) # rdd.keys.collect # # => [1, 3, 5] # def keys self.map('lambda{|(key, _)| key}') end # Return an RDD with the second element of PairRDD # # == Example: # rdd = $sc.parallelize([[1,2], [3,4], [5,6]]) # rdd.keys.collect # # => [2, 4, 6] # def values self.map('lambda{|(_, value)| value}') end # Return the list of values in the RDD for key `key`. # TODO: add Partitioner for efficiently searching # # == Example: # rdd = $sc.parallelize(0..10) # rdd = rdd.group_by(lambda {|x| x%3}) # rdd.lookup(2) # # => [[2, 5, 8]] # # rdd = $sc.parallelize(0..10) # rdd = rdd.key_by(lambda{|x| x.even?}) # rdd.lookup(true) # # => [0, 2, 4, 6, 8, 10] # def lookup(key) lookup_key = "lookup_key_#{object_id}" self.filter("lambda{|(key, _)| key == #{lookup_key}}") .bind(lookup_key => key) .values .collect end # Aliases alias_method :partitionsSize, :partitions_size alias_method :defaultReducePartitions, :default_reduce_partitions alias_method :setName, :set_name alias_method :addLibrary, :add_library alias_method :require, :add_library alias_method :flatMap, :flat_map alias_method :mapPartitions, :map_partitions alias_method :mapPartitionsWithIndex, :map_partitions_with_index alias_method :reduceByKey, :reduce_by_key alias_method :combineByKey, :combine_by_key alias_method :groupByKey, :group_by_key alias_method :groupWith, :group_with alias_method :partitionBy, :partition_by alias_method :defaultReducePartitions, :default_reduce_partitions alias_method :foreachPartition, :foreach_partition alias_method :mapValues, :map_values alias_method :takeSample, :take_sample alias_method :sortBy, :sort_by alias_method :sortByKey, :sort_by_key alias_method :keyBy, :key_by alias_method :groupBy, :group_by alias_method :foldByKey, :fold_by_key alias_method :aggregateByKey, :aggregate_by_key alias_method :subtractByKey, :subtract_by_key alias_method :sampleStdev, :sample_stdev alias_method :sampleVariance, :sample_variance private # This is base method for reduce operation. Is used by reduce, fold and aggregation. # Only difference is that fold has zero value. # def _reduce(klass, seq_op, comb_op, zero_value=nil) if seq_op.nil? # Partitions are already reduced rdd = self else rdd = new_rdd_from_command(klass, seq_op, zero_value) end # Send all results to one worker and combine results rdd = rdd.coalesce(1).compact # Add the same function to new RDD comm = rdd.add_command(klass, comb_op, zero_value) comm.deserializer = @command.serializer # Value is returned in array PipelinedRDD.new(rdd, comm).collect[0] end def _partition_by(num_partitions, klass, *args) # RDD is transform from [key, value] to [hash, [key, value]] keyed = new_rdd_from_command(klass, *args) keyed.serializer.unbatch! # PairwiseRDD and PythonPartitioner are borrowed from Python # but works great on ruby too pairwise_rdd = PairwiseRDD.new(keyed.jrdd.rdd).asJavaPairRDD partitioner = PythonPartitioner.new(num_partitions, args.first.object_id) new_jrdd = pairwise_rdd.partitionBy(partitioner).values # Reset deserializer RDD.new(new_jrdd, context, @command.serializer, keyed.serializer) end # For using a different combine_by_key # # == Used for: # * combine_by_key # * fold_by_key (with zero value) # def _combine_by_key(combine, merge, num_partitions) num_partitions ||= default_reduce_partitions # Combine key combined = new_rdd_from_command(combine.shift, *combine) # Merge items shuffled = combined.partition_by(num_partitions) merge_comm = shuffled.add_command(merge.shift, *merge) PipelinedRDD.new(shuffled, merge_comm) end end # Pipelined Resilient Distributed Dataset, operations are pipelined and sended to worker # # RDD # `-- map # `-- map # `-- map # # Code is executed from top to bottom # class PipelinedRDD < RDD attr_reader :prev_jrdd, :command def initialize(prev, command) if prev.is_a?(PipelinedRDD) && prev.pipelinable? # Second, ... stages @prev_jrdd = prev.prev_jrdd else # First stage @prev_jrdd = prev.jrdd end @cached = false @checkpointed = false @context = prev.context @command = command end def pipelinable? !(cached? || checkpointed?) end # Serialization necessary things and sent it to RubyRDD (scala extension) def jrdd @jrdd ||= _jrdd end private def _jrdd command = @command.build broadcasts = @command.bound_objects.select{|_, value| value.is_a?(Spark::Broadcast)}.values broadcasts = to_java_array_list(broadcasts.map(&:jbroadcast)) ruby_rdd = RubyRDD.new(@prev_jrdd.rdd, command, broadcasts, @context.jaccumulator) ruby_rdd.asJavaRDD end end end ================================================ FILE: lib/spark/sampler.rb ================================================ require 'distribution' # Random Generators module Spark module RandomGenerator class Poisson def initialize(mean, seed) generator = Random.new(seed) @exp_rng = Distribution::Exponential.rng(1.0/mean, random: generator) end def rand t = 0.0 number = 0 loop{ t += @exp_rng.call if t > 1 return number end number += 1 } end end end end # Samplers module Spark module Sampler class Base attr_reader :fraction, :seed def initialize(fraction, seed=nil) @fraction = fraction @seed = seed || Random.new_seed end end # Poisson Sampler # ------------------------------------------------------------------------- class Poisson < Base def sample(iterator) iterator.map! do |item| count = rng.rand Array.new(count) { item } end iterator.flatten! iterator.compact! iterator end def lazy_sample(iterator) Enumerator::Lazy.new(iterator) do |yielder, value| count = rng.rand count.times { yielder << value } end end def rng @rng ||= Spark::RandomGenerator::Poisson.new(fraction, seed) end end # Uniform Sampler # ------------------------------------------------------------------------- class Uniform < Base def sample(iterator) iterator.select!{|item| rng.rand <= fraction} iterator end def lazy_sample(iterator) iterator.select do |item| rng.rand <= fraction end end def rng @rng ||= Random.new(seed) end end end end ================================================ FILE: lib/spark/serializer/auto_batched.rb ================================================ module Spark module Serializer ## # AutoBatched serializator # # Batch size is computed automatically. Simillar to Python's AutoBatchedSerializer. # class AutoBatched < Batched MAX_RATIO = 10 def initialize(serializer, best_size=65536) @serializer = serializer @best_size = best_size.to_i error('Batch size must be greater than 1') if @best_size < 2 end def batched? true end def unbatch! end def name "AutoBatched(#{@best_size})" end def dump_to_io(data, io) check_each(data) # Only Array have .slice data = data.to_a index = 0 batch = 2 max = @best_size * MAX_RATIO loop do chunk = data.slice(index, batch) if chunk.nil? || chunk.empty? break end serialized = @serializer.dump(chunk) io.write_string(serialized) index += batch size = serialized.bytesize if size < @best_size batch *= 2 elsif size > max && batch > 1 batch /= 2 end end io.flush end end end end Spark::Serializer.register('auto_batched', 'autobatched', Spark::Serializer::AutoBatched) ================================================ FILE: lib/spark/serializer/base.rb ================================================ module Spark module Serializer # @abstract Parent for all serializers class Base def load_from_io(io) return to_enum(__callee__, io) unless block_given? loop do size = io.read_int_or_eof break if size == Spark::Constant::DATA_EOF yield load(io.read(size)) end end def load_from_file(file, *args) return to_enum(__callee__, file, *args) unless block_given? load_from_io(file, *args).each do |item| yield item end file.close file.unlink end def ==(other) self.to_s == other.to_s end def batched? false end def unbatch! end def check_each(data) unless data.respond_to?(:each) error('Data must be iterable.') end end def error(message) raise Spark::SerializeError, message end def name self.class.name.split('::').last end def to_s name end def inspect %{#} end end end end ================================================ FILE: lib/spark/serializer/batched.rb ================================================ module Spark module Serializer class Batched < Base attr_writer :serializer def initialize(serializer, batch_size=nil) batch_size ||= Spark::Serializer::DEFAULT_BATCH_SIZE @serializer = serializer @batch_size = batch_size.to_i error('Batch size must be greater than 0') if @batch_size < 1 end # Really batched def batched? @batch_size > 1 end def unbatch! @batch_size = 1 end def load(data) @serializer.load(data) end def dump(data) @serializer.dump(data) end def name "Batched(#{@batch_size})" end def to_s "#{name} -> #{@serializer}" end # === Dump ============================================================== def dump_to_io(data, io) check_each(data) if batched? data = data.each_slice(@batch_size) end data.each do |item| serialized = dump(item) io.write_string(serialized) end io.flush end # === Load ============================================================== def load_from_io(io) return to_enum(__callee__, io) unless block_given? loop do size = io.read_int_or_eof break if size == Spark::Constant::DATA_EOF data = io.read(size) data = load(data) if batched? data.each{|item| yield item } else yield data end end end end end end Spark::Serializer.register('batched', Spark::Serializer::Batched) ================================================ FILE: lib/spark/serializer/cartesian.rb ================================================ module Spark module Serializer class Cartesian < Pair def aggregate(item1, item2) item1.product(item2) end end end end Spark::Serializer.register('cartesian', Spark::Serializer::Cartesian) ================================================ FILE: lib/spark/serializer/compressed.rb ================================================ module Spark module Serializer class Compressed < Base def initialize(serializer) @serializer = serializer end def dump(data) Zlib::Deflate.deflate(@serializer.dump(data)) end def load(data) @serializer.load(Zlib::Inflate.inflate(data)) end end end end begin # TODO: require only if it is necessary require 'zlib' Spark::Serializer.register('compress', 'compressed', Spark::Serializer::Compressed) rescue LoadError end ================================================ FILE: lib/spark/serializer/marshal.rb ================================================ module Spark module Serializer class Marshal < Base def dump(data) ::Marshal.dump(data) end def load(data) ::Marshal.load(data) end end end end Spark::Serializer.register('marshal', Spark::Serializer::Marshal) ================================================ FILE: lib/spark/serializer/message_pack.rb ================================================ module Spark module Serializer class MessagePack < Base def dump(data) ::MessagePack.dump(data) end def load(data) ::MessagePack.load(data) end end end end begin # TODO: require only if it is necessary require 'msgpack' Spark::Serializer.register('messagepack', 'message_pack', 'msgpack', 'msg_pack', Spark::Serializer::MessagePack) rescue LoadError end ================================================ FILE: lib/spark/serializer/oj.rb ================================================ module Spark module Serializer class Oj < Base def dump(data) ::Oj.dump(data) end def load(data) ::Oj.load(data) end end end end begin # TODO: require only if it is necessary require 'oj' Spark::Serializer.register('oj', Spark::Serializer::Oj) rescue LoadError end ================================================ FILE: lib/spark/serializer/pair.rb ================================================ module Spark module Serializer class Pair < Base def initialize(serializer1, serializer2) @serializer1 = serializer1 @serializer2 = serializer2 end def to_s "#{name}(#{@serializer1}, #{@serializer2})" end def aggregate(item1, item2) item1.zip(item2) end def load_from_io(io) return to_enum(__callee__, io) unless block_given? loop do size = io.read_int_or_eof break if size == Spark::Constant::DATA_EOF item1 = @serializer1.load(io.read(size)) item2 = @serializer2.load(io.read_string) item1 = [item1] unless @serializer1.batched? item2 = [item2] unless @serializer2.batched? aggregate(item1, item2).each do |item| yield item end end end end end end Spark::Serializer.register('pair', Spark::Serializer::Pair) ================================================ FILE: lib/spark/serializer/text.rb ================================================ module Spark module Serializer class Text < Base attr_reader :encoding def initialize(encoding=Encoding::UTF_8) error('Encoding must be an instance of Encoding') unless encoding.is_a?(Encoding) @encoding = encoding end def load(data) data.to_s.force_encoding(@encoding) end def to_s "Text(#{@encoding})" end end end end Spark::Serializer.register('string', 'text', Spark::Serializer::Text) ================================================ FILE: lib/spark/serializer.rb ================================================ module Spark ## # Serializer # module Serializer DEFAULT_COMPRESS = false DEFAULT_BATCH_SIZE = 1024 DEFAULT_SERIALIZER_NAME = 'marshal' @@registered = {} # Register class and create method for quick access. # Class will be available also as __name__ for using # in build method (Proc binding problem). # # == Examples: # register('test1', 'test2', Class) # # Spark::Serializer.test1 # Spark::Serializer.test2 # # # Proc binding problem # build { marshal } # => Spark::Serializer::Marshal # # marshal = 1 # build { marshal } # => 1 # # build { __marshal__ } # => Spark::Serializer::Marshal # def self.register(*args) klass = args.pop args.each do |arg| @@registered[arg] = klass define_singleton_method(arg.to_sym){|*args| klass.new(*args) } define_singleton_method("__#{arg}__".to_sym){|*args| klass.new(*args) } end end def self.find(name) @@registered[name.to_s.downcase] end def self.find!(name) klass = find(name) if klass.nil? raise Spark::SerializeError, "Unknow serializer #{name}." end klass end def self.build(text=nil, &block) if block_given? class_eval(&block) else class_eval(text.to_s.downcase) end end end end # Parent require 'spark/serializer/base' # Basic require 'spark/serializer/oj' require 'spark/serializer/marshal' require 'spark/serializer/message_pack' require 'spark/serializer/text' # Others require 'spark/serializer/batched' require 'spark/serializer/auto_batched' require 'spark/serializer/compressed' require 'spark/serializer/pair' require 'spark/serializer/cartesian' ================================================ FILE: lib/spark/sort.rb ================================================ module Spark module InternalSorter class Base def initialize(key_function) @key_function = key_function end end class Ascending < Base def sort(data) data.sort_by!(&@key_function) end end class Descending < Ascending def sort(data) super data.reverse! end end def self.get(ascending, key_function) if ascending type = Ascending else type = Descending end type.new(key_function) end end end module Spark class ExternalSorter include Spark::Helper::System # Items from GC cannot be destroyed so #make_parts need some reserve MEMORY_RESERVE = 50 # % # How big will be chunk for adding new memory because GC not cleaning # immediately un-referenced variables MEMORY_FREE_CHUNK = 10 # % # How many items will be evaluate from iterator at start START_SLICE_SIZE = 10 # Maximum of slicing. Memory control can be avoided by large value. MAX_SLICE_SIZE = 10_000 # How many values will be taken from each enumerator. EVAL_N_VALUES = 10 # Default key function KEY_FUNCTION = lambda{|item| item} attr_reader :total_memory, :memory_limit, :memory_chunk, :serializer def initialize(total_memory, serializer) @total_memory = total_memory @memory_limit = total_memory * (100-MEMORY_RESERVE) / 100 @memory_chunk = total_memory * (100-MEMORY_FREE_CHUNK) / 100 @serializer = serializer end def add_memory! @memory_limit += memory_chunk end def sort_by(iterator, ascending=true, key_function=KEY_FUNCTION) return to_enum(__callee__, iterator, key_function) unless block_given? create_temp_folder internal_sorter = Spark::InternalSorter.get(ascending, key_function) # Make N sorted enumerators parts = make_parts(iterator, internal_sorter) return [] if parts.empty? # Need new key function because items have new structure # From: [1,2,3] to [[1, Enumerator],[2, Enumerator],[3, Enumerator]] key_function_with_enum = lambda{|(key, _)| key_function[key]} internal_sorter = Spark::InternalSorter.get(ascending, key_function_with_enum) heap = [] enums = [] # Load first items to heap parts.each do |part| EVAL_N_VALUES.times { begin heap << [part.next, part] rescue StopIteration break end } end # Parts can be empty but heap not while parts.any? || heap.any? internal_sorter.sort(heap) # Since parts are sorted and heap contains EVAL_N_VALUES method # can add EVAL_N_VALUES items to the result EVAL_N_VALUES.times { break if heap.empty? item, enum = heap.shift enums << enum yield item } # Add new element to heap from part of which was result item while (enum = enums.shift) begin heap << [enum.next, enum] rescue StopIteration parts.delete(enum) enums.delete(enum) end end end ensure destroy_temp_folder end private def create_temp_folder @dir = Dir.mktmpdir end def destroy_temp_folder FileUtils.remove_entry_secure(@dir) if @dir end # New part is created when current part exceeds memory limit (is variable) # Every new part have more memory because of ruby GC def make_parts(iterator, internal_sorter) slice = START_SLICE_SIZE parts = [] part = [] loop do begin # Enumerator does not have slice method slice.times { part << iterator.next } rescue StopIteration break end # Carefully memory_limit is variable if memory_usage > memory_limit # Sort current part with origin key_function internal_sorter.sort(part) # Tempfile for current part # will be destroyed on #destroy_temp_folder file = Tempfile.new("part", @dir) serializer.dump(part, file) # Peek is at the end of file file.seek(0) parts << serializer.load(file) # Some memory will be released but not immediately # need some new memory for start part.clear add_memory! else slice = [slice*2, MAX_SLICE_SIZE].min end end # Last part which is not in the file if part.any? internal_sorter.sort(part) parts << part.each end parts end end # ExternalSorter end # Spark ================================================ FILE: lib/spark/sql/column.rb ================================================ module Spark module SQL class Column # ============================================================================= # Creating def self.to_java(col) if col.is_a?(Column) col.jcolumn else from_name(col) end end def self.from_literal(literal) JSQLFunctions.lit(literal) end def self.from_name(name) JSQLFunctions.col(name) end # ============================================================================= # Functions for virtual columns # Evaluates a list of conditions and returns one of multiple possible result expressions. # If {Column.otherwise} is not invoked, nil is returned for unmatched conditions. # # == Parameters: # condition:: a boolean {Column} expression # value:: a literal value, or a {Column} expression # # == Example: # df.select(when(df['age'] == 2, 3).otherwise(4).alias("age")).collect() # # [Row(age=3), Row(age=4)] # # df.select(when(df.age == 2, df.age + 1).alias("age")).collect() # # [Row(age=3), Row(age=nil)] # def self.when(condition, value) Column.new(JSQLFunctions).when(condition, value) end # ============================================================================= # Initialized column attr_reader :jcolumn def initialize(jcolumn) @jcolumn = jcolumn end FUNC_OPERATORS = { '!' => 'not', '~' => 'negate', '-@' => 'negate' } BIN_OPERATORS = { '[]' => 'apply', '+' => 'plus', '-' => 'minus', '*' => 'multiply', '/' => 'divide', '%' => 'mod', '==' => 'equalTo', '!=' => 'notEqual', '<' => 'lt', '<=' => 'leq', '>' => 'gt', '>=' => 'geq', '&' => 'and', '|' => 'or', 'like' => 'like', 'starts_with' => 'startsWith', 'ends_with' => 'endsWith', 'bitwiseOR' => 'bitwiseOR', 'bitwiseAND' => 'bitwiseAND', 'bitwiseXOR' => 'bitwiseXOR', } UNARY_OPERATORS = { 'asc' => 'asc', 'desc' => 'desc', 'is_null' => 'isNull', 'is_not_null' => 'isNotNull' } FUNC_OPERATORS.each do |op, func| eval <<-METHOD def #{op} func_op('#{func}') end METHOD end BIN_OPERATORS.each do |op, func| eval <<-METHOD def #{op}(item) bin_op('#{func}', item) end METHOD end UNARY_OPERATORS.each do |op, func| eval <<-METHOD def #{op} unary_op('#{func}') end METHOD end # An expression that gets an item at position ordinal out of a list, # or gets an item by key out of a Hash. # # == Example: # df.select(df.l.get_item(0), df.d.get_item("key")).show # # +----+------+ # # |l[0]|d[key]| # # +----+------+ # # | 1| value| # # +----+------+ # # df.select(df.l[0], df.d["key"]).show # # +----+------+ # # |l[0]|d[key]| # # +----+------+ # # | 1| value| # # +----+------+ # def get_item(key) self[key] end # An expression that gets a field by name in a StructField. # # == Example: # df.select(df.r.get_field("b")).show # # +----+ # # |r[b]| # # +----+ # # | b| # # +----+ # # df.select(df.r.a).show # # +----+ # # |r[a]| # # +----+ # # | 1| # # +----+ # def get_field(name) self[name] end # Return a {Column} which is a substring of the column. # # == Parameters: # start:: start position (Integer or Column) # length:: length of the substring (Integer or Column) # # == Example: # df.select(df.name.substr(1, 3).alias("col")).collect # # => [#, #] # def substr(start, length) if start.is_a?(Integer) && length.is_a?(Integer) new_jcolumn = jcolumn.substr(start, length) elsif start.is_a?(Column) && length.is_a?(Column) new_jcolumn = jcolumn.substr(start.jcolumn, length.jcolumn) else raise ArgumentError, "Unsupported type: #{start.class} and #{length.class}." end Column.new(new_jcolumn) end # A boolean expression that is evaluated to true if the value of this # expression is contained by the evaluated values of the arguments. # # == Example: # df[df.name.isin("Bob", "Mike")].collect # # => [#] # # df[df.age.isin(1, 2, 3)].collect # # => [#] # def isin(*cols) if cols.size == 1 && cols.first.is_a?(Array) cols = cols.first end cols = cols.map do |col| Column.from_literal(col) end new_jcolumn = jcolumn.isin(Spark.jb.to_seq(cols)) Column.new(new_jcolumn) end # Returns this column aliased with a new name or names (in the case of expressions that # return more than one column, such as explode). # # == Example: # df.select(df.age.alias("age2")).collect # # => [#, #] # def alias(name) Column.new(jcolumn.as(name)) end # Convert the column into type data_type. # # == Example: # df.select(df.age.cast("string").alias('ages')).collect # # => [#, #] # # df.select(df.age.cast(StringType.new).alias('ages')).collect # # => [#, #] # def cast(data_type) case data_type when String new_jcolumn = jcolumn.cast(data_type) when DataType jdata_type = JDataType.fromJson(data_type.json) new_jcolumn = jcolumn.cast(jdata_type) else raise ArgumentError, "Unsupported type: #{data_type.class}" end Column.new(new_jcolumn) end # A boolean expression that is evaluated to true if the value of this # expression is between the given columns. # # == Example: # df.select(df.name, df.age.between(2, 4)).show # # +-----+--------------------------+ # # | name|((age >= 2) && (age <= 4))| # # +-----+--------------------------+ # # |Alice| true| # # | Bob| false| # # +-----+--------------------------+ # def between(lower, upper) (self >= lower) & (self <= upper) end # Evaluates a list of conditions and returns one of multiple possible result expressions. # If {Column.otherwise} is not invoked, nil is returned for unmatched conditions. # # == Parameters: # condition:: a boolean {Column} expression. # value:: a literal value, or a {Column} expression. # # == Example: # df.select(df.name, Column.when(df.age > 4, 1).when(df.age < 3, -1).otherwise(0)).show # # +-----+--------------------------------------------------------+ # # | name|CASE WHEN (age > 4) THEN 1 WHEN (age < 3) THEN -1 ELSE 0| # # +-----+--------------------------------------------------------+ # # |Alice| -1| # # | Bob| 1| # # +-----+--------------------------------------------------------+ # def when(condition, value) unless condition.is_a?(Column) raise ArgumentError, "Condition must be a Column" end if value.is_a?(Column) value = value.jcolumn end new_jcolumn = jcolumn.when(condition.jcolumn, value) Column.new(new_jcolumn) end # Evaluates a list of conditions and returns one of multiple possible result expressions. # If {Column.otherwise} is not invoked, nil is returned for unmatched conditions. # # == Example: # df.select(df.name, Column.when(df.age > 3, 1).otherwise(0)).show # # +-----+---------------------------------+ # # | name|CASE WHEN (age > 3) THEN 1 ELSE 0| # # +-----+---------------------------------+ # # |Alice| 0| # # | Bob| 1| # # +-----+---------------------------------+ # def otherwise(value) if value.is_a?(Column) value = value.jcolumn end new_jcolumn = jcolumn.otherwise(value) Column.new(new_jcolumn) end def over(*) raise Spark::NotImplemented end def method_missing(method, item) get_field(item) end def to_s "Column(\"#{jcolumn.toString}\")" end def inspect "#<#{to_s}>" end alias_method :as, :alias alias_method :slice, :substr alias_method :astype, :cast private def func_op(name) new_jcolumn = JSQLFunctions.__send__(name, jcolumn) Column.new(new_jcolumn) end def bin_op(name, item) if item.is_a?(Column) col = item.jcolumn else col = item end new_jcolumn = jcolumn.__send__(name, col) Column.new(new_jcolumn) end def unary_op(name) new_jcolumn = jcolumn.__send__(name) Column.new(new_jcolumn) end end end end ================================================ FILE: lib/spark/sql/context.rb ================================================ module Spark module SQL class Context attr_reader :spark_context, :jsql_context def initialize(spark_context) @spark_context = spark_context @jsql_context = JSQLContext.new(spark_context.sc) end def read DataFrameReader.new(self) end end end end ================================================ FILE: lib/spark/sql/data_frame.rb ================================================ module Spark module SQL ## # Spark::SQL::DataFrame # # All example are base on people.json # class DataFrame attr_reader :jdf, :sql_context def initialize(jdf, sql_context) @jdf = jdf @sql_context = sql_context end # Returns the column as a {Column}. # # == Examples: # df.select(df['age']).collect # # => [#2}>, #5}>] # # df[ ["name", "age"] ].collect # # => [#"Alice", "age"=>2}>, #"Bob", "age"=>5}>] # # df[ df.age > 3 ].collect # # => [#5, "name"=>"Bob"}>] # # df[df[0] > 3].collect # # => [#5, "name"=>"Bob"}>] # def [](item) case item when String jcolumn = jdf.apply(item) Column.new(jcolumn) when Array select(*item) when Numeric jcolumn = jdf.apply(columns[item]) Column.new(jcolumn) when Column where(item) else raise ArgumentError, "Unsupported type: #{item.class}" end end # Returns all column names as a Array. # # == Example: # df.columns # # => ['age', 'name'] # def columns schema.fields.map(&:name) end # Returns the schema of this {DataFrame} as a {StructType}. def schema return @schema if @schema begin @schema = DataType.parse(JSON.parse(jdf.schema.json)) rescue => e raise Spark::ParseError, 'Unable to parse datatype from schema' end end def show_string(n=20, truncate=true) jdf.showString(n, truncate) end # Prints the first n rows to the console. # # == Parameters: # n:: Number of rows to show. # truncate:: Whether truncate long strings and align cells right. # def show(n=20, truncate=true) puts show_string(n, truncate) end # Prints out the schema in the tree format. # # == Example: # df.print_schema # # root # # |-- age: integer (nullable = true) # # |-- name: string (nullable = true) # def print_schema puts jdf.schema.treeString end def explain(extended=false) if extended jdf.queryExecution.toString else jdf.queryExecution.executedPlan.toString end end # Prints the (logical and physical) plans to the console for debugging purpose. # # == Example: # df.print_explain # # Scan PhysicalRDD[age#0,name#1] # # df.print_explain(true) # # == Parsed Logical Plan == # # ... # # == Analyzed Logical Plan == # # ... # # == Optimized Logical Plan == # # ... # # == Physical Plan == # # ... # def print_explain(extended=false) puts explain(extended) end # Returns all column names and their data types as a list. # # == Example: # df.dtypes # # => [('age', 'int'), ('name', 'string')] # def dtypes schema.fields.map do |field| [field.name, field.data_type.simple_string] end end def inspect types = dtypes.map do |(name, type)| "#{name}: #{type}" end "#" end # Get column by name def method_missing(method, *args, &block) name = method.to_s if columns.include?(name) self[name] else super end end # ============================================================================= # Collect # Returns all the records as a list of {Row}. # # == Example: # df.collect # # => [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')] # def collect Spark.jb.call(jdf, 'collect') end def collect_as_hash result = collect result.map!(&:to_h) result end def values result = collect result.map! do |item| item.to_h.values end result end # Returns the number of rows in this {DataFrame}. def count jdf.count.to_i end # Returns the first num rows as an Array of {Row}. def take(num) limit(num).collect end # Return first {Row}. def first take(1).first end # ============================================================================= # Queries # Projects a set of expressions and returns a new {DataFrame} # # == Parameters: # *cols:: # List of column names (string) or expressions {Column}. # If one of the column names is '*', that column is expanded to include all columns # in the current DataFrame. # # == Example: # df.select('*').collect # # => [#2, "name"=>"Alice"}>, #5, "name"=>"Bob"}>] # # df.select('name', 'age').collect # # => [#"Alice", "age"=>2}>, #"Bob", "age"=>5}>] # # df.select(df.name, (df.age + 10).alias('age')).collect # # => [#"Alice", "age"=>12}>, #"Bob", "age"=>15}>] # def select(*cols) jcols = cols.map do |col| Column.to_java(col) end new_jdf = jdf.select(jcols) DataFrame.new(new_jdf, sql_context) end # Filters rows using the given condition. # # == Examples: # df.filter(df.age > 3).collect # # => [#5, "name"=>"Bob"}>] # # df.where(df.age == 2).collect # # => [#2, "name"=>"Alice"}>] # # df.filter("age > 3").collect # # => [#5, "name"=>"Bob"}>] # # df.where("age = 2").collect # # => [#2, "name"=>"Alice"}>] # def filter(condition) case condition when String new_jdf = jdf.filter(condition) when Column new_jdf = jdf.filter(condition.jcolumn) else raise ArgumentError, 'Condition must be String or Column' end DataFrame.new(new_jdf, sql_context) end # Limits the result count to the number specified. def limit(num) new_jdf = jdf.limit(num) DataFrame.new(new_jdf, sql_context) end alias_method :where, :filter end end end ================================================ FILE: lib/spark/sql/data_frame_reader.rb ================================================ module Spark module SQL class DataFrameReader attr_reader :sql_context, :jreader def initialize(sql_context) @sql_context = sql_context @jreader = sql_context.jsql_context.read end def df(jdf) DataFrame.new(jdf, sql_context) end # Specifies the input data source format. # Parameter is name of the data source, e.g. 'json', 'parquet'. def format(source) jreader.format(source) self end # Adds an input option for the underlying data source. def option(key, value) jreader.option(key, value.to_s) self end # Adds input options for the underlying data source. def options(options) options.each do |key, value| jreader.option(key, value.to_s) end self end # Loads data from a data source and returns it as a :class`DataFrame`. # # == Parameters: # path:: Optional string for file-system backed data sources. # format:: Optional string for format of the data source. Default to 'parquet'. # schema:: Optional {StructType} for the input schema. # options:: All other string options. # def load(path=nil, new_format=nil, new_schema=nil, new_options=nil) new_format && format(new_format) new_schema && schema(new_schema) new_options && options(new_options) if path.nil? df(jreader.load) else df(jreader.load(path)) end end # Specifies the input schema. # # Some data sources (e.g. JSON) can infer the input schema automatically from data. # By specifying the schema here, the underlying data source can skip the schema # inference step, and thus speed up data loading. # # Parameter schema must be StructType object. # def schema(new_schema) unless new_schema.is_a?(StructType) raise ArgumentError, 'Schema must be a StructType.' end jschema = sql_context.jsql_context.parseDataType(new_schema.json) jreader.schema(jschema) self end # Loads a JSON file (one object per line) and returns the result as {DataFrame} # # If the schema parameter is not specified, this function goes # through the input once to determine the input schema. # # == Parameters: # path:: string, path to the JSON dataset # schema:: an optional {StructType} for the input schema. # # == Example: # df = sql.read.json('people.json') # df.dtypes # # => [('age', 'bigint'), ('name', 'string')] # def json(path, new_schema=nil) # ClassNotFoundException: Failed to load class for data source: json # df(jreader.json(path)) load(path, 'org.apache.spark.sql.execution.datasources.json', new_schema) end end end end ================================================ FILE: lib/spark/sql/data_type.rb ================================================ module Spark module SQL ## # Spark::SQL::DataType # class DataType cattr_accessor :atomic_types self.atomic_types = {} cattr_accessor :complex_types self.complex_types = {} def self.parse(data) if data.is_a?(Hash) type = data['type'] if complex_types.has_key?(type) complex_types[type].from_json(data) # elsif type == 'udt' # UserDefinedType.from_json(data) else raise Spark::SQLError, "Unsupported type: #{type}" end else if atomic_types.has_key?(data) atomic_types[data].new else raise Spark::SQLError, "Unsupported type: #{type}" end end end def self.class_name name.split('::').last end def self.type_name class_name.sub('Type', '').downcase end def self.complex complex_types[type_name] = self end def self.atomic atomic_types[type_name] = self end def ==(other) self.class == other.class && self.to_s == other.to_s end def type_name self.class.type_name end def simple_string type_name end def json_value type_name end def json json_value.to_json end def to_s self.class.class_name end def inspect "#<#{to_s}>" end end ## # Spark::SQL::StructType # # Struct type, consisting of a list of {StructField}. # This is the data type representing a {Row}. # # == Example: # struct1 = StructType.new([StructField.new('f1', StringType.new, true)]) # struct2 = StructType.new([StructField.new('f2', StringType.new, true)]) # struct1 == struct2 # # => true # class StructType < DataType complex attr_reader :fields def self.from_json(json) fields = json['fields'].map do |field| StructField.from_json(field) end StructType.new(fields) end def initialize(fields=[]) @fields = fields @names = fields.map(&:name) end def json_value { 'type' => type_name, 'fields' => fields.map(&:json_value) } end def to_s "StructType(#{fields.join(', ')})" end end ## # Spark::SQL::StructField # class StructField < DataType attr_reader :name, :data_type, :nullable, :metadata def self.from_json(json) StructField.new(json['name'], DataType.parse(json['type']), json['nullable'], json['metadata']) end # A field in {StructType}. # # == Parameters: # name:: string, name of the field. # data_type:: {DataType} of the field. # nullable:: boolean, whether the field can be null (nil) or not. # metadata:: a dict from string to simple type that can be to_internald to JSON automatically # # == Example: # f1 = StructField.new('f1', StringType.new, true) # f2 = StructField.new('f2', StringType.new, true) # f1 == f2 # # => true # def initialize(name, data_type, nullable=true, metadata={}) @name = name @data_type = data_type @nullable = nullable @metadata = metadata end def json_value { 'name' => name, 'type' => data_type.json_value, 'nullable' => nullable, 'metadata' => metadata, } end def to_s %{StructField(#{name}, #{data_type}, #{nullable})} end end ## # Spark::SQL::AtomicType # # An internal type used to represent everything that is not # null, UDTs, arrays, structs, and maps. # class AtomicType < DataType end ## # Spark::SQL::BooleanType # # Boolean data type. # class BooleanType < AtomicType atomic end ## # Spark::SQL::NumericType # # Numeric data types. # class NumericType < AtomicType end ## # Spark::SQL::IntegralType # # Integral data types. # class IntegralType < NumericType end ## # Spark::SQL::StringType # # String data type. # class StringType < AtomicType atomic end ## # Spark::SQL::LongType # # Long data type, i.e. a signed 64-bit integer. # # If the values are beyond the range of [-9223372036854775808, 9223372036854775807], # please use {DecimalType}. # class LongType < IntegralType atomic end end end ================================================ FILE: lib/spark/sql/row.rb ================================================ module Spark module SQL ## # Spark::SQL::Row # class Row attr_reader :data def self.from_java(object, with_schema=true) if with_schema fields = object.schema.fieldNames else # Create virtual schema (t0, t1, t2, ...) raise Spark::NotImplemented, 'Row must have a schema' end if object.anyNull data = {} object.size.times do |i| if object.isNullAt(i) value = nil else value = Spark.jb.to_ruby(object.get(i)) end data[ fields[i] ] = value end else data = fields.zip(Spark.jb.to_ruby(object.values)) end Row.new(data) end def initialize(data={}) @data = data.to_h end def [](item) @data[item] end def to_h @data end def inspect formated = data.map do |key, value| "#{key}: \"#{value}\"" end %{#} end end end end ================================================ FILE: lib/spark/sql.rb ================================================ module Spark module SQL extend Spark::Library autoload_without_import :Context, 'spark/sql/context' autoload_without_import :DataType, 'spark/sql/data_type' autoload_without_import :DataFrame, 'spark/sql/data_frame' autoload_without_import :DataFrameReader, 'spark/sql/data_frame_reader' autoload :Row, 'spark/sql/row' autoload :Column, 'spark/sql/column' # Types autoload :StructType, 'spark/sql/data_type' autoload :StructField, 'spark/sql/data_type' autoload :AtomicType, 'spark/sql/data_type' autoload :NumericType, 'spark/sql/data_type' autoload :IntegralType, 'spark/sql/data_type' autoload :StringType, 'spark/sql/data_type' autoload :LongType, 'spark/sql/data_type' end SQLContext = Spark::SQL::Context end ================================================ FILE: lib/spark/stat_counter.rb ================================================ module Spark class StatCounter attr_reader :n # count of our values attr_reader :mu # mean of our values attr_reader :m2 # variance numerator (sum of (x - mean)^2) attr_reader :max # max of our values attr_reader :min # min of our values def initialize(iterator) @n = 0 @mu = 0.0 @m2 = 0.0 @max = -Float::INFINITY @min = Float::INFINITY merge(iterator) end def merge(other) if other.is_a?(Spark::StatCounter) merge_stat_counter(other) elsif other.respond_to?(:each) merge_array(other) else merge_value(other) end self end def sum @n * @mu end # Return the variance of the values. def variance if @n == 0 Float::NAN else @m2 / @n end end # Return the sample variance, which corrects for bias in estimating the variance by dividing # by N-1 instead of N. def sample_variance if @n <= 1 Float::NAN else @m2 / (@n - 1) end end # Return the standard deviation of the values. def stdev Math.sqrt(variance) end # Return the sample standard deviation of the values, which corrects for bias in estimating the # variance by dividing by N-1 instead of N. def sample_stdev Math.sqrt(sample_variance) end def to_s "(count: #{count}, mean: #{mean}, stdev: #{stdev}, max: #{max}, min: #{min})" end alias_method :count, :n alias_method :mean, :mu alias_method :max_value, :max alias_method :min_value, :min alias_method :sampleStdev, :sample_stdev alias_method :sampleVariance, :sample_variance private def merge_stat_counter(other) if other == self other = self.deep_copy end if @n == 0 @n = other.n @mu = other.mu @m2 = other.m2 @max = other.max @min = other.min elsif other.n != 0 delta = other.mu - @mu if other.n * 10 < @n @mu = @mu + (delta * other.n) / (@n + other.n) elsif @n * 10 < other.n @mu = other.mu - (delta * @n) / (@n + other.n) else @mu = (@mu * @n + other.mu * other.n) / (@n + other.n) end @max = [@max, other.max].max @min = [@min, other.min].min @m2 += other.m2 + (delta * delta * @n * other.n) / (@n + other.n) @n += other.n end end def merge_array(array) array.each do |item| merge_value(item) end end def merge_value(value) delta = value - @mu @n += 1 @mu += delta / @n @m2 += delta * (value - @mu) @max = [@max, value].max @min = [@min, value].min end end end ================================================ FILE: lib/spark/storage_level.rb ================================================ # Necessary libraries Spark.load_lib module Spark class StorageLevel def self.reload return if @reloaded reload! @reloaded = true end def self.reload! self.const_set(:NONE, JStorageLevel.NONE) self.const_set(:DISK_ONLY, JStorageLevel.DISK_ONLY) self.const_set(:DISK_ONLY_2, JStorageLevel.DISK_ONLY_2) self.const_set(:MEMORY_ONLY, JStorageLevel.MEMORY_ONLY) self.const_set(:MEMORY_ONLY_SER, JStorageLevel.MEMORY_ONLY_SER) self.const_set(:MEMORY_ONLY_2, JStorageLevel.MEMORY_ONLY_2) self.const_set(:MEMORY_ONLY_SER_2, JStorageLevel.MEMORY_ONLY_SER_2) self.const_set(:MEMORY_AND_DISK, JStorageLevel.MEMORY_AND_DISK) self.const_set(:MEMORY_AND_DISK_2, JStorageLevel.MEMORY_AND_DISK_2) self.const_set(:MEMORY_AND_DISK_SER, JStorageLevel.MEMORY_AND_DISK_SER) self.const_set(:MEMORY_AND_DISK_SER_2, JStorageLevel.MEMORY_AND_DISK_SER_2) self.const_set(:OFF_HEAP, JStorageLevel.OFF_HEAP) end def self.java_get(arg) reload if arg.is_a?(String) const_get(arg.upcase) else arg end end end end ================================================ FILE: lib/spark/version.rb ================================================ module Spark VERSION = '1.2.1' end ================================================ FILE: lib/spark/worker/master.rb ================================================ #!/usr/bin/env ruby $PROGRAM_NAME = 'RubySparkMaster' require 'socket' require 'io/wait' require 'nio' require_relative 'worker' # New process group # Otherwise master can be killed from pry console Process.setsid # ================================================================================================= # Master # module Master def self.create case ARGV[0].to_s.strip when 'thread' Master::Thread.new else Master::Process.new end end class Base include Spark::Constant def initialize @port = ARGV[1].to_s.strip.to_i @socket = TCPSocket.open('localhost', @port) @worker_arguments = @socket.read_string end def run selector = NIO::Selector.new monitor = selector.register(@socket, :r) monitor.value = Proc.new { receive_message } loop { selector.select {|monitor| monitor.value.call} } end def receive_message command = @socket.read_int case command when CREATE_WORKER create_worker when KILL_WORKER kill_worker when KILL_WORKER_AND_WAIT kill_worker_and_wait end end def kill_worker_and_wait if kill_worker @socket.write_int(SUCCESSFULLY_KILLED) else @socket.write_int(UNSUCCESSFUL_KILLING) end end end # =============================================================================================== # Worker::Process # class Process < Base def create_worker if fork? pid = ::Process.fork do Worker::Process.new(@port).run end else pid = ::Process.spawn("ruby #{@worker_arguments} worker.rb #{@port}") end # Detach child from master to avoid zombie process ::Process.detach(pid) end def kill_worker worker_id = @socket.read_long ::Process.kill('TERM', worker_id) rescue nil end def fork? @can_fork ||= _fork? end def _fork? return false if !::Process.respond_to?(:fork) pid = ::Process.fork exit unless pid # exit the child immediately true rescue NotImplementedError false end end # =============================================================================================== # Worker::Thread # class Thread < Base def initialize ::Thread.abort_on_exception = true # For synchronous access to socket IO $mutex_for_command = Mutex.new $mutex_for_iterator = Mutex.new super end def create_worker ::Thread.new do Worker::Thread.new(@port).run end end def kill_worker worker_id = @socket.read_long thread = ObjectSpace._id2ref(worker_id) thread.kill rescue nil end end end # Create proper master by worker_type Master.create.run ================================================ FILE: lib/spark/worker/spark_files.rb ================================================ class SparkFiles class << self attr_accessor :root_directory end def self.get(file_name) File.join(root_directory, file_name) end def self.get_content(file_name) File.read(get(file_name)) end end ================================================ FILE: lib/spark/worker/worker.rb ================================================ #!/usr/bin/env ruby # Load root of the gem lib = File.expand_path(File.join('..', '..'), File.dirname(__FILE__)) $LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib) require 'ruby-spark.rb' require 'socket' require_relative 'spark_files' # ================================================================================================= # Worker # # Iterator is LAZY !!! # module Worker class Base include Spark::Helper::Serialize include Spark::Helper::System include Spark::Constant attr_accessor :socket def initialize(port) # Open socket to Spark @socket = TCPSocket.open('localhost', port) # Send back worker ID socket.write_long(id) end def run begin compute rescue => e send_error(e) else successful_finish end end private def before_start # Should be implemented in sub-classes end def before_end # Should be implemented in sub-classes end # These methods must be on one method because iterator is Lazy # which mean that exception can be raised at `serializer` or `compute` def compute before_start # Load split index @split_index = socket.read_int # Load files SparkFiles.root_directory = socket.read_string # Load broadcast count = socket.read_int count.times do Spark::Broadcast.register(socket.read_long, socket.read_string) end # Load command @command = socket.read_data # Load iterator @iterator = @command.deserializer.load_from_io(socket).lazy # Compute @iterator = @command.execute(@iterator, @split_index) # Result is not iterable @iterator = [@iterator] unless @iterator.respond_to?(:each) # Send result @command.serializer.dump_to_io(@iterator, socket) end def send_error(e) # Flag socket.write_int(WORKER_ERROR) # Message socket.write_string(e.message) # Backtrace socket.write_int(e.backtrace.size) e.backtrace.each do |item| socket.write_string(item) end socket.flush # Wait for spark # Socket is closed before throwing an exception # Singal that ruby exception was fully received until socket.closed? sleep(0.1) end # Depend on type of worker kill_worker end def successful_finish # Finish socket.write_int(WORKER_DONE) # Send changed accumulator changed = Spark::Accumulator.changed socket.write_int(changed.size) changed.each do |accumulator| socket.write_data([accumulator.id, accumulator.value]) end # Send it socket.flush before_end end def log(message=nil) return if !$DEBUG $stdout.puts %{==> #{Time.now.strftime('%H:%M:%S')} [#{id}] #{message}} $stdout.flush end end # =============================================================================================== # Worker::Process # class Process < Base def id ::Process.pid end private def before_start $PROGRAM_NAME = 'RubySparkWorker' end def kill_worker Process.exit(false) end end # =============================================================================================== # Worker::Thread # class Thread < Base def id ::Thread.current.object_id end private def load_command $mutex_for_command.synchronize { super } end # Threads changing for reading is very slow # Faster way is do it one by one def load_iterator # Wait for incoming connection for preventing deadlock if jruby? socket.io_wait else socket.wait_readable end $mutex_for_iterator.synchronize { super } end def kill_worker Thread.current.kill end end end # Worker is loaded as standalone if $PROGRAM_NAME == __FILE__ worker = Worker::Process.new(ARGV[0]) worker.run end ================================================ FILE: lib/spark.rb ================================================ # Gems and libraries require 'method_source' require 'securerandom' require 'forwardable' require 'sourcify' require 'socket' require 'tempfile' require 'tmpdir' require 'json' module Spark autoload :Context, 'spark/context' autoload :Config, 'spark/config' autoload :RDD, 'spark/rdd' autoload :CLI, 'spark/cli' autoload :Build, 'spark/build' autoload :Serializer, 'spark/serializer' autoload :Helper, 'spark/helper' autoload :StorageLevel, 'spark/storage_level' autoload :Command, 'spark/command' autoload :CommandBuilder, 'spark/command_builder' autoload :Sampler, 'spark/sampler' autoload :Logger, 'spark/logger' autoload :JavaBridge, 'spark/java_bridge' autoload :ExternalSorter, 'spark/sort' autoload :Constant, 'spark/constant' autoload :Broadcast, 'spark/broadcast' autoload :Accumulator, 'spark/accumulator' autoload :StatCounter, 'spark/stat_counter' autoload :Library, 'spark/library' # Mllib autoload :Mllib, 'spark/mllib' # SQL autoload :SQL, 'spark/sql' autoload :SQLContext, 'spark/sql' include Helper::System DEFAULT_CONFIG_FILE = File.join(Dir.home, '.ruby-spark.conf') def self.print_logo(message=nil) puts <<-STRING Welcome to __ ____ __ ______ __/ / __ __ / __/__ ___ _____/ /__ / __/ // / _ \\/ // / _\\ \\/ _ \\/ _ `/ __/ '_/ /_/ \\_,_/_.__/\\_, / /___/ .__/\\_,_/_/ /_/\\_\\ version #{Spark::VERSION} /___/ /_/ #{message} STRING end # Returns current configuration. Configurations can be changed until # context is initialized. In this case config is locked only for reading. # # == Configuration can be changed: # # Spark.config.set('spark.app.name', 'RubySpark') # # Spark.config['spark.app.name'] = 'RubySpark' # # Spark.config do # set 'spark.app.name', 'RubySpark' # end # def self.config(&block) @config ||= Spark::Config.new if block_given? @config.instance_eval(&block) else @config end end # Destroy current configuration. This can be useful for restarting config # to set new. It has no effect if context is already started. def self.clear_config @config = nil end # Return a current active context or nil. def self.context @context end # Current active SQLContext or nil. def self.sql_context @sql_context end # Initialize spark context if not already. Config will be automatically # loaded on constructor. From that point `config` will use configuration # from running Spark and will be locked only for reading. def self.start @context ||= Spark::Context.new end def self.start_sql @sql_context ||= Spark::SQL::Context.new(start) end def self.stop @context.stop RubyWorker.stopServer logger.info('Workers were stopped') rescue nil ensure @context = nil @sql_context = nil clear_config end def self.started? !!@context end # =============================================================================== # Defaults # Load default configuration for Spark and RubySpark # By default are values stored at ~/.ruby-spark.conf # File is automatically created def self.load_defaults unless File.exists?(DEFAULT_CONFIG_FILE) save_defaults_to(DEFAULT_CONFIG_FILE) end load_defaults_from(DEFAULT_CONFIG_FILE) end # Clear prev setting and load new from file def self.load_defaults_from(file_path) # Parse values values = File.readlines(file_path) values.map!(&:strip) values.select!{|value| value.start_with?('gem.')} values.map!{|value| value.split(nil, 2)} values = Hash[values] # Clear prev values @target_dir = nil @ruby_spark_jar = nil @spark_home = nil # Load new @target_dir = values['gem.target'] end # Create target dir and new config file def self.save_defaults_to(file_path) dir = File.join(Dir.home, ".ruby-spark.#{SecureRandom.uuid}") if Dir.exist?(dir) save_defaults_to(file_path) else Dir.mkdir(dir, 0700) file = File.open(file_path, 'w') file.puts "# Directory where will be Spark saved" file.puts "gem.target #{dir}" file.puts "" file.puts "# You can also defined spark properties" file.puts "# spark.master spark://master:7077" file.puts "# spark.ruby.serializer marshal" file.puts "# spark.ruby.serializer.batch_size 2048" file.close end end # =============================================================================== # Global settings and variables def self.logger @logger ||= Spark::Logger.new end # Root of the gem def self.root @root ||= File.expand_path('..', File.dirname(__FILE__)) end # Default directory for java extensions def self.target_dir @target_dir ||= File.join(root, 'target') end # Directory where is worker.rb def self.worker_dir @worker_dir ||= File.join(root, 'lib', 'spark', 'worker') end def self.ruby_spark_jar @ruby_spark_jar ||= File.join(target_dir, 'ruby-spark.jar') end def self.spark_ext_dir @spark_ext_dir ||= File.join(root, 'ext', 'spark') end # =============================================================================== # Load JVM and jars # Load dependent libraries, can be use once # Cannot load before CLI::install # # == Parameters: # target:: # path to directory where are located sparks .jar files or single Spark jar # def self.load_lib(target=nil) return if @java_bridge target ||= Spark.target_dir @java_bridge = JavaBridge.init(target) @java_bridge.import_all nil end def self.java_bridge @java_bridge end # Aliases class << self alias_method :sc, :context alias_method :jb, :java_bridge alias_method :home, :root end end # C/Java extensions require 'ruby_spark_ext' # Ruby core extensions require 'spark/ext/module' require 'spark/ext/object' require 'spark/ext/hash' require 'spark/ext/string' require 'spark/ext/integer' require 'spark/ext/ip_socket' require 'spark/ext/io' # Other requirments require 'spark/version' require 'spark/error' # Load default settings for gem and Spark Spark.load_defaults # Make sure that Spark be always stopped Kernel.at_exit do begin Spark.started? && Spark.stop rescue end end ================================================ FILE: ruby-spark.gemspec ================================================ # coding: utf-8 lib = File.expand_path('../lib', __FILE__) $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) require 'spark/version' Gem::Specification.new do |spec| spec.name = 'ruby-spark' spec.version = Spark::VERSION spec.authors = ['Ondřej Moravčík'] spec.email = ['moravcik.ondrej@gmail.com'] spec.summary = %q{Ruby wrapper for Apache Spark} spec.description = %q{} spec.homepage = '' spec.license = 'MIT' spec.files = `git ls-files -z`.split("\x0") spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) } spec.test_files = spec.files.grep(%r{^(test|spec|features)/}) spec.require_paths = ['lib'] if RUBY_PLATFORM =~ /java/ spec.platform = 'java' extensions = ['ext/ruby_java/extconf.rb'] else extensions = ['ext/ruby_c/extconf.rb'] spec.add_dependency 'rjb' end spec.extensions = extensions spec.required_ruby_version = '>= 2.0' spec.requirements << 'java, scala' spec.add_dependency 'sourcify', '0.6.0.rc4' spec.add_dependency 'method_source' spec.add_dependency 'commander' spec.add_dependency 'pry' spec.add_dependency 'nio4r' spec.add_dependency 'distribution' spec.add_development_dependency 'bundler', '~> 1.6' spec.add_development_dependency 'rake' end ================================================ FILE: spec/generator.rb ================================================ class Generator def self.numbers(size=1000) Array.new(size){ rand(1..1000) } end def self.numbers_with_zero(size=1000) Array.new(size){ rand(0..1000) } end def self.words(size=1000) Array.new(size) { word } end def self.word(size=10) Array.new(rand(1..size)){(97+rand(26)).chr}.join end def self.lines(size=1000, letters=3) Array.new(size) do Array.new(rand(50..100)){ (97+rand(letters)).chr + (' ' * (rand(10) == 0 ? 1 : 0)) }.join end end def self.hash(size=1000) Array.new(size) do [word(2), rand(1..10)] end end def self.hash_with_values(size=1000, values_count=10) Array.new(size) do [word(2), Array.new(values_count) { rand(1..10) }] end end end ================================================ FILE: spec/inputs/lorem_300.txt ================================================ Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aenean ligula neque, ultricies et lorem vel, accumsan cursus felis. Maecenas ullamcorper, magna eu lobortis gravida, diam leo rutrum diam, eget elementum sapien felis non magna. Etiam scelerisque, mauris et cursus fermentum, ipsum nisl vulputate nisl, sit amet pulvinar libero sem at lectus. Vivamus nibh lectus, elementum eget dui non, fermentum volutpat orci. Nam imperdiet, dui id placerat pellentesque, purus sem semper augue, id dictum est ipsum et erat. Integer arcu tortor, ullamcorper ac libero a, iaculis sollicitudin orci. Sed dapibus hendrerit neque, ac aliquet arcu elementum sed. Phasellus ornare interdum erat, eget fringilla sapien ornare vitae. In condimentum, mi sed condimentum viverra, nisl sapien scelerisque mi, vel varius metus dolor eu lorem. Nulla pulvinar ac metus eu volutpat. Suspendisse potenti. Duis vitae mauris arcu. Proin et dignissim dolor, eget congue purus. Ut malesuada neque massa. Ut viverra faucibus turpis, in pharetra nulla iaculis quis. Morbi imperdiet risus eu eros varius facilisis. Aenean nec dapibus sapien. Fusce tempus, risus vitae volutpat faucibus, dolor diam cursus risus, sit amet faucibus mauris mauris quis orci. Aliquam massa ante, accumsan non sapien quis, ullamcorper fermentum elit. Pellentesque risus orci, rhoncus ac mi sed, volutpat vehicula sem. Mauris suscipit odio vel mi scelerisque, at cursus libero ullamcorper. Nulla aliquam metus arcu, in vestibulum sem ullamcorper eu. Pellentesque laoreet venenatis metus ut accumsan. Quisque ut enim interdum, fringilla lorem nec, dignissim orci. Fusce vel diam sed ante dictum scelerisque. Vestibulum lectus enim, gravida sit amet ullamcorper sit amet, rhoncus nec dui. Praesent eget molestie tellus, quis iaculis sapien. Sed ut rutrum velit. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Donec tortor quam, venenatis ac rhoncus et, gravida non orci. Ut lacus dolor, auctor id ante varius, pharetra placerat nulla. Nulla facilisi. Nam quis feugiat nibh, ut ultrices est. Nulla at mi nec metus porttitor tempor. Donec leo lorem, rhoncus ut arcu eu, venenatis eleifend risus. Phasellus non porttitor neque, sit amet accumsan nisl. Pellentesque non urna tempor, interdum orci non, gravida enim. Sed in urna et dolor cursus aliquet et vel magna. Quisque vestibulum tortor scelerisque orci mattis, eu aliquet sem condimentum. Proin ac ultricies erat. Integer sodales, turpis quis volutpat pretium, justo lacus lobortis mauris, nec commodo orci leo sit amet metus. Ut ornare ipsum vitae malesuada aliquam. Quisque lobortis semper elit id consectetur. Aenean facilisis sapien eu ipsum adipiscing mattis. Praesent malesuada aliquet venenatis. Ut aliquet vel sapien nec euismod. Morbi eros urna, rutrum ut iaculis sed, vulputate sit amet nunc. Nulla facilisi. Morbi sagittis nec magna sed scelerisque. Maecenas a euismod eros. Vestibulum suscipit pharetra velit porta fermentum. Phasellus euismod auctor metus ut interdum. Quisque lectus lorem, tristique ut libero vel, rhoncus tincidunt tellus. Sed malesuada vestibulum purus, at tincidunt massa imperdiet vitae. Ut mollis eleifend elit, et sodales nisl facilisis eu. Fusce ligula ligula, porta id est sed, tincidunt malesuada odio. Maecenas ultricies dignissim nunc, quis adipiscing urna auctor commodo. Phasellus tincidunt odio non nulla luctus sollicitudin. Mauris pharetra porttitor est iaculis sollicitudin. Curabitur quam sem, fringilla id tellus vitae, elementum convallis eros. Morbi sollicitudin eleifend leo, ut euismod ligula ornare sagittis. Nullam luctus, mi eget dapibus elementum, diam purus fringilla lectus, sit amet sodales neque turpis sed mi. Sed volutpat sem euismod posuere mollis. Integer viverra egestas lacinia. Quisque viverra metus massa, in condimentum sem tincidunt a. Proin ac ipsum non leo sollicitudin consectetur id a sem. Cras tempus venenatis nisl sit amet venenatis. Nulla facilisi. Morbi scelerisque mi est, vitae lobortis sem ultricies faucibus. In urna ante, faucibus ac eros et, dignissim mollis justo. Quisque aliquet tortor sem, ac mattis tortor faucibus sed. Donec tortor lacus, egestas in convallis at, vulputate eu nibh. Aenean ligula augue, imperdiet in tempor id, consequat vitae erat. Sed id eros a justo semper ultricies. Curabitur nunc nisi, placerat at leo sed, vehicula pulvinar velit. Nullam ut ipsum augue. Fusce condimentum quam commodo, venenatis massa eleifend, dignissim neque. Curabitur sit amet hendrerit tortor, a condimentum sem. Morbi lobortis porta porttitor. Maecenas mollis ipsum ac est venenatis auctor at vel lectus. Mauris luctus euismod dolor. Cras vitae nibh eget sem placerat adipiscing. Pellentesque ac molestie ligula. Vivamus sit amet lectus odio. Duis lacinia rutrum faucibus. Curabitur luctus ultricies enim, id imperdiet ipsum viverra vitae. Mauris et iaculis erat, vel faucibus purus. Fusce non nisl tristique, dignissim lacus id, fermentum velit. Sed facilisis sapien at interdum viverra. Aliquam erat volutpat. Maecenas suscipit diam vitae velit vulputate tincidunt. Nulla facilisi. Sed eget tortor et ante mollis cursus. Nullam vitae porttitor magna. Quisque iaculis massa dui, id rutrum purus blandit eu. Duis convallis ipsum id commodo iaculis. Praesent sagittis ut tortor ut varius. Curabitur consequat volutpat scelerisque. Cras pharetra lectus eget urna imperdiet ullamcorper. Sed lacinia ut eros non malesuada. Quisque hendrerit suscipit convallis. Vivamus posuere vestibulum massa, non accumsan diam tincidunt eu. Nulla bibendum dictum mi sit amet faucibus. Nullam egestas lorem nunc, vel malesuada elit imperdiet vitae. Sed luctus ligula at erat tempus tristique. Proin varius mi quis libero sollicitudin ullamcorper. In hac habitasse platea dictumst. Praesent auctor arcu vel luctus consequat. Curabitur consequat magna sit amet ante feugiat dictum. Morbi scelerisque faucibus urna, ac dapibus sem ultricies eu. Pellentesque rhoncus sapien nec eros facilisis consectetur. Duis eleifend vestibulum suscipit. Morbi orci metus, malesuada sit amet urna ac, laoreet vehicula lacus. Quisque gravida, nunc fringilla tincidunt vestibulum, lacus urna commodo nisl, quis sodales lectus ipsum et augue. Ut non erat sit amet neque fermentum ultricies. Vestibulum tincidunt est elit, ac dapibus velit faucibus id. Praesent in viverra libero. Proin eleifend, odio eget sodales dignissim, nunc arcu ullamcorper libero, sit amet sodales diam ipsum in tellus. Suspendisse enim nunc, accumsan non ligula et, vulputate viverra ante. Ut id elit eu dui dictum malesuada at id orci. Vivamus sed felis aliquam metus consequat euismod nec eu libero. Phasellus mattis malesuada ipsum eu posuere. Nullam at massa enim. Duis vitae urna blandit, ultricies nisi in, consequat elit. Quisque nec nibh ut tortor pulvinar euismod. Praesent molestie felis ac risus elementum sollicitudin. Donec eu leo in augue convallis mattis. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Integer ut dignissim lectus. Vivamus eros felis, gravida et auctor ut, volutpat vitae dui. Nunc adipiscing sapien et lectus rutrum vestibulum. Mauris fermentum, metus eu sollicitudin malesuada, lorem diam vestibulum metus, ut elementum metus nibh sed augue. Cras lectus risus, feugiat eget fringilla a, cursus et eros. Praesent aliquam justo vel condimentum lacinia. Sed condimentum dui nec leo blandit, vel elementum odio laoreet. Quisque suscipit molestie iaculis. Nullam dignissim, mauris sit amet condimentum aliquet, magna sapien scelerisque nisl, tincidunt auctor purus libero at lectus. Nulla facilisi. Sed egestas erat at dictum egestas. Cras non mauris ut dolor interdum condimentum. Fusce quis hendrerit purus, dictum cursus mi. Maecenas mattis, turpis sit amet mollis ultricies, mi turpis ornare velit, eget suscipit magna eros sit amet purus. Integer ut viverra elit. Praesent eu augue viverra nunc convallis porta. Etiam venenatis dignissim nisl et semper. Cras eu nisl vitae justo ornare porttitor vel nec augue. Pellentesque faucibus mollis neque, nec ullamcorper purus mollis sed. Suspendisse ut molestie lectus, faucibus aliquet libero. Aliquam tristique, neque ut lobortis ultricies, tellus elit ultrices risus, sodales dapibus sem mauris et magna. Sed et sem porttitor, fringilla mauris vestibulum, porttitor dui. Proin vitae viverra elit. Integer nec adipiscing velit. Nunc quis urna tristique, ultrices orci eget, aliquet lorem. Curabitur consequat adipiscing sodales. In elementum condimentum ante id placerat. Cras ac turpis tristique lacus vulputate dictum vel nec libero. Curabitur fringilla interdum tempus. Integer placerat dolor ut magna aliquet bibendum. Cras ac metus magna. Curabitur vehicula magna ut sapien viverra ornare. Donec risus nisi, imperdiet eu laoreet in, tempor lobortis urna. Etiam malesuada et lacus ac consectetur. Morbi facilisis sapien quis nisl laoreet semper. Suspendisse volutpat sapien vel quam blandit faucibus. Nam sagittis velit eros, vitae suscipit tortor elementum ac. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Donec nec nibh dictum, pretium nulla eu, pharetra mauris. Vestibulum leo mi, convallis et euismod ac, molestie in ligula. Vestibulum tempor tincidunt porttitor. Integer nisl orci, dignissim ac volutpat a, auctor eget augue. Suspendisse eget euismod nunc, eu elementum ipsum. Cras libero tortor, gravida quis vestibulum a, tincidunt aliquam mauris. Integer elementum pellentesque posuere. Donec accumsan feugiat pulvinar. Aliquam eros justo, dictum non elementum nec, tristique vel massa. Nulla a velit porttitor, aliquam turpis nec, ultricies ligula. Nam id dignissim dui. Ut placerat arcu nec accumsan varius. Sed quis accumsan nunc, in dapibus lorem. Morbi egestas sagittis pulvinar. Morbi id mauris ante. Sed magna nibh, venenatis quis lacinia in, congue quis metus. Nunc lacus lectus, adipiscing sed consequat id, luctus vel dui. Mauris eu nisi erat. Proin eleifend lectus sit amet ligula fringilla semper. Suspendisse tristique, quam ac pharetra dictum, libero risus rutrum ipsum, eget tristique arcu neque vel nisi. Ut auctor nulla vitae porta faucibus. Suspendisse ut tellus enim. Morbi commodo posuere quam. Proin consequat in quam pulvinar posuere. Nunc id ullamcorper est. Cras ac molestie massa. Cras leo tellus, tempus id nibh quis, porttitor laoreet elit. Mauris in ornare nisi. Duis vel velit felis. Suspendisse gravida felis nec nulla hendrerit pretium. Cras at orci neque. Phasellus vehicula, ipsum at tempus sodales, mauris est condimentum metus, a vehicula ante tellus sit amet diam. Suspendisse fermentum elit in volutpat viverra. Nullam gravida in augue sed mollis. Curabitur aliquam diam non quam aliquam ultrices. Quisque pretium semper diam eget malesuada. Suspendisse porttitor sagittis sem at malesuada. Donec euismod elementum nulla, sit amet eleifend enim adipiscing nec. Nullam porta, enim ac tincidunt molestie, turpis mi porta justo, ornare tristique sem orci quis turpis. Nullam leo dolor, pellentesque ac hendrerit et, tempus quis nisi. Fusce pretium mattis tortor sagittis suscipit. Vestibulum vitae suscipit libero. Mauris consequat sagittis mi, id tempus est condimentum et. In eget condimentum odio, a malesuada quam. Vivamus id turpis non nulla eleifend cursus ut sit amet tellus. Proin ultrices luctus nibh, eget condimentum ligula vestibulum in. Aliquam pharetra aliquet erat nec lacinia. Cras fringilla est fringilla ante tristique, vitae bibendum dolor malesuada. Praesent ut dui pulvinar, suscipit velit gravida, malesuada nunc. Cras tempus feugiat interdum. Vivamus lectus lorem, rutrum ut neque at, sollicitudin euismod nulla. Vestibulum ac ligula suscipit, ultricies felis eget, adipiscing lectus. Maecenas nec enim vel eros molestie lobortis faucibus sit amet urna. Sed ac consequat nulla. Nulla et libero nisi. Pellentesque euismod nunc quis ipsum tristique, suscipit elementum magna aliquam. Praesent sit amet tincidunt leo. Duis tempor arcu eget est posuere imperdiet. Quisque vel dui adipiscing, auctor nibh vel, vulputate sapien. Curabitur eu sodales lacus. Aliquam felis eros, mattis a diam eu, ullamcorper vestibulum turpis. Vivamus vitae vulputate lacus, sed convallis lorem. Vestibulum mattis sollicitudin vulputate. Mauris cursus erat eget nisi accumsan, nec commodo tellus blandit. Etiam gravida nulla et lorem molestie auctor. Mauris venenatis iaculis nulla vel mollis. Morbi pretium sed eros at commodo. Aliquam eu justo turpis. Pellentesque lobortis, nisl eget ultricies dictum, augue sem placerat elit, vitae pretium lectus massa eget tortor. Nulla accumsan, massa eu rutrum pharetra, mi sapien aliquam massa, viverra facilisis metus nisi in dolor. Duis felis velit, interdum a elit non, cursus pellentesque libero. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Nunc vel nisi quis augue accumsan aliquam. Suspendisse ante lectus, lobortis nec suscipit at, ullamcorper at diam. Aliquam hendrerit, eros ac egestas condimentum, enim metus lobortis nibh, sit amet convallis augue nulla nec lorem. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Ut ac ligula eget est blandit scelerisque at vitae nunc. Sed venenatis eros non quam auctor posuere. Curabitur convallis dapibus semper. Fusce et leo sed massa posuere porta. Morbi convallis lobortis eros. Quisque ac nisl dictum, sagittis eros et, pellentesque metus. Quisque mattis sodales lorem quis malesuada. Aenean neque sapien, rutrum vitae euismod quis, euismod eu mi. Etiam ante tellus, auctor vitae pulvinar a, mattis nec tellus. Morbi libero lectus, mattis sit amet convallis at, viverra et nisi. Proin a ante tristique, blandit urna at, lobortis leo. Praesent nec odio sit amet ligula adipiscing pretium at rhoncus felis. Ut ut velit turpis. Sed tempor lectus massa, vel gravida libero gravida a. Nunc mollis, lorem id dapibus hendrerit, mi orci gravida orci, at vehicula neque nisl quis nibh. Mauris feugiat, ligula sit amet interdum laoreet, lectus leo accumsan dolor, eu cursus tortor quam eget lectus. Sed commodo, est in bibendum condimentum, magna neque dictum sapien, at lacinia sem ipsum ut eros. In eget erat eu nulla hendrerit tincidunt id vulputate nibh. Nunc sed imperdiet urna, eu tempor orci. Phasellus pellentesque sapien eu risus tincidunt, ut iaculis risus fermentum. Suspendisse condimentum erat vitae porta malesuada. Ut a vulputate lorem. Nulla ullamcorper, neque in posuere vulputate, neque magna tempor erat, sit amet luctus nisi nibh quis ligula. Duis porta urna et fermentum interdum. Sed pellentesque odio euismod nisi auctor rutrum. Suspendisse mi nibh, dignissim eget porttitor quis, commodo a massa. Nunc vel eleifend turpis. Sed iaculis, massa quis egestas pellentesque, nibh ante feugiat ante, a euismod lacus nunc et felis. Nam in aliquet odio. Nulla eget enim aliquam, faucibus est at, fringilla tellus. Duis molestie massa ornare, sodales leo eget, lobortis nibh. Nam bibendum mi a facilisis mattis. Duis ultrices arcu tellus, vitae interdum tortor dictum et. Sed id luctus lectus, eu tempus quam. Duis mi nisl, iaculis vel tortor sit amet, vulputate sodales risus. Cras vitae lobortis nisi, eu adipiscing ante. Nam eget scelerisque libero. Nulla pulvinar, velit et posuere sagittis, odio risus venenatis sapien, at tristique enim augue quis sem. Integer rutrum blandit eros eu faucibus. Etiam eget iaculis felis, in fermentum ante. Nullam a placerat risus, id accumsan quam. Donec est orci, elementum eu sapien non, ultricies ullamcorper leo. Praesent tincidunt, mauris in viverra hendrerit, dolor nisi cursus orci, vel lacinia neque ante eu magna. Nam facilisis massa at nisi accumsan, non condimentum turpis facilisis. Cras quis ipsum at orci ornare venenatis vitae et ante. Morbi vitae luctus lacus. Nullam eu felis at mi hendrerit commodo a eu diam. Maecenas ultricies, urna sit amet egestas tempor, dolor ligula dictum nibh, vehicula commodo ipsum diam at nunc. Proin facilisis tincidunt elit, sed vulputate leo lobortis sed. In tincidunt risus lorem, venenatis pellentesque tellus accumsan vitae. Integer ullamcorper mi ut risus consectetur dictum in quis dui. Pellentesque sed diam sed purus egestas mollis id at sapien. Nunc cursus mi nec accumsan porta. Nullam pulvinar pharetra felis. Etiam porta massa et diam scelerisque, ut iaculis nisl luctus. Curabitur vel metus id lacus faucibus tempus. Nullam ornare neque orci, nec scelerisque erat mattis nec. Phasellus ultrices ultrices nisi quis venenatis. Sed ultrices iaculis diam a faucibus. Phasellus quis suscipit nulla. Nulla ultricies, turpis et dictum ullamcorper, urna metus porta tellus, quis congue dolor libero quis sem. Nam tempus metus risus, sed rutrum nibh cursus malesuada. Vivamus bibendum odio eget mi aliquet, sed tempor eros tincidunt. Suspendisse eu ultricies ligula, non commodo sem. Ut aliquet elit sed leo laoreet aliquam. Vivamus feugiat a justo non auctor. Sed rhoncus orci ut dictum dignissim. Duis eros libero, tempus non venenatis quis, suscipit eget turpis. Aliquam sed ullamcorper velit, in tincidunt tellus. Ut dapibus erat vel nunc feugiat elementum. Cras congue, erat sit amet lacinia venenatis, nisi magna rhoncus nulla, eu blandit eros neque ac eros. Donec vulputate placerat dapibus. Integer dignissim odio eget iaculis ultrices. Vestibulum ligula neque, tincidunt at pretium ac, tincidunt sit amet tellus. Sed fermentum egestas tortor, non volutpat sapien. Aliquam erat volutpat. Duis semper placerat sapien at placerat. Praesent facilisis pharetra dignissim. Morbi laoreet sed tortor eu rhoncus. Vivamus eleifend felis eu dui ornare ornare sed at urna. Nulla nulla justo, hendrerit id enim vitae, blandit consequat nibh. Aliquam mattis diam mattis fringilla tempor. Suspendisse suscipit est sed pulvinar commodo. Lorem ipsum dolor sit amet, consectetur adipiscing elit. In in scelerisque enim. Phasellus ornare nisl consequat volutpat bibendum. Vivamus et nunc viverra, ultrices lorem a, cursus purus. Curabitur nibh libero, hendrerit lobortis malesuada sit amet, fringilla et augue. Vestibulum est lacus, fringilla sit amet dictum pulvinar, lacinia at leo. Proin iaculis felis vitae metus viverra blandit. Mauris accumsan sagittis semper. Quisque non diam a quam volutpat faucibus. Pellentesque eros orci, commodo eget fringilla eu, euismod et turpis. Duis molestie et eros ac ullamcorper. Phasellus consequat risus eget elementum semper. Donec at mi a justo laoreet condimentum porttitor in purus. Nulla sit amet libero consectetur, iaculis neque nec, scelerisque turpis. Aliquam interdum nibh eget accumsan dictum. Ut lobortis, mi non eleifend lobortis, lorem mauris pretium urna, at fermentum tellus felis eu nunc. Aliquam in nibh tristique, tempus purus a, cursus massa. Suspendisse potenti. Maecenas porttitor et erat in sollicitudin. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Vestibulum commodo placerat velit, vel pellentesque neque sagittis eget. Interdum et malesuada fames ac ante ipsum primis in faucibus. Nullam eu massa placerat, iaculis eros eget, viverra orci. Aliquam ac lacus porttitor, eleifend elit id, vehicula mauris. Sed ac interdum libero. Sed laoreet suscipit mi, ac accumsan massa condimentum nec. Suspendisse sodales libero sollicitudin, malesuada quam ac, viverra enim. Sed sapien libero, egestas sit amet orci non, venenatis interdum augue. In hac habitasse platea dictumst. Fusce gravida orci at ligula fringilla adipiscing. Nunc quis ipsum quis nibh egestas porta. Proin et faucibus elit. Etiam in neque at nunc pharetra adipiscing nec vel magna. Donec at nunc scelerisque, tincidunt risus ut, bibendum nisi. Donec pulvinar fermentum purus, ac adipiscing urna iaculis at. Nulla ut nunc vitae lorem dapibus fringilla. Ut placerat dignissim nulla ornare mattis. Mauris rutrum tellus quis odio dictum, ac tempor velit scelerisque. Quisque ligula elit, convallis nec volutpat vitae, pulvinar id mauris. Vivamus vel accumsan tortor. Donec eu sollicitudin dolor. Pellentesque egestas congue tristique. Phasellus ut sollicitudin nisl. Praesent diam neque, malesuada id tincidunt id, malesuada in eros. Phasellus adipiscing ipsum vel justo molestie vulputate. Praesent ultricies dapibus lacus pulvinar gravida. Donec consequat, orci et mattis ultrices, nibh enim sagittis metus, vitae eleifend enim tellus vitae augue. Suspendisse placerat iaculis risus nec iaculis. Ut ullamcorper ultrices dui, sed blandit mauris hendrerit vitae. Nulla ac dolor lectus. Etiam pellentesque neque at odio bibendum, at venenatis tellus fermentum. Maecenas a condimentum metus. Phasellus semper scelerisque feugiat. Fusce varius varius tincidunt. Ut vel auctor magna. Cras dui turpis, euismod in enim a, scelerisque adipiscing lectus. Duis mollis pharetra risus, sed ultrices nulla blandit non. Integer ac pulvinar magna. Aenean fermentum auctor magna. Ut in viverra sapien. Proin ac bibendum magna, cursus gravida elit. Phasellus vehicula facilisis nibh, tempor sagittis mauris accumsan et. Vestibulum sed lacus luctus diam ornare venenatis non vel felis. Morbi posuere sit amet nisl quis pulvinar. Suspendisse blandit tempus risus quis pretium. Nullam gravida libero vel aliquam suscipit. Nunc vel nunc at leo pharetra tempor et ut mi. Aliquam erat volutpat. Nulla placerat odio tellus. Nam adipiscing massa nec varius posuere. Proin placerat tellus posuere lorem suscipit, sit amet sagittis sem condimentum. Ut pharetra odio quis tellus mattis facilisis. Quisque eget interdum est. Quisque mattis, felis eu semper feugiat, quam augue interdum mauris, eget sodales nisi neque quis erat. Curabitur semper, mi posuere luctus molestie, neque ante sagittis nulla, sit amet vehicula eros eros in justo. Integer aliquet vehicula arcu, quis iaculis justo. Sed tincidunt sem id est porta volutpat. Mauris varius felis ut est venenatis, ornare porttitor arcu adipiscing. Sed luctus rutrum ante, consectetur sollicitudin sapien accumsan vulputate. Vivamus id diam vehicula, fermentum nunc id, viverra justo. Quisque porttitor, odio in molestie hendrerit, libero eros vehicula odio, id vestibulum sapien neque quis nibh. Donec vel faucibus est. Ut nec sapien vitae nibh congue egestas vel euismod tellus. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vestibulum quis lacus lorem. Integer egestas euismod ante, vitae condimentum neque eleifend non. Sed posuere bibendum ante, ut facilisis dui condimentum at. In ut varius augue. Vivamus bibendum eu odio vel convallis. Vivamus cursus sodales iaculis. Nullam convallis facilisis blandit. Phasellus iaculis porttitor elit, eget vestibulum ipsum convallis eu. Quisque volutpat justo ipsum, eleifend cursus urna facilisis a. Sed at diam nec sem semper scelerisque. Aliquam euismod erat quis nisi dictum, at sodales leo fermentum. Nam at nisl metus. Proin luctus porttitor ante in tincidunt. Maecenas laoreet vitae enim eget elementum. Nulla id sagittis enim, nec ultrices tortor. Nam rutrum ipsum sit amet erat auctor, eu venenatis libero ultricies. Ut condimentum neque non diam ullamcorper, ultrices feugiat neque egestas. Pellentesque at lobortis est, in blandit mi. Maecenas tincidunt eros id massa pulvinar, quis varius eros lobortis. Curabitur vitae sodales orci. Suspendisse potenti. Pellentesque eu fringilla nibh. Etiam sed pretium enim, lacinia consequat lectus. Quisque sed mi risus. Praesent posuere dolor sed mauris dapibus, id tristique mi mattis. Quisque nec urna rutrum, consectetur mauris ut, egestas libero. Fusce a justo orci. Etiam vitae aliquet ipsum. Curabitur consequat tempor eros, ut placerat lectus tempus et. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Sed ligula mi, laoreet sit amet nunc id, ullamcorper fermentum magna. Maecenas enim dui, viverra at nulla ut, lacinia pretium nunc. Donec at ultricies nulla, nec cursus odio. Donec ullamcorper nec turpis imperdiet hendrerit. Sed euismod aliquam vehicula. Nunc sed enim eleifend turpis venenatis sagittis. Sed laoreet velit erat. Proin nisl erat, vulputate et fermentum iaculis, mollis suscipit magna. Sed porta, augue ut accumsan fermentum, arcu tortor rutrum tellus, sit amet sollicitudin lectus turpis non felis. Vestibulum ante leo, interdum sed venenatis non, porttitor ut nibh. Sed sit amet luctus erat. Duis id rhoncus justo, non rutrum lorem. Mauris ut laoreet elit. Praesent sed diam porta, rhoncus massa a, tincidunt lorem. Mauris bibendum nunc nec est ullamcorper bibendum. Nullam venenatis libero sed ligula scelerisque euismod quis at dui. Donec ac velit luctus, molestie mi at, tempor leo. Pellentesque a ultricies risus. Maecenas malesuada faucibus nulla quis consectetur. Phasellus pretium interdum risus sit amet aliquet. Nullam eleifend sem id magna laoreet, ut lobortis mi tincidunt. Maecenas in justo tempor, viverra ipsum eu, tincidunt nulla. Sed sed molestie turpis. Pellentesque imperdiet, eros non vulputate fringilla, turpis odio luctus lectus, eu lacinia purus nisl vitae justo. Etiam non dapibus dolor. Fusce non urna scelerisque, interdum massa vitae, venenatis metus. Vestibulum scelerisque dolor ac lectus sollicitudin, eget fringilla sapien fringilla. Suspendisse non quam massa. Donec a sollicitudin eros, ut mollis turpis. Nullam gravida congue semper. Phasellus vitae tellus vitae nulla cursus tempor et non elit. Vestibulum pharetra in ligula a venenatis. Maecenas at erat sed nulla vulputate pulvinar et eu libero. Donec pulvinar arcu nisi, sed posuere turpis cursus a. In nec turpis interdum, condimentum velit in, consectetur lacus. Duis porta, felis a rhoncus ornare, ligula est elementum nunc, eu adipiscing massa lorem in nibh. In consequat gravida eros. Phasellus condimentum malesuada sapien ultrices tempor. Suspendisse sit amet diam in est pulvinar iaculis nec vitae nibh. Vivamus rhoncus enim lorem, elementum posuere est pretium ut. Duis lectus lorem, ultricies ac dignissim in, egestas et ipsum. Proin nec est ac dui sagittis dictum. Cras dictum augue ipsum, sit amet gravida ligula scelerisque nec. Ut congue blandit porta. Nunc porta vitae risus at sagittis. Donec viverra, ante id porta consectetur, felis turpis fringilla dui, ut vulputate nulla eros sit amet augue. Donec aliquet, felis ut tempor pretium, enim leo suscipit risus, eget mollis justo ipsum ut augue. Nullam at lacus eu orci dapibus laoreet nec convallis leo. Fusce rhoncus sed neque sit amet viverra. Donec arcu nisl, hendrerit non pulvinar eu, blandit ac neque. Curabitur porta velit metus, non ullamcorper nibh volutpat non. Proin tristique orci nec pretium lobortis. Curabitur quam neque, lacinia vitae massa id, molestie pellentesque risus. Praesent vitae lectus bibendum, tincidunt augue vel, volutpat magna. Curabitur quis feugiat magna. In libero risus, commodo eu mauris vitae, euismod ullamcorper libero. Cras elementum rutrum lacus eu euismod. Morbi purus metus, rutrum nec varius sed, dignissim eget nisi. Vivamus mauris nibh, hendrerit eu massa sed, ultrices suscipit est. Cras id odio dui. Nulla condimentum luctus ipsum, eu molestie turpis commodo sed. Aliquam erat volutpat. Ut sodales urna sit amet est dapibus pharetra. In nec vestibulum mi. Nullam mattis fringilla venenatis. Sed risus sem, tempor vitae suscipit a, viverra in quam. In malesuada odio nec laoreet accumsan. Donec justo diam, lacinia eu ante eget, pulvinar molestie mauris. Interdum et malesuada fames ac ante ipsum primis in faucibus. Sed vulputate ornare dolor a tempor. Maecenas egestas, augue et semper egestas, elit ipsum varius sem, a dapibus eros velit in sapien. Nulla sit amet eros ullamcorper, hendrerit nunc eu, aliquet ipsum. Sed sit amet lacus enim. Curabitur faucibus rutrum dui, a tempor velit vestibulum sed. Curabitur sed nunc id lorem semper malesuada. Maecenas semper eros eu pellentesque vulputate. Nulla accumsan dolor placerat eros euismod facilisis. Nam vitae velit tortor. Fusce tincidunt felis luctus, scelerisque dui in, rutrum nulla. Proin a pharetra tellus. Aenean varius dolor nec risus eleifend fringilla. Proin at tellus ligula. Cras imperdiet mollis nisi eget auctor. Etiam libero nunc, dictum at fermentum vitae, vehicula tincidunt justo. Proin tempor risus elit, vestibulum auctor erat tristique vel. Etiam varius dui ante, a fringilla erat ullamcorper vel. Quisque cursus quam imperdiet ornare dictum. Suspendisse turpis nunc, scelerisque a congue eget, faucibus ut mauris. Suspendisse venenatis nisi nec dolor pharetra, id euismod sem accumsan. Quisque et accumsan justo, elementum vulputate nulla. Etiam et sapien scelerisque, malesuada lacus non, pretium enim. Curabitur ultrices, ipsum hendrerit pulvinar volutpat, dui tortor mattis tortor, sed tincidunt magna lectus non eros. Ut hendrerit velit non metus pellentesque mattis. Nullam velit nisi, ornare sit amet ipsum id, commodo tincidunt nisi. Aliquam egestas, ante non placerat convallis, mi mauris posuere ligula, nec auctor lectus mi quis quam. In auctor facilisis ante id elementum. Donec interdum ipsum vitae lorem sollicitudin rutrum. Etiam congue pharetra lorem ac dictum. Donec feugiat interdum vulputate. Curabitur mollis suscipit nisi, vel tincidunt risus fringilla at. Phasellus tincidunt, nulla a tincidunt tempor, libero turpis imperdiet tortor, vel convallis orci neque vitae nisi. Nunc euismod massa quis mollis ultricies. Proin non ante elit. Pellentesque et convallis massa. Curabitur blandit mattis metus, non aliquam erat iaculis ut. Nam vestibulum ipsum vitae nulla varius, sit amet sodales ipsum congue. Nullam eget mauris ut est blandit rhoncus sit amet ac arcu. Nulla at purus consequat, lobortis massa sit amet, posuere ante. Nam bibendum laoreet tempus. Fusce ac nulla consequat, placerat sem vitae, condimentum enim. Vestibulum sed tellus nec elit varius venenatis. Donec et dapibus dui. Nullam est metus, ultrices nec lectus vel, fermentum elementum lacus. Curabitur imperdiet vestibulum enim. Aenean sollicitudin at leo quis ullamcorper. Suspendisse in posuere risus. In quis mattis sem, eu facilisis arcu. Vestibulum faucibus auctor accumsan. Morbi mattis sit amet augue ac sodales. Integer varius eget orci iaculis aliquet. Suspendisse a auctor turpis. Fusce vestibulum vestibulum ante sed mattis. Mauris ornare rhoncus enim ac egestas. Donec turpis eros, interdum non placerat nec, adipiscing eu urna. Integer feugiat mi quis eros fringilla vehicula. Proin suscipit magna ultricies laoreet dignissim. Donec vehicula ac lacus non vehicula. Sed euismod mattis facilisis. Etiam nec risus vitae risus iaculis lobortis. Duis eu dui sit amet turpis tincidunt vulputate. Nunc tortor diam, egestas in ante ac, scelerisque placerat ante. Nullam interdum ultricies nisl a vehicula. Integer id nunc elit. Sed rutrum sit amet neque quis tristique. ================================================ FILE: spec/inputs/numbers/1.txt ================================================ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 ================================================ FILE: spec/inputs/numbers/10.txt ================================================ 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 ================================================ FILE: spec/inputs/numbers/11.txt ================================================ 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 ================================================ FILE: spec/inputs/numbers/12.txt ================================================ 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 ================================================ FILE: spec/inputs/numbers/13.txt ================================================ 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 ================================================ FILE: spec/inputs/numbers/14.txt ================================================ 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 ================================================ FILE: spec/inputs/numbers/15.txt ================================================ 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 ================================================ FILE: spec/inputs/numbers/16.txt ================================================ 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 ================================================ FILE: spec/inputs/numbers/17.txt ================================================ 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 ================================================ FILE: spec/inputs/numbers/18.txt ================================================ 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 ================================================ FILE: spec/inputs/numbers/19.txt ================================================ 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 ================================================ FILE: spec/inputs/numbers/2.txt ================================================ 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 ================================================ FILE: spec/inputs/numbers/20.txt ================================================ 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 ================================================ FILE: spec/inputs/numbers/3.txt ================================================ 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 ================================================ FILE: spec/inputs/numbers/4.txt ================================================ 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 ================================================ FILE: spec/inputs/numbers/5.txt ================================================ 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 ================================================ FILE: spec/inputs/numbers/6.txt ================================================ 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 ================================================ FILE: spec/inputs/numbers/7.txt ================================================ 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 ================================================ FILE: spec/inputs/numbers/8.txt ================================================ 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 ================================================ FILE: spec/inputs/numbers/9.txt ================================================ 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 ================================================ FILE: spec/inputs/numbers_0_100.txt ================================================ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 ================================================ FILE: spec/inputs/numbers_1_100.txt ================================================ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 ================================================ FILE: spec/inputs/people.json ================================================ {"id":1,"name":"Matthew Fuller","age":49,"email":"mfuller0@blogger.com","active":false} {"id":2,"name":"Pamela Thomas","age":58,"email":"pthomas1@apache.org","address":"92 Beilfuss Lane","active":false,"ip_address":"41.52.54.168"} {"id":3,"name":"Joan Stevens","age":33,"email":"jstevens2@xrea.com","address":"1 Wayridge Circle","active":true,"ip_address":"159.204.170.10"} {"id":4,"name":"Laura Reynolds","email":"lreynolds3@admin.ch","address":"431 Spenser Court","active":true,"ip_address":"164.254.150.90"} {"id":5,"name":"Daniel Baker","email":"dbaker4@blinklist.com","active":true,"ip_address":"165.138.63.70"} {"id":6,"name":"Christina Lane","email":"clane5@cnbc.com","address":"7 Chinook Park","active":true,"ip_address":"46.240.67.103"} {"id":7,"name":"Carlos Washington","age":50,"email":"cwashington6@issuu.com","address":"6487 Memorial Trail","active":false,"ip_address":"152.45.154.18"} {"id":8,"name":"Harold Reid","age":53,"email":"hreid7@seesaa.net","active":true} {"id":9,"name":"Earl Harris","age":37,"email":"eharris8@homestead.com","active":false} {"id":10,"name":"Jack Hernandez","age":30,"email":"jhernandez9@adobe.com","address":"29407 Memorial Alley","active":false,"ip_address":"129.222.144.1"} {"id":11,"name":"Nicole Torres","age":25,"email":"ntorresa@amazon.de","address":"34804 Havey Point","active":false,"ip_address":"5.114.113.83"} {"id":12,"name":"Theresa Gordon","age":19,"email":"tgordonb@xinhuanet.com","active":false} {"id":13,"name":"Emily Schmidt","age":25,"email":"eschmidtc@arstechnica.com","address":"115 Bluestem Pass","active":true} {"id":14,"name":"Dennis Ford","age":50,"email":"dfordd@hc360.com","address":"4107 Kim Avenue","active":true,"ip_address":"44.170.237.89"} {"id":15,"name":"Deborah Williams","age":28,"email":"dwilliamse@cmu.edu","address":"7 Kipling Pass","active":false} {"id":16,"name":"Rachel Sullivan","age":31,"email":"rsullivanf@pagesperso-orange.fr","address":"8196 Harbort Park","active":true,"ip_address":"216.142.141.210"} {"id":17,"name":"Phillip Jordan","email":"pjordang@liveinternet.ru","active":false} {"id":18,"name":"Fred Mitchell","email":"fmitchellh@shinystat.com","address":"279 Gateway Parkway","active":false} {"id":19,"name":"Antonio Dunn","age":23,"email":"adunni@mediafire.com","address":"71 Maple Place","active":true,"ip_address":"39.50.250.70"} {"id":20,"name":"Alan Boyd","age":59,"email":"aboydj@sbwire.com","address":"4302 Warner Road","active":false,"ip_address":"106.253.236.0"} {"id":21,"name":"Louise Wright","age":19,"email":"lwrightk@so-net.ne.jp","address":"5 Maryland Hill","active":false,"ip_address":"51.0.99.116"} {"id":22,"name":"Diane Greene","age":39,"email":"dgreenel@jugem.jp","address":"38 Merrick Lane","active":false,"ip_address":"146.124.156.180"} {"id":23,"name":"Emily Richardson","age":23,"email":"erichardsonm@csmonitor.com","active":true} {"id":24,"name":"Joseph Henderson","age":36,"email":"jhendersonn@drupal.org","address":"55 Morningstar Lane","active":true,"ip_address":"54.187.254.99"} {"id":25,"name":"Chris Fowler","age":31,"email":"cfowlero@msu.edu","address":"4 Oakridge Center","active":false} {"id":26,"name":"Helen West","age":38,"email":"hwestp@time.com","address":"93 Blaine Parkway","active":true,"ip_address":"159.131.255.177"} {"id":27,"name":"Jimmy Black","age":46,"email":"jblackq@house.gov","address":"80157 Bay Drive","active":true,"ip_address":"163.137.84.52"} {"id":28,"name":"Melissa Allen","age":56,"email":"mallenr@upenn.edu","address":"381 Merrick Way","active":false} {"id":29,"name":"Scott Walker","age":48,"email":"swalkers@etsy.com","active":true} {"id":30,"name":"Jimmy Wood","email":"jwoodt@bloomberg.com","address":"1041 Claremont Lane","active":true} {"id":31,"name":"Betty Jacobs","email":"bjacobsu@ihg.com","address":"6520 Anderson Junction","active":false,"ip_address":"166.45.58.141"} {"id":32,"name":"Richard Stone","age":34,"email":"rstonev@rakuten.co.jp","address":"51 Bay Pass","active":true,"ip_address":"9.35.132.204"} {"id":33,"name":"Melissa Henderson","age":21,"email":"mhendersonw@washington.edu","address":"06 Delaware Avenue","active":false} {"id":34,"name":"David Stanley","age":57,"email":"dstanleyx@ucoz.com","address":"692 Lien Avenue","active":true,"ip_address":"194.251.38.0"} {"id":35,"name":"Cynthia Murphy","age":20,"email":"cmurphyy@xinhuanet.com","active":false} {"id":36,"name":"Todd Henry","age":38,"address":"589 Katie Center","active":true,"ip_address":"177.233.117.222"} {"id":37,"name":"Christina Stephens","age":40,"email":"cstephens10@illinois.edu","address":"51039 Hermina Point","active":true} {"id":38,"name":"Sharon Gomez","email":"sgomez11@parallels.com","address":"57089 Texas Way","active":true,"ip_address":"149.85.104.141"} {"id":39,"name":"Benjamin Fisher","age":30,"email":"bfisher12@gmpg.org","address":"3 Welch Plaza","active":false,"ip_address":"116.184.105.191"} {"id":40,"name":"Mark Stewart","age":38,"email":"mstewart13@uiuc.edu","active":false,"ip_address":"167.115.237.197"} {"id":41,"name":"Mark Black","age":45,"email":"mblack14@tuttocitta.it","address":"9 Rutledge Pass","active":false,"ip_address":"108.90.166.239"} {"id":42,"name":"Christina Lawrence","age":47,"email":"clawrence15@simplemachines.org","address":"239 Eggendart Junction","active":true,"ip_address":"8.118.127.22"} {"id":43,"name":"Howard Lynch","age":52,"email":"hlynch16@slideshare.net","active":true} {"id":44,"name":"Heather Perez","age":60,"email":"hperez17@techcrunch.com","address":"1 Almo Court","active":false,"ip_address":"110.184.153.36"} {"id":45,"name":"Michael Howell","age":57,"email":"mhowell18@wufoo.com","address":"341 Shelley Alley","active":false} {"id":46,"name":"Gregory Johnson","age":57,"email":"gjohnson19@japanpost.jp","address":"4 Basil Plaza","active":true,"ip_address":"249.29.102.40"} {"id":47,"name":"Christopher Miller","age":50,"email":"cmiller1a@google.es","address":"76 Granby Way","active":true} {"id":48,"name":"Beverly Hall","age":60,"email":"bhall1b@cam.ac.uk","address":"9 Novick Place","active":true} {"id":49,"name":"Todd Adams","age":58,"email":"tadams1c@yahoo.co.jp","active":false} {"id":50,"name":"Judith Watkins","age":30,"email":"jwatkins1d@comcast.net","address":"5874 Esker Parkway","active":true,"ip_address":"229.176.89.163"} {"id":51,"name":"Cheryl Howard","age":34,"email":"choward1e@cam.ac.uk","address":"492 Mandrake Lane","active":false,"ip_address":"255.117.98.35"} {"id":52,"name":"Mary West","email":"mwest1f@cnn.com","address":"4 Vera Avenue","active":false,"ip_address":"118.130.207.177"} {"id":53,"name":"Carol Welch","age":39,"email":"cwelch1g@sun.com","address":"794 Burrows Pass","active":true,"ip_address":"205.98.9.218"} {"id":54,"name":"Donald Reed","age":23,"email":"dreed1h@wsj.com","address":"0769 Dryden Trail","active":true,"ip_address":"35.72.239.99"} {"id":55,"name":"Michael Wells","age":29,"email":"mwells1i@deviantart.com","address":"9033 Crescent Oaks Way","active":false,"ip_address":"33.18.26.152"} {"id":56,"name":"Joyce Montgomery","age":34,"email":"jmontgomery1j@sciencedaily.com","address":"29093 Lyons Circle","active":true,"ip_address":"85.155.89.174"} {"id":57,"name":"Angela Garza","age":24,"email":"agarza1k@hc360.com","address":"388 Kenwood Street","active":false,"ip_address":"204.191.24.172"} {"id":58,"name":"Rose Green","age":26,"email":"rgreen1l@businessinsider.com","address":"3 Mesta Pass","active":true} {"id":59,"name":"Wanda Williamson","age":39,"email":"wwilliamson1m@cafepress.com","address":"18596 Westridge Crossing","active":true,"ip_address":"215.98.196.209"} {"id":60,"name":"Irene Washington","age":49,"email":"iwashington1n@ameblo.jp","address":"83 Monica Crossing","active":false,"ip_address":"141.46.156.186"} {"id":61,"name":"Anna Freeman","age":50,"email":"afreeman1o@blogs.com","address":"3 Gulseth Way","active":true} {"id":62,"name":"Kathleen Romero","age":23,"email":"kromero1p@craigslist.org","address":"419 Leroy Court","active":true} {"id":63,"name":"Matthew Alexander","age":58,"email":"malexander1q@gnu.org","active":false} {"id":64,"name":"Louis Moore","age":50,"email":"lmoore1r@salon.com","address":"671 Buhler Hill","active":true,"ip_address":"21.247.160.104"} {"id":65,"name":"Christina Brooks","age":27,"email":"cbrooks1s@google.cn","address":"80405 Jana Circle","active":true,"ip_address":"121.100.200.46"} {"id":66,"name":"Sarah Moreno","age":30,"address":"03 Cottonwood Way","active":true,"ip_address":"111.174.142.117"} {"id":67,"name":"Harold Rodriguez","age":24,"email":"hrodriguez1u@squidoo.com","address":"76 Green Circle","active":true} {"id":68,"name":"Louise Black","age":18,"email":"lblack1v@yale.edu","address":"951 Blackbird Junction","active":false,"ip_address":"212.47.220.126"} {"id":69,"name":"Adam Montgomery","email":"amontgomery1w@mlb.com","address":"1 Mesta Terrace","active":false} {"id":70,"name":"Jacqueline Pierce","age":58,"email":"jpierce1x@google.com.au","address":"0161 Village Plaza","active":false,"ip_address":"116.164.88.112"} {"id":71,"name":"Ann Stone","age":45,"email":"astone1y@yelp.com","address":"1011 Heath Terrace","active":false} {"id":72,"name":"Teresa Arnold","age":33,"email":"tarnold1z@mayoclinic.com","active":false,"ip_address":"81.165.73.142"} {"id":73,"name":"Arthur Shaw","age":27,"email":"ashaw20@latimes.com","address":"9956 Hooker Road","active":true} {"id":74,"name":"Wayne Garrett","age":41,"email":"wgarrett21@adobe.com","address":"34 Grasskamp Street","active":true,"ip_address":"29.26.28.17"} {"id":75,"name":"Russell Castillo","age":46,"email":"rcastillo22@printfriendly.com","address":"444 South Avenue","active":false} {"id":76,"name":"Shirley Burke","age":47,"email":"sburke23@lulu.com","address":"70 Florence Drive","active":false} {"id":77,"name":"Tammy Washington","age":46,"email":"twashington24@youtube.com","address":"559 Hollow Ridge Road","active":true,"ip_address":"230.169.245.123"} {"id":78,"name":"Diane Freeman","age":49,"email":"dfreeman25@github.com","address":"04 Transport Center","active":false,"ip_address":"138.200.234.169"} {"id":79,"name":"Anne Morrison","email":"amorrison26@telegraph.co.uk","address":"525 Shasta Junction","active":true} {"id":80,"name":"Paul Johnston","age":51,"email":"pjohnston27@youku.com","address":"16254 Ryan Center","active":false,"ip_address":"214.38.125.121"} {"id":81,"name":"Virginia Welch","age":58,"email":"vwelch28@china.com.cn","address":"2 Michigan Hill","active":true} {"id":82,"name":"Louis Hughes","age":44,"email":"lhughes29@mysql.com","address":"423 Meadow Valley Pass","active":false,"ip_address":"213.45.167.91"} {"id":83,"name":"Betty Reynolds","age":57,"email":"breynolds2a@furl.net","address":"4486 Kedzie Road","active":true} {"id":84,"name":"Norma Olson","age":18,"email":"nolson2b@goo.gl","active":true} {"id":85,"name":"David Ward","age":28,"email":"dward2c@ibm.com","address":"3 Kings Place","active":true} {"id":86,"name":"Phyllis Williamson","age":26,"email":"pwilliamson2d@nationalgeographic.com","address":"7 Northview Street","active":false,"ip_address":"234.86.8.89"} {"id":87,"name":"Kathleen Holmes","age":46,"email":"kholmes2e@zdnet.com","address":"4814 Colorado Place","active":false} {"id":88,"name":"George King","age":23,"email":"gking2f@ask.com","address":"966 Morrow Junction","active":false,"ip_address":"89.94.24.41"} {"id":89,"name":"Raymond Garcia","age":47,"email":"rgarcia2g@quantcast.com","active":true,"ip_address":"135.10.187.167"} {"id":90,"name":"Rose Meyer","age":38,"active":true,"ip_address":"228.216.201.80"} {"id":91,"name":"Jennifer Gray","age":50,"email":"jgray2i@princeton.edu","address":"58241 Calypso Court","active":true,"ip_address":"158.144.236.158"} {"id":92,"name":"Bonnie Franklin","age":24,"email":"bfranklin2j@slideshare.net","address":"629 Prairieview Center","active":false} {"id":93,"name":"Sarah Martin","age":52,"email":"smartin2k@cnn.com","address":"997 Kensington Lane","active":false} {"id":94,"name":"Shirley Hamilton","age":39,"email":"shamilton2l@nih.gov","address":"934 Clarendon Lane","active":false} {"id":95,"name":"Gregory Kim","age":37,"email":"gkim2m@tinyurl.com","active":true,"ip_address":"216.24.238.78"} {"id":96,"name":"Betty Sanchez","age":46,"email":"bsanchez2n@washington.edu","active":true} {"id":97,"name":"Ann Cooper","age":41,"email":"acooper2o@issuu.com","active":false} {"id":98,"name":"Christopher Cole","active":true} {"id":99,"name":"Debra Lopez","age":36,"address":"4 Grim Drive","active":false,"ip_address":"1.217.64.60"} {"id":100,"name":"Shawn Moore","age":35,"email":"smoore2r@mayoclinic.com","active":true} ================================================ FILE: spec/lib/collect_spec.rb ================================================ require 'spec_helper' RSpec.describe Spark::RDD do let(:mapping) { lambda{|x| [x, 1]} } let(:numbers) { Generator.numbers } it '.collect_as_hash' do rdd = $sc.parallelize(numbers) rdd = rdd.map(mapping) expect(rdd.collect_as_hash).to eql(Hash[numbers.map(&mapping)]) end context '.take' do let(:size) { 1000 } let(:numbers) { Generator.numbers(size) } let(:rdd) { $sc.parallelize(numbers) } it 'nothing' do expect(rdd.take(0)).to eql([]) end it 'first' do expect(rdd.first).to eql(numbers.first) end it 'less than limit' do _size = size / 2 expect(rdd.take(_size)).to eql(numbers.take(_size)) end it 'all' do expect(rdd.take(size)).to eql(numbers) end it 'more than limit' do expect(rdd.take(size*2)).to eql(numbers) end end end ================================================ FILE: spec/lib/command_spec.rb ================================================ require 'spec_helper' def to_s_method(x) x.to_s end RSpec::describe Spark::CommandBuilder do let(:numbers) { Generator.numbers } let(:rdd) { $sc.parallelize(numbers, 1) } context '.serialize_function' do let(:result) { numbers.map(&:to_s) } it 'string' do expect(rdd.map('lambda{|x| x.to_s}').collect).to eql(result) end it 'symbol' do expect(rdd.map(:to_s).collect).to eql(result) end it 'lambda' do expect(rdd.map(lambda{|x| x.to_s}).collect).to eql(result) end it 'method' do expect(rdd.map(method(:to_s_method)).collect).to eql(result) end end context '.bind' do it 'number' do number = rand(0..10000000) rdd2 = rdd.map(lambda{|x| x * number}).bind(number: number) expect(rdd2.collect).to eq(numbers.map{|x| x * number}) end it 'open struct' do require 'ostruct' struct = OpenStruct.new struct.number = 3 struct.string = '3' struct.array = [1, 2, 3] func = lambda{|item| item * struct.number + struct.string.to_i + struct.array[0] } rdd2 = rdd.add_library('ostruct') rdd2 = rdd2.map(func) rdd2 = rdd2.bind(struct: struct) expect(rdd2.collect).to eq(numbers.map(&func)) end it 'different naming' do array = [1, 2, 3] rdd2 = rdd.map(lambda{|_| my_array.size}) rdd2 = rdd2.bind(my_array: array) expect(rdd2.sum).to eq(numbers.size * array.size) end end end ================================================ FILE: spec/lib/config_spec.rb ================================================ require 'spec_helper' RSpec.describe Spark::Config do before(:context) do Spark.stop end after(:context) do spark_start end it 'should be stopped' do expect(Spark.started?).to be_falsy end context 'new config' do let(:configuration) do { 'test.test1' => 'test1', 'test.test2' => 'test2', 'test.test3' => 'test3' } end before(:each) do Spark.clear_config end it 'throught methods' do configuration.each do |key, value| Spark.config.set(key, value) end configuration.each do |key, value| expect(Spark.config.get(key)).to eql(value) end end it 'throught hash style' do configuration.each do |key, value| Spark.config[key] = value end configuration.each do |key, value| expect(Spark.config[key]).to eql(value) end end it 'throught dsl' do configuration.each do |key, value| Spark.config { set key, value } end configuration.each do |key, value| expect(Spark.config[key]).to eql(value) end end end end ================================================ FILE: spec/lib/context_spec.rb ================================================ require 'spec_helper' RSpec.describe Spark::Context do it '.run_job' do workers = 5 numbers = (0...100).to_a func = lambda{|part| part.size} ser = Spark::Serializer.build { __batched__(__marshal__, 1) } rdd = $sc.parallelize(numbers, workers, ser) rdd_result = $sc.run_job(rdd, func) result = numbers.each_slice(numbers.size/workers).map(&func) expect(rdd_result).to eql(result) parts = [0, 2] func = lambda{|part| part.to_s} rdd_result = $sc.run_job(rdd, func, parts) result = [] sliced_numbers = numbers.each_slice(numbers.size/workers).to_a parts.each do |part| result << func.call(sliced_numbers[part]) end expect(rdd_result).to eql(result) end it '.broadcast' do workers = rand(1..5) values1 = [1,2,3] values2 = [4,5,6] broadcast1 = $sc.broadcast(values1) broadcast2 = $sc.broadcast(values2) rdd = $sc.parallelize(0..5, workers) rdd = rdd.bind(broadcast1: broadcast1, broadcast2: broadcast2) rdd = rdd.map_partitions(lambda{|_| broadcast1.value + broadcast2.value }) expect(rdd.sum).to eql( (values1 + values2).reduce(:+) * workers ) end # context '.accumulator' do # it 'test' do # accum1 = $sc.accumulator(0,) # accum2 = $sc.accumulator(1, :*, 1) # accum3 = $sc.accumulator(0, lambda{|max, val| val > max ? val : max}) # accum1 += 1 # accum2.add(2) # accum2.add(2) # accum2.add(2) # accum3.add(9) # accum3.add(6) # accum3.add(7) # expect(accum1.value).to eql(1) # expect(accum2.value).to eql(8) # expect(accum3.value).to eql(9) # func = Proc.new do |_, index| # accum1.add(1) # accum2.add(2) # accum3.add(index * 10) # end # rdd = $sc.parallelize(0..4, 4) # rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3) # rdd = rdd.map_partitions_with_index(func) # rdd.collect # # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock # sleep(1) # expect(accum1.value).to eql(5) # expect(accum2.value).to eql(128) # expect(accum3.value).to eql(30) # end # context 'accum param' do # it 'symbol' do # accum1 = $sc.accumulator(1, :+, 0) # accum2 = $sc.accumulator(5, :-, 3) # accum3 = $sc.accumulator(1, :*, 1) # accum4 = $sc.accumulator(1.0, :/, 1.0) # accum5 = $sc.accumulator(2, :**, 2) # func = Proc.new do |_| # accum1.add(1) # accum2.add(1) # accum3.add(2) # accum4.add(2) # accum5.add(2) # end # rdd = $sc.parallelize(0..4, 2) # rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3, accum4: accum4, accum5: accum5) # rdd = rdd.map_partitions(func) # rdd.collect # # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock # sleep(1) # expect(accum1.value).to eq(3) # expect(accum2.value).to eq(1) # expect(accum3.value).to eq(4) # expect(accum4.value).to eq(4) # expect(accum5.value).to eq(65536) # end # it 'proc' do # accum1 = $sc.accumulator(1, lambda{|mem, val| mem + val}, 0) # accum2 = $sc.accumulator('a', lambda{|mem, val| mem + val}, '') # accum3 = $sc.accumulator([], lambda{|mem, val| mem << val}, []) # func = Proc.new do |_| # accum1.add(1) # accum2.add('a') # accum3.add(1) # end # rdd = $sc.parallelize(0..4, 2) # rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3) # rdd = rdd.map_partitions(func) # rdd.collect # # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock # sleep(1) # expect(accum1.value).to eq(3) # expect(accum2.value).to eq('aaa') # expect(accum3.value).to eq([[1], [1]]) # end # it 'string' do # expect { $sc.accumulator(1, '0') }.to raise_error(Spark::SerializeError) # accum = $sc.accumulator(1, 'lambda{|mem, val| mem + val}', 0) # func = Proc.new do |_| # accum.add(1) # end # rdd = $sc.parallelize(0..4, 2) # rdd = rdd.bind(accum: accum) # rdd = rdd.map_partitions(func) # rdd.collect # # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock # sleep(1) # expect(accum.value).to eq(3) # end # end # end end ================================================ FILE: spec/lib/ext_spec.rb ================================================ require 'spec_helper' RSpec.describe Array do it '.deep_copy' do data = ['a', 'b', 'c'] new_data = data.dup data[0] << 'a' expect(data).to eql(new_data) new_data = data.deep_copy data[1] << 'b' expect(data).to_not eql(new_data) end end RSpec.describe Hash do it '.stringify_keys!' do data = { a: 'a', b: 'b', c: 'c' } data.stringify_keys! expect(data).to eql({ 'a' => 'a', 'b' => 'b', 'c' => 'c' }) end end RSpec.describe String do it '.camelize' do data = 'aaa_bbb_ccc'.camelize expect(data).to eql('AaaBbbCcc') end end RSpec.describe IO do it 'serialize' do file = Tempfile.new('serialize') file.binmode file.write_int(1) file.write_long(2) file.write_string('3') file.write_data([4]) file.rewind expect(file.read_int).to eq(1) expect(file.read_long).to eq(2) expect(file.read_string).to eq('3') expect(file.read_data).to eq([4]) file.unlink end end ================================================ FILE: spec/lib/external_apps_spec.rb ================================================ require 'spec_helper' RSpec.describe Spark::RDD do context '.pipe' do let(:words) { Generator.words } let(:numbers) { Generator.numbers } it 'single program' do skip if windows? rdd = $sc.parallelize(words, 1) rdd = rdd.pipe('tr a b') result = words.dup result.map! do |x| x.gsub('a', 'b') end expect(rdd.collect).to eql(result) end it 'multiple program' do skip if windows? rdd = $sc.parallelize(numbers, 1) rdd = rdd.pipe("tr 1 5", "awk '{print $1*10}'") rdd = rdd.map(lambda{|x| x.to_i * 100}) result = numbers.dup result.map! do |x| x.to_s.gsub('1', '5') end result.map! do |x| x.to_i * 10 end result.map! do |x| x * 100 end expect(rdd.collect).to eql(result) end end end ================================================ FILE: spec/lib/filter_spec.rb ================================================ require 'spec_helper' def func4(item) item.start_with?('a') && item.size > 3 && item[1].to_s.ord > 106 end RSpec.shared_examples 'a filtering' do |workers| context "with #{workers || 'default'} worker" do it 'when numbers' do rdd2 = rdd_numbers(workers) rdd2 = rdd2.filter(func1) result = numbers.select(&func1) expect(rdd2.collect).to eql(result) rdd3 = rdd_numbers(workers) rdd3 = rdd3.filter(func1) rdd3 = rdd3.filter(func2) expect(rdd3.collect).to eql([]) end it 'when words' do rdd2 = rdd_words(workers) rdd2 = rdd2.filter(func3) result = words.select{|x| func3.call(x)} expect(rdd2.collect).to eql(result) rdd3 = rdd_words(workers) rdd3 = rdd3.filter(method(:func4)) result = words.select{|x| func4(x)} expect(rdd3.collect).to eql(result) end end end RSpec.describe 'Spark::RDD.filter' do let(:func1) { lambda{|x| x.to_i.even?} } let(:func2) { lambda{|x| x.to_i.odd?} } let(:func3) { lambda{|x| x.to_s.start_with?('b')} } context 'throught parallelize' do let(:numbers) { Generator.numbers_with_zero } let(:words) { Generator.words } def rdd_numbers(workers) $sc.parallelize(numbers, workers) end def rdd_words(workers) $sc.parallelize(words, workers) end it_behaves_like 'a filtering', 2 # it_behaves_like 'a filtering', nil # it_behaves_like 'a filtering', rand(2..10) end context 'throught text_file' do let(:file_numbers) { File.join('spec', 'inputs', 'numbers_0_100.txt') } let(:file_words) { File.join('spec', 'inputs', 'lorem_300.txt') } let(:numbers) { File.readlines(file_numbers).map(&:strip) } let(:words) { File.readlines(file_words).map(&:strip) } def rdd_numbers(workers) $sc.text_file(file_numbers, workers) end def rdd_words(workers) $sc.text_file(file_words, workers) end it_behaves_like 'a filtering', 2 # it_behaves_like 'a filtering', nil # it_behaves_like 'a filtering', rand(2..10) end end ================================================ FILE: spec/lib/flat_map_spec.rb ================================================ require 'spec_helper' RSpec.shared_examples 'a flat mapping' do |workers| it "with #{workers || 'default'} worker" do rdd2 = rdd(workers).map(func1) result = numbers.flat_map(&func1) expect(rdd2.collect).to eql(result) rdd3 = rdd(workers) rdd3 = rdd3.flat_map(func1) rdd3 = rdd3.flat_map(func2) rdd3 = rdd3.flat_map(func3) result = numbers.flat_map(&func1).flat_map(&func2).flat_map(&func3) expect(rdd3.collect).to eql(result) rdd4 = rdd(workers) rdd4 = rdd4.flat_map(func1) rdd4 = rdd4.flat_map(func2) rdd4 = rdd4.flat_map(func3) expect(rdd4.collect).to eql(rdd3.collect) end end RSpec.shared_examples 'a flat mapping values' do |workers| it "with #{workers || 'default'} worker" do rdd2 = rdd(workers).flat_map_values(func1) result = [] hash_with_values.each do |(key, values)| values = func1.call(values).flatten values.each do |value| result << [key, value] end end expect(rdd2.collect).to eql(result) rdd2 = rdd(workers).flat_map_values(func2) result = [] hash_with_values.each do |(key, values)| values = func2.call(values).flatten values.each do |value| result << [key, value] end end expect(rdd2.collect).to eql(result) end end RSpec.describe 'Spark::RDD' do let(:func1) { lambda{|x| x*2} } let(:func2) { lambda{|x| [x*3, 1, 1]} } let(:func3) { lambda{|x| [x*4, 2, 2]} } context 'throught parallelize' do context '.flat_map' do let(:numbers) { Generator.numbers_with_zero } def rdd(workers) $sc.parallelize(numbers, workers) end it_behaves_like 'a flat mapping', 1 it_behaves_like 'a flat mapping', 2 # it_behaves_like 'a flat mapping', nil # it_behaves_like 'a flat mapping', rand(2..10) end context '.flat_map_values' do let(:func1) { lambda{|x| x*2} } let(:func2) { lambda{|x| [x.first]} } let(:hash_with_values) { Generator.hash_with_values } def rdd(workers) $sc.parallelize(hash_with_values, workers) end it_behaves_like 'a flat mapping values', 1 it_behaves_like 'a flat mapping values', 2 # it_behaves_like 'a flat mapping values', nil # it_behaves_like 'a flat mapping values', rand(2..10) end end context 'throught text_file' do context '.flat_map' do let(:file) { File.join('spec', 'inputs', 'numbers_0_100.txt') } let(:numbers) { File.readlines(file).map(&:strip) } def rdd(workers) $sc.text_file(file, workers) end it_behaves_like 'a flat mapping', 1 it_behaves_like 'a flat mapping', 2 # it_behaves_like 'a flat mapping', nil # it_behaves_like 'a flat mapping', rand(2..10) end end end ================================================ FILE: spec/lib/group_spec.rb ================================================ require 'spec_helper' RSpec.shared_examples 'a groupping by key' do |workers| it "with #{workers || 'default'} worker" do expect(rdd_result(workers)).to eql(result) end end RSpec.shared_examples 'a cogroupping by key' do |workers| context "with #{workers || 'default'} worker" do it '.group_with' do rdd = rdd_1(workers).group_with(rdd_2(workers)) expect(rdd.collect_as_hash).to eql(result_12) end it '.cogroup' do rdd = rdd_1(workers).cogroup(rdd_2(workers), rdd_3(workers)) expect(rdd.collect_as_hash).to eql(result_123) end end end RSpec.shared_examples 'a groupping by' do |workers| it "with #{workers || 'default'} worker" do rdd = rdd_numbers(workers) rdd = rdd.group_by(key_function1) expect(rdd.collect_as_hash).to eql(numbers.group_by(&key_function1)) rdd = rdd_words(workers) rdd = rdd.group_by(key_function2) expect(rdd.collect_as_hash).to eql(words.group_by(&key_function2)) end end RSpec.describe 'Spark::RDD' do def make_result(*hashes) _result = {} hashes.each do |data| data.each do |key, value| _result[key] ||= [] _result[key] << value end end _result end context '.group_by_key' do let(:hash) { Generator.hash } let(:result) { make_result(hash) } def rdd_result(workers) rdd = $sc.parallelize(hash) rdd.group_by_key.collect_as_hash end it_behaves_like 'a groupping by key', 1 it_behaves_like 'a groupping by key', 2 # it_behaves_like 'a groupping by key', nil # it_behaves_like 'a groupping by key', rand(2..10) end context 'cogroup' do let(:hash1) { Generator.hash } let(:hash2) { Generator.hash } let(:hash3) { Generator.hash } let(:result_12) { make_result(hash1, hash2) } let(:result_123) { make_result(hash1, hash2, hash3) } def rdd_1(workers) $sc.parallelize(hash1) end def rdd_2(workers) $sc.parallelize(hash2) end def rdd_3(workers) $sc.parallelize(hash3) end it_behaves_like 'a cogroupping by key', 1 it_behaves_like 'a cogroupping by key', 2 # it_behaves_like 'a cogroupping by key', nil # it_behaves_like 'a cogroupping by key', rand(2..10) end context 'group_by' do let(:key_function1) { lambda{|x| x%2} } let(:key_function2) { lambda{|x| x.size} } let(:numbers) { Generator.numbers } let(:words) { Generator.words } def rdd_numbers(workers) $sc.parallelize(numbers) end def rdd_words(workers) $sc.parallelize(words) end it_behaves_like 'a groupping by', 1 it_behaves_like 'a groupping by', 2 # it_behaves_like 'a groupping by', nil # it_behaves_like 'a groupping by', rand(2..10) end end ================================================ FILE: spec/lib/helper_spec.rb ================================================ require 'spec_helper' RSpec.configure do |c| c.include Spark::Helper::Parser c.include Spark::Helper::Statistic end RSpec.describe Spark::Helper do it 'memory size' do expect(to_memory_size('512mb')).to eql(524288.0) expect(to_memory_size('1586 mb')).to eql(1624064.0) expect(to_memory_size('3 MB')).to eql(3072.0) expect(to_memory_size('9gb')).to eql(9437184.0) expect(to_memory_size('9gb', 'mb')).to eql(9216.0) expect(to_memory_size('9mb', 'gb')).to eql(0.01) expect(to_memory_size('6652548796kb', 'mb')).to eql(6496629.68) end context 'statistic' do it 'compute_fraction' do expect(compute_fraction(1, 1000, true)).to be_within(0.001).of(0.013) expect(compute_fraction(2, 1000, true)).to be_within(0.001).of(0.018) expect(compute_fraction(3, 1000, true)).to be_within(0.001).of(0.023) expect(compute_fraction(4, 1000, true)).to be_within(0.001).of(0.028) expect(compute_fraction(5, 1000, true)).to be_within(0.001).of(0.031) expect(compute_fraction(1, 1000, false)).to be_within(0.001).of(0.0249) expect(compute_fraction(2, 1000, false)).to be_within(0.001).of(0.0268) expect(compute_fraction(3, 1000, false)).to be_within(0.001).of(0.0287) expect(compute_fraction(4, 1000, false)).to be_within(0.001).of(0.0305) expect(compute_fraction(5, 1000, false)).to be_within(0.001).of(0.0322) end it 'bisect_right' do data = [10, 20, 30, 40, 50, 60, 70, 80, 90] expect(bisect_right(data, 0)).to eq(0) expect(bisect_right(data, 1)).to eq(0) expect(bisect_right(data, 1, 2)).to eq(2) expect(bisect_right(data, 1, 3)).to eq(3) expect(bisect_right(data, 1, 4)).to eq(4) expect(bisect_right(data, 9)).to eq(0) expect(bisect_right(data, 10)).to eq(1) expect(bisect_right(data, 40)).to eq(4) expect(bisect_right(data, 42)).to eq(4) expect(bisect_right(data, 72)).to eq(7) expect(bisect_right(data, 80, 4)).to eq(8) expect(bisect_right(data, 80, 5)).to eq(8) expect(bisect_right(data, 80, 8)).to eq(8) expect(bisect_right(data, 80, 9)).to eq(9) expect(bisect_right(data, 200)).to eq(9) end it 'determine_bounds' do data = [10, 20, 30, 40, 50, 60, 70, 80, 90] expect(determine_bounds(data, 0)).to eq([]) expect(determine_bounds(data, 1)).to eq([]) expect(determine_bounds(data, 2)).to eq([50]) expect(determine_bounds(data, 3)).to eq([40, 70]) expect(determine_bounds(data, 4)).to eq([30, 50, 70]) expect(determine_bounds(data, 20)).to eq(data) end end end ================================================ FILE: spec/lib/key_spec.rb ================================================ require 'spec_helper' RSpec.shared_examples 'a keying by' do |workers| it "with #{workers || 'default'} worker" do rdd = rdd_numbers(workers) rdd = rdd.key_by(key_function1) result = numbers.map{|item| [key_function1.call(item), item]} expect(rdd.collect).to eql(result) rdd = rdd_words(workers) rdd = rdd.key_by(key_function2) result = words.map{|item| [key_function2.call(item), item]} expect(rdd.collect).to eql(result) end end RSpec.describe 'Spark::RDD' do context 'key_by' do let(:key_function1) { lambda{|x| x.even?} } let(:key_function2) { lambda{|x| x.include?('a')} } let(:numbers) { Generator.numbers } let(:words) { Generator.words } def rdd_numbers(workers) $sc.parallelize(numbers) end def rdd_words(workers) $sc.parallelize(words) end it_behaves_like 'a keying by', 1 it_behaves_like 'a keying by', 2 # it_behaves_like 'a keying by', nil # it_behaves_like 'a keying by', rand(2..10) end it 'lookup' do numbers = Generator.numbers rdd_numbers = $sc.parallelize(numbers, 2) rdd = rdd_numbers.group_by(lambda {|x| x%3}) rdd.lookup(2) expect(rdd.lookup(2).first).to eq( numbers.group_by{|x| x%3}[2] ) rdd = rdd_numbers.key_by(lambda{|x| x.even?}) expect(rdd.lookup(true)).to eq( numbers.select(&:even?) ) end end ================================================ FILE: spec/lib/manipulation_spec.rb ================================================ require 'spec_helper' RSpec.describe 'Spark::RDD' do let(:numbers) { 1..100 } let(:rand_numbers) { Generator.numbers } it '.glom' do rdd = $sc.parallelize(numbers, 1).glom expect(rdd.collect).to eql([numbers.to_a]) ser = Spark::Serializer.build { __batched__(__marshal__, 1) } rdd = $sc.parallelize(numbers, 5, ser).glom expect(rdd.collect).to eql(numbers.each_slice(20).to_a) end it '.coalesce' do rdd = $sc.parallelize(numbers, 5) rdd2 = rdd.glom expect(rdd2.collect.size).to eql(5) rdd3 = rdd.coalesce(4).glom expect(rdd3.collect.size).to eql(4) end it '.distinct' do rdd = $sc.parallelize(rand_numbers, 5) rdd = rdd.distinct expect(rdd.collect.sort).to eql(rand_numbers.uniq.sort) rdd = $sc.parallelize(numbers, 5) rdd = rdd.map(lambda{|x| 1}) rdd = rdd.distinct expect(rdd.collect).to eql([1]) end context '.union' do it 'classic method' do rdd = $sc.parallelize(numbers, 5) rdd = rdd.union(rdd).collect expect(rdd.collect.sort).to eql((numbers.to_a+numbers.to_a).sort) end it 'with a different serializer' do rdd1 = $sc.parallelize(numbers, 1, Spark::Serializer.build{ __batched__(__marshal__) }) rdd2 = $sc.parallelize(numbers, 1, Spark::Serializer.build{ __batched__(__oj__) }) expect { rdd1.union(rdd2).collect }.to_not raise_error end it 'as operator' do rdd1 = $sc.parallelize(numbers) rdd2 = $sc.parallelize(rand_numbers) expect((rdd1+rdd2).sum).to eql((numbers.to_a+rand_numbers).reduce(:+)) end end it '.compact' do data = [nil, nil , 0, 0, 1, 2, nil, 6] result = data.compact ser = Spark::Serializer.build { __batched__(__marshal__, 1) } rdd = $sc.parallelize(data, 1).compact expect(rdd.collect).to eql(result) rdd = $sc.parallelize(data, 5, ser).compact expect(rdd.collect).to eql(result) rdd = $sc.parallelize(data, 1, ser).compact expect(rdd.collect).to eql(result) end it '.intersection' do data1 = [0,1,2,3,4,5,6,7,8,9,10] data2 = [5,6,7,8,9,10,11,12,13,14,15] rdd1 = $sc.parallelize(data1) rdd2 = $sc.parallelize(data2) expect(rdd1.intersection(rdd2).collect.sort).to eql(data1 & data2) end it '.shuffle' do data = Generator.numbers rdd = $sc.parallelize(data) expect(rdd.shuffle.collect).to_not eql(data) end context '.cartesian' do let(:data1) { Generator.numbers(100) } let(:data2) { Generator.numbers(100) } let(:result) { data1.product(data2).map(&:to_s).sort } it 'unbatched' do ser = Spark::Serializer.build { __batched__(__marshal__, 1) } rdd1 = $sc.parallelize(data1, 2, ser) rdd2 = $sc.parallelize(data2, 2, ser) rdd = rdd1.cartesian(rdd2).map(lambda{|x| x.to_s}) expect(rdd.collect.sort).to eql(result) end it 'batched' do ser1 = Spark::Serializer.build { __batched__(__marshal__, rand(4..10)) } ser2 = Spark::Serializer.build { __batched__(__marshal__, rand(4..10)) } rdd1 = $sc.parallelize(data1, 2, ser1) rdd2 = $sc.parallelize(data2, 2, ser2) rdd = rdd1.cartesian(rdd2).map(lambda{|x| x.to_s}) expect(rdd.collect.sort).to eql(result) end end end ================================================ FILE: spec/lib/map_partitions_spec.rb ================================================ require 'spec_helper' def func3(x) x.map(&:to_i).reduce(:+) end def func4_with_index(data, index) [{ index => data.map(&:to_i).reduce(:*) }] end RSpec.shared_examples 'a map partitions' do |workers| context "with #{workers || 'default'} worker" do it 'without index' do rdd2 = rdd(workers).map_partitions(func1) result = func1.call(numbers) expect(func1.call(rdd2.collect)).to eql(result) rdd3 = rdd(workers) rdd3 = rdd3.map_partitions(func1) rdd3 = rdd3.map_partitions(func2) rdd3 = rdd3.map_partitions(method(:func3)) result = func3(func2.call(func1.call(numbers))) # Not same number of workers expect(rdd3.collect.size).to be >= 1 rdd4 = rdd(workers) rdd4 = rdd4.map_partitions(func1) rdd4 = rdd4.map_partitions(func2) rdd4 = rdd4.map_partitions(method(:func3)) expect(rdd4.collect).to eql(rdd3.collect) end it 'with index' do rdd2 = rdd(workers).map_partitions_with_index(method(:func4_with_index)) result = rdd2.collect expect(result).to be_a(Array) result.each do |x| expect(x).to be_a(Hash) end # Multiply by 0 # Some values are 0 because of batched serialization expect(result.map(&:values).flatten.compact.uniq.first).to eql(0) end end end RSpec::describe 'Spark::RDD.map_partitions(_with_index)' do let(:func1) { lambda{|x| x.map(&:to_i)} } let(:func2) { lambda{|x| x.map{|y| y*2} } } context 'throught parallelize' do let(:numbers) { 0..1000 } def rdd(workers) $sc.parallelize(numbers, workers) end it_behaves_like 'a map partitions', 1 it_behaves_like 'a map partitions', 2 # it_behaves_like 'a map partitions', nil # it_behaves_like 'a map partitions', rand(2..10) end context 'throught text_file' do let(:file) { File.join('spec', 'inputs', 'numbers_0_100.txt') } let(:numbers) { File.readlines(file).map(&:strip) } def rdd(workers) $sc.text_file(file, workers) end it_behaves_like 'a map partitions', 1 it_behaves_like 'a map partitions', 2 # it_behaves_like 'a map partitions', nil # it_behaves_like 'a map partitions', rand(2..10) end end ================================================ FILE: spec/lib/map_spec.rb ================================================ require 'spec_helper' RSpec.shared_examples 'a mapping' do |workers| it "with #{workers || 'default'} worker" do rdd2 = rdd(workers).map(func1) result = numbers.map(&func1) expect(rdd2.collect).to eql(result) rdd3 = rdd(workers) rdd3 = rdd3.map(func1) rdd3 = rdd3.map(func2) rdd3 = rdd3.map(func3) result = numbers.map(&func1).map(&func2).map(&func3) expect(rdd3.collect).to eql(result) rdd4 = rdd(workers) rdd4 = rdd4.map(func3) rdd4 = rdd4.map(func2) rdd4 = rdd4.map(func1) expect(rdd4.collect).to eql(rdd3.collect) end end RSpec.shared_examples 'a mapping values' do |workers| it "with #{workers || 'default'} worker" do rdd2 = rdd(workers).map_values(func1) result = hash.map{|key, value| [key, func1.call(value)]} expect(rdd2.collect).to eql(result) rdd3 = rdd(workers) rdd3 = rdd3.map_values(func1) rdd3 = rdd3.map_values(func2) rdd3 = rdd3.map_values(func3) result = hash.map{|key, value| [key, func1.call(value)]} .map{|key, value| [key, func2.call(value)]} .map{|key, value| [key, func3.call(value)]} expect(rdd3.collect).to eql(result) end end RSpec.describe 'Spark::RDD' do let(:func1) { lambda{|x| x*2} } let(:func2) { lambda{|x| x*3} } let(:func3) { lambda{|x| x*4} } context 'throught parallelize' do context '.map' do let(:numbers) { Generator.numbers } def rdd(workers) $sc.parallelize(numbers, workers) end it_behaves_like 'a mapping', 1 it_behaves_like 'a mapping', 2 # it_behaves_like 'a mapping', nil # it_behaves_like 'a mapping', rand(2..10) end context '.map_values' do let!(:hash) { Generator.hash } def rdd(workers) $sc.parallelize(hash, workers) end it_behaves_like 'a mapping values', 1 it_behaves_like 'a mapping values', 2 # it_behaves_like 'a mapping values', nil # it_behaves_like 'a mapping values', rand(2..10) end end context 'throught text_file' do context '.map' do let(:file) { File.join('spec', 'inputs', 'numbers_0_100.txt') } let(:numbers) { File.readlines(file).map(&:strip) } def rdd(workers) $sc.text_file(file, workers) end it_behaves_like 'a mapping', 1 it_behaves_like 'a mapping', 2 # it_behaves_like 'a mapping', nil # it_behaves_like 'a mapping', rand(2..10) end end end ================================================ FILE: spec/lib/mllib/classification_spec.rb ================================================ require 'spec_helper' RSpec.describe 'Spark::Mllib classification' do let(:data1) do [ LabeledPoint.new(0.0, [1, 0, 0]), LabeledPoint.new(1.0, [0, 1, 1]), LabeledPoint.new(0.0, [2, 0, 0]), LabeledPoint.new(1.0, [0, 2, 1]) ] end let(:values1) do data1.map do |lp| lp.features.values end end let(:rdd1) { $sc.parallelize(data1) } context 'logistic regression' do it 'test' do lrm = LogisticRegressionWithSGD.train(rdd1) expect(lrm.predict(values1[0])).to be <= 0 expect(lrm.predict(values1[1])).to be > 0 expect(lrm.predict(values1[2])).to be <= 0 expect(lrm.predict(values1[3])).to be > 0 end end context 'svm' do it 'test' do lrm = SVMWithSGD.train(rdd1) expect(lrm.predict(values1[0])).to be <= 0 expect(lrm.predict(values1[1])).to be > 0 expect(lrm.predict(values1[2])).to be <= 0 expect(lrm.predict(values1[3])).to be > 0 end end context 'naive bayes' do it 'test' do lrm = NaiveBayes.train(rdd1) expect(lrm.predict(values1[0])).to be <= 0 expect(lrm.predict(values1[1])).to be > 0 expect(lrm.predict(values1[2])).to be <= 0 expect(lrm.predict(values1[3])).to be > 0 end end end ================================================ FILE: spec/lib/mllib/clustering_spec.rb ================================================ require 'spec_helper' RSpec.describe 'Spark::Mllib clustering' do context 'kmeans' do it 'test' do data = [ DenseVector.new([0, 1.1]), DenseVector.new([0, 1.2]), DenseVector.new([1.1, 0]), DenseVector.new([1.2, 0]) ] model = KMeans.train($sc.parallelize(data), 2, initialization_mode: 'k-means||') expect(model.predict(data[0])).to eq(model.predict(data[1])) expect(model.predict(data[2])).to eq(model.predict(data[3])) end it 'deterministic' do data = Array.new(10) do |i| i *= 10 DenseVector.new([i, i]) end clusters1 = KMeans.train($sc.parallelize(data), 3, initialization_mode: 'k-means||', seed: 42) clusters2 = KMeans.train($sc.parallelize(data), 3, initialization_mode: 'k-means||', seed: 42) centers1 = clusters1.centers.to_a centers2 = clusters2.centers.to_a centers1.zip(centers2).each do |c1, c2| expect(c1).to eq(c2) end end end end ================================================ FILE: spec/lib/mllib/matrix_spec.rb ================================================ require 'spec_helper' RSpec.describe 'Spark::Mllib::Matrix' do context 'dense' do it 'construct' do values = [[1.0, 0.0, 4.0], [0.0, 3.0, 5.0], [2.0, 0.0, 6.0]] matrix = DenseMatrix.new(3, 3, [[1.0, 0.0, 4.0], [0.0, 3.0, 5.0], [2.0, 0.0, 6.0]]) expect(matrix.shape).to eq([3, 3]) expect(matrix.values).to eq([[1.0, 0.0, 4.0], [0.0, 3.0, 5.0], [2.0, 0.0, 6.0]]) end end context 'sparse' do it 'construct' do values = [1.0, 2.0, 4.0, 5.0] column_pointers = [0, 2, 2, 4, 4] row_indices = [1, 2, 1, 2] matrix = SparseMatrix.new(3, 4, column_pointers, row_indices, values) expect(matrix.shape).to eq([3, 4]) expect(matrix.to_a).to eq( [ [0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 4.0, 0.0], [2.0, 0.0, 5.0, 0.0] ] ) end end end ================================================ FILE: spec/lib/mllib/regression_spec.rb ================================================ require 'spec_helper' # Mllib functions are tested on Spark # This just test if ruby call proper methods RSpec.describe 'Spark::Mllib regression' do let(:data1) do [ LabeledPoint.new(-1.0, [0, -1]), LabeledPoint.new(1.0, [0, 1]), LabeledPoint.new(-1.0, [0, -2]), LabeledPoint.new(1.0, [0, 2]) ] end let(:values1) do data1.map do |lp| lp.features.values end end let(:rdd1) { $sc.parallelize(data1) } context 'labeled point' do let(:lp) { LabeledPoint.new(1, [1,2,3]) } it 'from array' do expect(lp.label).to eql(1.0) expect(lp.features).to be_a(DenseVector) end it 'serialize' do lp2 = Marshal.load(Marshal.dump(lp)) expect(lp2.label).to eql(lp.label) expect(lp2.features.values).to eql(lp.features.values) end end context 'linear regression' do context 'test' do let(:lrm) { LinearRegressionWithSGD.train(rdd1) } it 'test' do expect(lrm.predict(values1[0])).to be <= 0 expect(lrm.predict(values1[1])).to be > 0 expect(lrm.predict(values1[2])).to be <= 0 expect(lrm.predict(values1[3])).to be > 0 end it 'test via rdd' do rdd = $sc.parallelize(values1, 1) rdd = rdd.map(lambda{|value| model.predict(value)}) rdd = rdd.bind(model: lrm) result = rdd.collect expect(result[0]).to be <= 0 expect(result[1]).to be > 0 expect(result[2]).to be <= 0 expect(result[3]).to be > 0 end end # Y = 3 + 10*X1 + 10*X2 it 'linear regression' do data = Spark.jb.call(RubyMLLibUtilAPI, 'generateLinearInput', 3.0, ['10.0', '10.0'], 100, 42, 0.1) rdd = $sc.parallelize(data) lrm = LinearRegressionWithSGD.train(rdd, iterations: 1000, intercept: true, step: 1.0) expect(lrm.intercept).to be_between(2.5, 3.5) expect(lrm.weights.size).to eq(2) expect(lrm.weights[0]).to be_between(9.0, 11.0) expect(lrm.weights[1]).to be_between(9.0, 11.0) end end context 'lasso' do it 'test' do lrm = LassoWithSGD.train(rdd1) expect(lrm.predict(values1[0])).to be <= 0 expect(lrm.predict(values1[1])).to be > 0 expect(lrm.predict(values1[2])).to be <= 0 expect(lrm.predict(values1[3])).to be > 0 end it 'local random SGD with initial weights' do data = Spark.jb.call(RubyMLLibUtilAPI, 'generateLinearInput', 2.0, ['-1.5', '0.01'], 1000, 42, 0.1) data.map! do |lp| LabeledPoint.new(lp.label, [1.0] + lp.features.values) end rdd = $sc.parallelize(data); lrm = LassoWithSGD.train(rdd, step: 1.0, reg_param: 0.01, iterations: 40, initial_weights: [-1.0, -1.0, -1.0]) expect(lrm.weights[0]).to be_between(1.9, 2.1) expect(lrm.weights[1]).to be_between(-1.60, -1.40) expect(lrm.weights[2]).to be_between(-1.0e-2, 1.0e-2) end end context 'ridge' do it 'test' do lrm = RidgeRegressionWithSGD.train(rdd1) expect(lrm.predict(values1[0])).to be <= 0 expect(lrm.predict(values1[1])).to be > 0 expect(lrm.predict(values1[2])).to be <= 0 expect(lrm.predict(values1[3])).to be > 0 end end end ================================================ FILE: spec/lib/mllib/vector_spec.rb ================================================ require 'spec_helper' RSpec.describe 'Spark::Mllib::Vector' do context 'parsing' do it 'dense vector' do dv = DenseVector.new([1.0, 2.0, 3.0, 4.0, 5.0]) dv2 = DenseVector.parse(dv.to_s) dv3 = Vectors.parse(dv.to_s) expect(dv.to_s).to eq("[1.0,2.0,3.0,4.0,5.0]") expect(dv2.values).to eq(dv.values) expect(dv3.values).to eq(dv.values) end it 'sparse vector' do sv = SparseVector.new(5, {1 => 3, 4 => 5}) sv2 = SparseVector.parse(sv.to_s) sv3 = Vectors.parse(sv.to_s) expect(sv.to_s).to eq("(5,[1,4],[3,5])") expect(sv2.size).to eq(sv.size) expect(sv2.indices).to eq(sv.indices) expect(sv2.values).to eq(sv.values) expect(sv3.size).to eq(sv.size) expect(sv3.indices).to eq(sv.indices) expect(sv3.values).to eq(sv.values) end end it 'dot' do sv = SparseVector.new(4, {1 => 1, 3 => 2}) dv = DenseVector.new([1.0, 2.0, 3.0, 4.0]) lst = DenseVector.new([1, 2, 3, 4]) expect(sv.dot(dv)).to eq(10.0) expect(dv.dot(dv)).to eq(30.0) expect(lst.dot(dv)).to eq(30.0) end it 'squared distance' do sv = SparseVector.new(4, {1 => 1, 3 => 2}) dv = DenseVector.new([1.0, 2.0, 3.0, 4.0]) lst = DenseVector.new([4, 3, 2, 1]) expect(sv.squared_distance(dv)).to eq(15) expect(sv.squared_distance(lst)).to eq(25) expect(dv.squared_distance(lst)).to eq(20) expect(dv.squared_distance(sv)).to eq(15) expect(lst.squared_distance(sv)).to eq(25) expect(lst.squared_distance(dv)).to eq(20) expect(sv.squared_distance(sv)).to eq(0) expect(dv.squared_distance(dv)).to eq(0) expect(lst.squared_distance(lst)).to eq(0) end it 'sparse vector indexing' do sv1 = SparseVector.new(4, {1 => 1, 3 => 2}) sv2 = SparseVector.new(4, [1, 3], [1, 2]) expect(sv1[0]).to eq(0) expect(sv1[3]).to eq(2) expect(sv1[1]).to eq(1) expect(sv1[2]).to eq(0) expect(sv1[-1]).to eq(2) expect(sv1[-2]).to eq(0) expect(sv1[-4]).to eq(0) expect(sv2[0]).to eq(0) expect(sv2[3]).to eq(2) expect(sv2[1]).to eq(1) expect(sv2[2]).to eq(0) expect(sv2[-1]).to eq(2) expect(sv2[-2]).to eq(0) expect(sv2[-4]).to eq(0) end end ================================================ FILE: spec/lib/reduce_by_key_spec.rb ================================================ require 'spec_helper' def flat_map(line) line.split end def map(item) [item, 1] end def reduce(x,y) x+y end RSpec.shared_examples 'a words counting' do |workers| context "with #{workers || 'default'} worker" do let(:result) do keyyed = lines.flat_map{|x| x.split}.map{|x| [x,1]} result = keyyed.reduce({}){|memo, item| key = item[0] value = item[1] memo[key] ||= 0 memo[key] += value memo } result end it 'when lambda' do rdd2 = rdd(workers) rdd2 = rdd2.flat_map(lambda{|line| line.split}) rdd2 = rdd2.map(lambda{|word| [word, 1]}) rdd2 = rdd2.reduce_by_key(lambda{|x,y| x+y}) expect(rdd2.collect_as_hash).to eql(result) end it 'when method' do rdd2 = rdd(workers) rdd2 = rdd2.flat_map(method(:flat_map)) rdd2 = rdd2.map(method(:map)) rdd2 = rdd2.reduce_by_key(method(:reduce)) expect(rdd2.collect_as_hash).to eql(result) end it 'keys, values' do rdd2 = rdd(workers) rdd2 = rdd2.flat_map(method(:flat_map)) rdd2 = rdd2.map(method(:map)) rdd2 = rdd2.reduce_by_key(method(:reduce)) expect(rdd2.keys.collect.sort).to eql(result.keys.sort) expect { rdd2.values.collect.reduce(:+) }.to_not raise_error end end end RSpec.describe 'Spark::RDD' do context '.reduce_by_key' do context 'throught parallelize' do let(:lines) { Generator.lines } def rdd(workers) $sc.parallelize(lines, workers) end it_behaves_like 'a words counting', 2 # it_behaves_like 'a words counting', nil # it_behaves_like 'a words counting', rand(2..10) end context 'throught text_file' do let(:file) { File.join('spec', 'inputs', 'lorem_300.txt') } let(:lines) { File.readlines(file).map(&:strip) } def rdd(workers) $sc.text_file(file, workers) end it_behaves_like 'a words counting', 2 # it_behaves_like 'a words counting', nil # it_behaves_like 'a words counting', rand(2..10) end end context '.fold_by_key' do let(:numbers) { Generator.numbers } let(:zero_value) { 0 } let(:rdd) { $sc.parallelize(numbers) } let(:map) { lambda{|x| [x, 1]} } let(:add) { lambda{|x,y| x+y} } let(:result) do _result = {} numbers.map(&map).each do |key, value| _result[key] ||= zero_value _result[key] = add.call(_result[key], value) end _result end def fold_by_key(num_partitions=nil) rdd.map(map).fold_by_key(zero_value, add, num_partitions).collect_as_hash end it 'default num_partitions' do expect(fold_by_key).to eq(result) end it 'default num_partitions' do expect( fold_by_key rand(1..10) ).to eq(result) end end end ================================================ FILE: spec/lib/reduce_spec.rb ================================================ require 'spec_helper' def longest_words(memo, word) memo.length > word.length ? memo : word end RSpec.shared_examples 'a reducing' do |workers| context "with #{workers || 'default'} worker" do it '.reduce' do rdd2 = rdd_numbers(workers) rdd2 = rdd2.map(to_i) rdd2 = rdd2.reduce(func1) result = numbers.map(&:to_i).reduce(&func1) expect(rdd2).to eql(result) rdd3 = rdd_numbers(workers) rdd3 = rdd3.map(to_i) rdd3 = rdd3.reduce(func2) result = numbers.map(&:to_i).reduce(&func2) expect(rdd3).to eql(result) rdd4 = rdd_lines(workers) rdd4 = rdd4.flat_map(split) rdd4 = rdd4.reduce(method(:longest_words)) result = lines.flat_map(&split).reduce(&lambda(&method(:longest_words))) expect(rdd4).to eql(result) end it '.fold' do rdd2 = rdd_numbers(workers) rdd2 = rdd2.map(to_i) rdd_result = rdd2.fold(1, func1) # all workers add 1 + last reducing phase result = numbers.map(&:to_i).reduce(&func1) + rdd2.partitions_size + 1 expect(rdd_result).to eql(result) end it '.aggregate' do rdd2 = rdd_numbers(workers) rdd2 = rdd2.map(to_i) # Sum of items + their count seq = lambda{|x,y| [x[0] + y, x[1] + 1]} com = lambda{|x,y| [x[0] + y[0], x[1] + y[1]]} rdd_result = rdd2.aggregate([0,0], seq, com) result = [numbers.reduce(:+), numbers.size] expect(rdd_result).to eql(result) end it '.max' do rdd2 = rdd_numbers(workers) rdd2 = rdd2.map(to_i) expect(rdd2.max).to eql(numbers.map(&:to_i).max) end it '.min' do rdd2 = rdd_numbers(workers) rdd2 = rdd2.map(to_i) expect(rdd2.min).to eql(numbers.map(&:to_i).min) end it '.sum' do rdd2 = rdd_numbers(workers) rdd2 = rdd2.map(to_i) expect(rdd2.sum).to eql(numbers.map(&:to_i).reduce(:+)) end it '.count' do rdd2 = rdd_numbers(workers) rdd2 = rdd2.map(to_i) expect(rdd2.count).to eql(numbers.size) end end end RSpec.describe 'Spark::RDD' do let(:func1) { lambda{|sum, x| sum+x} } let(:func2) { lambda{|product, x| product*x} } let(:to_i) { lambda{|item| item.to_i} } let(:split) { lambda{|item| item.split} } context 'throught parallelize' do let(:numbers) { Generator.numbers } let(:lines) { Generator.lines } def rdd_numbers(workers) $sc.parallelize(numbers, workers) end def rdd_lines(workers) $sc.parallelize(lines, workers) end it_behaves_like 'a reducing', 1 it_behaves_like 'a reducing', 2 # it_behaves_like 'a reducing', nil # it_behaves_like 'a reducing', rand(2..10) end context 'throught text_file' do let(:file) { File.join('spec', 'inputs', 'numbers_0_100.txt') } let(:file_lines) { File.join('spec', 'inputs', 'lorem_300.txt') } let(:numbers) { File.readlines(file).map(&:strip).map(&:to_i) } let(:lines) { File.readlines(file_lines).map(&:strip) } def rdd_numbers(workers) $sc.text_file(file, workers) end def rdd_lines(workers) $sc.text_file(file_lines, workers) end it_behaves_like 'a reducing', 1 it_behaves_like 'a reducing', 2 # it_behaves_like 'a reducing', nil # it_behaves_like 'a reducing', rand(2..10) end end ================================================ FILE: spec/lib/sample_spec.rb ================================================ require 'spec_helper' # Sample method can not be tested because of random generator # Just test it for raising error RSpec.shared_examples 'a sampler' do |workers| context "with #{workers || 'default'} worker" do context '.sample' do it 'with replacement' do rdd2 = rdd(workers).sample(true, rand) expect { rdd2.collect }.to_not raise_error end it 'without replacement' do rdd2 = rdd(workers).sample(false, rand) expect { rdd2.collect }.to_not raise_error end end context '.take_sample' do it 'with replacement' do size = rand(10..999) expect(rdd(workers).take_sample(true, size).size).to eql(size) end it 'without replacement' do size = rand(10..999) expect(rdd(workers).take_sample(false, size).size).to eql(size) end end end end RSpec.describe 'Spark::RDD' do let(:numbers) { Generator.numbers(1000) } def rdd(workers) $sc.parallelize(numbers, workers) end it_behaves_like 'a sampler', 1 it_behaves_like 'a sampler', 2 # it_behaves_like 'a sampler', nil # it_behaves_like 'a sampler', rand(2..10) end ================================================ FILE: spec/lib/serializer_spec.rb ================================================ require 'spec_helper' require 'zlib' RSpec.describe Spark::Serializer do let(:data) { [1, 'test', 2.0, [3], {key: 'value'}, :test, String] } it 'find' do expect(described_class.find('not_existed_class')).to eql(nil) expect(described_class.find('Marshal')).to eq(described_class::Marshal) expect(described_class.find('marshal')).to eq(described_class::Marshal) expect(described_class.find(:marshal)).to eq(described_class::Marshal) expect(described_class.find('batched')).to eq(described_class::Batched) end it 'find!' do expect { expect(described_class.find!('not_existed_class')) }.to raise_error(Spark::SerializeError) expect { expect(described_class.find!('marshal')) }.to_not raise_error expect { expect(described_class.find!('batched')) }.to_not raise_error end it 'register' do NewSerializer = Class.new expect(described_class.find('new_serializer_1')).to eql(nil) expect(described_class.find('new_serializer_2')).to eql(nil) expect(described_class.find('new_serializer_3')).to eql(nil) described_class.register('new_serializer_1', 'new_serializer_2', 'new_serializer_3', NewSerializer) expect(described_class.find('new_serializer_1')).to eql(NewSerializer) expect(described_class.find('new_serializer_2')).to eql(NewSerializer) expect(described_class.find('new_serializer_3')).to eql(NewSerializer) end it '==' do # One class marshal1 = described_class::Marshal.new marshal2 = described_class::Marshal.new expect(marshal1).to eq(marshal1) expect(marshal1).to eq(marshal2) # Two classes compressed1 = described_class::Compressed.new(marshal1) compressed2 = described_class::Compressed.new(marshal2) expect(compressed1).to eq(compressed1) expect(compressed1).to eq(compressed2) # Three classes batched1 = described_class::Batched.new(compressed1, 1) batched2 = described_class::Batched.new(compressed2, 1) batched3 = described_class::Batched.new(compressed1, 2) expect(batched1).to eq(batched2) expect(batched1).to_not eq(batched3) end context 'build' do let(:marshal1) { described_class::Marshal.new } let(:compressed1) { described_class::Compressed.new(marshal1) } let(:batched1) { described_class::Batched.new(compressed1, 1) } it 'block' do expect(described_class.build{ marshal }).to eq(marshal1) expect(described_class.build{ marshal }).to eq(described_class.build{ __marshal__ }) expect(described_class.build{ compressed(marshal) }).to eq(compressed1) expect(described_class.build{ batched(compressed(marshal), 1) }).to eq(batched1) end it 'text' do expect(described_class.build('marshal')).to eq(marshal1) expect(described_class.build('compressed(marshal)')).to eq(compressed1) expect(described_class.build('batched(compressed(marshal), 1)')).to eq(batched1) end end it 'serialization' do marshal1 = described_class.build{ marshal } compressed1 = described_class.build{ compressed(marshal) } expect(marshal1.dump(data)).to eq(Marshal.dump(data)) expect(compressed1.dump(data)).to eq( Zlib::Deflate.deflate(Marshal.dump(data)) ) end context 'Auto batched' do let(:klass) { Spark::Serializer::AutoBatched } let(:marshal) { Spark::Serializer::Marshal.new } let(:numbers) { Generator.numbers } it 'initialize' do expect { klass.new }.to raise_error(ArgumentError) expect { klass.new(marshal) }.to_not raise_error expect { klass.new(marshal, 1) }.to raise_error(Spark::SerializeError) end it 'serialization' do serializer1 = klass.new(marshal) serializer2 = klass.new(marshal, 2) rdd1 = Spark.sc.parallelize(numbers, 2, serializer1) rdd2 = Spark.sc.parallelize(numbers, 2, serializer2).map(:to_i) result = rdd1.collect expect(rdd1.serializer).to eq(serializer1) expect(result).to eq(numbers) expect(result).to eq(rdd2.collect) end end end ================================================ FILE: spec/lib/sort_spec.rb ================================================ require 'spec_helper' RSpec.shared_examples 'a sorting' do |workers| it "with #{workers || 'default'} worker" do rdd2 = rdd(workers) rdd2 = rdd2.flat_map(split) result = lines.flat_map(&split) # Sort by self rdd3 = rdd2.map(map).sort_by_key result2 = result.map(&map).sort_by{|(key, _)| key} expect(rdd3.collect).to eql(result2) # Sort by len rdd3 = rdd2.map(len_map).sort_by_key result2 = result.map(&len_map).sort_by{|(key, _)| key} expect(rdd3.collect).to eql(result2) end end RSpec.describe 'Spark::RDD' do let(:split) { lambda{|x| x.split} } let(:map) { lambda{|x| [x.to_s, 1]} } let(:len_map) { lambda{|x| [x.size, x]} } context 'throught parallelize' do context '.map' do let(:lines) { Generator.lines } def rdd(workers) $sc.parallelize(lines, workers) end it_behaves_like 'a sorting', 1 it_behaves_like 'a sorting', 2 # it_behaves_like 'a sorting', nil # it_behaves_like 'a sorting', rand(2..10) end end context 'throught text_file' do context '.map' do let(:file) { File.join('spec', 'inputs', 'lorem_300.txt') } let(:lines) { File.readlines(file).map(&:strip) } def rdd(workers) $sc.text_file(file, workers) end it_behaves_like 'a sorting', 1 it_behaves_like 'a sorting', 2 # it_behaves_like 'a sorting', nil # it_behaves_like 'a sorting', rand(2..10) end end end ================================================ FILE: spec/lib/sql/column_spec.rb ================================================ require 'spec_helper' RSpec.shared_examples 'binary comparison' do |op| it "#{op}" do to_test = 20 result = df.select('age').where( df.age.__send__(op, to_test) ).values.flatten result.each do |item| if op == '!=' expect(item).to_not eq(to_test) else expect(item).to be.__send__(op, to_test) end end end end RSpec.describe Spark::SQL::Column do let(:file) { File.join('spec', 'inputs', 'people.json') } let(:df) { $sql.read.json(file) } let(:data) do # Data are line delimited result = [] File.readlines(file).each do |line| result << JSON.parse(line) end result end context 'operators' do it 'func' do result = df.select( df.id, df.active, ~df.id, !df.active ).collect_as_hash.map(&:values) result.each do |item| expect(item[0]).to eq(-item[2]) expect(item[1]).to eq(!item[3]) end end context 'binary' do it 'arithmetic' do result = df.select( df.id, df.id+1, df.id-1, df.id*2, df.id/2, df.id%2 ).collect_as_hash.map(&:values) result.each do |item| expect(item[1]).to eq(item[0]+1) expect(item[2]).to eq(item[0]-1) expect(item[3]).to eq(item[0]*2) expect(item[4]).to eq(item[0]/2.0) expect(item[5]).to eq(item[0]%2) end end # comparison it_behaves_like 'binary comparison', '==' it_behaves_like 'binary comparison', '!=' it_behaves_like 'binary comparison', '<' it_behaves_like 'binary comparison', '<=' it_behaves_like 'binary comparison', '>' it_behaves_like 'binary comparison', '>=' it 'logical' do result = df.select('id').where( (df.id >= 20) & (df.id <= 30) ).values.flatten expect(result).to all( be_between(20, 30) ) result = df.select('id').where( (df.id == 1) | (df.id == 2) ).values.flatten expect(result).to eq([1, 2]) end it 'like' do result = df.select('email').where( df.email.like('%com%') ).values.flatten expect(result).to all( include('com') ) end it 'null' do result1 = df.select('address').where( df.address.is_null ).values.flatten result2 = df.select('address').where( df.address.is_not_null ).values.flatten expect(result1).to all( be_nil ) expect(result2).to all( be_an(String) ) end end end it 'substr' do result = df.select( df.name.substr(1, 3) ).values.flatten result.each do |item| expect(item.size).to eq(3) end end it 'isin' do result = df.select('age').where( df.age.isin(20, 21, 22) ).values.flatten expect(result).to all( eq(20).or eq(21).or eq(22) ) end it 'alias' do result = df.select( df.id.as('id2') ).collect_as_hash.map(&:keys).flatten expect(result).to all( eq('id2') ) end it 'cast' do result = df.select( df.id, df.id.cast('string').alias('age2') ).values result.each do |item| expect(item[0]).to be_an(Integer) expect(item[0].to_s).to eq(item[1]) end end it 'when, otherwise' do result = df.select(df.id, Spark::SQL::Column.when(df.id <= 20, 1).when(df.id >= 30, 3).otherwise(2)).values result.each do |item| id = item[0] value = item[1] if id <= 20 expect(value).to eq(1) elsif id >= 30 expect(value).to eq(3) else expect(value).to eq(2) end end end end ================================================ FILE: spec/lib/sql/data_frame_spec.rb ================================================ require 'spec_helper' RSpec.describe Spark::SQL::DataFrame do let(:file) { File.join('spec', 'inputs', 'people.json') } let(:df) { $sql.read.json(file) } context '[]' do it 'String' do value = df['age'] expect(value).to be_a(Spark::SQL::Column) expect(value.to_s).to eq('Column("age")') end it 'Array' do value = df[ ['name', 'age'] ] expect(value).to be_a(Spark::SQL::DataFrame) expect(value.columns).to eq(['name', 'age']) end it 'Numeric' do value = df[0] expect(value).to be_a(Spark::SQL::Column) expect(value.to_s).to eq('Column("active")') end it 'Column' do value = df[ df[0] == true ] expect(value).to be_a(Spark::SQL::DataFrame) end end it 'columns' do expect(df.columns).to eq(['active', 'address', 'age', 'email', 'id', 'ip_address', 'name']) end it 'schema' do schema = df.schema expect(schema).to be_a(Spark::SQL::StructType) expect(schema.type_name).to eq('struct') end it 'show_string' do expect(df.show_string).to start_with('+--') end it 'dtypes' do expect(df.dtypes).to eq([['active', 'boolean'], ['address', 'string'], ['age', 'long'], ['email', 'string'], ['id', 'long'], ['ip_address', 'string'], ['name', 'string']]) end it 'take' do expect(df.take(10).size).to eq(10) end it 'count' do expect(df.count).to eq(100) end context 'select' do it '*' do row = df.select('*').first expect(row.data.keys).to eq(['active', 'address', 'age', 'email', 'id', 'ip_address', 'name']) end it 'with string' do row = df.select('name', 'age').first expect(row.data.keys).to eq(['name', 'age']) end it 'with column' do row = df.select(df.name, df.age).first expect(row.data.keys).to eq(['name', 'age']) end end context 'where' do it 'with string' do eq_20 = df.filter('age = 20').collect expect(eq_20.map{|c| c['age']}).to all(be == 20) end it 'with column' do nil_values = df.where(df.age.is_null).collect greater_or_eq_20 = df.where(df.age >= 20).collect lesser_than_20 = df.where(df.age < 20).collect expect(nil_values.size + greater_or_eq_20.size + lesser_than_20.size).to eq(df.count) expect(nil_values.map{|c| c['age']}).to all(be_nil) expect(greater_or_eq_20.map{|c| c['age']}).to all(be >= 20) expect(lesser_than_20.map{|c| c['age']}).to all(be < 20) end end end ================================================ FILE: spec/lib/statistic_spec.rb ================================================ require 'spec_helper' RSpec.shared_examples 'a stats' do |workers| let(:numbers) { [1.0, 1.0, 2.0, 3.0, 5.0, 8.0] } context "with #{workers || 'default'} worker" do it 'stats class' do stats = $sc.parallelize(numbers, workers).stats expect(stats.sum).to be_within(0.1).of(20) expect(stats.mean).to be_within(0.1).of(20/6.0) expect(stats.max).to be_within(0.1).of(8.0) expect(stats.min).to be_within(0.1).of(1.0) expect(stats.variance).to be_within(0.1).of(6.22222) expect(stats.sample_variance).to be_within(0.1).of(7.46667) expect(stats.stdev).to be_within(0.1).of(2.49444) expect(stats.sample_stdev).to be_within(0.1).of(2.73252) end it 'rdd methods' do rdd = $sc.parallelize([1, 2, 3], workers) expect(rdd.mean).to be_within(0.1).of(2.0) expect(rdd.variance).to be_within(0.1).of(0.666) expect(rdd.stdev).to be_within(0.1).of(0.816) expect(rdd.sample_stdev).to be_within(0.1).of(1.0) expect(rdd.sample_variance).to be_within(0.1).of(1.0) end end end RSpec.shared_examples 'a histogram' do |workers| context "with #{workers || 'default'} worker" do it 'empty' do rdd = $sc.parallelize([], workers, ser) expect( rdd.histogram([0, 10])[1] ).to eq([0]) expect( rdd.histogram([0, 4, 10])[1] ).to eq([0, 0]) end it 'validation' do rdd = $sc.parallelize([], workers, ser) expect { rdd.histogram(0) }.to raise_error(ArgumentError) end it 'double' do rdd = $sc.parallelize([1.0, 2.0, 3.0, 4.0], workers, ser) buckets, counts = rdd.histogram(2) expect(buckets).to eq([1.0, 2.5, 4.0]) expect(counts).to eq([2, 2]) end it 'out of range' do rdd = $sc.parallelize([10.01, -0.01], workers, ser) expect( rdd.histogram([0, 10])[1] ).to eq([0]) expect( rdd.histogram([0, 4, 10])[1] ).to eq([0, 0]) end it 'in range with one bucket' do rdd = $sc.parallelize([1, 2, 3, 4], workers, ser) expect( rdd.histogram([0, 10])[1] ).to eq([4]) expect( rdd.histogram([0, 4, 10])[1] ).to eq([3, 1]) end it 'in range with one bucket exact match' do rdd = $sc.parallelize([1, 2, 3, 4], workers, ser) expect( rdd.histogram([1, 4])[1] ).to eq([4]) end it 'out of range with two buckets' do rdd = $sc.parallelize([10.01, -0.01], workers, ser) expect( rdd.histogram([0, 5, 10])[1] ).to eq([0, 0]) end it 'out of range with two uneven buckets' do rdd = $sc.parallelize([10.01, -0.01], workers, ser) expect( rdd.histogram([0, 4, 10])[1] ).to eq([0, 0]) end it 'in range with two buckets' do rdd = $sc.parallelize([1, 2, 3, 5, 6], workers, ser) expect( rdd.histogram([0, 5, 10])[1] ).to eq([3, 2]) end it 'in range with two bucket and nil' do rdd = $sc.parallelize([1, 2, 3, 5, 6, nil, Float::NAN], workers, ser) expect( rdd.histogram([0, 5, 10])[1] ).to eq([3, 2]) end it 'in range with two uneven buckets' do rdd = $sc.parallelize([1, 2, 3, 5, 6], workers, ser) expect( rdd.histogram([0, 5, 11])[1] ).to eq([3, 2]) end it 'mixed range with two uneven buckets' do rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.0, 11.01], workers, ser) expect( rdd.histogram([0, 5, 11])[1] ).to eq([4, 3]) end it 'mixed range with four uneven buckets' do rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1], workers, ser) expect( rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1] ).to eq([4, 2, 1, 3]) end it 'mixed range with uneven buckets and NaN' do rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1, nil, Float::NAN], workers, ser) expect( rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1] ).to eq([4, 2, 1, 3]) end it 'out of range with infinite buckets' do rdd = $sc.parallelize([10.01, -0.01, Float::NAN, Float::INFINITY], workers, ser) expect( rdd.histogram([-Float::INFINITY, 0, Float::INFINITY])[1] ).to eq([1, 1]) end it 'without buckets' do rdd = $sc.parallelize([1, 2, 3, 4], workers, ser) expect( rdd.histogram(1) ).to eq([[1, 4], [4]]) end it 'without buckets single element' do rdd = $sc.parallelize([1], workers, ser) expect( rdd.histogram(1) ).to eq([[1, 1], [1]]) end it 'without bucket no range' do rdd = $sc.parallelize([1, 1, 1, 1], workers, ser) expect( rdd.histogram(1) ).to eq([[1, 1], [4]]) end it 'without buckets basic two' do rdd = $sc.parallelize([1, 2, 3, 4], workers, ser) expect( rdd.histogram(2) ).to eq([[1, 2.5, 4], [2, 2]]) end it 'without buckets with more requested than elements' do rdd = $sc.parallelize([1, 2], workers, ser) buckets = [1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0] hist = [1, 0, 0, 0, 0, 0, 0, 0, 0, 1] expect( rdd.histogram(10) ).to eq([buckets, hist]) end it 'string' do rdd = $sc.parallelize(['ab', 'ac', 'b', 'bd', 'ef'], workers, ser) expect( rdd.histogram(['a', 'b', 'c'])[1] ).to eq([2, 2]) expect( rdd.histogram(1) ).to eq([['ab', 'ef'], [5]]) expect { rdd.histogram(2) }.to raise_error(Spark::RDDError) end end end RSpec.describe Spark::RDD do let(:ser) { Spark::Serializer.build { __batched__(__marshal__, 1) } } context '.stats' do it_behaves_like 'a stats', 1 it_behaves_like 'a stats', 2 # it_behaves_like 'a stats', rand(2..5) end context '.histogram' do it_behaves_like 'a histogram', 1 it_behaves_like 'a histogram', 2 # it_behaves_like 'a histogram', rand(2..5) end end ================================================ FILE: spec/lib/whole_text_files_spec.rb ================================================ require 'spec_helper' RSpec.shared_examples 'a whole_text_files' do |workers| it "with #{workers || 'default'} worker" do rdd2 = rdd(workers).map(get_numbers) result = files.size expect(rdd2.collect.size).to eql(result) rdd3 = rdd(workers) rdd3 = rdd3.flat_map(get_numbers) result = 0 files.each{|f| result += File.read(f).split.map(&:to_i).reduce(:+)} expect(rdd3.sum).to eql(result) end end RSpec.describe 'Spark::Context' do let(:get_numbers) { lambda{|file, content| content.split.map(&:to_i)} } let(:dir) { File.join('spec', 'inputs', 'numbers') } let(:files) { Dir.glob(File.join(dir, '*')) } def rdd(workers) $sc.whole_text_files(dir, workers) end it_behaves_like 'a whole_text_files', 1 it_behaves_like 'a whole_text_files', 2 # it_behaves_like 'a whole_text_files', nil # it_behaves_like 'a whole_text_files', rand(2..10) end ================================================ FILE: spec/spec_helper.rb ================================================ require 'simplecov' SimpleCov.start $LOAD_PATH.unshift File.dirname(__FILE__) + '/../lib' require 'ruby-spark' require 'generator' # Loading Spark.load_lib Spark.jb.import_all_test Spark::Mllib.import # Keep it on method because its called from config test def spark_start Spark.logger.disable Spark.config do set 'spark.ruby.serializer.batch_size', 100 end $sc = Spark.start $sql = Spark.start_sql end def windows? RbConfig::CONFIG['host_os'] =~ /mswin|mingw/ end RSpec.configure do |config| config.default_formatter = 'doc' config.color = true config.tty = true config.before(:suite) do spark_start end config.after(:suite) do Spark.stop end end