Repository: ondra-m/ruby-spark
Branch: master
Commit: d1b9787642fe
Files: 191
Total size: 440.0 KB
Directory structure:
gitextract_h83fh3m2/
├── .gitignore
├── .travis.yml
├── CHANGELOG.md
├── Gemfile
├── Guardfile
├── LICENSE.txt
├── README.md
├── Rakefile
├── TODO.md
├── benchmark/
│ ├── aggregate.rb
│ ├── bisect.rb
│ ├── comparison/
│ │ ├── prepare.sh
│ │ ├── python.py
│ │ ├── r.r
│ │ ├── ruby.rb
│ │ ├── run-all.sh
│ │ └── scala.scala
│ ├── custom_marshal.rb
│ ├── digest.rb
│ ├── enumerator.rb
│ ├── serializer.rb
│ ├── sort.rb
│ ├── sort2.rb
│ └── take.rb
├── bin/
│ └── ruby-spark
├── example/
│ ├── pi.rb
│ └── website_search.rb
├── ext/
│ ├── ruby_c/
│ │ ├── extconf.rb
│ │ ├── murmur.c
│ │ ├── murmur.h
│ │ └── ruby-spark.c
│ ├── ruby_java/
│ │ ├── Digest.java
│ │ ├── Murmur2.java
│ │ ├── RubySparkExtService.java
│ │ └── extconf.rb
│ └── spark/
│ ├── build.sbt
│ ├── project/
│ │ └── plugins.sbt
│ ├── sbt/
│ │ └── sbt
│ └── src/
│ ├── main/
│ │ └── scala/
│ │ ├── Exec.scala
│ │ ├── MLLibAPI.scala
│ │ ├── Marshal.scala
│ │ ├── MarshalDump.scala
│ │ ├── MarshalLoad.scala
│ │ ├── RubyAccumulatorParam.scala
│ │ ├── RubyBroadcast.scala
│ │ ├── RubyConstant.scala
│ │ ├── RubyMLLibAPI.scala
│ │ ├── RubyMLLibUtilAPI.scala
│ │ ├── RubyPage.scala
│ │ ├── RubyRDD.scala
│ │ ├── RubySerializer.scala
│ │ ├── RubyTab.scala
│ │ ├── RubyUtils.scala
│ │ └── RubyWorker.scala
│ └── test/
│ └── scala/
│ └── MarshalSpec.scala
├── lib/
│ ├── ruby-spark.rb
│ ├── spark/
│ │ ├── accumulator.rb
│ │ ├── broadcast.rb
│ │ ├── build.rb
│ │ ├── cli.rb
│ │ ├── command/
│ │ │ ├── base.rb
│ │ │ ├── basic.rb
│ │ │ ├── pair.rb
│ │ │ ├── sort.rb
│ │ │ └── statistic.rb
│ │ ├── command.rb
│ │ ├── command_builder.rb
│ │ ├── command_validator.rb
│ │ ├── config.rb
│ │ ├── constant.rb
│ │ ├── context.rb
│ │ ├── error.rb
│ │ ├── ext/
│ │ │ ├── hash.rb
│ │ │ ├── integer.rb
│ │ │ ├── io.rb
│ │ │ ├── ip_socket.rb
│ │ │ ├── module.rb
│ │ │ ├── object.rb
│ │ │ └── string.rb
│ │ ├── helper/
│ │ │ ├── logger.rb
│ │ │ ├── parser.rb
│ │ │ ├── serialize.rb
│ │ │ ├── statistic.rb
│ │ │ └── system.rb
│ │ ├── helper.rb
│ │ ├── java_bridge/
│ │ │ ├── base.rb
│ │ │ ├── jruby.rb
│ │ │ └── rjb.rb
│ │ ├── java_bridge.rb
│ │ ├── library.rb
│ │ ├── logger.rb
│ │ ├── mllib/
│ │ │ ├── classification/
│ │ │ │ ├── common.rb
│ │ │ │ ├── logistic_regression.rb
│ │ │ │ ├── naive_bayes.rb
│ │ │ │ └── svm.rb
│ │ │ ├── clustering/
│ │ │ │ ├── gaussian_mixture.rb
│ │ │ │ └── kmeans.rb
│ │ │ ├── matrix.rb
│ │ │ ├── regression/
│ │ │ │ ├── common.rb
│ │ │ │ ├── labeled_point.rb
│ │ │ │ ├── lasso.rb
│ │ │ │ ├── linear.rb
│ │ │ │ └── ridge.rb
│ │ │ ├── ruby_matrix/
│ │ │ │ ├── matrix_adapter.rb
│ │ │ │ └── vector_adapter.rb
│ │ │ ├── stat/
│ │ │ │ └── distribution.rb
│ │ │ └── vector.rb
│ │ ├── mllib.rb
│ │ ├── rdd.rb
│ │ ├── sampler.rb
│ │ ├── serializer/
│ │ │ ├── auto_batched.rb
│ │ │ ├── base.rb
│ │ │ ├── batched.rb
│ │ │ ├── cartesian.rb
│ │ │ ├── compressed.rb
│ │ │ ├── marshal.rb
│ │ │ ├── message_pack.rb
│ │ │ ├── oj.rb
│ │ │ ├── pair.rb
│ │ │ └── text.rb
│ │ ├── serializer.rb
│ │ ├── sort.rb
│ │ ├── sql/
│ │ │ ├── column.rb
│ │ │ ├── context.rb
│ │ │ ├── data_frame.rb
│ │ │ ├── data_frame_reader.rb
│ │ │ ├── data_type.rb
│ │ │ └── row.rb
│ │ ├── sql.rb
│ │ ├── stat_counter.rb
│ │ ├── storage_level.rb
│ │ ├── version.rb
│ │ └── worker/
│ │ ├── master.rb
│ │ ├── spark_files.rb
│ │ └── worker.rb
│ └── spark.rb
├── ruby-spark.gemspec
└── spec/
├── generator.rb
├── inputs/
│ ├── lorem_300.txt
│ ├── numbers/
│ │ ├── 1.txt
│ │ ├── 10.txt
│ │ ├── 11.txt
│ │ ├── 12.txt
│ │ ├── 13.txt
│ │ ├── 14.txt
│ │ ├── 15.txt
│ │ ├── 16.txt
│ │ ├── 17.txt
│ │ ├── 18.txt
│ │ ├── 19.txt
│ │ ├── 2.txt
│ │ ├── 20.txt
│ │ ├── 3.txt
│ │ ├── 4.txt
│ │ ├── 5.txt
│ │ ├── 6.txt
│ │ ├── 7.txt
│ │ ├── 8.txt
│ │ └── 9.txt
│ ├── numbers_0_100.txt
│ ├── numbers_1_100.txt
│ └── people.json
├── lib/
│ ├── collect_spec.rb
│ ├── command_spec.rb
│ ├── config_spec.rb
│ ├── context_spec.rb
│ ├── ext_spec.rb
│ ├── external_apps_spec.rb
│ ├── filter_spec.rb
│ ├── flat_map_spec.rb
│ ├── group_spec.rb
│ ├── helper_spec.rb
│ ├── key_spec.rb
│ ├── manipulation_spec.rb
│ ├── map_partitions_spec.rb
│ ├── map_spec.rb
│ ├── mllib/
│ │ ├── classification_spec.rb
│ │ ├── clustering_spec.rb
│ │ ├── matrix_spec.rb
│ │ ├── regression_spec.rb
│ │ └── vector_spec.rb
│ ├── reduce_by_key_spec.rb
│ ├── reduce_spec.rb
│ ├── sample_spec.rb
│ ├── serializer_spec.rb
│ ├── sort_spec.rb
│ ├── sql/
│ │ ├── column_spec.rb
│ │ └── data_frame_spec.rb
│ ├── statistic_spec.rb
│ └── whole_text_files_spec.rb
└── spec_helper.rb
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
/.gemtags
/.tags
/java/spark.jar
.jbundler
target/*
*.class
*.jar
pom.xml
vendor/*
*.gem
*.rbc
.bundle
.config
.yardoc
Gemfile.lock
InstalledFiles
_yardoc
coverage
doc/
lib/bundler/man
pkg
rdoc
spec/reports
test/tmp
test/version_tmp
tmp
*.bundle
*.so
*.o
*.a
mkmf.log
ext/spark/target/*
ext/spark/project/target/*
ext/spark/project/project/target/*
wiki
/benchmark/performance/spark/*
/benchmark/performance/rspark/*
/_*
================================================
FILE: .travis.yml
================================================
language: ruby
rvm:
- 2.2.0
before_script:
- bundle exec rake compile
- bundle exec ruby bin/ruby-spark build
cache:
bundler: true
directories:
- $HOME/.m2
- $HOME/.ivy2
- $HOME/.sbt
================================================
FILE: CHANGELOG.md
================================================
## Unreleased
## 1.3.0
- new method on RDD (lookup)
- fix sbt url
- Spark 1.5.0
## 1.2.0 (15.06.2015)
- target folder is now located at HOME
- better serializators
- error when java class does not exist
- default setting at ~/.ruby-spark.conf
- compatible with Spark 1.4.0
- added calling site to RDD
================================================
FILE: Gemfile
================================================
source 'https://rubygems.org'
gemspec
gem 'sourcify', '0.6.0.rc4'
gem 'method_source'
gem 'commander'
gem 'pry'
gem 'nio4r'
gem 'distribution'
platform :mri do
gem 'rjb'
gem 'msgpack'
gem 'oj'
gem 'narray'
end
platform :jruby do
gem 'msgpack-jruby', require: 'msgpack'
# NameError: no constructorfor arguments (org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.joda.time.chrono.GJChronology) on Java::OrgJodaTime::DateTime
# gem 'mdarray'
end
group :stats do
# gem 'nmatrix'
# gem 'statsample'
# gem 'statsample-glm'
# gem 'statsample-timeseries'
# gem 'statistics2'
# gem 'statsample-optimization' # libgsl0-dev
# gem 'narray'
# gem 'gsl-nmatrix'
end
group :development do
gem 'benchmark-ips'
gem 'rspec'
gem 'rake-compiler'
gem 'guard'
gem 'guard-rspec'
gem 'listen'
end
group :test do
gem 'simplecov', require: false
end
================================================
FILE: Guardfile
================================================
guard :rspec, cmd: 'rspec' do
watch(%r{^spec/.+_spec\.rb$})
watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
watch('spec/spec_helper.rb') { "spec" }
end
================================================
FILE: LICENSE.txt
================================================
Copyright (c) 2014 Ondřej Moravčík
MIT License
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
================================================
FILE: README.md
================================================
# Ruby-Spark [](https://travis-ci.org/ondra-m/ruby-spark)
Apache Spark™ is a fast and general engine for large-scale data processing.
This Gem allows the use Spark functionality on Ruby.
> Word count in Spark's Ruby API
```ruby
file = spark.text_file("hdfs://...")
file.flat_map(:split)
.map(lambda{|word| [word, 1]})
.reduce_by_key(lambda{|a, b| a+b})
```
- [Apache Spark](http://spark.apache.org)
- [Wiki](https://github.com/ondra-m/ruby-spark/wiki)
- [Rubydoc](http://www.rubydoc.info/gems/ruby-spark)
## Installation
### Requirments
- Java 7+
- Ruby 2+
- wget or curl
- MRI or JRuby
Add this line to your application's Gemfile:
```ruby
gem 'ruby-spark'
```
And then execute:
```
$ bundle
```
Or install it yourself as:
```
$ gem install ruby-spark
```
Run `rake compile` if you are using gem from local filesystem.
### Build Apache Spark
This command will download Spark and build extensions for this gem ([SBT](ext/spark/build.sbt) is used for compiling). For more informations check [wiki](https://github.com/ondra-m/ruby-spark/wiki/Installation). Jars will be stored at you HOME directory.
```
$ ruby-spark build
```
## Usage
You can use Ruby Spark via interactive shell (Pry is used)
```
$ ruby-spark shell
```
Or on existing project.
If you want configure Spark first. See [configurations](https://github.com/ondra-m/ruby-spark/wiki/Configuration) for more details.
```ruby
require 'ruby-spark'
# Configuration
Spark.config do
set_app_name "RubySpark"
set 'spark.ruby.serializer', 'oj'
set 'spark.ruby.serializer.batch_size', 100
end
# Start Apache Spark
Spark.start
# Context reference
Spark.sc
```
Finally, to stop the cluster. On the shell is Spark stopped automatically when environment exit.
```ruby
Spark.stop
```
After first use, global configuration is created at **~/.ruby-spark.conf**. There can be specified properties for Spark and RubySpark.
## Creating RDD (a new collection)
Single text file:
```ruby
rdd = sc.text_file(FILE, workers_num, serializer=nil)
```
All files on directory:
```ruby
rdd = sc.whole_text_files(DIRECTORY, workers_num, serializer=nil)
```
Direct uploading structures from ruby:
```ruby
rdd = sc.parallelize([1,2,3,4,5], workers_num, serializer=nil)
rdd = sc.parallelize(1..5, workers_num, serializer=nil)
```
There is 2 conditions:
1. choosen serializer must be able to serialize it
2. data must be iterable
If you do not specified serializer -> default is used (defined from spark.ruby.serializer.* options). [Check this](https://github.com/ondra-m/ruby-spark/wiki/Loading-data#custom-serializer) if you want create custom serializer.
## Operations
All operations can be divided into 2 groups:
- **Transformations:** append new operation to current RDD and return new
- **Actions:** add operation and start calculations
More informations:
- [Wiki page](https://github.com/ondra-m/ruby-spark/wiki/RDD)
- [Rubydoc](http://www.rubydoc.info/github/ondra-m/ruby-spark/master/Spark/RDD)
- [rdd.rb](https://github.com/ondra-m/ruby-spark/blob/master/lib/spark/rdd.rb)
You can also check official Spark documentation. First make sure that method is implemented here.
- [Transformations](http://spark.apache.org/docs/latest/programming-guide.html#transformations)
- [Actions](http://spark.apache.org/docs/latest/programming-guide.html#actions)
#### Transformations
rdd.map(function)
- Return a new RDD by applying a function to all elements of this RDD.
rdd.flat_map(function)
- Return a new RDD by first applying a function to all elements of this RDD, and then flattening the results.
rdd.map_partitions(function)
- Return a new RDD by applying a function to each partition of this RDD.
rdd.filter(function)
- Return a new RDD containing only the elements that satisfy a predicate.
rdd.cartesian(other)
- Return the Cartesian product of this RDD and another one, that is, the RDD of all pairs of elements `(a, b)` where `a` is in `self` and `b` is in `other`.
rdd.intersection(other)
- Return the intersection of this RDD and another one. The output will not contain any duplicate elements, even if the input RDDs did.
rdd.sample(with_replacement, fraction, seed)
- Return a sampled subset of this RDD. Operations are base on Poisson and Uniform distributions.
rdd.group_by_key(num_partitions)
- Group the values for each key in the RDD into a single sequence.
...many more...
#### Actions
rdd.take(count)
- Take the first num elements of the RDD.
rdd.reduce(function)
- Reduces the elements of this RDD using the specified lambda or method.
rdd.aggregate(zero_value, seq_op, comb_op)
- Aggregate the elements of each partition, and then the results for all the partitions, using given combine functions and a neutral “zero value”.
rdd.histogram(buckets)
- Compute a histogram using the provided buckets.
rdd.collect
- Return an array that contains all of the elements in this RDD.
...many more...
## Examples
##### Basic methods
```ruby
# Every batch will be serialized by Marshal and will have size 10
ser = Spark::Serializer.build('batched(marshal, 10)')
# Range 0..100, 2 workers, custom serializer
rdd = Spark.sc.parallelize(0..100, 2, ser)
# Take first 5 items
rdd.take(5)
# => [0, 1, 2, 3, 4]
# Numbers reducing
rdd.reduce(lambda{|sum, x| sum+x})
rdd.reduce(:+)
rdd.sum
# => 5050
# Reducing with zero items
seq = lambda{|x,y| x+y}
com = lambda{|x,y| x*y}
rdd.aggregate(1, seq, com)
# 1. Every workers adds numbers
# => [1226, 3826]
# 2. Results are multiplied
# => 4690676
# Statistic method
rdd.stats
# => StatCounter: (count, mean, max, min, variance,
# sample_variance, stdev, sample_stdev)
# Compute a histogram using the provided buckets.
rdd.histogram(2)
# => [[0.0, 50.0, 100], [50, 51]]
# Mapping
rdd.map(lambda {|x| x*2}).collect
# => [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, ...]
rdd.map(:to_f).collect
# => [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ...]
# Mapping the whole collection
rdd.map_partitions(lambda{|part| part.reduce(:+)}).collect
# => [1225, 3825]
# Selecting
rdd.filter(lambda{|x| x.even?}).collect
# => [0, 2, 4, 6, 8, 10, 12, 14, 16, ...]
# Sampling
rdd.sample(true, 10).collect
# => [3, 36, 40, 54, 58, 82, 86, 95, 98]
# Sampling X items
rdd.take_sample(true, 10)
# => [53, 87, 71, 74, 18, 75, 55, 94, 46, 32]
# Using external process
rdd.pipe('cat', "awk '{print $1*10}'")
# => ["0", "10", "20", "30", "40", "50", ...]
```
##### Words count using methods
```ruby
# Content:
# "first line"
# "second line"
rdd = sc.text_file(PATH)
# ["first", "line", "second", "line"]
rdd = rdd.flat_map(lambda{|line| line.split})
# [["first", 1], ["line", 1], ["second", 1], ["line", 1]]
rdd = rdd.map(lambda{|word| [word, 1]})
# [["first", 1], ["line", 2], ["second", 1]]
rdd = rdd.reduce_by_key(lambda{|a, b| a+b})
# {"first"=>1, "line"=>2, "second"=>1}
rdd.collect_as_hash
```
##### Estimating PI with a custom serializer
```ruby
slices = 3
n = 100000 * slices
def map(_)
x = rand * 2 - 1
y = rand * 2 - 1
if x**2 + y**2 < 1
return 1
else
return 0
end
end
rdd = Spark.context.parallelize(1..n, slices, serializer: 'oj')
rdd = rdd.map(method(:map))
puts 'Pi is roughly %f' % (4.0 * rdd.sum / n)
```
##### Estimating PI
```ruby
rdd = sc.parallelize([10_000], 1)
rdd = rdd.add_library('bigdecimal/math')
rdd = rdd.map(lambda{|x| BigMath.PI(x)})
rdd.collect # => #
```
### Mllib (Machine Learning Library)
Mllib functions are using Spark's Machine Learning Library. Ruby objects are serialized and deserialized in Java so you cannot use custom classes. Supported are primitive types such as string or integers.
All supported methods/models:
- [Rubydoc / Mllib](http://www.rubydoc.info/github/ondra-m/ruby-spark/Spark/Mllib)
- [Github / Mllib](https://github.com/ondra-m/ruby-spark/tree/master/lib/spark/mllib)
##### Linear regression
```ruby
# Import Mllib classes into Object
# Otherwise are accessible via Spark::Mllib::LinearRegressionWithSGD
Spark::Mllib.import(Object)
# Training data
data = [
LabeledPoint.new(0.0, [0.0]),
LabeledPoint.new(1.0, [1.0]),
LabeledPoint.new(3.0, [2.0]),
LabeledPoint.new(2.0, [3.0])
]
# Train a model
lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initial_weights: [1.0])
lrm.predict([0.0])
```
##### K-Mean
```ruby
Spark::Mllib.import
# Dense vectors
data = [
DenseVector.new([0.0,0.0]),
DenseVector.new([1.0,1.0]),
DenseVector.new([9.0,8.0]),
DenseVector.new([8.0,9.0])
]
model = KMeans.train(sc.parallelize(data), 2)
model.predict([0.0, 0.0]) == model.predict([1.0, 1.0])
# => true
model.predict([8.0, 9.0]) == model.predict([9.0, 8.0])
# => true
```
## Benchmarks
================================================
FILE: Rakefile
================================================
#-*- mode: ruby -*-
require "bundler/gem_tasks"
require "rspec/core/rake_task"
RSpec::Core::RakeTask.new
task default: :spec
task test: :spec
def java?
RUBY_PLATFORM =~ /java/
end
if java?
require "rake/javaextensiontask"
Rake::JavaExtensionTask.new("ruby_java") do |ext|
ext.name = "ruby_spark_ext"
end
else
require "rake/extensiontask"
Rake::ExtensionTask.new("ruby_c") do |ext|
ext.name = "ruby_spark_ext"
end
end
task :clean do
Dir['lib/*.{jar,o,so}'].each do |path|
puts "Deleting #{path} ..."
File.delete(path)
end
FileUtils.rm_rf('./pkg')
FileUtils.rm_rf('./tmp')
end
================================================
FILE: TODO.md
================================================
- refactor JavaBridge
- to_java, from_java
- every type should have class
- automatic registration
- add Streaming
- worker informations (time, memory, ...)
- killing zombie workers
- add_rb, add_inline_rb to Spark::{Context, RDD}
- fix broadcast for cluster
- dump to disk if there is memory limit
- Add Partitioner to RDD
- add NonExist serializer
================================================
FILE: benchmark/aggregate.rb
================================================
require 'benchmark'
require 'benchmark/ips'
data = 0..1_000_000
zero_value = rand(100_000)
function = Proc.new{|sum, n| sum+n}
Benchmark.ips do |r|
r.report('each') do
sum = zero_value
data.each do |n|
sum += n
end
end
r.report('reduce') do
data.reduce(zero_value){|sum, n| sum+n}
end
r.report('each with function') do
sum = zero_value
data.each do |n|
sum = function.call(sum, n)
end
end
r.report('reduce with function') do
data.reduce(zero_value, &function)
end
r.compare!
end
================================================
FILE: benchmark/bisect.rb
================================================
require "benchmark"
def bisect_left1(a, x, opts={})
return nil if a.nil?
return 0 if a.empty?
lo = (opts[:lo] || opts[:low]).to_i
hi = opts[:hi] || opts[:high] || a.length
while lo < hi
mid = (lo + hi) / 2
v = a[mid]
if v < x
lo = mid + 1
else
hi = mid
end
end
return lo
end
def bisect_left2(list, item)
count = 0
list.each{|i|
return count if i >= item
count += 1
}
nil
end
def bisect_left3(list, item, lo = 0, hi = list.size)
while lo < hi
i = (lo + hi - 1) >> 1
if 0 <= (list[i] <=> item)
hi = i
else
lo = i + 1
end
end
return hi
end
array = Array.new(1000000) { rand(0..1000000) };
to_find = Array.new(500) { rand(0..10000) };
Benchmark.bm(20) do |x|
x.report("bisect_left1") do
to_find.each do |item|
bisect_left1(array, item)
end
end
x.report("bisect_left2") do
to_find.each do |item|
bisect_left2(array, item)
end
end
x.report("bisect_left3") do
to_find.each do |item|
bisect_left3(array, item)
end
end
end
array = Array.new(100000) { Array.new(rand(1..10)){(97+rand(26)).chr}.join };
to_find = Array.new(500) { (97+rand(26)).chr };
Benchmark.bm(20) do |x|
x.report("bisect_left1") do
to_find.each do |item|
bisect_left1(array, item)
end
end
x.report("bisect_left2") do
to_find.each do |item|
bisect_left2(array, item)
end
end
x.report("bisect_left3") do
to_find.each do |item|
bisect_left3(array, item)
end
end
end
================================================
FILE: benchmark/comparison/prepare.sh
================================================
#!/usr/bin/env bash
# Current dir
cd "$(dirname "$0")"
# Exit immediately if a pipeline returns a non-zero status.
set -e
# Spark
wget "http://d3kbcqa49mib13.cloudfront.net/spark-1.3.0-bin-hadoop2.4.tgz" -O spark.tgz
tar xvzf spark.tgz
mv spark-1.3.0-bin-hadoop2.4 spark
rm spark.tgz
# RSpark (only for 1.3.0)
git clone git@github.com:amplab-extras/SparkR-pkg.git rspark
cd rspark
SPARK_VERSION=1.3.0 ./install-dev.sh
================================================
FILE: benchmark/comparison/python.py
================================================
import os
import math
from time import time
from random import random
from operator import add
from pyspark import SparkContext
sc = SparkContext(appName="Python", master="local[*]")
log_file = open(os.environ.get('PYTHON_LOG'), 'w')
def log(*values):
values = map(lambda x: str(x), values)
log_file.write(';'.join(values))
log_file.write('\n')
workers = int(os.environ.get('WORKERS'))
numbers_count = int(os.environ.get('NUMBERS_COUNT'))
text_file = os.environ.get('TEXT_FILE')
numbers = range(numbers_count)
floats = [float(i) for i in numbers]
with open(text_file) as t:
strings = t.read().split("\n")
# =============================================================================
# Serialization
# =============================================================================
t = time()
rdd_numbers = sc.parallelize(numbers, workers)
t = time() - t
log('NumbersSerialization', t)
t = time()
rdd_floats = sc.parallelize(floats, workers)
t = time() - t
log('FloatsSerialization', t)
t = time()
rdd_strings = sc.parallelize(strings, workers)
t = time() - t
log('StringsSerialization', t)
# =============================================================================
# Computing
# =============================================================================
# --- Is prime? ---------------------------------------------------------------
def is_prime(x):
if x < 2:
return [x, False]
elif x == 2:
return [x, True]
elif x % 2 == 0:
return [x, False]
else:
upper = int(math.sqrt(float(x)))
result = True
i = 3
while i <= upper:
if x % i == 0:
result = False
break
i += 2
return [x, result]
t = time()
rdd_numbers.map(is_prime).collect()
t = time() - t
log('IsPrime', t)
# --- Matrix multiplication ---------------------------------------------------
matrix_size = int(os.environ.get('MATRIX_SIZE'))
matrix = []
for row in range(matrix_size):
matrix.append([])
for col in range(matrix_size):
matrix[row].append(row+col)
def multiplication_func(matrix):
matrix = list(matrix)
size = len(matrix)
new_matrix = []
for row in range(size):
new_matrix.append([])
for col in range(size):
result = 0
for i in range(size):
result += matrix[row][i] * matrix[col][i]
new_matrix[row].append(result)
return new_matrix
t = time()
rdd = sc.parallelize(matrix, 1)
rdd.mapPartitions(multiplication_func).collect()
t = time() - t
log('MatrixMultiplication', t)
# --- Pi digits ---------------------------------------------------------------
# http://rosettacode.org/wiki/Pi#Python
pi_digit = int(os.environ.get('PI_DIGIT'))
def pi_func(size):
size = size.next()
result = ''
q, r, t, k, n, l = 1, 0, 1, 1, 3, 3
while size > 0:
if 4*q+r-t < n*t:
result += str(n)
size -= 1
nr = 10*(r-n*t)
n = ((10*(3*q+r))//t)-10*n
q *= 10
r = nr
else:
nr = (2*q+r)*l
nn = (q*(7*k)+2+(r*l))//(t*l)
q *= k
t *= l
l += 2
k += 1
n = nn
r = nr
return [result]
t = time()
rdd = sc.parallelize([pi_digit], 1)
rdd.mapPartitions(pi_func).collect()
t = time() - t
log('PiDigit', t)
log_file.close()
================================================
FILE: benchmark/comparison/r.r
================================================
library(SparkR)
sc <- sparkR.init(master="local[*]")
logFile <- file(Sys.getenv("R_LOG"), "w")
logInfo <- function(...){
args <- list(...)
line <- paste(args, collapse = ";")
writeLines(line, logFile)
}
workers <- as.integer(Sys.getenv('WORKERS'))
numbersCount <- as.integer(Sys.getenv('NUMBERS_COUNT'))
textFile <- Sys.getenv('TEXT_FILE')
# =============================================================================
# Serialization
# =============================================================================
time <- proc.time()
rddNumbers <- parallelize(sc, as.numeric(seq(0, numbersCount)), workers)
time <- as.double(proc.time()-time)[3]
logInfo('NumbersSerialization', time)
# =============================================================================
# Computing
# =============================================================================
isPrime = function(x) {
if(x < 2){
c(x, FALSE)
}
else if(x == 2){
c(x, TRUE)
}
else if(x %% 2 == 0){
c(x, FALSE)
}
else{
upper <- as.numeric(sqrt(as.double(x)))
result <- TRUE
i <- 3
while(i <= upper){
if(x %% i == 0){
result = FALSE
break
}
i <- i+2
}
c(x, result)
}
}
time <- proc.time()
rdd <- map(rddNumbers, isPrime)
capture.output(collect(rdd), file='/dev/null')
time <- as.double(proc.time()-time)[3]
logInfo('IsPrime', time)
close(logFile)
sparkR.stop()
================================================
FILE: benchmark/comparison/ruby.rb
================================================
#!/usr/bin/env ruby
lib = File.expand_path(File.dirname(__FILE__) + '/../../lib')
$LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)
require 'ruby-spark'
require 'benchmark'
Spark.start
sc = Spark.context
$log_file = File.open(ENV['RUBY_LOG'], 'w')
def log(*values)
$log_file.puts(values.join(';'))
end
workers = ENV['WORKERS'].to_i
numbers_count = ENV['NUMBERS_COUNT'].to_i
text_file = ENV['TEXT_FILE']
numbers = (0...numbers_count).to_a
floats = numbers.map(&:to_f)
strings = File.read(text_file).split("\n")
# =============================================================================
# Serialization
# =============================================================================
time = Benchmark.realtime do
@rdd_numbers = sc.parallelize(numbers, workers)
end
log('NumbersSerialization', time)
time = Benchmark.realtime do
@rdd_floats = sc.parallelize(floats, workers)
end
log('FloatsSerialization', time)
time = Benchmark.realtime do
@rdd_strings = sc.parallelize(strings, workers)
end
log('StringsSerialization', time)
# =============================================================================
# Computing
# =============================================================================
# --- Is prime? ---------------------------------------------------------------
is_prime = Proc.new do |x|
case
when x < 2
[x, false]
when x == 2
[x, true]
when x % 2 == 0
[x, false]
else
upper = Math.sqrt(x.to_f).to_i
result = true
i = 3
while i <= upper
if x % i == 0
result = false
break
end
i += 2
end
[x, result]
end
end
time = Benchmark.realtime do
@rdd_numbers.map(is_prime).collect
end
log('IsPrime', time)
# --- Matrix multiplication ---------------------------------------------------
matrix_size = ENV['MATRIX_SIZE'].to_i
matrix = Array.new(matrix_size) do |row|
Array.new(matrix_size) do |col|
row+col
end
end;
multiplication_func = Proc.new do |matrix|
size = matrix.size
Array.new(size) do |row|
Array.new(size) do |col|
matrix[row]
result = 0
size.times do |i|
result += matrix[row][i] * matrix[col][i]
end
result
end
end
end
time = Benchmark.realtime do
rdd = sc.parallelize(matrix, 1)
rdd.map_partitions(multiplication_func).collect
end
log('MatrixMultiplication', time)
# --- Pi digits ---------------------------------------------------------------
# http://rosettacode.org/wiki/Pi#Ruby
pi_digit = ENV['PI_DIGIT'].to_i
pi_func = Proc.new do |size|
size = size.first
result = ''
q, r, t, k, n, l = 1, 0, 1, 1, 3, 3
while size > 0
if 4*q+r-t < n*t
result << n.to_s
size -= 1
nr = 10*(r-n*t)
n = ((10*(3*q+r)) / t) - 10*n
q *= 10
r = nr
else
nr = (2*q+r) * l
nn = (q*(7*k+2)+r*l) / (t*l)
q *= k
t *= l
l += 2
k += 1
n = nn
r = nr
end
end
[result]
end
time = Benchmark.realtime do
rdd = sc.parallelize([pi_digit], 1)
rdd.map_partitions(pi_func).collect
end
log('PiDigit', time)
$log_file.close
================================================
FILE: benchmark/comparison/run-all.sh
================================================
#!/usr/bin/env bash
# Current dir
cd "$(dirname "$0")"
# Exit immediately if a pipeline returns a non-zero status.
set -e
# Settings
export WORKERS=2
export MATRIX_SIZE=100
export NUMBERS_COUNT=1000000
export TEXT_FILE=$(mktemp)
export PI_DIGIT=1000
export RUBY_BATCH_SIZE=2048
text_file_rows=10
text_file_per_line=10
text_file_duplicates=50
mx="4096m"
ms="4096m"
# Parse arguments
while (( "$#" )); do
case $1 in
--workers)
WORKERS="$2"
shift
;;
--matrix-size)
MATRIX_SIZE="$2"
shift
;;
--numbers-count)
NUMBERS_COUNT="$2"
shift
;;
--random-file-rows)
text_file_rows="$2"
shift
;;
--text-file-per-line)
text_file_per_line="$2"
shift
;;
--text-file-duplicates)
text_file_duplicates="$2"
shift
;;
--pi-digit)
PI_DIGIT="$2"
shift
;;
--ruby-batch-size)
RUBY_BATCH_SIZE="$2"
shift
;;
--mx)
mx="$2"
shift
;;
--ms)
ms="$2"
shift
;;
*)
break
;;
esac
shift
done
# Generating
file=$(mktemp)
for (( i=0; i<$text_file_rows; i++ ))
do
shuf -n $text_file_per_line /usr/share/dict/words | tr '\n' ' ' >> $file
echo >> $file
done
for (( i=0; i<$text_file_duplicates; i++ ))
do
cat $file >> $TEXT_FILE
done
# Before run
if [[ -z "$SPARK_HOME" ]]; then
export SPARK_HOME=$(pwd)/spark
fi
if [[ -z "$RSPARK_HOME" ]]; then
export RSPARK_HOME=$(pwd)/rspark
fi
export SPARK_RUBY_BATCH_SIZE="$RUBY_BATCH_SIZE"
SPARK_CLASSPATH=$($SPARK_HOME/bin/compute-classpath.sh 2>/dev/null)
export _JAVA_OPTIONS="$_JAVA_OPTIONS -Xms$ms -Xmx$mx"
# Log files
export RUBY_MARSHAL_LOG=$(mktemp)
export RUBY_OJ_LOG=$(mktemp)
export PYTHON_LOG=$(mktemp)
export SCALA_LOG=$(mktemp)
export R_LOG=$(mktemp)
# Run:
echo "Workers: $WORKERS"
echo "Matrix size: $MATRIX_SIZE"
echo "Numbers count: $NUMBERS_COUNT"
echo "Pi digits: $PI_DIGIT"
echo "File: rows = $(($text_file_rows * $text_file_duplicates))"
echo " per line = $text_file_per_line"
# --- Ruby
export SPARK_RUBY_SERIALIZER='marshal'
export RUBY_LOG="$RUBY_MARSHAL_LOG"
/usr/bin/env ruby ruby.rb &>/dev/null
export SPARK_RUBY_SERIALIZER='oj'
export RUBY_LOG="$RUBY_OJ_LOG"
/usr/bin/env ruby ruby.rb &>/dev/null
# # --- Python
"$SPARK_HOME"/bin/spark-submit --master "local[*]" $(pwd)/python.py &>/dev/null
# # --- Scala
/usr/bin/env scalac -cp $SPARK_CLASSPATH scala.scala -d scala.jar &>/dev/null
"$SPARK_HOME"/bin/spark-submit --master "local[*]" $(pwd)/scala.jar &>/dev/null
# --- R
# "$RSPARK_HOME"/sparkR r.r #&>/dev/null
# Parse results
echo "# Ruby (Marshal)"
cat $RUBY_MARSHAL_LOG
echo ""
echo "# Ruby (Oj)"
cat $RUBY_OJ_LOG
echo ""
echo "# Python"
cat $PYTHON_LOG
echo ""
echo "# Scala"
cat $SCALA_LOG
echo ""
echo "# R"
cat $R_LOG
================================================
FILE: benchmark/comparison/scala.scala
================================================
import java.io._
import scala.math
import scala.io.Source
import org.apache.spark._
object Scala {
val logFile = new PrintWriter(new File(System.getenv("SCALA_LOG")))
def log(args: Any*) {
logFile.write(args.mkString(";"))
logFile.write("\n")
}
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("Scala")
val sc = new SparkContext(conf)
val workers = System.getenv("WORKERS").toInt
val numbersCount = System.getenv("NUMBERS_COUNT").toInt
val textFile = System.getenv("TEXT_FILE")
val numbers = 0 until numbersCount
val floats = numbers.map(_.toDouble)
val strings = Source.fromFile(textFile).mkString.split("\n")
// =============================================================================
// Serialization
// =============================================================================
var time: Long = 0
time = System.currentTimeMillis
val rddNumbers = sc.parallelize(numbers, workers)
time = System.currentTimeMillis - time
log("NumbersSerialization", time/1000.0)
time = System.currentTimeMillis
val rddFloats = sc.parallelize(floats, workers)
time = System.currentTimeMillis - time
log("FloatsSerialization", time/1000.0)
time = System.currentTimeMillis
val rddStrings = sc.parallelize(strings, workers)
time = System.currentTimeMillis - time
log("StringsSerialization", time/1000.0)
// =============================================================================
// Computing
// =============================================================================
// --- Is prime? ---------------------------------------------------------------
time = System.currentTimeMillis
val primes = rddNumbers.map{ x =>
if(x < 2){
(x, false)
}
else if(x == 2){
(x, true)
}
else if(x % 2 == 0){
(x, false)
}
else{
val upper = math.sqrt(x.toDouble).toInt
var result = true
var i = 3
while(i <= upper && result == true){
if(x % i == 0){
result = false
}
else{
i += 2
}
}
(x, result)
}
}
primes.collect()
time = System.currentTimeMillis - time
log("IsPrime", time/1000.0)
// --- Matrix multiplication ---------------------------------------------------
val matrixSize = System.getenv("MATRIX_SIZE").toInt
val matrix = new Array[Array[Long]](matrixSize)
for( row <- 0 until matrixSize ) {
matrix(row) = new Array[Long](matrixSize)
for( col <- 0 until matrixSize ) {
matrix(row)(col) = row + col
}
}
time = System.currentTimeMillis
val rdd = sc.parallelize(matrix, 1)
rdd.mapPartitions { it =>
val matrix = it.toArray
val size = matrix.size
val newMatrix = new Array[Array[Long]](size)
for( row <- 0 until size ) {
newMatrix(row) = new Array[Long](size)
for( col <- 0 until size ) {
var result: Long = 0
for( i <- 0 until size ) {
result += matrix(row)(i) * matrix(col)(i)
}
newMatrix(row)(col) = result
}
}
newMatrix.toIterator
}
time = System.currentTimeMillis - time
log("MatrixMultiplication", time/1000.0)
// --- Pi digits ---------------------------------------------------------------
// http://rosettacode.org/wiki/Pi#Scala
val piDigit = System.getenv("PI_DIGIT").toInt
time = System.currentTimeMillis
val piDigits = sc.parallelize(Array(piDigit), 1)
piDigits.mapPartitions { it =>
var size = it.toArray.asInstanceOf[Array[Int]](0)
var result = ""
var r: BigInt = 0
var q, t, k: BigInt = 1
var n, l: BigInt = 3
var nr, nn: BigInt = 0
while(size > 0){
while((4*q+r-t) >= (n*t)){
nr = (2*q+r)*l
nn = (q*(7*k)+2+(r*l))/(t*l)
q = q * k
t = t * l
l = l + 2
k = k + 1
n = nn
r = nr
}
result += n.toString
size -= 1
nr = 10*(r-n*t)
n = ((10*(3*q+r))/t)-(10*n)
q = q * 10
r = nr
}
Iterator(result)
}
time = System.currentTimeMillis - time
log("PiDigit", time/1000.0)
sc.stop()
logFile.close()
}
}
================================================
FILE: benchmark/custom_marshal.rb
================================================
require 'benchmark'
require 'benchmark/ips'
def pack_int(data)
[data].pack('l>')
end
def pack_long(data)
[data].pack('q>')
end
def pack_doubles(data)
data.pack('G*')
end
module Standard
class LabeledPoint
def initialize(label, features)
@label = label
@features = Standard::Vector.new(features)
end
def marshal_dump
[@label, @features]
end
def marshal_load(*)
end
end
class Vector
def initialize(array)
@values = array
end
def marshal_dump
[@values]
end
def marshal_load(*)
end
end
end
module Custom
class LabeledPoint
def initialize(label, features)
@label = label
@features = Custom::Vector.new(features)
end
def _dump(*)
pack_long(@label) + @features._dump
end
def self._load(*)
end
end
class Vector
def initialize(array)
@values = array
end
def _dump(*)
result = 'v'
result << pack_int(@values.size)
result << pack_doubles(@values)
result.encode(Encoding::ASCII_8BIT)
end
def self._load(*)
end
end
end
data_size = 10_000
vector_size = 1_000
values = Array.new(vector_size) { |x| rand(10_000..100_000) }
@data1 = Array.new(data_size) {|i| Standard::LabeledPoint.new(i, values)}
@data2 = Array.new(data_size) {|i| Custom::LabeledPoint.new(i, values)}
Benchmark.ips do |r|
r.report('standard') do
Marshal.dump(@data1)
end
r.report('custom') do
Marshal.dump(@data2)
end
r.compare!
end
================================================
FILE: benchmark/digest.rb
================================================
lib = File.expand_path(File.dirname(__FILE__) + '/../lib')
$LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)
def java?
RUBY_PLATFORM =~ /java/
end
unless java?
require 'murmurhash3'
end
require 'digest'
require 'benchmark'
require 'ruby-spark'
TEST = 5_000_000
WORDS = ["wefwefwef", "rgwefiwefwe", "a", "rujfwgrethrzjrhgawf", "irncrnuggo"]
puts "TEST COUNT = #{TEST*WORDS.size}"
# =================================================================================================
# Pure ruby mumrumur
# funny-falcon/murmurhash3-ruby
MASK32 = 0xffffffff
def murmur3_32_rotl(x, r)
((x << r) | (x >> (32 - r))) & MASK32
end
def murmur3_32_fmix(h)
h &= MASK32
h ^= h >> 16
h = (h * 0x85ebca6b) & MASK32
h ^= h >> 13
h = (h * 0xc2b2ae35) & MASK32
h ^ (h >> 16)
end
def murmur3_32__mmix(k1)
k1 = (k1 * 0xcc9e2d51) & MASK32
k1 = murmur3_32_rotl(k1, 15)
(k1 * 0x1b873593) & MASK32
end
def murmur3_32_str_hash(str, seed=0)
h1 = seed
numbers = str.unpack('V*C*')
tailn = str.bytesize % 4
tail = numbers.slice!(numbers.size - tailn, tailn)
for k1 in numbers
h1 ^= murmur3_32__mmix(k1)
h1 = murmur3_32_rotl(h1, 13)
h1 = (h1*5 + 0xe6546b64) & MASK32
end
unless tail.empty?
k1 = 0
tail.reverse_each do |c1|
k1 = (k1 << 8) | c1
end
h1 ^= murmur3_32__mmix(k1)
end
h1 ^= str.bytesize
murmur3_32_fmix(h1)
end
# =================================================================================================
# Benchmark
Benchmark.bm(18) do |x|
x.report("ruby hash"){
TEST.times{
WORDS.each{ |word|
word.hash
}
}
}
x.report("ext portable"){
TEST.times{
WORDS.each{ |word|
Spark::Digest.portable_hash(word)
}
}
}
x.report("murmur3 32"){
TEST.times{
WORDS.each{ |word|
# MurmurHash3::V128.str_hash(word)
# [MurmurHash3::V128.str_hash(word).join.to_i].pack("q>")
# MurmurHash3::V128.str_hash(word)
# a = MurmurHash3::V32.str_hash(word).to_s
# a.slice!(0,8)
MurmurHash3::V32.str_hash(word)
}
}
} unless java?
# Too slow
# x.report("murmur3 32 (ruby)"){
# TEST.times{
# WORDS.each{ |word|
# # MurmurHash3::V128.str_hash(word)
# # [MurmurHash3::V128.str_hash(word).join.to_i].pack("q>")
# # MurmurHash3::V128.str_hash(word)
# # a = murmur3_32_str_hash(word).to_s
# # a.slice!(0,8)
# murmur3_32_str_hash(word)
# }
# }
# }
x.report("murmur3 128"){
TEST.times{
WORDS.each{ |word|
# MurmurHash3::V128.str_hash(word)
# [MurmurHash3::V128.str_hash(word).join.to_i].pack("q>")
# a = MurmurHash3::V128.str_hash(word).to_s
# a.slice!(0,8)
MurmurHash3::V128.str_hash(word)
}
}
} unless java?
# x.report("sha256"){
# TEST.times{
# WORDS.each{ |word|
# a = Digest::SHA256.digest(word)
# # a.slice!(0,8)
# }
# }
# }
# x.report("md5"){
# TEST.times{
# WORDS.each{ |word|
# a = Digest::MD5.digest(word)
# # a.slice!(0,8)
# }
# }
# }
end
================================================
FILE: benchmark/enumerator.rb
================================================
require "benchmark"
class Enumerator
def defer(&blk)
self.class.new do |y|
each do |*input|
blk.call(y, *input)
end
end
end
end
ARRAY_SIZE = 50_000_000
def type_yield
return to_enum(__callee__) unless block_given?
ARRAY_SIZE.times { |i|
yield i
}
end
def yield_map_x2(enum)
return to_enum(__callee__, enum) unless block_given?
enum.each do |item|
yield item*2
end
end
def type_enumerator_new
Enumerator.new do |e|
ARRAY_SIZE.times { |i|
e << i
}
end
end
def enumerator_new_map_x2(enum)
Enumerator.new do |e|
enum.each do |item|
e << item*2
end
end
end
def enumerator_defer_x2(enum)
enum.defer do |out, inp|
out << inp*2
end
end
Benchmark.bm(26) do |x|
x.report("yield max") do
type_yield.max
end
x.report("yield sum") do
type_yield.reduce(:+)
end
x.report("yield map x*2 sum") do
yield_map_x2(type_yield).reduce(:+)
end
x.report("yield defer map x*2 sum") do
enumerator_defer_x2(type_yield).reduce(:+)
end
x.report("-----"){}
x.report("Enum.new max") do
type_enumerator_new.max
end
x.report("Enum.new sum") do
type_enumerator_new.reduce(:+)
end
x.report("Enum.new map x*2 sum") do
enumerator_new_map_x2(type_enumerator_new).reduce(:+)
end
x.report("Enum.new defer map x*2 sum") do
enumerator_defer_x2(type_enumerator_new).reduce(:+)
end
end
================================================
FILE: benchmark/serializer.rb
================================================
require "benchmark"
require "yaml"
require "msgpack"
require "oj"
# require "thrift"
puts "Simple"
data = (0..100000).to_a
Benchmark.bmbm do |x|
x.report("YAML") do
serialized = YAML.dump(data)
deserialized = YAML.load(serialized)
puts "Size: #{serialized.size}, Equal: #{deserialized == data}"
end
x.report("Marshal") do
serialized = Marshal.dump(data)
deserialized = Marshal.load(serialized)
puts "Size: #{serialized.size}, Equal: #{deserialized == data}"
end
x.report("MessagePack") do
serialized = MessagePack.dump(data)
deserialized = MessagePack.load(serialized)
puts "Size: #{serialized.size}, Equal: #{deserialized == data}"
end
x.report("Oj") do
serialized = Oj.dump(data)
deserialized = Oj.load(serialized)
puts "Size: #{serialized.size}, Equal: #{deserialized == data}"
end
# x.report("Thrift") do
# serializer = Thrift::Serializer.new
# deserializer = Thrift::Deserializer.new
# serialized = serializer.serialize(data)
# end
end
puts ""
puts "More complex"
data = Array.new(10000000) {
[rand(97..122).chr, rand(10000000)]
}
Benchmark.bm do |x|
# Take too long
# x.report("YAML") do
# serialized = YAML.dump(data)
# YAML.load(serialized)
# end
x.report("Marshal") do
serialized = Marshal.dump(data)
deserialized = Marshal.load(serialized)
puts " Size: #{serialized.size}, Equal: #{deserialized == data}"
end
x.report("MessagePack") do
serialized = MessagePack.dump(data)
deserialized = MessagePack.load(serialized)
puts " Size: #{serialized.size}, Equal: #{deserialized == data}"
end
x.report("Oj") do
serialized = Oj.dump(data)
deserialized = Oj.load(serialized)
puts " Size: #{serialized.size}, Equal: #{deserialized == data}"
end
# x.report("Thrift") do
# serializer = Thrift::Serializer.new
# deserializer = Thrift::Deserializer.new
# serialized = serializer.serialize(data)
# end
end
================================================
FILE: benchmark/sort.rb
================================================
require "benchmark"
array = []
1000.times {
array << {:bar => rand(1000)}
}
n = 500
Benchmark.bm(20) do |x|
x.report("sort") { n.times { array.sort{ |a,b| b[:bar] <=> a[:bar] } } }
x.report("sort reverse") { n.times { array.sort{ |a,b| a[:bar] <=> b[:bar] }.reverse } }
x.report("sort_by -a[:bar]") { n.times { array.sort_by{ |a| -a[:bar] } } }
x.report("sort_by a[:bar]*-1") { n.times { array.sort_by{ |a| a[:bar]*-1 } } }
x.report("sort_by.reverse!") { n.times { array.sort_by{ |a| a[:bar] }.reverse } }
end
array = Array.new(10000) { Array.new(rand(1..10)){(97+rand(26)).chr}.join }
Benchmark.bm(20) do |x|
x.report("sort asc") { n.times { array.sort } }
x.report("sort asc block") { n.times { array.sort{|a,b| a <=> b} } }
x.report("sort desc") { n.times { array.sort{|a,b| b <=> a} } }
x.report("sort asc reverse") { n.times { array.sort.reverse } }
end
key_value = Struct.new(:key, :value) do
def <=>(other)
key <=> other.key
end
end
count = 10000
item_range = 1000000
array1 = Array.new(count) { [rand(item_range), rand(item_range)] }
array2 = Array.new(count) { key_value.new rand(item_range), rand(item_range) }
Benchmark.bm(20) do |x|
x.report("sort_by") { n.times { array1.sort_by {|a| a[0]} } }
x.report("sort struct") { n.times { array2.sort } }
end
================================================
FILE: benchmark/sort2.rb
================================================
require "benchmark"
require "algorithms"
NUMBER_OF_SORTING = 1
NUMBER_OF_ARRAY = 10
WORDS_IN_ARRAY = 100000
MAX_WORD_SIZE = 10
EVAL_N_VALUES = 10
puts "NUMBER_OF_SORTING: #{NUMBER_OF_SORTING}"
puts "NUMBER_OF_ARRAY: #{NUMBER_OF_ARRAY}"
puts "WORDS_IN_ARRAY: #{WORDS_IN_ARRAY}"
puts "MAX_WORD_SIZE: #{MAX_WORD_SIZE}"
puts "EVAL_N_VALUES: #{EVAL_N_VALUES}"
def words
Array.new(WORDS_IN_ARRAY) { word }
end
def word
Array.new(rand(1..MAX_WORD_SIZE)){(97+rand(26)).chr}.join
end
@array = Array.new(NUMBER_OF_ARRAY) { words.sort }
# =================================================================================================
# Sort1
# Vrátí nový (nevyhodnocený) enumerator
def sort1(data)
return to_enum(__callee__, data) unless block_given?
heap = []
# Inicializuji heap s prvními položkami
# připojím samotné enumeratory pro volání .next
data.each do |a|
heap << [a.next, a]
end
while data.any?
begin
# Seřadím pole podle hodnot
heap.sort_by!{|(item,_)| item}
# Uložím si hodnotu a enumerator
item, enum = heap.shift
# Hodnota půjde do výsledku
yield item
# Místo odstraněné položky nahradí další ze stejného seznamu
heap << [enum.next, enum]
rescue StopIteration
# Enumerator je prázdný
data.delete(enum)
end
end
end
# =================================================================================================
# Sort1_2
# Vrátí nový (nevyhodnocený) enumerator
def sort1_2(data)
return to_enum(__callee__, data) unless block_given?
heap = []
enums = []
# Inicializuji heap s prvními položkami
# připojím samotné enumeratory pro volání .next
data.each do |a|
EVAL_N_VALUES.times {
begin
heap << [a.next, a]
rescue StopIteration
end
}
end
while data.any? || heap.any?
# Seřadím pole podle hodnot
heap.sort_by!{|(item,_)| item}
# Minimálně můžu vzít EVAL_N_VALUES
EVAL_N_VALUES.times {
break if heap.empty?
# Uložím si hodnotu a enumerator
item, enum = heap.shift
# Hodnota půjde do výsledku
yield item
enums << enum
}
while (enum = enums.shift)
begin
heap << [enum.next, enum]
rescue StopIteration
data.delete(enum)
enums.delete(enum)
end
end
end
end
# =================================================================================================
# Sort 2
def sort2(data)
return to_enum(__callee__, data) unless block_given?
heap = Containers::Heap.new
data.each do |enum|
item = enum.next
heap.push(item, [item, enum])
end
while data.any?
begin
item, enum = heap.pop
yield item
item = enum.next
heap.push(item, [item, enum])
rescue StopIteration
data.delete(enum)
end
end
end
# =================================================================================================
# Benchmark
Benchmark.bm(10) do |x|
x.report("sort") do
NUMBER_OF_SORTING.times {
@result = @array.flatten.sort
}
end
x.report("sort 1") do
NUMBER_OF_SORTING.times {
raise "Bad sorting" if @result != sort1(@array.map(&:each)).to_a
}
end
x.report("sort 1_2") do
NUMBER_OF_SORTING.times {
raise "Bad sorting" if @result != sort1_2(@array.map(&:each)).to_a
}
end
# x.report("sort 2") do
# NUMBER_OF_SORTING.times {
# raise "Bad sorting" if @result != sort2(@array.map(&:each)).to_a
# }
# end
end
================================================
FILE: benchmark/take.rb
================================================
require "benchmark"
SIZE = 100_000_000
@array1 = (0..SIZE).to_a;
@array2 = (0..SIZE).to_a;
@array3 = (0..SIZE).to_a;
TAKE = 100_000
Benchmark.bm(15) do |x|
# Fastest
x.report("take"){
a=@array1.take(TAKE)
}
# Slowest and take most memory
x.report("reverse drop"){
@array2.reverse!
@array2.drop(@array2.size - TAKE)
@array2.reverse!
}
# Least memory
x.report("splice"){
a=@array2.slice!(0, TAKE)
}
end
================================================
FILE: bin/ruby-spark
================================================
#!/usr/bin/env ruby
lib = File.expand_path(File.dirname(__FILE__) + '/../lib')
$LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)
require 'ruby-spark'
Spark::CLI.new.run
================================================
FILE: example/pi.rb
================================================
#!/usr/bin/env ruby
lib = File.expand_path(File.dirname(__FILE__) + '/../lib')
$LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)
require 'ruby-spark'
Spark.logger.disable
Spark.start
slices = 3
n = 100000 * slices
def map(_)
x = rand * 2 - 1
y = rand * 2 - 1
if x**2 + y**2 < 1
return 1
else
return 0
end
end
rdd = Spark.context.parallelize(1..n, slices)
rdd = rdd.map(method(:map))
puts 'Pi is roughly %f' % (4.0 * rdd.sum / n)
================================================
FILE: example/website_search.rb
================================================
#!/usr/bin/env ruby
# Parse sitemap and search word on every page
require 'optparse'
require 'open-uri'
require 'nokogiri'
require 'ruby-spark'
options = {
sitemap: 'http://fit.cvut.cz/sitemap.xml',
query: 'cvut',
workers: 2
}
opt_parser = OptionParser.new do |opts|
opts.banner = 'Usage: website_search.rb [options]'
opts.separator ''
opts.separator 'Specific options:'
opts.on('-s', '--sitemap SITEMAP', 'Sitemap URL') do |sitemap|
options[:sitemap] = sitemap
end
opts.on('-q', '--query QUERY', 'Query to search') do |query|
options[:query] = query
end
opts.on('-w', '--workers WORKERS_NUM', Integer, 'Number of workers') do |workers|
options[:workers] = workers
end
opts.on('--quite', 'Run quitely') do |v|
Spark.logger.disabled
end
opts.on_tail('-h', '--help', 'Show this message') do
puts opts
exit
end
end
opt_parser.parse!
@links = []
def parse_sitemap(doc)
doc.xpath('//sitemapindex/sitemap/loc').each do |loc|
next_doc = Nokogiri::HTML(open(loc.text))
parse_sitemap(next_doc)
end
doc.xpath('//url/loc').each do |loc|
@links << loc.text
end
end
doc = Nokogiri::HTML(open(options[:sitemap]))
parse_sitemap(doc)
# Map function
func = Proc.new do |url|
begin
open(url) {|f|
[url, f.read.scan(query).count]
}
rescue
[url, 0]
end
end
Spark.start
rdd = Spark.sc.parallelize(@links, options[:workers])
.add_library('open-uri')
.bind(query: options[:query])
.map(func)
.sort_by(lambda{|(_, value)| value}, false)
rdd.collect.each do |(url, count)|
puts "#{url} => #{count}"
end
================================================
FILE: ext/ruby_c/extconf.rb
================================================
require 'mkmf'
create_makefile("ruby_spark_ext")
================================================
FILE: ext/ruby_c/murmur.c
================================================
#include "murmur.h"
#if defined(_MSC_VER)
#define BIG_CONSTANT(x) (x)
#else
#define BIG_CONSTANT(x) (x##LLU)
#endif
/*-----------------------------------------------------------------------------
// MurmurHash2, 64-bit versions, by Austin Appleby
//
// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment
// and endian-ness issues if used across multiple platforms.
//
// 64-bit hash for 64-bit platforms
*/
uint64_t MurmurHash64A(const void * key, int len, uint64_t seed)
{
const uint64_t m = BIG_CONSTANT(0xc6a4a7935bd1e995);
const int r = 47;
uint64_t h = seed ^ (len * m);
const uint64_t * data = (const uint64_t *)key;
const uint64_t * end = data + (len/8);
while(data != end)
{
uint64_t k = *data++;
k *= m;
k ^= k >> r;
k *= m;
h ^= k;
h *= m;
}
const unsigned char * data2 = (const unsigned char*)data;
switch(len & 7)
{
case 7: h ^= ((uint64_t) data2[6]) << 48;
case 6: h ^= ((uint64_t) data2[5]) << 40;
case 5: h ^= ((uint64_t) data2[4]) << 32;
case 4: h ^= ((uint64_t) data2[3]) << 24;
case 3: h ^= ((uint64_t) data2[2]) << 16;
case 2: h ^= ((uint64_t) data2[1]) << 8;
case 1: h ^= ((uint64_t) data2[0]);
h *= m;
};
h ^= h >> r;
h *= m;
h ^= h >> r;
return h;
}
/* 64-bit hash for 32-bit platforms */
uint64_t MurmurHash64B(const void * key, int len, uint64_t seed)
{
const uint32_t m = 0x5bd1e995;
const int r = 24;
uint32_t h1 = ((uint32_t) seed) ^ len;
uint32_t h2 = ((uint32_t) (seed >> 32));
const uint32_t * data = (const uint32_t *)key;
while(len >= 8)
{
uint32_t k1 = *data++;
k1 *= m; k1 ^= k1 >> r; k1 *= m;
h1 *= m; h1 ^= k1;
len -= 4;
uint32_t k2 = *data++;
k2 *= m; k2 ^= k2 >> r; k2 *= m;
h2 *= m; h2 ^= k2;
len -= 4;
}
if(len >= 4)
{
uint32_t k1 = *data++;
k1 *= m; k1 ^= k1 >> r; k1 *= m;
h1 *= m; h1 ^= k1;
len -= 4;
}
switch(len)
{
case 3: h2 ^= ((unsigned char*)data)[2] << 16;
case 2: h2 ^= ((unsigned char*)data)[1] << 8;
case 1: h2 ^= ((unsigned char*)data)[0];
h2 *= m;
};
h1 ^= h2 >> 18; h1 *= m;
h2 ^= h1 >> 22; h2 *= m;
h1 ^= h2 >> 17; h1 *= m;
h2 ^= h1 >> 19; h2 *= m;
uint64_t h = h1;
h = (h << 32) | h2;
return h;
}
// ================================================================================================
// Ruby methods
#define PORTABLE_HASH_SEED 16154832
VALUE murmur2_digest(VALUE rb_str, uint64_t seed)
{
StringValue(rb_str);
void * key = RSTRING_PTR(rb_str);
long len = RSTRING_LEN(rb_str);
uint64_t result = MurmurHash64A(key, len, seed);
return LONG2FIX(result);
}
// ------------------------------------------------------------------------------------------------
// Spark::Digest::Murmur2.digest
VALUE method_murmur2_digest(int argc, VALUE *argv, VALUE klass)
{
if(argc == 0 || argc > 2){
rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
}
uint64_t seed = (argc == 1 ? 0 : NUM2UINT(argv[1]));
return murmur2_digest(argv[0], seed);
}
// ------------------------------------------------------------------------------------------------
// Spark::Digest.portable_hash
VALUE method_portable_hash(int argc, VALUE *argv, VALUE klass)
{
if(argc != 1){
rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
}
return murmur2_digest(argv[0], PORTABLE_HASH_SEED);
}
================================================
FILE: ext/ruby_c/murmur.h
================================================
#ifndef MURMUR_INCLUDED
#define MURMUR_INCLUDED
#include "ruby.h"
VALUE method_portable_hash(int argc, VALUE *argv, VALUE klass);
VALUE method_murmur2_digest(int argc, VALUE *argv, VALUE klass);
#endif
================================================
FILE: ext/ruby_c/ruby-spark.c
================================================
#include "ruby.h"
#include "murmur.h"
VALUE SparkModule;
VALUE SparkDigestModule;
VALUE SparkDigestMurmur2Class;
void Init_ruby_spark_ext()
{
SparkModule = rb_define_module("Spark");
SparkDigestModule = rb_define_module_under(SparkModule, "Digest");
SparkDigestMurmur2Class = rb_define_class_under(SparkDigestModule, "Murmur2", rb_cObject);
rb_define_singleton_method(SparkDigestModule, "portable_hash", method_portable_hash, -1);
rb_define_singleton_method(SparkDigestMurmur2Class, "digest", method_murmur2_digest, -1);
}
================================================
FILE: ext/ruby_java/Digest.java
================================================
import org.jruby.Ruby;
import org.jruby.RubyModule;
import org.jruby.RubyObject;
import org.jruby.RubyClass;
import org.jruby.RubyString;
import org.jruby.RubyFixnum;
import org.jruby.anno.JRubyModule;
import org.jruby.anno.JRubyMethod;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;
@JRubyModule(name="Spark::Digest")
public class Digest extends RubyObject{
// Have to be the same as in C extension
final static long PORTABLE_HASH_SEED = 16154832;
public Digest(final Ruby ruby, RubyClass rubyClass) {
super(ruby, rubyClass);
}
@JRubyMethod(module=true)
public static IRubyObject portable_hash(ThreadContext context, IRubyObject self, IRubyObject arg) {
Ruby ruby = self.getRuntime();
RubyString keyString = (RubyString)arg;
long hash = Murmur2.hash64(keyString.getBytes(), (int)keyString.length().getLongValue(), PORTABLE_HASH_SEED);
RubyFixnum result = new RubyFixnum(ruby, hash);
return result;
}
}
================================================
FILE: ext/ruby_java/Murmur2.java
================================================
import org.jruby.Ruby;
import org.jruby.RubyClass;
import org.jruby.RubyObject;
import org.jruby.RubyString;
import org.jruby.RubyFixnum;
import org.jruby.anno.JRubyClass;
import org.jruby.anno.JRubyMethod;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;
/** Murmur hash 2.0.
*
* The murmur hash is a relative fast hash function from
* http://murmurhash.googlepages.com/ for platforms with efficient
* multiplication.
*
* http://d3s.mff.cuni.cz/~holub/sw/javamurmurhash/
*
*/
@JRubyClass(name="Spark::Digest::Murmur2")
public class Murmur2 extends RubyObject {
public Murmur2(final Ruby ruby, RubyClass rubyClass) {
super(ruby, rubyClass);
}
@JRubyMethod(required=1, optional=1, module=true)
public static IRubyObject digest(ThreadContext context, IRubyObject self, IRubyObject[] args) {
Ruby ruby = context.getRuntime();
RubyString keyString = (RubyString)args[0];
long seed;
if(args.length > 1){
RubyFixnum rb_seed = (RubyFixnum)args[1];
seed = rb_seed.getLongValue();
}
else{
seed = 0;
}
long hash = hash64(keyString.getBytes(), (int)keyString.length().getLongValue(), seed);
RubyFixnum result = new RubyFixnum(ruby, hash);
return result;
}
/** Generates 64 bit hash from byte array of the given length and seed.
*
* @param data byte array to hash
* @param length length of the array to hash
* @param seed initial seed value
* @return 64 bit hash of the given array
*/
public static long hash64(final byte[] data, int length, long seed) {
final long m = 0xc6a4a7935bd1e995L;
final int r = 47;
long h = (seed&0xffffffffl)^(length*m);
int length8 = length/8;
for (int i=0; i>> r;
k *= m;
h ^= k;
h *= m;
}
switch (length%8) {
case 7: h ^= (long)(data[(length&~7)+6]&0xff) << 48;
case 6: h ^= (long)(data[(length&~7)+5]&0xff) << 40;
case 5: h ^= (long)(data[(length&~7)+4]&0xff) << 32;
case 4: h ^= (long)(data[(length&~7)+3]&0xff) << 24;
case 3: h ^= (long)(data[(length&~7)+2]&0xff) << 16;
case 2: h ^= (long)(data[(length&~7)+1]&0xff) << 8;
case 1: h ^= (long)(data[length&~7]&0xff);
h *= m;
};
h ^= h >>> r;
h *= m;
h ^= h >>> r;
return h;
}
}
================================================
FILE: ext/ruby_java/RubySparkExtService.java
================================================
import org.jruby.Ruby;
import org.jruby.RubyClass;
import org.jruby.RubyModule;
import org.jruby.runtime.ObjectAllocator;
import org.jruby.runtime.builtin.IRubyObject;
import org.jruby.runtime.load.BasicLibraryService;
public class RubySparkExtService implements BasicLibraryService
{
public boolean basicLoad(final Ruby ruby) throws java.io.IOException {
RubyModule sparkModule = ruby.defineModule("Spark");
RubyModule sparkDigestModule = sparkModule.defineModuleUnder("Digest");
RubyClass sparkDigestMurmur2Class = sparkDigestModule.defineClassUnder("Murmur2", ruby.getObject(), sparkDigestMurmur2Allocator);
sparkDigestModule.defineAnnotatedMethods(Digest.class);
sparkDigestMurmur2Class.defineAnnotatedMethods(Murmur2.class);
return true;
}
public static ObjectAllocator sparkDigestMurmur2Allocator = new ObjectAllocator() {
public IRubyObject allocate(Ruby ruby, RubyClass rubyClass) {
return new Murmur2(ruby, rubyClass);
}
};
}
================================================
FILE: ext/ruby_java/extconf.rb
================================================
require 'mkmf'
create_makefile("ruby_spark_ext")
================================================
FILE: ext/spark/build.sbt
================================================
import AssemblyKeys._
assemblySettings
// Default values
val defaultScalaVersion = "2.10.4"
val defaultSparkVersion = "1.6.0"
val defaultSparkCoreVersion = "2.10"
val defaultTargetDir = "target"
val defaultHadoopVersion = "1.0.4"
// Values
val _hadoopVersion = scala.util.Properties.envOrElse("HADOOP_VERSION", defaultHadoopVersion)
val _scalaVersion = scala.util.Properties.envOrElse("SCALA_VERSION", defaultScalaVersion)
val _sparkVersion = scala.util.Properties.envOrElse("SPARK_VERSION", defaultSparkVersion)
val _sparkCoreVersion = scala.util.Properties.envOrElse("SPARK_CORE_VERSION", defaultSparkCoreVersion)
val _targetDir = scala.util.Properties.envOrElse("TARGET_DIR", defaultTargetDir)
// Project settings
name := "ruby-spark"
version := "1.0.0"
scalaVersion := _scalaVersion
javacOptions ++= Seq("-source", "1.7", "-target", "1.7")
// Jar target folder
artifactPath in Compile in packageBin := file(s"${_targetDir}/ruby-spark.jar")
outputPath in packageDependency := file(s"${_targetDir}/ruby-spark-deps.jar")
// Protocol buffer support
seq(sbtprotobuf.ProtobufPlugin.protobufSettings: _*)
// Additional libraries
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % _sparkVersion excludeAll(ExclusionRule(organization = "org.apache.hadoop")),
"org.apache.spark" %% "spark-graphx" % _sparkVersion,
"org.apache.spark" %% "spark-mllib" % _sparkVersion,
"org.apache.spark" %% "spark-sql" % _sparkVersion,
"org.apache.hadoop" % "hadoop-client" % _hadoopVersion,
"com.github.fommil.netlib" % "all" % "1.1.2",
"org.scalatest" % "scalatest_2.10" % "2.2.1" % "test"
)
// Repositories
resolvers ++= Seq(
"JBoss Repository" at "http://repository.jboss.org/nexus/content/repositories/releases/",
"Spray Repository" at "http://repo.spray.io/",
"Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/",
"Akka Repository" at "http://repo.akka.io/releases/",
"Twitter4J Repository" at "http://twitter4j.org/maven2/",
"Apache HBase" at "https://repository.apache.org/content/repositories/releases",
"Twitter Maven Repo" at "http://maven.twttr.com/",
"scala-tools" at "https://oss.sonatype.org/content/groups/scala-tools",
"Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/",
"Second Typesafe repo" at "http://repo.typesafe.com/typesafe/maven-releases/",
"Mesosphere Public Repository" at "http://downloads.mesosphere.io/maven",
Resolver.sonatypeRepo("public")
)
// Merge strategy
mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) =>
{
case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard
case m if m.startsWith("META-INF") => MergeStrategy.discard
case PathList("javax", "servlet", xs @ _*) => MergeStrategy.first
case PathList("org", "apache", xs @ _*) => MergeStrategy.first
case PathList("org", "jboss", xs @ _*) => MergeStrategy.first
case "about.html" => MergeStrategy.rename
case "reference.conf" => MergeStrategy.concat
case _ => MergeStrategy.first
}
}
================================================
FILE: ext/spark/project/plugins.sbt
================================================
resolvers += Resolver.url("artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns)
resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/"
resolvers += "Spray Repository" at "http://repo.spray.io/"
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.10.2")
addSbtPlugin("com.github.gseitz" % "sbt-protobuf" % "0.3.3")
================================================
FILE: ext/spark/sbt/sbt
================================================
#!/bin/bash
# This script launches sbt for this project. If present it uses the system
# version of sbt. If there is no system version of sbt it attempts to download
# sbt locally.
SBT_VERSION=0.13.9
URL1=http://dl.bintray.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
URL2=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
JAR=sbt/sbt-launch-${SBT_VERSION}.jar
# Download sbt launch jar if it hasn't been downloaded yet
if [ ! -f ${JAR} ]; then
# Download
printf "Attempting to fetch sbt\n"
JAR_DL=${JAR}.part
if hash wget 2>/dev/null; then
(wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR}
elif hash curl 2>/dev/null; then
(curl --progress-bar ${URL1} > ${JAR_DL} || curl --progress-bar ${URL2} > ${JAR_DL}) && mv ${JAR_DL} ${JAR}
else
printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n"
exit -1
fi
fi
if [ ! -f ${JAR} ]; then
# We failed to download
printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n"
exit -1
fi
printf "Launching sbt from ${JAR}\n"
java \
-Xmx1200m -XX:MaxPermSize=350m -XX:ReservedCodeCacheSize=256m \
-jar ${JAR} \
"$@"
================================================
FILE: ext/spark/src/main/scala/Exec.scala
================================================
package org.apache.spark.api.ruby
import java.io.{File, FileOutputStream, InputStreamReader, BufferedReader}
import scala.collection.JavaConversions._
import org.apache.spark.{SparkEnv, Logging}
import org.apache.spark.util._
/* =================================================================================================
* class FileCommand
* =================================================================================================
*
* Save command to file and than execute him because from Scala you cannot simply run
* something like "bash --norc -i -c 'source .zshrc; ruby master.rb'"
*/
class FileCommand(command: String) extends Logging {
var pb: ProcessBuilder = null
var file: File = null
// Command is complete.
def this(command: String, env: SparkEnv) = {
this(command)
create(env)
}
// Template must contains %s which will be replaced for command
def this(template: String, command: String, env: SparkEnv, envVars: Map[String, String]) = {
this(template.format(command), env)
setEnvVars(envVars)
}
private def create(env: SparkEnv) {
val dir = new File(env.sparkFilesDir)
val ext = if(Utils.isWindows) ".cmd" else ".sh"
val shell = if(Utils.isWindows) "cmd" else "bash"
file = File.createTempFile("command", ext, dir)
val out = new FileOutputStream(file)
out.write(command.getBytes)
out.close
logInfo(s"New FileCommand at ${file.getAbsolutePath}")
pb = new ProcessBuilder(shell, file.getAbsolutePath)
}
def setEnvVars(vars: Map[String, String]) {
pb.environment().putAll(vars)
}
def run = {
new ExecutedFileCommand(pb.start)
}
}
/* =================================================================================================
* class ExecutedFileCommand
* =================================================================================================
*
* Represent process executed from file.
*/
class ExecutedFileCommand(process: Process) {
var reader: BufferedReader = null
def readLine = {
openInput
reader.readLine.toString.trim
}
def openInput {
if(reader != null){
return
}
val input = new InputStreamReader(process.getInputStream)
reader = new BufferedReader(input)
}
// Delegation
def destroy = process.destroy
def getInputStream = process.getInputStream
def getErrorStream = process.getErrorStream
}
================================================
FILE: ext/spark/src/main/scala/MLLibAPI.scala
================================================
package org.apache.spark.mllib.api.python
// PythonMLLibAPI is private for python
class MLLibAPI extends PythonMLLibAPI {}
================================================
FILE: ext/spark/src/main/scala/Marshal.scala
================================================
package org.apache.spark.api.ruby.marshal
import java.io.{DataInputStream, DataOutputStream, ByteArrayInputStream, ByteArrayOutputStream}
import scala.collection.mutable.ArrayBuffer
import scala.collection.JavaConverters._
/* =================================================================================================
* object Marshal
* =================================================================================================
*/
object Marshal {
def load(bytes: Array[Byte]) = {
val is = new DataInputStream(new ByteArrayInputStream(bytes))
val majorVersion = is.readUnsignedByte // 4
val minorVersion = is.readUnsignedByte // 8
(new MarshalLoad(is)).load
}
def dump(data: Any) = {
val aos = new ByteArrayOutputStream
val os = new DataOutputStream(aos)
os.writeByte(4)
os.writeByte(8)
(new MarshalDump(os)).dump(data)
aos.toByteArray
}
}
/* =================================================================================================
* class IterableMarshaller
* =================================================================================================
*/
class IterableMarshaller(iter: Iterator[Any]) extends Iterator[Array[Byte]] {
private val buffer = new ArrayBuffer[Any]
override def hasNext: Boolean = iter.hasNext
override def next(): Array[Byte] = {
while (iter.hasNext) {
buffer += iter.next()
}
Marshal.dump(buffer)
}
}
================================================
FILE: ext/spark/src/main/scala/MarshalDump.scala
================================================
package org.apache.spark.api.ruby.marshal
import java.io.{DataInputStream, DataOutputStream, ByteArrayInputStream, ByteArrayOutputStream}
import scala.collection.mutable.ArrayBuffer
import scala.collection.JavaConverters._
import scala.reflect.{ClassTag, classTag}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.{Vector, DenseVector, SparseVector}
/* =================================================================================================
* class MarshalDump
* =================================================================================================
*/
class MarshalDump(os: DataOutputStream) {
val NAN_BYTELIST = "nan".getBytes
val NEGATIVE_INFINITY_BYTELIST = "-inf".getBytes
val INFINITY_BYTELIST = "inf".getBytes
def dump(data: Any) {
data match {
case null =>
os.writeByte('0')
case item: Boolean =>
val char = if(item) 'T' else 'F'
os.writeByte(char)
case item: Int =>
os.writeByte('i')
dumpInt(item)
case item: Array[_] =>
os.writeByte('[')
dumpArray(item)
case item: Double =>
os.writeByte('f')
dumpFloat(item)
case item: ArrayBuffer[Any] => dump(item.toArray)
}
}
def dumpInt(data: Int) {
if(data == 0){
os.writeByte(0)
}
else if (0 < data && data < 123) {
os.writeByte(data + 5)
}
else if (-124 < data && data < 0) {
os.writeByte((data - 5) & 0xff)
}
else {
val buffer = new Array[Byte](4)
var value = data
var i = 0
while(i != 4 && value != 0 && value != -1){
buffer(i) = (value & 0xff).toByte
value = value >> 8
i += 1
}
val lenght = i + 1
if(value < 0){
os.writeByte(-lenght)
}
else{
os.writeByte(lenght)
}
os.write(buffer, 0, lenght)
}
}
def dumpArray(array: Array[_]) {
dumpInt(array.size)
for(item <- array) {
dump(item)
}
}
def dumpFloat(value: Double) {
if(value.isPosInfinity){
dumpString(NEGATIVE_INFINITY_BYTELIST)
}
else if(value.isNegInfinity){
dumpString(INFINITY_BYTELIST)
}
else if(value.isNaN){
dumpString(NAN_BYTELIST)
}
else{
// dumpString("%.17g".format(value))
dumpString(value.toString)
}
}
def dumpString(data: String) {
dumpString(data.getBytes)
}
def dumpString(data: Array[Byte]) {
dumpInt(data.size)
os.write(data)
}
}
================================================
FILE: ext/spark/src/main/scala/MarshalLoad.scala
================================================
package org.apache.spark.api.ruby.marshal
import java.io.{DataInputStream, DataOutputStream, ByteArrayInputStream, ByteArrayOutputStream}
import scala.collection.mutable.ArrayBuffer
import scala.collection.JavaConverters._
import scala.reflect.{ClassTag, classTag}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.{Vector, DenseVector, SparseVector}
/* =================================================================================================
* class MarshalLoad
* =================================================================================================
*/
class MarshalLoad(is: DataInputStream) {
case class WaitForObject()
val registeredSymbols = ArrayBuffer[String]()
val registeredLinks = ArrayBuffer[Any]()
def load: Any = {
load(is.readUnsignedByte.toChar)
}
def load(dataType: Char): Any = {
dataType match {
case '0' => null
case 'T' => true
case 'F' => false
case 'i' => loadInt
case 'f' => loadAndRegisterFloat
case ':' => loadAndRegisterSymbol
case '[' => loadAndRegisterArray
case 'U' => loadAndRegisterUserObject
case _ =>
throw new IllegalArgumentException(s"Format is not supported: $dataType.")
}
}
// ----------------------------------------------------------------------------------------------
// Load by type
def loadInt: Int = {
var c = is.readByte.toInt
if (c == 0) {
return 0
} else if (4 < c && c < 128) {
return c - 5
} else if (-129 < c && c < -4) {
return c + 5
}
var result: Long = 0
if (c > 0) {
result = 0
for( i <- 0 until c ) {
result |= (is.readUnsignedByte << (8 * i)).toLong
}
} else {
c = -c
result = -1
for( i <- 0 until c ) {
result &= ~((0xff << (8 * i)).toLong)
result |= (is.readUnsignedByte << (8 * i)).toLong
}
}
result.toInt
}
def loadAndRegisterFloat: Double = {
val result = loadFloat
registeredLinks += result
result
}
def loadFloat: Double = {
val string = loadString
string match {
case "nan" => Double.NaN
case "inf" => Double.PositiveInfinity
case "-inf" => Double.NegativeInfinity
case _ => string.toDouble
}
}
def loadString: String = {
new String(loadStringBytes)
}
def loadStringBytes: Array[Byte] = {
val size = loadInt
val buffer = new Array[Byte](size)
var readSize = 0
while(readSize < size){
val read = is.read(buffer, readSize, size-readSize)
if(read == -1){
throw new IllegalArgumentException("Marshal too short.")
}
readSize += read
}
buffer
}
def loadAndRegisterSymbol: String = {
val result = loadString
registeredSymbols += result
result
}
def loadAndRegisterArray: Array[Any] = {
val size = loadInt
val array = new Array[Any](size)
registeredLinks += array
for( i <- 0 until size ) {
array(i) = loadNextObject
}
array
}
def loadAndRegisterUserObject: Any = {
val klass = loadNextObject.asInstanceOf[String]
// Register future class before load the next object
registeredLinks += WaitForObject()
val index = registeredLinks.size - 1
val data = loadNextObject
val result = klass match {
case "Spark::Mllib::LabeledPoint" => createLabeledPoint(data)
case "Spark::Mllib::DenseVector" => createDenseVector(data)
case "Spark::Mllib::SparseVector" => createSparseVector(data)
case other =>
throw new IllegalArgumentException(s"Object $other is not supported.")
}
registeredLinks(index) = result
result
}
// ----------------------------------------------------------------------------------------------
// Other loads
def loadNextObject: Any = {
val dataType = is.readUnsignedByte.toChar
if(isLinkType(dataType)){
readLink(dataType)
}
else{
load(dataType)
}
}
// ----------------------------------------------------------------------------------------------
// To java objects
def createLabeledPoint(data: Any): LabeledPoint = {
val array = data.asInstanceOf[Array[_]]
new LabeledPoint(array(0).asInstanceOf[Double], array(1).asInstanceOf[Vector])
}
def createDenseVector(data: Any): DenseVector = {
new DenseVector(data.asInstanceOf[Array[_]].map(toDouble(_)))
}
def createSparseVector(data: Any): SparseVector = {
val array = data.asInstanceOf[Array[_]]
val size = array(0).asInstanceOf[Int]
val indices = array(1).asInstanceOf[Array[_]].map(_.asInstanceOf[Int])
val values = array(2).asInstanceOf[Array[_]].map(toDouble(_))
new SparseVector(size, indices, values)
}
// ----------------------------------------------------------------------------------------------
// Helpers
def toDouble(data: Any): Double = data match {
case x: Int => x.toDouble
case x: Double => x
case _ => 0.0
}
// ----------------------------------------------------------------------------------------------
// Cache
def readLink(dataType: Char): Any = {
val index = loadInt
dataType match {
case '@' => registeredLinks(index)
case ';' => registeredSymbols(index)
}
}
def isLinkType(dataType: Char): Boolean = {
dataType == ';' || dataType == '@'
}
}
================================================
FILE: ext/spark/src/main/scala/RubyAccumulatorParam.scala
================================================
package org.apache.spark.api.ruby
import java.io._
import java.net._
import java.util.{List, ArrayList}
import scala.collection.JavaConversions._
import scala.collection.immutable._
import org.apache.spark._
import org.apache.spark.util.Utils
/**
* Internal class that acts as an `AccumulatorParam` for Ruby accumulators. Inside, it
* collects a list of pickled strings that we pass to Ruby through a socket.
*/
private class RubyAccumulatorParam(serverHost: String, serverPort: Int)
extends AccumulatorParam[List[Array[Byte]]] {
// Utils.checkHost(serverHost, "Expected hostname")
val bufferSize = SparkEnv.get.conf.getInt("spark.buffer.size", 65536)
// Socket shoudl not be serialized
// Otherwise: SparkException: Task not serializable
@transient var socket: Socket = null
@transient var socketOutputStream: DataOutputStream = null
@transient var socketInputStream: DataInputStream = null
def openSocket(){
synchronized {
if (socket == null || socket.isClosed) {
socket = new Socket(serverHost, serverPort)
socketInputStream = new DataInputStream(new BufferedInputStream(socket.getInputStream, bufferSize))
socketOutputStream = new DataOutputStream(new BufferedOutputStream(socket.getOutputStream, bufferSize))
}
}
}
override def zero(value: List[Array[Byte]]): List[Array[Byte]] = new ArrayList
override def addInPlace(val1: List[Array[Byte]], val2: List[Array[Byte]]) : List[Array[Byte]] = synchronized {
if (serverHost == null) {
// This happens on the worker node, where we just want to remember all the updates
val1.addAll(val2)
val1
} else {
// This happens on the master, where we pass the updates to Ruby through a socket
openSocket()
socketOutputStream.writeInt(val2.size)
for (array <- val2) {
socketOutputStream.writeInt(array.length)
socketOutputStream.write(array)
}
socketOutputStream.flush()
// Wait for acknowledgement
// http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
//
// if(in.readInt() != RubyConstant.ACCUMULATOR_ACK){
// throw new SparkException("Accumulator was not acknowledged")
// }
new ArrayList
}
}
}
================================================
FILE: ext/spark/src/main/scala/RubyBroadcast.scala
================================================
package org.apache.spark.api.ruby
import org.apache.spark.api.python.PythonBroadcast
/**
* An Wrapper for Ruby Broadcast, which is written into disk by Ruby. It also will
* write the data into disk after deserialization, then Ruby can read it from disks.
*
* Class use Python logic - only for semantic
*/
class RubyBroadcast(@transient var _path: String, @transient var id: java.lang.Long) extends PythonBroadcast(_path) {
}
================================================
FILE: ext/spark/src/main/scala/RubyConstant.scala
================================================
package org.apache.spark.api.ruby
object RubyConstant {
val DATA_EOF = -2
val WORKER_ERROR = -1
val WORKER_DONE = 0
val CREATE_WORKER = 1
val KILL_WORKER = 2
val KILL_WORKER_AND_WAIT = 3
val SUCCESSFULLY_KILLED = 4
val UNSUCCESSFUL_KILLING = 5
val ACCUMULATOR_ACK = 6
}
================================================
FILE: ext/spark/src/main/scala/RubyMLLibAPI.scala
================================================
package org.apache.spark.mllib.api.ruby
import java.util.ArrayList
import scala.collection.JavaConverters._
import org.apache.spark.rdd.RDD
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.clustering.GaussianMixtureModel
import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
import org.apache.spark.mllib.api.python.MLLibAPI
class RubyMLLibAPI extends MLLibAPI {
// trainLinearRegressionModelWithSGD
// trainLassoModelWithSGD
// trainRidgeModelWithSGD
// trainLogisticRegressionModelWithSGD
// trainLogisticRegressionModelWithLBFGS
// trainSVMModelWithSGD
// trainKMeansModel
// trainGaussianMixtureModel
// Rjb have a problem with theta: Array[Array[Double]]
override def trainNaiveBayesModel(data: JavaRDD[LabeledPoint], lambda: Double) = {
val model = NaiveBayes.train(data.rdd, lambda)
List(
Vectors.dense(model.labels),
Vectors.dense(model.pi),
model.theta.toSeq
).map(_.asInstanceOf[Object]).asJava
}
// On python is wt just Object
def predictSoftGMM(
data: JavaRDD[Vector],
wt: ArrayList[Object],
mu: ArrayList[Object],
si: ArrayList[Object]): RDD[Array[Double]] = {
// val weight = wt.asInstanceOf[Array[Double]]
val weight = wt.toArray.map(_.asInstanceOf[Double])
val mean = mu.toArray.map(_.asInstanceOf[DenseVector])
val sigma = si.toArray.map(_.asInstanceOf[DenseMatrix])
val gaussians = Array.tabulate(weight.length){
i => new MultivariateGaussian(mean(i), sigma(i))
}
val model = new GaussianMixtureModel(weight, gaussians)
model.predictSoft(data)
}
}
================================================
FILE: ext/spark/src/main/scala/RubyMLLibUtilAPI.scala
================================================
package org.apache.spark.mllib.api.ruby
import java.util.ArrayList
import org.apache.spark.mllib.util.LinearDataGenerator
import org.apache.spark.mllib.regression.LabeledPoint
object RubyMLLibUtilAPI {
// Ruby does have a problem with creating Array[Double]
def generateLinearInput(
intercept: Double,
weights: ArrayList[String],
nPoints: Int,
seed: Int,
eps: Double = 0.1): Seq[LabeledPoint] = {
LinearDataGenerator.generateLinearInput(intercept, weights.toArray.map(_.toString.toDouble), nPoints, seed, eps)
}
}
================================================
FILE: ext/spark/src/main/scala/RubyPage.scala
================================================
package org.apache.spark.ui.ruby
// import javax.servlet.http.HttpServletRequest
// import scala.xml.Node
// import org.apache.spark.ui.{WebUIPage, UIUtils}
// import org.apache.spark.util.Utils
// private[ui] class RubyPage(parent: RubyTab, rbConfig: Array[Tuple2[String, String]]) extends WebUIPage("") {
// def render(request: HttpServletRequest): Seq[Node] = {
// val content = UIUtils.listingTable(header, row, rbConfig)
// UIUtils.headerSparkPage("Ruby Config", content, parent)
// }
// private def header = Seq(
// "Number"
// )
// private def row(keyValue: (String, String)): Seq[Node] = {
// // scalastyle:off
// keyValue match {
// case (key, value) =>
//
// | {key} |
// {value} |
//
// }
// // scalastyle:on
// }
// }
class RubyPage {}
================================================
FILE: ext/spark/src/main/scala/RubyRDD.scala
================================================
package org.apache.spark.api.ruby
import java.io._
import java.net._
import java.util.{List, ArrayList, Collections}
import scala.util.Try
import scala.reflect.ClassTag
import scala.collection.JavaConversions._
import org.apache.spark._
import org.apache.spark.{SparkEnv, Partition, SparkException, TaskContext}
import org.apache.spark.api.ruby._
import org.apache.spark.api.ruby.marshal._
import org.apache.spark.api.java.{JavaSparkContext, JavaPairRDD, JavaRDD}
import org.apache.spark.api.python.PythonRDD
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.util.Utils
import org.apache.spark.InterruptibleIterator
/* =================================================================================================
* Class RubyRDD
* =================================================================================================
*/
class RubyRDD(
@transient parent: RDD[_],
command: Array[Byte],
broadcastVars: ArrayList[Broadcast[RubyBroadcast]],
accumulator: Accumulator[List[Array[Byte]]])
extends RDD[Array[Byte]](parent){
val bufferSize = conf.getInt("spark.buffer.size", 65536)
val asJavaRDD: JavaRDD[Array[Byte]] = JavaRDD.fromRDD(this)
override def getPartitions: Array[Partition] = firstParent.partitions
override val partitioner = None
/* ------------------------------------------------------------------------------------------ */
override def compute(split: Partition, context: TaskContext): Iterator[Array[Byte]] = {
val env = SparkEnv.get
// Get worker and id
val (worker, workerId) = RubyWorker.create(env)
// Start a thread to feed the process input from our parent's iterator
val writerThread = new WriterThread(env, worker, split, context)
context.addTaskCompletionListener { context =>
writerThread.shutdownOnTaskCompletion()
writerThread.join()
// Cleanup the worker socket. This will also cause the worker to exit.
try {
RubyWorker.remove(worker, workerId)
worker.close()
} catch {
case e: Exception => logWarning("Failed to close worker socket", e)
}
}
val stream = new DataInputStream(new BufferedInputStream(worker.getInputStream, bufferSize))
// Send data
writerThread.start()
// For violent termination of worker
new MonitorThread(workerId, worker, context).start()
// Return an iterator that read lines from the process's stdout
val stdoutIterator = new StreamReader(stream, writerThread, context)
// An iterator that wraps around an existing iterator to provide task killing functionality.
new InterruptibleIterator(context, stdoutIterator)
} // end compute
/* ------------------------------------------------------------------------------------------ */
class WriterThread(env: SparkEnv, worker: Socket, split: Partition, context: TaskContext)
extends Thread("stdout writer for worker") {
@volatile private var _exception: Exception = null
setDaemon(true)
// Contains the exception thrown while writing the parent iterator to the process.
def exception: Option[Exception] = Option(_exception)
// Terminates the writer thread, ignoring any exceptions that may occur due to cleanup.
def shutdownOnTaskCompletion() {
assert(context.isCompleted)
this.interrupt()
}
// -------------------------------------------------------------------------------------------
// Send the necessary data for worker
// - split index
// - command
// - iterator
override def run(): Unit = Utils.logUncaughtExceptions {
try {
SparkEnv.set(env)
val stream = new BufferedOutputStream(worker.getOutputStream, bufferSize)
val dataOut = new DataOutputStream(stream)
// Partition index
dataOut.writeInt(split.index)
// Spark files
PythonRDD.writeUTF(SparkFiles.getRootDirectory, dataOut)
// Broadcast variables
dataOut.writeInt(broadcastVars.length)
for (broadcast <- broadcastVars) {
dataOut.writeLong(broadcast.value.id)
PythonRDD.writeUTF(broadcast.value.path, dataOut)
}
// Serialized command
dataOut.writeInt(command.length)
dataOut.write(command)
// Send it
dataOut.flush()
// Data
PythonRDD.writeIteratorToStream(firstParent.iterator(split, context), dataOut)
dataOut.writeInt(RubyConstant.DATA_EOF)
dataOut.flush()
} catch {
case e: Exception if context.isCompleted || context.isInterrupted =>
logDebug("Exception thrown after task completion (likely due to cleanup)", e)
case e: Exception =>
// We must avoid throwing exceptions here, because the thread uncaught exception handler
// will kill the whole executor (see org.apache.spark.executor.Executor).
_exception = e
} finally {
Try(worker.shutdownOutput()) // kill worker process
}
}
} // end WriterThread
/* ------------------------------------------------------------------------------------------ */
class StreamReader(stream: DataInputStream, writerThread: WriterThread, context: TaskContext) extends Iterator[Array[Byte]] {
def hasNext = _nextObj != null
var _nextObj = read()
// -------------------------------------------------------------------------------------------
def next(): Array[Byte] = {
val obj = _nextObj
if (hasNext) {
_nextObj = read()
}
obj
}
// -------------------------------------------------------------------------------------------
private def read(): Array[Byte] = {
if (writerThread.exception.isDefined) {
throw writerThread.exception.get
}
try {
stream.readInt() match {
case length if length > 0 =>
val obj = new Array[Byte](length)
stream.readFully(obj)
obj
case RubyConstant.WORKER_DONE =>
val numAccumulatorUpdates = stream.readInt()
(1 to numAccumulatorUpdates).foreach { _ =>
val updateLen = stream.readInt()
val update = new Array[Byte](updateLen)
stream.readFully(update)
accumulator += Collections.singletonList(update)
}
null
case RubyConstant.WORKER_ERROR =>
// Exception from worker
// message
val length = stream.readInt()
val obj = new Array[Byte](length)
stream.readFully(obj)
// stackTrace
val stackTraceLen = stream.readInt()
val stackTrace = new Array[String](stackTraceLen)
(0 until stackTraceLen).foreach { i =>
val length = stream.readInt()
val obj = new Array[Byte](length)
stream.readFully(obj)
stackTrace(i) = new String(obj, "utf-8")
}
// Worker will be killed
stream.close
// exception
val exception = new RubyException(new String(obj, "utf-8"), writerThread.exception.getOrElse(null))
exception.appendToStackTrace(stackTrace)
throw exception
}
} catch {
case e: Exception if context.isInterrupted =>
logDebug("Exception thrown after task interruption", e)
throw new TaskKilledException
case e: Exception if writerThread.exception.isDefined =>
logError("Worker exited unexpectedly (crashed)", e)
throw writerThread.exception.get
case eof: EOFException =>
throw new SparkException("Worker exited unexpectedly (crashed)", eof)
}
}
} // end StreamReader
/* ---------------------------------------------------------------------------------------------
* Monitor thread for controll worker. Kill worker if task is interrupted.
*/
class MonitorThread(workerId: Long, worker: Socket, context: TaskContext)
extends Thread("Worker Monitor for worker") {
setDaemon(true)
override def run() {
// Kill the worker if it is interrupted, checking until task completion.
while (!context.isInterrupted && !context.isCompleted) {
Thread.sleep(2000)
}
if (!context.isCompleted) {
try {
logWarning("Incomplete task interrupted: Attempting to kill Worker "+workerId.toString())
RubyWorker.kill(workerId)
} catch {
case e: Exception =>
logError("Exception when trying to kill worker "+workerId.toString(), e)
}
}
}
} // end MonitorThread
} // end RubyRDD
/* =================================================================================================
* Class PairwiseRDD
* =================================================================================================
*
* Form an RDD[(Array[Byte], Array[Byte])] from key-value pairs returned from Ruby.
* This is used by PySpark's shuffle operations.
* Borrowed from Python Package -> need new deserializeLongValue ->
* Marshal will add the same 4b header
*/
class PairwiseRDD(prev: RDD[Array[Byte]]) extends RDD[(Long, Array[Byte])](prev) {
override def getPartitions = prev.partitions
override def compute(split: Partition, context: TaskContext) =
prev.iterator(split, context).grouped(2).map {
case Seq(a, b) => (Utils.deserializeLongValue(a.reverse), b)
case x => throw new SparkException("PairwiseRDD: unexpected value: " + x)
}
val asJavaPairRDD : JavaPairRDD[Long, Array[Byte]] = JavaPairRDD.fromRDD(this)
}
/* =================================================================================================
* Object RubyRDD
* =================================================================================================
*/
object RubyRDD extends Logging {
def runJob(
sc: SparkContext,
rdd: JavaRDD[Array[Byte]],
partitions: ArrayList[Int],
allowLocal: Boolean,
filename: String): String = {
type ByteArray = Array[Byte]
type UnrolledPartition = Array[ByteArray]
val allPartitions: Array[UnrolledPartition] =
sc.runJob(rdd, (x: Iterator[ByteArray]) => x.toArray, partitions, allowLocal)
val flattenedPartition: UnrolledPartition = Array.concat(allPartitions: _*)
writeRDDToFile(flattenedPartition.iterator, filename)
}
def readRDDFromFile(sc: JavaSparkContext, filename: String, parallelism: Int): JavaRDD[Array[Byte]] = {
val file = new DataInputStream(new BufferedInputStream(new FileInputStream(filename)))
val objs = new collection.mutable.ArrayBuffer[Array[Byte]]
try {
while (true) {
val length = file.readInt()
val obj = new Array[Byte](length)
file.readFully(obj)
objs.append(obj)
}
} catch {
case eof: EOFException => {}
}
JavaRDD.fromRDD(sc.sc.parallelize(objs, parallelism))
}
def writeRDDToFile[T](items: Iterator[T], filename: String): String = {
val file = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(filename)))
try {
PythonRDD.writeIteratorToStream(items, file)
} finally {
file.close()
}
filename
}
def writeRDDToFile[T](rdd: RDD[T], filename: String): String = {
writeRDDToFile(rdd.collect.iterator, filename)
}
def readBroadcastFromFile(sc: JavaSparkContext, path: String, id: java.lang.Long): Broadcast[RubyBroadcast] = {
sc.broadcast(new RubyBroadcast(path, id))
}
/**
* Convert an RDD of serialized Ruby objects to RDD of objects, that is usable in Java.
*/
def toJava(rbRDD: JavaRDD[Array[Byte]], batched: Boolean): JavaRDD[Any] = {
rbRDD.rdd.mapPartitions { iter =>
iter.flatMap { item =>
val obj = Marshal.load(item)
if(batched){
obj.asInstanceOf[Array[_]]
}
else{
Seq(item)
}
}
}.toJavaRDD()
}
/**
* Convert an RDD of Java objects to an RDD of serialized Ruby objects, that is usable by Ruby.
*/
def toRuby(jRDD: JavaRDD[_]): JavaRDD[Array[Byte]] = {
jRDD.rdd.mapPartitions { iter => new IterableMarshaller(iter) }
}
}
/* =================================================================================================
* Class RubyException
* =================================================================================================
*/
class RubyException(msg: String, cause: Exception) extends RuntimeException(msg, cause) {
def appendToStackTrace(toAdded: Array[String]) {
val newStactTrace = getStackTrace.toBuffer
var regexpMatch = "(.*):([0-9]+):in `([a-z]+)'".r
for(item <- toAdded) {
item match {
case regexpMatch(fileName, lineNumber, methodName) =>
newStactTrace += new StackTraceElement("RubyWorker", methodName, fileName, lineNumber.toInt)
case _ => null
}
}
setStackTrace(newStactTrace.toArray)
}
}
================================================
FILE: ext/spark/src/main/scala/RubySerializer.scala
================================================
package org.apache.spark.api.ruby
import scala.collection.JavaConverters._
import scala.reflect.{ClassTag, classTag}
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.api.ruby.marshal._
/* =================================================================================================
* object RubySerializer
* =================================================================================================
*/
object RubySerializer { }
================================================
FILE: ext/spark/src/main/scala/RubyTab.scala
================================================
package org.apache.spark.ui.ruby
import scala.collection.mutable.HashMap
import org.apache.spark.ui._
// class RubyTab(parent: SparkUI, rbConfig: HashMap[String, String]) extends SparkUITab(parent, "ruby"){
// attachPage(new RubyPage(this, rbConfig.toArray))
// }
class RubyTab {}
================================================
FILE: ext/spark/src/main/scala/RubyUtils.scala
================================================
package org.apache.spark.api.ruby
import org.apache.spark.util._
import org.apache.spark.{SparkConf, Logging}
object RubyUtils extends Logging {
def loadPropertiesFile(conf: SparkConf, path: String): String = {
Utils.getPropertiesFromFile(path).foreach {
case (key, value) => conf.set(key, value)
}
path
}
}
================================================
FILE: ext/spark/src/main/scala/RubyWorker.scala
================================================
package org.apache.spark.api.ruby
import java.io.{File, DataInputStream, InputStream, DataOutputStream, FileOutputStream}
import java.net.{InetAddress, ServerSocket, Socket, SocketException}
import java.nio.file.Paths
import scala.collection.mutable
import scala.collection.JavaConversions._
import org.apache.spark._
import org.apache.spark.api.python.PythonRDD
import org.apache.spark.util.Utils
import org.apache.spark.util.RedirectThread
/* =================================================================================================
* Object RubyWorker
* =================================================================================================
*
* Create and store server for creating workers.
*/
object RubyWorker extends Logging {
val PROCESS_WAIT_TIMEOUT = 10000
private var serverSocket: ServerSocket = null
private val serverHost = InetAddress.getByAddress(Array(127, 0, 0, 1))
private var serverPort: Int = 0
private var master: ExecutedFileCommand = null
private var masterSocket: Socket = null
private var masterOutputStream: DataOutputStream = null
private var masterInputStream: DataInputStream = null
private var workers = new mutable.WeakHashMap[Socket, Long]()
/* ----------------------------------------------------------------------------------------------
* Create new worker but first check if exist SocketServer and master process.
* If not it will create them. Worker have 2 chance to create.
*/
def create(env: SparkEnv): (Socket, Long) = {
synchronized {
// Create the server if it hasn't been started
createServer(env)
// Attempt to connect, restart and retry once if it fails
try {
createWorker
} catch {
case exc: SocketException =>
logWarning("Worker unexpectedly quit, attempting to restart")
createWorker
}
}
}
/* ----------------------------------------------------------------------------------------------
* Create a worker throught master process. Return new socket and id.
* According spark.ruby.worker.type id will be:
* process: PID
* thread: thread object id
*/
def createWorker: (Socket, Long) = {
synchronized {
masterOutputStream.writeInt(RubyConstant.CREATE_WORKER)
var socket = serverSocket.accept()
var id = new DataInputStream(socket.getInputStream).readLong()
workers.put(socket, id)
(socket, id)
}
}
/* ----------------------------------------------------------------------------------------------
* Create SocketServer and bind it to the localhost. Max numbers of connection on queue
* is set to default. If server is created withou exception -> create master.
*/
private def createServer(env: SparkEnv){
synchronized {
// Already running?
if(serverSocket != null && masterSocket != null) {
return
}
try {
// Start Socket Server for comunication
serverSocket = new ServerSocket(0, 0, serverHost)
serverPort = serverSocket.getLocalPort
// Create a master for worker creations
createMaster(env)
} catch {
case e: Exception =>
throw new SparkException("There was a problem with creating a server", e)
}
}
}
/* ----------------------------------------------------------------------------------------------
* In this point SocketServer must be created. Master process create and kill workers.
* Creating workers from Java can be an expensive operation because new process can
* get copy of address space.
*/
private def createMaster(env: SparkEnv){
synchronized {
val isDriver = env.executorId == SparkContext.DRIVER_IDENTIFIER
val executorOptions = env.conf.get("spark.ruby.executor.options", "")
val commandTemplate = env.conf.get("spark.ruby.executor.command")
val workerType = env.conf.get("spark.ruby.worker.type")
// Where is root of ruby-spark
var executorLocation = ""
if(isDriver){
// Use worker from current active gem location
executorLocation = env.conf.get("spark.ruby.driver_home")
}
else{
// Use gem installed on the system
try {
val homeCommand = (new FileCommand(commandTemplate, "ruby-spark home", env, getEnvVars(env))).run
executorLocation = homeCommand.readLine
} catch {
case e: Exception =>
throw new SparkException("Ruby-spark gem is not installed.", e)
}
}
// Master and worker are saved in GEM_ROOT/lib/spark/worker
executorLocation = Paths.get(executorLocation, "lib", "spark", "worker").toString
// Create master command
// -C: change worker dir before execution
val masterRb = s"ruby $executorOptions -C $executorLocation master.rb $workerType $serverPort"
val masterCommand = new FileCommand(commandTemplate, masterRb, env, getEnvVars(env))
// Start master
master = masterCommand.run
// Redirect master stdout and stderr
redirectStreamsToStderr(master.getInputStream, master.getErrorStream)
// Wait for it to connect to our socket
serverSocket.setSoTimeout(PROCESS_WAIT_TIMEOUT)
try {
// Use socket for comunication. Keep stdout and stdin for log
masterSocket = serverSocket.accept()
masterOutputStream = new DataOutputStream(masterSocket.getOutputStream)
masterInputStream = new DataInputStream(masterSocket.getInputStream)
PythonRDD.writeUTF(executorOptions, masterOutputStream)
} catch {
case e: Exception =>
throw new SparkException("Ruby master did not connect back in time", e)
}
}
}
/* ----------------------------------------------------------------------------------------------
* Gel all environment variables for executor
*/
def getEnvVars(env: SparkEnv): Map[String, String] = {
val prefix = "spark.ruby.executor.env."
env.conf.getAll.filter{case (k, _) => k.startsWith(prefix)}
.map{case (k, v) => (k.substring(prefix.length), v)}
.toMap
}
/* ------------------------------------------------------------------------------------------- */
def kill(workerId: Long){
masterOutputStream.writeInt(RubyConstant.KILL_WORKER)
masterOutputStream.writeLong(workerId)
}
/* ------------------------------------------------------------------------------------------- */
def killAndWait(workerId: Long){
masterOutputStream.writeInt(RubyConstant.KILL_WORKER_AND_WAIT)
masterOutputStream.writeLong(workerId)
// Wait for answer
masterInputStream.readInt() match {
case RubyConstant.SUCCESSFULLY_KILLED =>
logInfo(s"Worker $workerId was successfully killed")
case RubyConstant.UNSUCCESSFUL_KILLING =>
logInfo(s"Worker $workerId cannot be killed (maybe is already killed)")
}
}
/* ----------------------------------------------------------------------------------------------
* workers HashMap is week but it avoid long list of workers which cannot be killed (killAndWait)
*/
def remove(worker: Socket, workerId: Long){
try {
workers.remove(worker)
} catch {
case e: Exception => logWarning(s"Worker $workerId does not exist (maybe is already removed)")
}
}
/* ------------------------------------------------------------------------------------------- */
def stopServer{
synchronized {
// Kill workers
workers.foreach { case (socket, id) => killAndWait(id) }
// Kill master
master.destroy
// Stop SocketServer
serverSocket.close()
// Clean variables
serverSocket = null
serverPort = 0
master = null
masterSocket = null
masterOutputStream = null
masterInputStream = null
}
}
/* ------------------------------------------------------------------------------------------- */
private def redirectStreamsToStderr(streams: InputStream*) {
try {
for(stream <- streams) {
new RedirectThread(stream, System.err, "stream reader").start()
}
} catch {
case e: Exception =>
logError("Exception in redirecting streams", e)
}
}
/* ------------------------------------------------------------------------------------------- */
}
================================================
FILE: ext/spark/src/test/scala/MarshalSpec.scala
================================================
package org.apache.spark.api.ruby.marshal
import org.scalatest._
import org.apache.spark.api.ruby.marshal._
class MarshalSpec extends FunSpec with Matchers {
// ====================================================================================
// Load
describe("Marshal.load"){
describe("single value"){
it("int"){
val data = 1
val serialized = Array[Byte](4, 8, 105, 6)
Marshal.load(serialized) should equal(data)
}
it("double"){
val data = 1.2
val serialized = Array[Byte](4, 8, 102, 8, 49, 46, 50)
Marshal.load(serialized) should equal(data)
}
}
describe("array"){
it("ints"){
val data = Array(1, 2, 3, 4, 5)
val serialized = Array[Byte](4, 8, 91, 10, 105, 6, 105, 7, 105, 8, 105, 9, 105, 10)
Marshal.load(serialized) should equal(data)
}
it("doubles"){
val data = Array(1.1, 2.2, 3.3)
val serialized = Array[Byte](4, 8, 91, 8, 102, 8, 49, 46, 49, 102, 8, 50, 46, 50, 102, 8, 51, 46, 51)
Marshal.load(serialized) should equal(data)
}
}
}
// ====================================================================================
// Dump
describe("Marshal.dump"){
describe("single value"){
it("int"){
val data = 1
val serialized = Array(4, 8, 105, 6)
Marshal.dump(data) should equal(serialized)
}
it("double"){
val data = 1.2
val serialized = Array(4, 8, 102, 8, 49, 46, 50)
Marshal.dump(data) should equal(serialized)
}
}
describe("array"){
it("ints"){
val data = Array(1, 2, 3, 4, 5)
val serialized = Array(4, 8, 91, 10, 105, 6, 105, 7, 105, 8, 105, 9, 105, 10)
Marshal.dump(data) should equal(serialized)
}
it("doubles"){
val data = Array(1.1, 2.2, 3.3)
val serialized = Array(4, 8, 91, 8, 102, 8, 49, 46, 49, 102, 8, 50, 46, 50, 102, 8, 51, 46, 51)
Marshal.dump(data) should equal(serialized)
}
}
}
}
================================================
FILE: lib/ruby-spark.rb
================================================
require_relative 'spark'
================================================
FILE: lib/spark/accumulator.rb
================================================
module Spark
##
# A shared variable that can be accumulated, i.e., has a commutative and associative "add"
# operation. Worker tasks on a Spark cluster can add values to an Accumulator with the `+=`
# operator, but only the driver program is allowed to access its value, using value.
# Updates from the workers get propagated automatically to the driver program.
#
# == Arguments:
# value::
# Initial value for accumulator. This values is stored only on driver process
#
# accum_param::
# How merge 2 value on worker or driver process.
# Symbol or Proc (or String)
#
# zero_value::
# Initial value for worker process
#
#
# == Examples:
#
# accum1 = $sc.accumulator(1)
# accum2 = $sc.accumulator(2, :*, 1)
# accum3 = $sc.accumulator(3, lambda{|max, val| val > max ? val : max})
#
# accum1 += 1
#
# accum2.add(2)
# accum2.add(2)
# accum2.add(2)
#
# accum3.add(9)
# accum3.add(6)
# accum3.add(7)
#
# accum1.value # => 2
# accum2.value # => 16
# accum3.value # => 9
#
# func = Proc.new do |_, index|
# accum1.add(1)
# accum2.add(2)
# accum3.add(index * 10)
# end
#
# rdd = $sc.parallelize(0..4, 4)
# rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3)
# rdd = rdd.map_partitions_with_index(func)
# rdd.collect
#
# accum1.value # => 6
# accum2.value # => 256
# accum3.value # => 30
#
class Accumulator
attr_reader :id, :value, :accum_param, :zero_value
@@instances = {}
@@changed = []
SUPPORTED_SYMBOLS = [:+, :-, :*, :/, :**]
# =========================================================================
# Creating and selecting Spark::Accumulator
def initialize(value, accum_param=:+, zero_value=0)
@id = object_id
@value = value
@accum_param = accum_param
@zero_value = zero_value
@driver = true
valid_accum_param
@@instances[@id] = self
end
def inspect
result = %{#<#{self.class.name}:0x#{object_id}\n}
result << %{ ID: #{@id}\n}
result << %{ Zero: #{@zero_value.to_s[0, 10]}\n}
result << %{Value: #{@value.to_s[0, 10]}>}
result
end
def self.changed
@@changed
end
def self.instances
@@instances
end
def valid_accum_param
if @accum_param.is_a?(Symbol)
raise Spark::AccumulatorError, "Unsupported symbol #{@accum_param}" unless SUPPORTED_SYMBOLS.include?(@accum_param)
@serialized_accum_param = @accum_param
return
end
if @accum_param.is_a?(Proc)
begin
@serialized_accum_param = @accum_param.to_source
return
rescue
raise Spark::SerializeError, 'Proc can not be serialized. Use String instead.'
end
end
if @accum_param.is_a?(String)
@serialized_accum_param = @accum_param
@accum_param = eval(@accum_param)
unless @accum_param.is_a?(Proc)
raise Spark::SerializeError, 'Yours param is not a Proc.'
end
return
end
raise Spark::AccumulatorError, 'Unsupported param. Use Symbol, Proc or String.'
end
# Driver process or worker
def driver?
@driver
end
# =========================================================================
# Operations
def add(term)
if !driver? && !@@changed.include?(self)
@@changed << self
end
if @accum_param.is_a?(Proc)
@value = @accum_param.call(@value, term)
else
add_by_symbol(term)
end
end
def +(term)
add(term)
self
end
def add_by_symbol(term)
case @accum_param
when :+
@value += term
when :-
@value -= term
when :*
@value *= term
when :/
@value /= term
when :**
@value **= term
end
end
# =========================================================================
# Dump and load
def marshal_dump
[@id, @zero_value, @serialized_accum_param]
end
def marshal_load(array)
@id, @zero_value, @serialized_accum_param = array
@value = @zero_value
@driver = false
load_accum_param
end
def load_accum_param
if @serialized_accum_param.is_a?(String)
@accum_param = eval(@serialized_accum_param)
else
@accum_param = @serialized_accum_param
end
end
end
end
# =============================================================================
# Server for handeling Accumulator update
#
module Spark
class Accumulator
class Server
attr_reader :server, :host, :port
def self.start
@instance ||= Spark::Accumulator::Server.new
end
def self.stop
@instance && @instance.stop
end
def self.host
start
@instance.host
end
def self.port
start
@instance.port
end
def initialize
@server = TCPServer.new(0)
@host = @server.hostname
@port = @server.port
@threads = []
handle_accept
end
def stop
@threads.each(&:kill)
rescue
nil
end
def handle_accept
@threads << Thread.new do
loop {
handle_connection(@server.accept)
}
end
end
def handle_connection(socket)
@threads << Thread.new do
until socket.closed?
count = socket.read_int
count.times do
data = socket.read_data
accum = Spark::Accumulator.instances[data[0]]
if accum
accum.add(data[1])
else
Spark.logger.warn("Accumulator with id #{data[0]} does not exist.")
end
end
# http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
# socket.write_int(Spark::Constant::ACCUMULATOR_ACK)
end
end
end
end
end
end
================================================
FILE: lib/spark/broadcast.rb
================================================
module Spark
##
# Broadcast a read-only variable to the cluster, returning a Spark::Broadcast
# object for reading it in distributed functions. The variable will
# be sent to each cluster only once.
#
# == Example:
#
# broadcast1 = $sc.broadcast('a')
# broadcast2 = $sc.broadcast('b')
# broadcast3 = $sc.broadcast([1,2,3])
#
# func = Proc.new do |part, index|
# [
# broadcast1.value * index,
# broadcast2.value * index,
# broadcast3.value.reduce(:+)
# ]
# end
#
# rdd = $sc.parallelize(0..5, 4)
# rdd = rdd.bind(broadcast1: broadcast1, broadcast2: broadcast2, broadcast3: broadcast3)
# rdd = rdd.map_partitions_with_index(func)
# rdd.collect
# # => ["", "", 6, "a", "b", 6, "aa", "bb", 6, "aaa", "bbb", 6]
#
class Broadcast
LOADED = 0 # id, value, path
NOT_LOADED = 1 # id, path
WITHOUT_PATH = 2 # id
attr_reader :id, :state, :path, :jbroadcast
@@registered = {}
# =========================================================================
# Creating broadcast for SparkContext
# Create new Broadcast and dump value to the disk
#
# b = $sc.broadcast('a')
#
# b.value # => 'a'
# b.path
# b.jbroadcast
#
def initialize(sc, value)
@id = object_id
@value = value
@state = LOADED
file = Tempfile.create('broadcast', sc.temp_dir)
file.binmode
file.write(Marshal.dump(value))
file.close
@path = file.path
@jbroadcast = RubyRDD.readBroadcastFromFile(sc.jcontext, @path, Spark.jb.to_long(@id))
ObjectSpace.define_finalizer(self, proc { File.unlink(@path) })
end
def inspect
result = %{#<#{self.class.name}:0x#{object_id}\n}
result << %{ ID: #{@id}\n}
result << %{Value: #{@value.to_s[0, 10]}>}
result
end
def self.register(id, path)
@@registered[id] = path
end
def value
case state
when LOADED
@value
when NOT_LOADED
@value = Marshal.load(File.read(@path))
@state = LOADED
@value
when WITHOUT_PATH
@path = @@registered[id]
if @path
@state = NOT_LOADED
value
else
raise Spark::BroadcastError, "Broadcast #{@id} do not have registered path."
end
end
end
def marshal_dump
@id
end
def marshal_load(id)
@id = id
@state = WITHOUT_PATH
end
end
end
================================================
FILE: lib/spark/build.rb
================================================
module Spark
module Build
DEFAULT_SCALA_VERSION = '2.10.4'
DEFAULT_CORE_VERSION = '2.10'
DEFAULT_SPARK_VERSION = '1.6.0'
DEFAULT_HADOOP_VERSION = '1.0.4'
SBT = 'sbt/sbt'
SBT_DEPS = 'assemblyPackageDependency'
SBT_EXT = 'package'
SBT_CLEAN = 'clean'
def self.build(options={})
scala_version = options[:scala_version] || DEFAULT_SCALA_VERSION
spark_core_version = options[:spark_core_version] || DEFAULT_CORE_VERSION
spark_version = options[:spark_version] || DEFAULT_SPARK_VERSION
hadoop_version = options[:hadoop_version] || DEFAULT_HADOOP_VERSION
target = options[:target] || Spark.target_dir
only_ext = options[:only_ext] || false
env = {
'SCALA_VERSION' => scala_version,
'SPARK_VERSION' => spark_version,
'SPARK_CORE_VERSION' => spark_core_version,
'HADOOP_VERSION' => hadoop_version,
'TARGET_DIR' => target
}
cmd = [SBT]
cmd << SBT_EXT
cmd << SBT_DEPS unless only_ext
cmd << SBT_CLEAN unless $DEBUG
Dir.chdir(Spark.spark_ext_dir) do
unless Kernel.system(env, cmd.join(' '))
raise Spark::BuildError, 'Spark cannot be assembled.'
end
end
end
end
end
================================================
FILE: lib/spark/cli.rb
================================================
require 'commander'
module Commander
module UI
# Disable paging
# for 'classic' help
def self.enable_paging
end
end
end
module Spark
class CLI
include Commander::Methods
# IRB_HISTORY_FILE = File.join(Dir.home, '.irb_spark_history')
# IRB_HISTORY_SIZE = 100
def run
program :name, 'RubySpark'
program :version, Spark::VERSION
program :description, 'Ruby wrapper for Spark'
global_option('-d', '--debug', 'Logging message to stdout'){ $DEBUG = true }
default_command :help
# Build ---------------------------------------------------------------
command :build do |c|
c.syntax = 'build [options]'
c.description = 'Build spark and gem extensions'
c.option '--hadoop-version STRING', String, 'Version of hadoop which will assembled with the Spark'
c.option '--spark-core-version STRING', String, 'Version of Spark core'
c.option '--spark-version STRING', String, 'Version of Spark'
c.option '--scala-version STRING', String, 'Version of Scala'
c.option '--target STRING', String, 'Directory where Spark will be stored'
c.option '--only-ext', 'Build only extension for RubySpark'
c.action do |args, options|
Spark::Build.build(options.__hash__)
puts
puts 'Everything is OK'
end
end
alias_command :install, :build
# Shell -----------------------------------------------------------------
command :shell do |c|
c.syntax = 'shell [options]'
c.description = 'Start ruby shell for spark'
c.option '--target STRING', String, 'Directory where Spark is stored'
c.option '--properties-file STRING', String, 'Path to a file from which to load extra properties'
c.option '--[no-]start', 'Start Spark immediately'
c.option '--[no-]logger', 'Enable/disable logger (default: enable)'
c.option '--auto-reload', 'Autoreload changed files'
c.action do |args, options|
options.default start: true, logger: true
Spark.load_lib(options.target)
Spark.logger.disable unless options.logger
Spark.config do
set_app_name 'RubySpark'
end
Spark.config.from_file(options.properties_file)
if options.auto_reload
require 'listen'
listener = Listen.to(File.join(Spark.root, 'lib')) do |modified, added, removed|
(modified+added).each do |file|
silence_warnings { load(file) }
end
end
listener.start
end
if options.start
# Load Java and Spark
Spark.start
$sc = Spark.context
Spark.print_logo('Spark context is loaded as $sc')
else
Spark.print_logo('You can start Spark with Spark.start')
end
# Load Pry
require 'pry'
Pry.start
end
end
# # IRB -------------------------------------------------------------------
# command :irb do |c|
# c.syntax = 'irb [options]'
# c.description = 'Start ruby shell for spark'
# c.option '--spark-home STRING', String, 'Directory where Spark is stored'
# c.option '--[no-]start', 'Start Spark immediately'
# c.option '--[no-]logger', 'Enable/disable logger (default: enable)'
#
# c.action do |args, options|
# options.default start: true, logger: true
#
# Spark.load_lib(options.spark_home)
# Spark::Logger.disable unless options.logger
#
# Spark.config do
# set_app_name 'Pry RubySpark'
# end
#
# if options.start
# # Load Java and Spark
# Spark.start
# $sc = Spark.context
#
# Spark.print_logo('Spark context is loaded as $sc')
# else
# Spark.print_logo('You can start Spark with Spark.start')
# end
#
# # Load IRB
# require 'irb'
# require 'irb/completion'
# require 'irb/ext/save-history'
#
# begin
# file = File.expand_path(IRB_HISTORY_FILE)
# if File.exists?(file)
# lines = IO.readlines(file).collect { |line| line.chomp }
# Readline::HISTORY.push(*lines)
# end
# Kernel.at_exit do
# lines = Readline::HISTORY.to_a.reverse.uniq.reverse
# lines = lines[-IRB_HISTORY_SIZE, IRB_HISTORY_SIZE] if lines.nitems > IRB_HISTORY_SIZE
# File.open(IRB_HISTORY_FILE, File::WRONLY | File::CREAT | File::TRUNC) { |io| io.puts lines.join("\n") }
# end
# rescue
# end
#
# ARGV.clear # Clear Thor ARGV, otherwise IRB will parse it
# ARGV.concat ['--readline', '--prompt-mode', 'simple']
# IRB.start
# end
# end
# Home ------------------------------------------------------------------
command :home do |c|
c.action do |args, options|
puts Spark.home
exit(0)
end
end
# Ruby spark jar --------------------------------------------------------
command :ruby_spark_jar do |c|
c.action do |args, options|
puts Spark.ruby_spark_jar
exit(0)
end
end
run!
end
end
end
================================================
FILE: lib/spark/command/base.rb
================================================
##
# Spark::Command::Base
#
# Parent for all commands (Map, FlatMap, Sort, ...)
#
class Spark::Command::Base
DEFAULT_VARIABLE_OPTIONS = {
type: Hash,
function: true
}
def initialize(*args)
settings.variables.each do |name, options|
instance_variable_set("@#{name}", args.shift)
end
end
def to_s
self.class.name.split('::').last
end
def self.error(message)
raise Spark::CommandError, message
end
def error(message)
self.class.error(message)
end
def log(message=nil)
$stdout.puts %{==> #{Time.now.strftime("%H:%M:%S")} [#{self.class.name}] #{message}}
$stdout.flush
end
# ===============================================================================================
# Methods called during class loading
# This is not nicer way but these methods set/get classes variables for child
# Settings for command (variables)
def self.settings
init_settings
class_variable_get(:@@settings)
end
def settings
self.class.settings
end
# Init empty settings
def self.init_settings
if !class_variable_defined?(:@@settings)
struct = Struct.new(:variables)
class_variable_set(:@@settings, struct.new)
settings.variables = {}
end
end
# New variable for command
#
# == Example:
#
# class Map < Spark::Command::Base
# variable :map_function
# end
#
# command = Map.new(1)
#
# command.instance_variables
# # => [:@map_function]
# command.instance_variable_get(:@map_function)
# # => 1
#
def self.variable(name, options={})
if settings.variables.has_key?(name)
error "Function #{name} already exist."
end
settings.variables[name] = DEFAULT_VARIABLE_OPTIONS.merge(options)
end
# ===============================================================================================
# Executing methods
# Execute command for data and split index
def execute(iterator, split_index)
# Implemented on Base but can be override
before_run
# Run has to be implemented on child
if iterator.is_a?(Enumerator::Lazy) && respond_to?(:lazy_run)
return lazy_run(iterator, split_index)
end
iterator = iterator.to_a
run(iterator, split_index)
end
def prepared?
!!@prepared
end
# This is called before execution. Executing will be stopped if
# some command contains error (e.g. badly serialized lambda).
#
# == What is doing?
# * evaluate lambda
# * evaluate method
# * make new lambda
#
def prepare
return if prepared?
to_function = settings.variables.select {|_, options| options[:function]}
to_function.each do |name, options|
name = "@#{name}"
data = instance_variable_get(name)
case data[:type]
when 'proc'
result = eval(data[:content])
when 'symbol'
result = lambda(&data[:content])
when 'method'
# Method must me added to instance not Class
instance_eval(data[:content])
# Method will be available as Proc
result = lambda(&method(data[:name]))
end
instance_variable_set(name, result)
end
@prepared = true
end
# This method is called before every execution.
def before_run
end
# ===============================================================================================
# Bound objects
attr_accessor :__objects__
def method_missing(method, *args, &block)
if __objects__ && __objects__.has_key?(method)
return __objects__[method]
end
super
end
end
================================================
FILE: lib/spark/command/basic.rb
================================================
_Base = Spark::Command::Base
# -------------------------------------------------------------------------------------------------
# Map
class Spark::Command::Map < _Base
variable :map_function
def run(iterator, *)
iterator.map! do |item|
@map_function.call(item)
end
iterator
end
def lazy_run(iterator, *)
iterator.map do |item|
@map_function.call(item)
end
end
end
# -------------------------------------------------------------------------------------------------
# FlatMap
class Spark::Command::FlatMap < Spark::Command::Map
def run(iterator, *)
iterator = super
iterator.flatten!(1)
iterator
end
def lazy_run(iterator, *)
iterator.flat_map do |item|
@map_function.call(item)
end
end
end
# -------------------------------------------------------------------------------------------------
# MapPartitionsWithIndex
class Spark::Command::MapPartitionsWithIndex < _Base
variable :partition_function
def run(iterator, index)
iterator = @partition_function.call(iterator, index)
iterator
end
# User should controll if there is Enumerator or not
# alias_method :lazy_run, :run
end
# -------------------------------------------------------------------------------------------------
# MapPartitions
class Spark::Command::MapPartitions < Spark::Command::MapPartitionsWithIndex
def run(iterator, *)
# Do not use `super` because `@partition_function` can be method with 1 argument
iterator = @partition_function.call(iterator)
iterator
end
# alias_method :lazy_run, :run
end
# -------------------------------------------------------------------------------------------------
# Filter
class Spark::Command::Filter < _Base
variable :filter_function
def run(iterator, *)
iterator.select! do |item|
@filter_function.call(item)
end
iterator
end
def lazy_run(iterator, *)
iterator.select do |item|
@filter_function.call(item)
end
end
end
# -------------------------------------------------------------------------------------------------
# Compact
class Spark::Command::Compact < _Base
def run(iterator, *)
iterator.compact!
iterator
end
def lazy_run(iterator, *)
iterator.select do |item|
!item.nil?
end
end
end
# -------------------------------------------------------------------------------------------------
# Glom
class Spark::Command::Glom < _Base
def run(iterator, *)
[iterator]
end
def lazy_run(iterator, *)
run(iterator.to_a)
end
end
# -------------------------------------------------------------------------------------------------
# Shuffle
class Spark::Command::Shuffle < _Base
variable :seed, function: false, type: Integer
def run(iterator, *)
iterator.shuffle!(random: rng)
iterator
end
def rng
Random.new(@seed)
end
end
# -------------------------------------------------------------------------------------------------
# PartitionBy
class Spark::Command::PartitionBy
class Base < Spark::Command::Base
include Spark::Helper::Serialize
def prepare
super
# Default. Keep it after super because Sorting has own key_function.
@key_function ||= lambda{|x| x[0]}
end
def run(iterator, *)
iterator.map! do |item|
make_partition_item(item)
end
iterator.flatten!(1)
iterator
end
def lazy_run(iterator, *)
iterator.flat_map do |item|
make_partition_item(item)
end
end
private
def make_partition_item(item)
[
pack_long(@partition_func.call(@key_function[item])),
item
]
end
end
class Basic < Base
variable :partition_func
end
class Sorting < Base
variable :key_function
variable :bounds, function: false, type: Array
variable :ascending, function: false, type: [TrueClass, FalseClass]
variable :num_partitions, function: false, type: Numeric
def prepare
super
# Index by bisect alghoritm
@partition_func ||= Proc.new do |key|
count = 0
@bounds.each{|i|
break if i >= key
count += 1
}
if @ascending
count
else
@num_partitions - 1 - count
end
end
end
end # Sorting
end # PartitionBy
# -------------------------------------------------------------------------------------------------
# Aggregate
class Spark::Command::Aggregate < _Base
variable :reduce_func
variable :zero_value, function: false, type: Object
def run(iterator, *)
[iterator.reduce(@zero_value, &@reduce_func)]
end
def lazy_run(iterator, *)
run(iterator)
end
end
# -------------------------------------------------------------------------------------------------
# Reduce
class Spark::Command::Reduce < Spark::Command::Aggregate
def run(iterator, *)
[iterator.reduce(&@reduce_func)]
end
end
# -------------------------------------------------------------------------------------------------
# Foreach
class Spark::Command::Foreach < _Base
variable :each_function
def run(iterator, *)
iterator.each do |item|
@each_function.call(item)
end
nil
end
end
# -------------------------------------------------------------------------------------------------
# ForeachPartition
class Spark::Command::ForeachPartition < _Base
variable :partition_function
def run(iterator, *)
@partition_function.call(iterator)
nil
end
end
# -------------------------------------------------------------------------------------------------
# KeyBy
class Spark::Command::KeyBy < _Base
variable :key_function
def run(iterator, *)
iterator.map! do |item|
[@key_function.call(item), item]
end
iterator
end
def lazy_run(iterator, *)
iterator.map do |item|
[@key_function.call(item), item]
end
end
end
# -------------------------------------------------------------------------------------------------
# Take
class Spark::Command::Take < _Base
variable :total, function: false, type: Numeric
variable :last_part, function: false, type: Numeric
def run(iterator, index)
if index == @last_part && iterator.size > @total
return iterator.slice!(0, @total)
end
iterator
end
end
# -------------------------------------------------------------------------------------------------
# Pipe
class Spark::Command::Pipe < _Base
variable :cmds, function: false, type: Array
def before_run
require 'open3'
@in, @out, @threads = Open3.pipeline_rw(*@cmds)
end
def run(iterator, *)
create_writing_thread(iterator)
new_iterator = []
# Read full input
begin
loop {
new_iterator << @out.readline.rstrip
}
rescue EOFError
end
new_iterator
end
def lazy_run(iterator, *)
create_writing_thread(iterator)
Enumerator::Lazy.new([nil]) do |yielder, _|
begin
loop {
yielder << @out.readline.rstrip
}
rescue EOFError
end
end
end
private
def create_writing_thread(iterator)
@writing_thread = Thread.new do
# Send complete iterator to the pipe
iterator.each do |item|
@in.puts(item.to_s.rstrip)
end
# Input must be closed for EOFError
@in.close
end
end
end
================================================
FILE: lib/spark/command/pair.rb
================================================
_Base = Spark::Command::Base
# -------------------------------------------------------------------------------------------------
# CombineByKey
class Spark::Command::CombineByKey
# ---------------
class Base < Spark::Command::Base
def run(iterator, *)
_run(iterator).to_a
end
def lazy_run(iterator, *)
_run(iterator).lazy
end
end
# ---------------
class Combine < Base
variable :create_combiner
variable :merge_value
def _run(iterator)
# Not use combiners[key] ||= ..
# it tests nil and not has_key?
combiners = {}
iterator.each do |key, value|
if combiners.has_key?(key)
combiners[key] = @merge_value.call(combiners[key], value)
else
combiners[key] = @create_combiner.call(value)
end
end
combiners
end
end
# ---------------
class Merge < Base
variable :merge_combiners
def _run(iterator, *)
combiners = {}
iterator.each do |key, value|
if combiners.has_key?(key)
combiners[key] = @merge_combiners.call(combiners[key], value)
else
combiners[key] = value
end
end
combiners
end
end
# ---------------
class CombineWithZero < Base
variable :zero_value, function: false, type: Object
variable :merge_value
def _run(iterator)
# Not use combiners[key] ||= ..
# it tests nil and not has_key?
combiners = {}
iterator.each do |key, value|
unless combiners.has_key?(key)
combiners[key] = @zero_value
end
combiners[key] = @merge_value.call(combiners[key], value)
end
combiners
end
end
# ---------------
end
# -------------------------------------------------------------------------------------------------
# MapValues
class Spark::Command::MapValues < _Base
variable :map_function
def run(iterator, *)
iterator.map! do |item|
item[1] = @map_function.call(item[1])
item
end
iterator
end
def lazy_run(iterator, *)
iterator.map do |item|
item[1] = @map_function.call(item[1])
item
end
end
end
# -------------------------------------------------------------------------------------------------
# FlatMapValues
class Spark::Command::FlatMapValues < _Base
variable :map_function
def run(iterator, *)
iterator.map! do |(key, values)|
values = @map_function.call(values)
values.flatten!(1)
values.map! do |value|
[key, value]
end
end
iterator.flatten!(1)
iterator
end
end
================================================
FILE: lib/spark/command/sort.rb
================================================
_Base = Spark::Command::Base
# -------------------------------------------------------------------------------------------------
# Sort
class Spark::Command::SortByKey < _Base
variable :key_function
variable :ascending, function: false, type: [TrueClass, FalseClass]
variable :spilling, function: false, type: [TrueClass, FalseClass]
variable :memory, function: false, type: [Numeric, NilClass]
variable :serializer, function: false, type: Spark::Serializer::Base
# Currently disabled
def before_run
@spilling = false
end
def run(iterator, _)
if @spilling
iterator = run_with_spilling(iterator.each)
else
run_without_spilling(iterator)
end
iterator
end
def run_with_enum(iterator, _)
if @spilling
iterator = run_with_spilling(iterator)
else
iterator = iterator.to_a
run_without_spilling(iterator)
end
iterator
end
private
def run_with_spilling(iterator)
sorter = Spark::ExternalSorter.new(@memory, @serializer)
sorter.sort_by(iterator, @ascending, @key_function)
end
def run_without_spilling(iterator)
iterator.sort_by!(&@key_function)
iterator.reverse! unless @ascending
end
end
================================================
FILE: lib/spark/command/statistic.rb
================================================
_Base = Spark::Command::Base
# -------------------------------------------------------------------------------------------------
# Sample
class Spark::Command::Sample < _Base
variable :with_replacement, function: false, type: [TrueClass, FalseClass]
variable :fraction, function: false, type: Numeric
variable :seed, function: false, type: [NilClass, Numeric]
def run(iterator, _)
sampler.sample(iterator)
end
def lazy_run(iterator, _)
sampler.lazy_sample(iterator)
end
def sampler
@sampler ||= _sampler
end
def _sampler
if @with_replacement
sampler = Spark::Sampler::Poisson
else
sampler = Spark::Sampler::Uniform
end
sampler = sampler.new(@fraction, @seed)
end
end
# -------------------------------------------------------------------------------------------------
# Stats
class Spark::Command::Stats < _Base
def run(iterator, *)
[Spark::StatCounter.new(iterator)]
end
def lazy_run(iterator, *)
run(iterator)
end
end
# -------------------------------------------------------------------------------------------------
# Histogram
class Spark::Command::Histogram < _Base
include Spark::Helper::Statistic
variable :even, function: false, type: [TrueClass, FalseClass]
variable :buckets, function: false, type: Array
def run(iterator, *)
counters = Array.new(counter_size) { 0 }
iterator.each do |item|
if item.nil? || (item.is_a?(Float) && !item.finite?) || item > max || item < min
next
end
x = bucket_function.call(item)
if x.nil?
# next
else
counters[x] += 1
end
end
[counters]
end
def lazy_run(iterator, *)
run(iterator)
end
private
def min
@buckets.first
end
def max
@buckets.last
end
def counter_size
@buckets.size-1
end
def increment
@buckets[1]-@buckets[0]
end
# Decide which bucket function to pass. We decide here rather than having
# a general function so that the decission need only be made once.
def bucket_function
@bucket_function ||= _bucket_function
end
def _bucket_function
if @even
fast_bucket_function
else
basic_bucket_function
end
end
# Determine the bucket function in constant time.
# Requires that buckets are evenly spaced
def fast_bucket_function
Proc.new do |item|
if item.is_a?(Float) && item.nan?
nil
else
bucket_number = (item - min)/increment
if bucket_number > counter_size || bucket_number < 0
nil
else
[bucket_number.to_i, counter_size-1].min
end
end
end
end
# Basic bucket function. Same as right bisect.
def basic_bucket_function
Proc.new do |item|
bucket_number = bisect_right(@buckets, item) - 1
# Counters is @buckets.size - 1
# [bucket_number, counter_size-1].min
if bucket_number > counter_size-1
counter_size-1
else
bucket_number
end
end
end
end
================================================
FILE: lib/spark/command.rb
================================================
module Spark
##
# Container which includes all commands and other things for worker
# Every RDD have own copy of Command
#
class Command
attr_accessor :serializer, :deserializer, :commands, :libraries, :bound_objects
def initialize
@serializer = nil
@deserializer = nil
@commands = []
@libraries = []
@bound_objects = {}
end
def execute(iterator, split_index)
# Require necessary libraries
libraries.each{|lib| require lib}
# Prepare bound objects
@commands.each do |command|
command.__objects__ = bound_objects
end
# Prepare for running
@commands.each(&:prepare)
# Run all task
@commands.each do |command|
iterator = command.execute(iterator, split_index)
end
# Return changed iterator. This is not be necessary for some tasks
# because of using inplace changing but some task can return
# only one value (for example reduce).
iterator
end
def last
@commands.last
end
def bound_objects
# Objects from users
# Already initialized objects on worker
return @bound_objects if @bound_objects
if @serialized_bound_objects
# Still serialized
@bound_objects = Marshal.load(@serialized_bound_objects)
else
# Something else
@bound_objects = {}
end
end
# Bound objects can depend on library which is loaded during @execute
# In that case worker raise "undefined class/module"
def marshal_dump
[@serializer, @deserializer, @commands, @libraries, serialized_bound_objects]
end
def marshal_load(array)
@serializer = array.shift
@deserializer = array.shift
@commands = array.shift
@libraries = array.shift
@serialized_bound_objects = array.shift
end
private
def serialized_bound_objects
@serialized_bound_objects ||= Marshal.dump(@bound_objects)
end
end
end
require 'spark/command/base'
require 'spark/command/basic'
require 'spark/command/pair'
require 'spark/command/statistic'
require 'spark/command/sort'
================================================
FILE: lib/spark/command_builder.rb
================================================
require 'spark/command_validator'
module Spark
##
# Builder for building correct {Spark::Command}
#
class CommandBuilder
extend Forwardable
include Spark::Helper::Serialize
include Spark::Helper::System
include Spark::CommandValidator
attr_reader :command
def_delegators :@command, :serializer, :serializer=, :deserializer, :deserializer=, :commands,
:commands=, :libraries, :libraries=, :bound_objects, :bound_objects=
def initialize(serializer, deserializer=nil)
create_command
self.serializer = serializer
self.deserializer = deserializer || serializer.dup
end
def create_command
@command = Spark::Command.new
end
# Do not user Marshal.dump(Marshal.load(self)) because some variables
# have marshal_dump prepared for worker.
def deep_copy
copy = self.dup
copy.create_command
copy.serializer = self.serializer.deep_copy
copy.deserializer = self.deserializer.deep_copy
copy.commands = self.commands.dup
copy.libraries = self.libraries.dup
copy.bound_objects = self.bound_objects.dup
copy
end
# Serialize Command class for worker
# Java use signed number
def build
unpack_chars(Marshal.dump(@command))
end
def add_command(klass, *args)
variables = klass.settings.variables
validate_size(variables, args)
built_args = []
variables.values.zip(args) do |var, arg|
if var[:function]
arg = serialize_function(arg)
end
validate(arg, var)
built_args << arg
end
comm = klass.new(*built_args)
@command.commands << comm
self
end
def add_library(*libraries)
@command.libraries += libraries
end
def bind(objects)
objects.symbolize_keys!
@command.bound_objects.merge!(objects)
end
private
# Serialized can be Proc and Method
#
# === Func
# * *string:* already serialized proc
# * *proc:* proc
# * *symbol:* name of method
# * *method:* Method class
#
def serialize_function(func)
case func
when String
serialize_function_from_string(func)
when Symbol
serialize_function_from_symbol(func)
when Proc
serialize_function_from_proc(func)
when Method
serialize_function_from_method(func)
else
raise Spark::CommandError, 'You must enter String, Symbol, Proc or Method.'
end
end
def serialize_function_from_string(string)
{type: 'proc', content: string}
end
def serialize_function_from_symbol(symbol)
{type: 'symbol', content: symbol}
end
# Serialize Proc as String
#
# lambda{|x| x*x}.to_source
# # => "proc { |x| (x * x) }"
#
def serialize_function_from_proc(proc)
serialize_function_from_string(proc.to_source)
rescue
raise Spark::SerializeError, 'Proc can not be serialized. Use String instead.'
end
# Serialize method as string
#
# def test(x)
# x*x
# end
# serialize_function_from_method(method(:test))
#
# # => "def test(x)\n x*x\nend\n"
#
def serialize_function_from_method(meth)
if pry?
meth = Pry::Method.new(meth)
end
{type: 'method', name: meth.name, content: meth.source}
rescue
raise Spark::SerializeError, 'Method can not be serialized. Use full path or Proc.'
end
end
end
================================================
FILE: lib/spark/command_validator.rb
================================================
module Spark
module CommandValidator
def validate(value, options)
validate_type(value, options[:type])
end
def valid?(value, options)
begin
validate(value, options)
return true
rescue
return false
end
end
def validate_type(value, types)
types = [types] if !types.is_a?(Array)
types.each do |type|
return if value.is_a?(type)
end
error "Value: #{value} should be a #{types.join(' or ')} but is #{value.class}."
end
def validate_size(array1, array2)
if array1.size != array2.size
error "Wrong number of arguments (#{array1.size} for #{array2.size})"
end
end
end
end
================================================
FILE: lib/spark/config.rb
================================================
# Necessary libraries
Spark.load_lib
module Spark
# Common configuration for RubySpark and Spark
class Config
include Spark::Helper::System
TYPES = {
'spark.shuffle.spill' => :boolean,
'spark.ruby.serializer.compress' => :boolean
}
# Initialize java SparkConf and load default configuration.
def initialize
@spark_conf = SparkConf.new(true)
set_default
from_file(Spark::DEFAULT_CONFIG_FILE)
end
def from_file(file)
check_read_only
if file && File.exist?(file)
file = File.expand_path(file)
RubyUtils.loadPropertiesFile(spark_conf, file)
end
end
def [](key)
get(key)
end
def []=(key, value)
set(key, value)
end
def spark_conf
if Spark.started?
# Get latest configuration
Spark.context.jcontext.conf
else
@spark_conf
end
end
def valid!
errors = []
if !contains?('spark.app.name')
errors << 'An application name must be set in your configuration.'
end
if !contains?('spark.master')
errors << 'A master URL must be set in your configuration.'
end
if Spark::Serializer.find(get('spark.ruby.serializer')).nil?
errors << 'Unknow serializer.'
end
scanned = get('spark.ruby.executor.command').scan('%s')
if scanned.size == 0
errors << "Executor command must contain '%s'."
end
if scanned.size > 1
errors << "Executor command can contain only one '%s'."
end
if errors.any?
errors.map!{|error| "- #{error}"}
raise Spark::ConfigurationError, "Configuration is not valid:\r\n#{errors.join("\r\n")}"
end
end
def read_only?
Spark.started?
end
# Rescue from NoSuchElementException
def get(key)
value = spark_conf.get(key.to_s)
case TYPES[key]
when :boolean
parse_boolean(value)
when :integer
parse_integer(value)
else
value
end
rescue
nil
end
def get_all
Hash[spark_conf.getAll.map{|tuple| [tuple._1, tuple._2]}]
end
def contains?(key)
spark_conf.contains(key.to_s)
end
def set(key, value)
check_read_only
spark_conf.set(key.to_s, value.to_s)
end
def set_app_name(name)
set('spark.app.name', name)
end
def set_master(master)
set('spark.master', master)
end
def parse_boolean(value)
case value
when 'true'
true
when 'false'
false
end
end
def parse_integer(value)
value.to_i
end
# =============================================================================
# Defaults
def set_default
set_app_name('RubySpark')
set_master('local[*]')
set('spark.ruby.driver_home', Spark.home)
set('spark.ruby.serializer', default_serializer)
set('spark.ruby.serializer.compress', default_serializer_compress)
set('spark.ruby.serializer.batch_size', default_serializer_batch_size)
set('spark.ruby.executor.command', default_executor_command)
set('spark.ruby.executor.options', default_executor_options)
set('spark.ruby.worker.type', default_worker_type)
load_executor_envs
# set('spark.ruby.executor.install', default_executor_install)
end
def default_serializer
ENV['SPARK_RUBY_SERIALIZER'] || Spark::Serializer::DEFAULT_SERIALIZER_NAME
end
def default_serializer_compress
ENV['SPARK_RUBY_SERIALIZER_COMPRESS'] || Spark::Serializer::DEFAULT_COMPRESS
end
def default_serializer_batch_size
ENV['SPARK_RUBY_SERIALIZER_BATCH_SIZE'] || Spark::Serializer::DEFAULT_BATCH_SIZE
end
# Command template which is applied when scala want create a ruby
# process (e.g. master, home request). Command is represented by '%s'.
#
# == Example:
# bash --norc -i -c "export HOME=/home/user; cd; source .bashrc; %s"
#
def default_executor_command
ENV['SPARK_RUBY_EXECUTOR_COMMAND'] || '%s'
end
# Options for every worker.
#
# == Example:
# -J-Xmx512m
#
def default_executor_options
ENV['SPARK_RUBY_EXECUTOR_OPTIONS'] || ''
end
# # Install command which is triggered before on start.
# # This command using executor command template.
# #
# # == Example:
# # gem install ruby-spark -v 1.2.0
# #
# def default_executor_install
# ENV['SPARK_RUBY_EXECUTOR_INSTALL'] || ''
# end
# Type of worker.
#
# == Options:
# process:: (default)
# thread:: (experimental)
#
def default_worker_type
ENV['SPARK_RUBY_WORKER_TYPE'] || 'process'
end
# Load environment variables for executor from ENV.
#
# == Examples:
# SPARK_RUBY_EXECUTOR_ENV_KEY1="1"
# SPARK_RUBY_EXECUTOR_ENV_KEY2="2"
#
def load_executor_envs
prefix = 'SPARK_RUBY_EXECUTOR_ENV_'
envs = ENV.select{|key, _| key.start_with?(prefix)}
envs.each do |key, value|
key = key.dup # ENV keys are frozen
key.slice!(0, prefix.size)
set("spark.ruby.executor.env.#{key}", value)
end
end
# Aliases
alias_method :getAll, :get_all
alias_method :setAppName, :set_app_name
alias_method :setMaster, :set_master
private
def check_read_only
if read_only?
raise Spark::ConfigurationError, 'Configuration is ready only'
end
end
end
end
================================================
FILE: lib/spark/constant.rb
================================================
module Spark
# Commond constant for Ruby and Spark
module Constant
DATA_EOF = -2
WORKER_ERROR = -1
WORKER_DONE = 0
CREATE_WORKER = 1
KILL_WORKER = 2
KILL_WORKER_AND_WAIT = 3
SUCCESSFULLY_KILLED = 4
UNSUCCESSFUL_KILLING = 5
ACCUMULATOR_ACK = 6
end
end
================================================
FILE: lib/spark/context.rb
================================================
# Necessary libraries
Spark.load_lib
module Spark
##
# Main entry point for Spark functionality. A SparkContext represents the connection to a Spark
# cluster, and can be used to create RDDs, accumulators and broadcast variables on that cluster.
#
class Context
include Spark::Helper::System
include Spark::Helper::Parser
include Spark::Helper::Logger
attr_reader :jcontext, :jaccumulator, :temp_dir
# Constructor for Ruby context. Configuration is automatically is taken
# from Spark. Config will be automatically set to default if user start
# context first.
#
def initialize
Spark.config.valid!
@jcontext = JavaSparkContext.new(Spark.config.spark_conf)
@jcontext.addJar(Spark.ruby_spark_jar)
# Does not work on 1.2
# ui.attachTab(RubyTab.new(ui, to_java_hash(RbConfig::CONFIG)))
spark_local_dir = JUtils.getLocalDir(sc.conf)
@temp_dir = JUtils.createTempDir(spark_local_dir, 'ruby').getAbsolutePath
accum_server = Spark::Accumulator::Server
accum_server.start
@jaccumulator = @jcontext.accumulator(ArrayList.new, RubyAccumulatorParam.new(accum_server.host, accum_server.port))
log_info("Ruby accumulator server is running on port #{accum_server.port}")
set_call_site('Ruby') # description of stage
end
def inspect
result = %{#<#{self.class.name}:0x#{object_id}\n}
result << %{Tempdir: "#{temp_dir}">}
result
end
def stop
Spark::Accumulator::Server.stop
log_info('Ruby accumulator server was stopped')
@jcontext.stop
end
def sc
@jcontext.sc
end
def ui
sc.ui
end
# Default level of parallelism to use when not given by user (e.g. parallelize and makeRDD)
#
def default_parallelism
sc.defaultParallelism
end
# Default serializer
#
# Batch -> Compress -> Basic
#
def default_serializer
# Basic
serializer = Spark::Serializer.find!(config('spark.ruby.serializer')).new
# Compress
if config('spark.ruby.serializer.compress')
serializer = Spark::Serializer.compressed(serializer)
end
# Bactching
batch_size = default_batch_size
if batch_size == 'auto'
serializer = Spark::Serializer.auto_batched(serializer)
else
serializer = Spark::Serializer.batched(serializer, batch_size)
end
# Finally, "container" contains serializers
serializer
end
def default_batch_size
size = config('spark.ruby.serializer.batch_size').to_i
if size >= 1
size
else
'auto'
end
end
# Set a local property that affects jobs submitted from this thread, such as the
# Spark fair scheduler pool.
#
def set_local_property(key, value)
jcontext.setLocalProperty(key, value)
end
# Get a local property set in this thread, or null if it is missing
#
def get_local_property(key)
jcontext.getLocalProperty(key)
end
# Support function for API backtraces.
#
def set_call_site(site)
jcontext.setCallSite(site)
end
def clear_call_site
jcontext.clearCallSite
end
# Return a copy of this SparkContext's configuration. The configuration *cannot*
# be changed at runtime.
#
def config(key=nil)
if key
Spark.config.get(key)
else
Spark.config
end
end
# Add a file to be downloaded with this Spark job on every node.
# The path of file passed can be either a local file, a file in HDFS
# (or other Hadoop-supported filesystems), or an HTTP, HTTPS or FTP URI.
#
# To access the file in Spark jobs, use `SparkFiles.get(file_name)` with the
# filename to find its download location.
#
# == Example:
# `echo 10 > test.txt`
#
# $sc.add_file('test.txt')
# $sc.parallelize(0..5).map(lambda{|x| x * SparkFiles.get_content('test.txt').to_i}).collect
# # => [0, 10, 20, 30, 40, 50]
#
def add_file(*files)
files.each do |file|
sc.addFile(file)
end
end
# Broadcast a read-only variable to the cluster, returning a Spark::Broadcast
# object for reading it in distributed functions. The variable will
# be sent to each cluster only once.
#
# == Example:
# broadcast1 = $sc.broadcast('a')
# broadcast2 = $sc.broadcast('b')
#
# rdd = $sc.parallelize(0..5, 4)
# rdd = rdd.bind(broadcast1: broadcast1, broadcast2: broadcast2)
# rdd = rdd.map_partitions_with_index(lambda{|part, index| [broadcast1.value * index, broadcast2.value * index] })
# rdd.collect
# # => ["", "", "a", "b", "aa", "bb", "aaa", "bbb"]
#
def broadcast(value)
Spark::Broadcast.new(self, value)
end
# Create an Accumulator with the given initial value, using a given
# accum_param helper object to define how to add values of the
# data type if provided.
#
# == Example:
# accum = $sc.accumulator(7)
#
# rdd = $sc.parallelize(0..5, 4)
# rdd = rdd.bind(accum: accum)
# rdd = rdd.map_partitions(lambda{|_| accum.add(1) })
# rdd = rdd.collect
#
# accum.value
# # => 11
#
def accumulator(value, accum_param=:+, zero_value=0)
Spark::Accumulator.new(value, accum_param, zero_value)
end
# Distribute a local Ruby collection to form an RDD
# Direct method can be slow so be careful, this method update data inplace
#
# == Parameters:
# data:: Range or Array
# num_slices:: number of slice
# serializer:: custom serializer (default: serializer based on configuration)
#
# == Examples:
# $sc.parallelize(["1", "2", "3"]).map(lambda{|x| x.to_i}).collect
# #=> [1, 2, 3]
#
# $sc.parallelize(1..3).map(:to_s).collect
# #=> ["1", "2", "3"]
#
def parallelize(data, num_slices=nil, serializer=nil)
num_slices ||= default_parallelism
serializer ||= default_serializer
serializer.check_each(data)
# Through file
file = Tempfile.new('to_parallelize', temp_dir)
serializer.dump_to_io(data, file)
file.close # not unlink
jrdd = RubyRDD.readRDDFromFile(jcontext, file.path, num_slices)
Spark::RDD.new(jrdd, self, serializer)
ensure
file && file.unlink
end
# Read a text file from HDFS, a local file system (available on all nodes), or any
# Hadoop-supported file system URI, and return it as an RDD of Strings.
#
# == Example:
# f = Tempfile.new("test")
# f.puts("1")
# f.puts("2")
# f.close
#
# $sc.text_file(f.path).map(lambda{|x| x.to_i}).collect
# # => [1, 2]
#
def text_file(path, min_partitions=nil, encoding=Encoding::UTF_8, serializer=nil)
min_partitions ||= default_parallelism
serializer ||= default_serializer
deserializer = Spark::Serializer.build { __text__(encoding) }
Spark::RDD.new(@jcontext.textFile(path, min_partitions), self, serializer, deserializer)
end
# Read a directory of text files from HDFS, a local file system (available on all nodes), or any
# Hadoop-supported file system URI. Each file is read as a single record and returned in a
# key-value pair, where the key is the path of each file, the value is the content of each file.
#
# == Example:
# dir = Dir.mktmpdir
# f1 = Tempfile.new("test1", dir)
# f2 = Tempfile.new("test2", dir)
# f1.puts("1"); f1.puts("2");
# f2.puts("3"); f2.puts("4");
# f1.close
# f2.close
#
# $sc.whole_text_files(dir).flat_map(lambda{|key, value| value.split}).collect
# # => ["1", "2", "3", "4"]
#
def whole_text_files(path, min_partitions=nil, serializer=nil)
min_partitions ||= default_parallelism
serializer ||= default_serializer
deserializer = Spark::Serializer.build{ __pair__(__text__, __text__) }
Spark::RDD.new(@jcontext.wholeTextFiles(path, min_partitions), self, serializer, deserializer)
end
# Executes the given partition function f on the specified set of partitions,
# returning the result as an array of elements.
#
# If partitions is not specified, this will run over all partitions.
#
# == Example:
# rdd = $sc.parallelize(0..10, 5)
# $sc.run_job(rdd, lambda{|x| x.to_s}, [0,2])
# # => ["[0, 1]", "[4, 5]"]
#
def run_job(rdd, f, partitions=nil, allow_local=false)
run_job_with_command(rdd, partitions, allow_local, Spark::Command::MapPartitions, f)
end
# Execute the given command on specific set of partitions.
#
def run_job_with_command(rdd, partitions, allow_local, command, *args)
if !partitions.nil? && !partitions.is_a?(Array)
raise Spark::ContextError, 'Partitions must be nil or Array'
end
partitions_size = rdd.partitions_size
# Execute all parts
if partitions.nil?
partitions = (0...partitions_size).to_a
end
# Can happend when you use coalesce
partitions.delete_if {|part| part >= partitions_size}
# Rjb represent Fixnum as Integer but Jruby as Long
partitions = to_java_array_list(convert_to_java_int(partitions))
# File for result
file = Tempfile.new('collect', temp_dir)
mapped = rdd.new_rdd_from_command(command, *args)
RubyRDD.runJob(rdd.context.sc, mapped.jrdd, partitions, allow_local, file.path)
mapped.collect_from_file(file)
end
# Aliases
alias_method :textFile, :text_file
alias_method :wholeTextFiles, :whole_text_files
alias_method :defaultParallelism, :default_parallelism
alias_method :setLocalProperty, :set_local_property
alias_method :getLocalProperty, :get_local_property
alias_method :setCallSite, :set_call_site
alias_method :clearCallSite, :clear_call_site
alias_method :runJob, :run_job
alias_method :runJobWithCommand, :run_job_with_command
alias_method :addFile, :add_file
end
end
================================================
FILE: lib/spark/error.rb
================================================
module Spark
# Extension cannot be built
class BuildError < StandardError
end
# Proc.to_source
# Java object cannot be converted
class SerializeError < StandardError
end
# Serializer method
# Non-existing serializer
class NotImplemented < StandardError
end
# Missison app_name or master
class ConfigurationError < StandardError
end
# Wrong parameters
class RDDError < StandardError
end
# Validations
class CommandError < StandardError
end
# Parser helper
# SQL DataType
class ParseError < StandardError
end
# Validation in context
class ContextError < StandardError
end
# Broadcasts
# Missing path
class BroadcastError < StandardError
end
# Accumulators
# Existing keys
# Wrong ID
class AccumulatorError < StandardError
end
# Wrong instances
class MllibError < StandardError
end
# Wrong datatype
class SQLError < StandardError
end
# Missing Java class
class JavaBridgeError < StandardError
end
end
================================================
FILE: lib/spark/ext/hash.rb
================================================
module Spark
module CoreExtension
module Hash
module ClassMethods
end
module InstanceMethods
# Destructively convert all keys to strings.
def stringify_keys_with_spark!
transform_keys!{ |key| key.to_s }
end
# Destructively convert all keys to symbols, as long as they respond
def symbolize_keys_with_spark!
transform_keys!{ |key| key.to_sym rescue key }
end
# Destructively convert all keys using the block operations.
# Same as transform_keys but modifies +self+.
def transform_keys_with_spark!
keys.each do |key|
self[yield(key)] = delete(key)
end
self
end
end
def self.included(base)
base.extend(ClassMethods)
base.send(:include, InstanceMethods)
base.class_eval do
patch_unless_exist :stringify_keys!, :spark
patch_unless_exist :symbolize_keys!, :spark
patch_unless_exist :transform_keys!, :spark
end
end
end
end
end
Hash.__send__(:include, Spark::CoreExtension::Hash)
================================================
FILE: lib/spark/ext/integer.rb
================================================
module Spark
module CoreExtension
module Integer
module ClassMethods
end
module InstanceMethods
end
def self.included(base)
base.extend(ClassMethods)
base.send(:include, InstanceMethods)
base.class_eval do
const_set :MAX_WITH_SPARK, 1 << (1.size * 8 - 2) - 1
const_set :MIN_WITH_SPARK, -const_get(:MAX_WITH_SPARK) - 1
path_const_unless_exist :MAX, :SPARK
path_const_unless_exist :MIN, :SPARK
end
end
end
end
end
Integer.__send__(:include, Spark::CoreExtension::Integer)
================================================
FILE: lib/spark/ext/io.rb
================================================
module Spark
module CoreExtension
module IO
module ClassMethods
end
module InstanceMethods
# Reading
def read_int
unpack_int(read(4))
end
def read_int_or_eof
bytes = read(4)
return Spark::Constant::DATA_EOF if bytes.nil?
unpack_int(bytes)
end
def read_long
unpack_long(read(8))
end
def read_string
read(read_int)
end
def read_data
Marshal.load(read_string)
end
# Writing
def write_int(data)
write(pack_int(data))
end
def write_long(data)
write(pack_long(data))
end
# Size and data can have different encoding
# Marshal: both ASCII
# Oj: ASCII and UTF-8
def write_string(data)
write_int(data.bytesize)
write(data)
end
def write_data(data)
write_string(Marshal.dump(data))
end
end
def self.included(base)
base.extend(ClassMethods)
base.send(:include, Spark::Helper::Serialize)
base.send(:include, InstanceMethods)
end
end
end
end
IO.__send__(:include, Spark::CoreExtension::IO)
StringIO.__send__(:include, Spark::CoreExtension::IO)
================================================
FILE: lib/spark/ext/ip_socket.rb
================================================
module Spark
module CoreExtension
module IPSocket
module ClassMethods
end
module InstanceMethods
def port
addr[1]
end
def hostname
addr(true)[2]
end
def numeric_address
addr[3]
end
end
def self.included(base)
base.extend(ClassMethods)
base.send(:include, InstanceMethods)
end
end
end
end
IPSocket.__send__(:include, Spark::CoreExtension::IPSocket)
================================================
FILE: lib/spark/ext/module.rb
================================================
module Spark
module CoreExtension
module Module
# Patch method to class unless already exist
#
# == Example:
#
# class Hash
# def a
# 1
# end
# end
#
# module HashExtension
# module InstanceMethods
# def a_with_spark
# 2
# end
#
# def b_with_spark
# 1
# end
# end
#
# def self.included(base)
# base.send(:include, InstanceMethods)
# base.class_eval do
# patch_unless_exist :a, :spark
# patch_unless_exist :b, :spark
# end
# end
# end
#
# Hash.include(HashExtension)
#
# Hash.new.a # => 1
# Hash.new.b # => 1
#
def patch_unless_exist(target, suffix)
unless method_defined?(target)
aliased_target, punctuation = target.to_s.sub(/([?!=])$/, ''), $1
alias_method target, "#{aliased_target}_with_#{suffix}#{punctuation}"
end
end
def path_const_unless_exist(target, suffix)
unless const_defined?(target)
const_set(target, const_get("#{target}_WITH_#{suffix}"))
end
end
end
end
end
Module.__send__(:include, Spark::CoreExtension::Module)
================================================
FILE: lib/spark/ext/object.rb
================================================
module Spark
module CoreExtension
module Object
module ClassMethods
end
module InstanceMethods
def deep_copy_with_spark
Marshal.load(Marshal.dump(self))
end
def silence_warnings
old_verbose, $VERBOSE = $VERBOSE, nil
yield
ensure
$VERBOSE = old_verbose
end
def cattr_reader_with_spark(*syms)
syms.each do |sym|
raise NameError.new("Invalid attribute name: #{sym}") unless sym =~ /^[_A-Za-z]\w*$/
class_eval(<<-EOS, __FILE__, __LINE__ + 1)
@@#{sym} = nil unless defined? @@#{sym}
def self.#{sym}
@@#{sym}
end
EOS
class_eval(<<-EOS, __FILE__, __LINE__ + 1)
def #{sym}
@@#{sym}
end
EOS
end
end
def cattr_writer_with_spark(*syms)
syms.each do |sym|
raise NameError.new("Invalid attribute name: #{sym}") unless sym =~ /^[_A-Za-z]\w*$/
class_eval(<<-EOS, __FILE__, __LINE__ + 1)
@@#{sym} = nil unless defined? @@#{sym}
def self.#{sym}=(obj)
@@#{sym} = obj
end
EOS
class_eval(<<-EOS, __FILE__, __LINE__ + 1)
def #{sym}=(obj)
@@#{sym} = obj
end
EOS
end
end
def cattr_accessor_with_spark(*syms)
cattr_reader_with_spark(*syms)
cattr_writer_with_spark(*syms)
end
end
def self.included(base)
base.extend(ClassMethods)
base.send(:include, InstanceMethods)
base.class_eval do
patch_unless_exist :deep_copy, :spark
patch_unless_exist :silence_warnings, :spark
patch_unless_exist :cattr_accessor, :spark
end
end
end
end
end
Object.__send__(:include, Spark::CoreExtension::Object)
================================================
FILE: lib/spark/ext/string.rb
================================================
module Spark
module CoreExtension
module String
module ClassMethods
end
module InstanceMethods
def camelize_with_spark
self.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase }
end
end
def self.included(base)
base.extend(ClassMethods)
base.send(:include, InstanceMethods)
base.class_eval do
patch_unless_exist :camelize, :spark
end
end
end
end
end
String.__send__(:include, Spark::CoreExtension::String)
================================================
FILE: lib/spark/helper/logger.rb
================================================
module Spark
module Helper
module Logger
def self.included(base)
base.send :extend, Methods
base.send :include, Methods
end
module Methods
def log_info(message)
Spark.logger.info(message)
end
def log_debug(message)
Spark.logger.debug(message)
end
def log_trace(message)
Spark.logger.trace(message)
end
def log_warning(message)
Spark.logger.warning(message)
end
def log_error(message)
Spark.logger.error(message)
end
alias_method :logInfo, :log_info
alias_method :logDebug, :log_debug
alias_method :logTrace, :log_trace
alias_method :logWarning, :log_warning
alias_method :logError, :log_error
end # Methods
end # Logger
end # Helper
end # Spark
================================================
FILE: lib/spark/helper/parser.rb
================================================
module Spark
module Helper
module Parser
def self.included(base)
base.send :extend, Methods
base.send :include, Methods
end
module Methods
def to_java_hash(hash)
hash_map = HashMap.new
hash.each_pair do |key, value|
begin
# RJB raise Object is NULL (but new record is put correctly)
hash_map.put(key, value)
rescue RuntimeError
end
end
hash_map
end
def convert_to_java_int(data)
if data.is_a?(Array)
data.map{|x| JInteger.new(x)}
else
JInteger.new(data)
end
end
def to_java_array_list(array)
array_list = ArrayList.new
array.each do |item|
array_list.add(item)
end
array_list
end
# Parse and convert memory size. Shifting be better but Float doesn't support it.
#
# == Examples:
# to_memory_size("512mb")
# # => 524288
#
# to_memory_size("512 MB")
# # => 524288
#
# to_memory_size("512mb", "GB")
# # => 0.5
#
def to_memory_size(memory, result_unit="KB")
match = memory.match(/([\d]+)[\s]*([\w]*)/)
if match.nil?
raise Spark::ParseError, "Memory has wrong format. Use: 'SIZE UNIT'"
end
size = match[1].to_f
unit = match[2]
size *= memory_multiplier_based_kb(unit)
size /= memory_multiplier_based_kb(result_unit)
size.round(2)
end
# Based to KB
def memory_multiplier_based_kb(type)
case type.to_s.upcase
when "G", "GB"
1048576
when "M", "MB"
1024
when "K", "KB"
1
else
raise Spark::ParseError, "Unsupported type #{type}"
end
end
end # Methods
end # Parser
end # Helper
end # Spark
================================================
FILE: lib/spark/helper/serialize.rb
================================================
module Spark
module Helper
module Serialize
DIRECTIVE_INTEGER_BIG_ENDIAN = 'l>'
DIRECTIVE_INTEGERS_BIG_ENDIAN = 'l>*'
DIRECTIVE_LONG_BIG_ENDIAN = 'q>'
DIRECTIVE_LONGS_BIG_ENDIAN = 'q>*'
DIRECTIVE_DOUBLE_BIG_ENDIAN = 'G'
DIRECTIVE_DOUBLES_BIG_ENDIAN = 'G*'
DIRECTIVE_UNSIGNED_CHARS = 'C*'
DIRECTIVE_CHARS = 'c*'
# Packing
def pack_int(data)
[data].pack(DIRECTIVE_INTEGER_BIG_ENDIAN)
end
def pack_long(data)
[data].pack(DIRECTIVE_LONG_BIG_ENDIAN)
end
def pack_double(data)
[data].pack(DIRECTIVE_DOUBLE_BIG_ENDIAN)
end
def pack_unsigned_chars(data)
data.pack(DIRECTIVE_UNSIGNED_CHARS)
end
def pack_ints(data)
__check_array(data)
data.pack(DIRECTIVE_INTEGERS_BIG_ENDIAN)
end
def pack_longs(data)
__check_array(data)
data.pack(DIRECTIVE_LONGS_BIG_ENDIAN)
end
def pack_doubles(data)
__check_array(data)
data.pack(DIRECTIVE_DOUBLES_BIG_ENDIAN)
end
# Unpacking
def unpack_int(data)
data.unpack(DIRECTIVE_INTEGER_BIG_ENDIAN)[0]
end
def unpack_long(data)
data.unpack(DIRECTIVE_LONG_BIG_ENDIAN)[0]
end
def unpack_chars(data)
data.unpack(DIRECTIVE_CHARS)
end
private
def __check_array(data)
unless data.is_a?(Array)
raise ArgumentError, 'Data must be an Array.'
end
end
end
end
end
================================================
FILE: lib/spark/helper/statistic.rb
================================================
module Spark
module Helper
module Statistic
# Returns a sampling rate that guarantees a sample of size >= sampleSizeLowerBound 99.99% of the time.
#
# == How the sampling rate is determined:
# Let p = num / total, where num is the sample size and total is the total number of
# datapoints in the RDD. We're trying to compute q > p such that
# * when sampling with replacement, we're drawing each datapoint with prob_i ~ Pois(q),
# where we want to guarantee Pr[s < num] < 0.0001 for s = sum(prob_i for i from 0 to total),
# i.e. the failure rate of not having a sufficiently large sample < 0.0001.
# Setting q = p + 5 * sqrt(p/total) is sufficient to guarantee 0.9999 success rate for
# num > 12, but we need a slightly larger q (9 empirically determined).
# * when sampling without replacement, we're drawing each datapoint with prob_i
# ~ Binomial(total, fraction) and our choice of q guarantees 1-delta, or 0.9999 success
# rate, where success rate is defined the same as in sampling with replacement.
#
def compute_fraction(lower_bound, total, with_replacement)
lower_bound = lower_bound.to_f
if with_replacement
upper_poisson_bound(lower_bound) / total
else
fraction = lower_bound / total
upper_binomial_bound(0.00001, total, fraction)
end
end
def upper_poisson_bound(bound)
num_std = if bound < 6
12
elsif bound < 16
9
else
6
end.to_f
[bound + num_std * Math.sqrt(bound), 1e-10].max
end
def upper_binomial_bound(delta, total, fraction)
gamma = -Math.log(delta) / total
[1, fraction + gamma + Math.sqrt(gamma*gamma + 2*gamma*fraction)].min
end
# Bisect right
#
# == Examples:
# data = [1,5,6,8,96,120,133]
#
# bisect_right(data, 0) # => 0
# bisect_right(data, 1) # => 1
# bisect_right(data, 5) # => 2
# bisect_right(data, 9) # => 4
# bisect_right(data, 150) # => 7
#
def bisect_right(data, value, low=0, high=data.size)
if low < 0
raise ArgumentError, 'Low must be >= 0.'
end
while low < high
mid = (low + high) / 2
if value < data[mid]
high = mid
else
low = mid + 1
end
end
low
end
# Determine bound of partitioning
#
# == Example:
# data = [0,1,2,3,4,5,6,7,8,9,10]
# determine_bounds(data, 3)
# # => [3, 7]
#
def determine_bounds(data, num_partitions)
if num_partitions > data.size
return data
end
bounds = []
count = data.size
(0...(num_partitions-1)).each do |index|
bounds << data[count * (index+1) / num_partitions]
end
bounds
end
end
end
end
================================================
FILE: lib/spark/helper/system.rb
================================================
module Spark
module Helper
module System
def self.included(base)
base.send :extend, Methods
base.send :include, Methods
end
module Methods
def windows?
RbConfig::CONFIG['host_os'] =~ /mswin|mingw/
end
def mri?
RbConfig::CONFIG['ruby_install_name'] == 'ruby'
end
def jruby?
RbConfig::CONFIG['ruby_install_name'] == 'jruby'
end
def pry?
!!Thread.current[:__pry__]
end
# Memory usage in kb
def memory_usage
if jruby?
runtime = java.lang.Runtime.getRuntime
(runtime.totalMemory - runtime.freeMemory) >> 10
elsif windows?
# not yet
else
`ps -o rss= -p #{Process.pid}`.to_i
end
end
end # Methods
end # System
end # Helper
end # Spark
================================================
FILE: lib/spark/helper.rb
================================================
module Spark
module Helper
autoload :System, "spark/helper/system"
autoload :Logger, "spark/helper/logger"
autoload :Statistic, "spark/helper/statistic"
autoload :Serialize, "spark/helper/serialize"
autoload :Partition, "spark/helper/partition"
autoload :Parser, "spark/helper/parser"
end
end
================================================
FILE: lib/spark/java_bridge/base.rb
================================================
##
# Spark::JavaBridge::Base
#
# Parent for all adapter (ruby - java)
#
module Spark
module JavaBridge
class Base
include Spark::Helper::System
JAVA_OBJECTS = [
'java.util.ArrayList',
'scala.collection.mutable.HashMap',
'org.apache.spark.SparkConf',
'org.apache.spark.api.java.JavaSparkContext',
'org.apache.spark.api.ruby.RubyRDD',
'org.apache.spark.api.ruby.RubyUtils',
'org.apache.spark.api.ruby.RubyWorker',
'org.apache.spark.api.ruby.PairwiseRDD',
'org.apache.spark.api.ruby.RubyAccumulatorParam',
'org.apache.spark.api.ruby.RubySerializer',
'org.apache.spark.api.python.PythonRDD',
'org.apache.spark.api.python.PythonPartitioner',
'org.apache.spark.api.python.PythonUtils',
'org.apache.spark.ui.ruby.RubyTab',
'org.apache.spark.mllib.api.ruby.RubyMLLibAPI',
:JInteger => 'java.lang.Integer',
:JLong => 'java.lang.Long',
:JLogger => 'org.apache.log4j.Logger',
:JLevel => 'org.apache.log4j.Level',
:JPriority => 'org.apache.log4j.Priority',
:JUtils => 'org.apache.spark.util.Utils',
:JDataType => 'org.apache.spark.sql.types.DataType',
:JSQLContext => 'org.apache.spark.sql.SQLContext',
:JDenseVector => 'org.apache.spark.mllib.linalg.DenseVector',
:JDenseMatrix => 'org.apache.spark.mllib.linalg.DenseMatrix',
:JStorageLevel => 'org.apache.spark.storage.StorageLevel',
:JSQLFunctions => 'org.apache.spark.sql.functions'
]
JAVA_TEST_OBJECTS = [
'org.apache.spark.mllib.api.ruby.RubyMLLibUtilAPI'
]
RUBY_TO_JAVA_SKIP = [Fixnum, Integer]
def initialize(target)
@target = target
end
# Import all important classes into Objects
def import_all
return if @imported
java_objects.each do |name, klass|
import(name, klass)
end
@imported = true
nil
end
# Import classes for testing
def import_all_test
return if @imported_test
java_test_objects.each do |name, klass|
import(name, klass)
end
@imported_test = true
nil
end
# Call java object
def call(klass, method, *args)
# To java
args.map!{|item| to_java(item)}
# Call java
result = klass.__send__(method, *args)
# To ruby
to_ruby(result)
end
def to_array_list(array)
array_list = ArrayList.new
array.each do |item|
array_list.add(to_java(item))
end
array_list
end
def to_seq(array)
PythonUtils.toSeq(to_array_list(array))
end
def to_long(number)
return nil if number.nil?
JLong.new(number)
end
def to_java(object)
if RUBY_TO_JAVA_SKIP.include?(object.class)
# Some object are convert automatically
# This is for preventing errors
# For example: jruby store integer as long so 1.to_java is Long
object
elsif object.respond_to?(:to_java)
object.to_java
elsif object.is_a?(Array)
to_array_list(object)
else
object
end
end
# Array problem:
# Rjb: object.toArray -> Array
# Jruby: object.toArray -> java.lang.Object
#
def to_ruby(object)
# Java object
if java_object?(object)
class_name = object.getClass.getSimpleName
case class_name
when 'ArraySeq'
result = []
iterator = object.iterator
while iterator.hasNext
result << to_ruby(iterator.next)
end
result
when 'Map2', 'Map3', 'Map4', 'HashTrieMap'
Hash[
object.toSeq.array.to_a.map!{|item| [item._1, item._2]}
]
when 'SeqWrapper'; object.toArray.to_a.map!{|item| to_ruby(item)}
when 'ofRef'; object.array.to_a.map!{|item| to_ruby(item)} # WrappedArray$ofRef
when 'LabeledPoint'; Spark::Mllib::LabeledPoint.from_java(object)
when 'DenseVector'; Spark::Mllib::DenseVector.from_java(object)
when 'KMeansModel'; Spark::Mllib::KMeansModel.from_java(object)
when 'DenseMatrix'; Spark::Mllib::DenseMatrix.from_java(object)
when 'GenericRowWithSchema'; Spark::SQL::Row.from_java(object, true)
else
# Some RDD
if class_name != 'JavaRDD' && class_name.end_with?('RDD')
object = object.toJavaRDD
class_name = 'JavaRDD'
end
# JavaRDD
if class_name == 'JavaRDD'
jrdd = RubyRDD.toRuby(object)
serializer = Spark::Serializer.build { __batched__(__marshal__) }
serializer = Spark::Serializer.build { __batched__(__marshal__, 2) }
return Spark::RDD.new(jrdd, Spark.sc, serializer, deserializer)
end
# Unknow
Spark.logger.warn("Java object '#{object.getClass.name}' was not converted.")
object
end
# Array can be automatically transfered but content not
elsif object.is_a?(Array)
object.map! do |item|
to_ruby(item)
end
object
# Already transfered
else
object
end
end
alias_method :java_to_ruby, :to_ruby
alias_method :ruby_to_java, :to_java
private
def jars
result = Dir.glob(File.join(@target, '*.jar'))
result.flatten!
result
end
def objects_with_names(objects)
hash = {}
objects.each do |object|
if object.is_a?(Hash)
hash.merge!(object)
else
key = object.split('.').last.to_sym
hash[key] = object
end
end
hash
end
def java_objects
objects_with_names(JAVA_OBJECTS)
end
def java_test_objects
objects_with_names(JAVA_TEST_OBJECTS)
end
def raise_missing_class(klass)
raise Spark::JavaBridgeError, "Class #{klass} is missing. Make sure that Spark and RubySpark is assembled."
end
end
end
end
================================================
FILE: lib/spark/java_bridge/jruby.rb
================================================
require 'java'
module Spark
module JavaBridge
class JRuby < Base
def initialize(*args)
super
jars.each {|jar| require jar}
end
def import(name, klass)
klass = "Java::#{klass}"
Object.const_set(name, eval(klass))
rescue NameError
raise_missing_class(klass)
end
def java_object?(object)
object.is_a?(JavaProxy)
end
end
end
end
================================================
FILE: lib/spark/java_bridge/rjb.rb
================================================
if !ENV.has_key?('JAVA_HOME')
raise Spark::ConfigurationError, 'Environment variable JAVA_HOME is not set'
end
require 'rjb'
module Spark
module JavaBridge
class RJB < Base
def initialize(*args)
super
Rjb.load(jars)
Rjb.primitive_conversion = true
end
def import(name, klass)
Object.const_set(name, silence_warnings { Rjb.import(klass) })
rescue NoClassDefFoundError
raise_missing_class(klass)
end
def java_object?(object)
object.is_a?(Rjb::Rjb_JavaProxy)
end
private
def jars
separator = windows? ? ';' : ':'
super.join(separator)
end
end
end
end
================================================
FILE: lib/spark/java_bridge.rb
================================================
module Spark
module JavaBridge
autoload :Base, 'spark/java_bridge/base'
autoload :JRuby, 'spark/java_bridge/jruby'
autoload :RJB, 'spark/java_bridge/rjb'
include Spark::Helper::System
def self.init(*args)
if jruby?
klass = JRuby
else
klass = RJB
end
klass.new(*args)
end
end
end
================================================
FILE: lib/spark/library.rb
================================================
module Spark
module Library
def autoload(klass, location, import=true)
if import
@for_importing ||= []
@for_importing << klass
end
super(klass, location)
end
def autoload_without_import(klass, location)
autoload(klass, location, false)
end
def import(to=Object)
@for_importing.each do |klass|
to.const_set(klass, const_get(klass))
end
nil
end
end
end
================================================
FILE: lib/spark/logger.rb
================================================
# Necessary libraries
Spark.load_lib
module Spark
class Logger
attr_reader :jlogger
def initialize
@jlogger = JLogger.getLogger('Ruby')
end
def level_off
JLevel.toLevel('OFF')
end
# Disable all Spark log
def disable
jlogger.setLevel(level_off)
JLogger.getLogger('org').setLevel(level_off)
JLogger.getLogger('akka').setLevel(level_off)
JLogger.getRootLogger.setLevel(level_off)
end
def enabled?
!disabled?
end
def info(message)
jlogger.info(message) if info?
end
def debug(message)
jlogger.debug(message) if debug?
end
def trace(message)
jlogger.trace(message) if trace?
end
def warning(message)
jlogger.warn(message) if warning?
end
def error(message)
jlogger.error(message) if error?
end
def info?
level_enabled?('info')
end
def debug?
level_enabled?('debug')
end
def trace?
level_enabled?('trace')
end
def warning?
level_enabled?('warn')
end
def error?
level_enabled?('error')
end
def level_enabled?(type)
jlogger.isEnabledFor(JPriority.toPriority(type.upcase))
end
alias_method :warn, :warning
end
end
================================================
FILE: lib/spark/mllib/classification/common.rb
================================================
module Spark
module Mllib
class ClassificationModel
attr_reader :weights, :intercept, :threshold
def initialize(weights, intercept)
@weights = Spark::Mllib::Vectors.to_vector(weights)
@intercept = intercept.to_f
@threshold = nil
end
def threshold=(value)
@threshold = value.to_f
end
def clear_threshold
@threshold = nil
end
end
end
end
module Spark
module Mllib
class ClassificationMethodBase < RegressionMethodBase
end
end
end
================================================
FILE: lib/spark/mllib/classification/logistic_regression.rb
================================================
module Spark
module Mllib
##
# LogisticRegressionModel
#
# A linear binary classification model derived from logistic regression.
#
# == Examples:
#
# Spark::Mllib.import
#
# # Dense vectors
# data = [
# LabeledPoint.new(0.0, [0.0, 1.0]),
# LabeledPoint.new(1.0, [1.0, 0.0]),
# ]
# lrm = LogisticRegressionWithSGD.train($sc.parallelize(data))
#
# lrm.predict([1.0, 0.0])
# # => 1
# lrm.predict([0.0, 1.0])
# # => 0
#
# lrm.clear_threshold
# lrm.predict([0.0, 1.0])
# # => 0.123...
#
#
# # Sparse vectors
# data = [
# LabeledPoint.new(0.0, SparseVector.new(2, {0 => 0.0})),
# LabeledPoint.new(1.0, SparseVector.new(2, {1 => 1.0})),
# LabeledPoint.new(0.0, SparseVector.new(2, {0 => 1.0})),
# LabeledPoint.new(1.0, SparseVector.new(2, {1 => 2.0}))
# ]
# lrm = LogisticRegressionWithSGD.train($sc.parallelize(data))
#
# lrm.predict([0.0, 1.0])
# # => 1
# lrm.predict([1.0, 0.0])
# # => 0
# lrm.predict(SparseVector.new(2, {1 => 1.0}))
# # => 1
# lrm.predict(SparseVector.new(2, {0 => 1.0}))
# # => 0
#
#
# # LogisticRegressionWithLBFGS
# data = [
# LabeledPoint.new(0.0, [0.0, 1.0]),
# LabeledPoint.new(1.0, [1.0, 0.0]),
# ]
# lrm = LogisticRegressionWithLBFGS.train($sc.parallelize(data))
#
# lrm.predict([1.0, 0.0])
# # => 1
# lrm.predict([0.0, 1.0])
# # => 0
#
class LogisticRegressionModel < ClassificationModel
def initialize(*args)
super
@threshold = 0.5
end
# Predict values for a single data point or an RDD of points using
# the model trained.
def predict(vector)
vector = Spark::Mllib::Vectors.to_vector(vector)
margin = weights.dot(vector) + intercept
score = 1.0 / (1.0 + Math.exp(-margin))
if threshold.nil?
return score
end
if score > threshold
1
else
0
end
end
end
end
end
module Spark
module Mllib
class LogisticRegressionWithSGD < ClassificationMethodBase
DEFAULT_OPTIONS = {
iterations: 100,
step: 1.0,
mini_batch_fraction: 1.0,
initial_weights: nil,
reg_param: 0.01,
reg_type: 'l2',
intercept: false,
validate: true,
convergence_tol: 0.001
}
# Train a logistic regression model on the given data.
#
# == Arguments:
# rdd::
# The training data, an RDD of LabeledPoint.
#
# iterations::
# The number of iterations (default: 100).
#
# step::
# The step parameter used in SGD (default: 1.0).
#
# mini_batch_fraction::
# Fraction of data to be used for each SGD iteration.
#
# initial_weights::
# The initial weights (default: nil).
#
# reg_param::
# The regularizer parameter (default: 0.01).
#
# reg_type::
# The type of regularizer used for training our model (default: "l2").
#
# Allowed values:
# - "l1" for using L1 regularization
# - "l2" for using L2 regularization
# - nil for no regularization
#
# intercept::
# Boolean parameter which indicates the use
# or not of the augmented representation for
# training data (i.e. whether bias features
# are activated or not).
# (default: false)
#
# validate::
# Boolean parameter which indicates if the
# algorithm should validate data before training.
# (default: true)
#
# convergence_tol::
# A condition which decides iteration termination.
# (default: 0.001)
#
def self.train(rdd, options={})
super
weights, intercept = Spark.jb.call(RubyMLLibAPI.new, 'trainLogisticRegressionModelWithSGD', rdd,
options[:iterations].to_i,
options[:step].to_f,
options[:mini_batch_fraction].to_f,
options[:initial_weights],
options[:reg_param].to_f,
options[:reg_type],
options[:intercept],
options[:validate],
options[:convergence_tol])
LogisticRegressionModel.new(weights, intercept)
end
end
end
end
module Spark
module Mllib
class LogisticRegressionWithLBFGS < ClassificationMethodBase
DEFAULT_OPTIONS = {
iterations: 100,
initial_weights: nil,
reg_param: 0.01,
reg_type: 'l2',
intercept: false,
corrections: 10,
tolerance: 0.0001
}
# Train a logistic regression model on the given data.
#
# == Arguments:
# rdd::
# The training data, an RDD of LabeledPoint.
#
# iterations::
# The number of iterations (default: 100).
#
# initial_weights::
# The initial weights (default: nil).
#
# reg_param::
# The regularizer parameter (default: 0.01).
#
# reg_type::
# The type of regularizer used for training our model (default: "l2").
#
# Allowed values:
# - "l1" for using L1 regularization
# - "l2" for using L2 regularization
# - nil for no regularization
#
# intercept::
# Boolean parameter which indicates the use
# or not of the augmented representation for
# training data (i.e. whether bias features
# are activated or not).
#
# corrections::
# The number of corrections used in the LBFGS update (default: 10).
#
# tolerance::
# The convergence tolerance of iterations for L-BFGS (default: 0.0001).
#
def self.train(rdd, options={})
super
weights, intercept = Spark.jb.call(RubyMLLibAPI.new, 'trainLogisticRegressionModelWithLBFGS', rdd,
options[:iterations].to_i,
options[:initial_weights],
options[:reg_param].to_f,
options[:reg_type],
options[:intercept],
options[:corrections].to_i,
options[:tolerance].to_f)
LogisticRegressionModel.new(weights, intercept)
end
end
end
end
================================================
FILE: lib/spark/mllib/classification/naive_bayes.rb
================================================
module Spark
module Mllib
##
# NaiveBayesModel
#
# Model for Naive Bayes classifiers.
#
# Contains two parameters:
# pi:: vector of logs of class priors (dimension C)
# theta:: matrix of logs of class conditional probabilities (CxD)
#
# == Examples:
#
# Spark::Mllib.import
#
# # Dense vectors
# data = [
# LabeledPoint.new(0.0, [0.0, 0.0]),
# LabeledPoint.new(0.0, [0.0, 1.0]),
# LabeledPoint.new(1.0, [1.0, 0.0])
# ]
# model = NaiveBayes.train($sc.parallelize(data))
#
# model.predict([0.0, 1.0])
# # => 0.0
# model.predict([1.0, 0.0])
# # => 1.0
#
#
# # Sparse vectors
# data = [
# LabeledPoint.new(0.0, SparseVector.new(2, {1 => 0.0})),
# LabeledPoint.new(0.0, SparseVector.new(2, {1 => 1.0})),
# LabeledPoint.new(1.0, SparseVector.new(2, {0 => 1.0}))
# ]
# model = NaiveBayes.train($sc.parallelize(data))
#
# model.predict(SparseVector.new(2, {1 => 1.0}))
# # => 0.0
# model.predict(SparseVector.new(2, {0 => 1.0}))
# # => 1.0
#
class NaiveBayesModel
attr_reader :labels, :pi, :theta
def initialize(labels, pi, theta)
@labels = labels
@pi = pi
@theta = theta
end
# Predict values for a single data point or an RDD of points using
# the model trained.
def predict(vector)
vector = Spark::Mllib::Vectors.to_vector(vector)
array = (vector.dot(theta) + pi).to_a
index = array.index(array.max)
labels[index]
end
end
end
end
module Spark
module Mllib
class NaiveBayes
# Trains a Naive Bayes model given an RDD of (label, features) pairs.
#
# This is the Multinomial NB (http://tinyurl.com/lsdw6p) which can handle all kinds of
# discrete data. For example, by converting documents into TF-IDF vectors, it can be used for
# document classification. By making every vector a 0-1 vector, it can also be used as
# Bernoulli NB (http://tinyurl.com/p7c96j6). The input feature values must be nonnegative.
#
# == Arguments:
# rdd:: RDD of LabeledPoint.
# lambda:: The smoothing parameter.
#
def self.train(rdd, lambda=1.0)
# Validation
first = rdd.first
unless first.is_a?(LabeledPoint)
raise Spark::MllibError, "RDD should contains LabeledPoint, got #{first.class}"
end
labels, pi, theta = Spark.jb.call(RubyMLLibAPI.new, 'trainNaiveBayesModel', rdd, lambda)
theta = Spark::Mllib::Matrices.dense(theta.size, theta.first.size, theta)
NaiveBayesModel.new(labels, pi, theta)
end
end
end
end
================================================
FILE: lib/spark/mllib/classification/svm.rb
================================================
module Spark
module Mllib
##
# SVMModel
#
# A support vector machine.
#
# == Examples:
#
# Spark::Mllib.import
#
# # Dense vectors
# data = [
# LabeledPoint.new(0.0, [0.0]),
# LabeledPoint.new(1.0, [1.0]),
# LabeledPoint.new(1.0, [2.0]),
# LabeledPoint.new(1.0, [3.0])
# ]
# svm = SVMWithSGD.train($sc.parallelize(data))
#
# svm.predict([1.0])
# # => 1
# svm.clear_threshold
# svm.predict([1.0])
# # => 1.25...
#
#
# # Sparse vectors
# data = [
# LabeledPoint.new(0.0, SparseVector.new(2, {0 => -1.0})),
# LabeledPoint.new(1.0, SparseVector.new(2, {1 => 1.0})),
# LabeledPoint.new(0.0, SparseVector.new(2, {0 => 0.0})),
# LabeledPoint.new(1.0, SparseVector.new(2, {1 => 2.0}))
# ]
# svm = SVMWithSGD.train($sc.parallelize(data))
#
# svm.predict(SparseVector.new(2, {1 => 1.0}))
# # => 1
# svm.predict(SparseVector.new(2, {0 => -1.0}))
# # => 0
#
class SVMModel < ClassificationModel
def initialize(*args)
super
@threshold = 0.0
end
# Predict values for a single data point or an RDD of points using
# the model trained.
def predict(vector)
vector = Spark::Mllib::Vectors.to_vector(vector)
margin = weights.dot(vector) + intercept
if threshold.nil?
return margin
end
if margin > threshold
1
else
0
end
end
end
end
end
module Spark
module Mllib
class SVMWithSGD < ClassificationMethodBase
DEFAULT_OPTIONS = {
iterations: 100,
step: 1.0,
reg_param: 0.01,
mini_batch_fraction: 1.0,
initial_weights: nil,
reg_type: 'l2',
intercept: false,
validate: true,
convergence_tol: 0.001
}
# Train a support vector machine on the given data.
#
# rdd::
# The training data, an RDD of LabeledPoint.
#
# iterations::
# The number of iterations (default: 100).
#
# step::
# The step parameter used in SGD (default: 1.0).
#
# reg_param::
# The regularizer parameter (default: 0.01).
#
# mini_batch_fraction::
# Fraction of data to be used for each SGD iteration.
#
# initial_weights::
# The initial weights (default: nil).
#
# reg_type::
# The type of regularizer used for training our model (default: "l2").
#
# Allowed values:
# - "l1" for using L1 regularization
# - "l2" for using L2 regularization
# - nil for no regularization
#
# intercept::
# Boolean parameter which indicates the use
# or not of the augmented representation for
# training data (i.e. whether bias features
# are activated or not).
# (default: false)
#
# validateData::
# Boolean parameter which indicates if the
# algorithm should validate data before training.
# (default: true)
#
# convergence_tol::
# A condition which decides iteration termination.
# (default: 0.001)
#
def self.train(rdd, options={})
super
weights, intercept = Spark.jb.call(RubyMLLibAPI.new, 'trainSVMModelWithSGD', rdd,
options[:iterations].to_i,
options[:step].to_f,
options[:reg_param].to_f,
options[:mini_batch_fraction].to_f,
options[:initial_weights],
options[:reg_type],
options[:intercept],
options[:validate],
options[:convergence_tol])
SVMModel.new(weights, intercept)
end
end
end
end
================================================
FILE: lib/spark/mllib/clustering/gaussian_mixture.rb
================================================
module Spark
module Mllib
##
# GaussianMixtureModel
#
# A clustering model derived from the Gaussian Mixture Model method.
#
# == Examples:
#
# Spark::Mllib.import
#
# data = [
# DenseVector.new([-0.1, -0.05]),
# DenseVector.new([-0.01, -0.1]),
# DenseVector.new([0.9, 0.8]),
# DenseVector.new([0.75, 0.935]),
# DenseVector.new([-0.83, -0.68]),
# DenseVector.new([-0.91, -0.76])
# ]
#
# model = GaussianMixture.train($sc.parallelize(data), 3, convergence_tol: 0.0001, max_iterations: 50, seed: 10)
#
# labels = model.predict($sc.parallelize(data)).collect
#
class GaussianMixtureModel
attr_reader :weights, :gaussians, :k
def initialize(weights, gaussians)
@weights = weights
@gaussians = gaussians
@k = weights.size
end
# Find the cluster to which the points in 'x' has maximum membership
# in this model.
def predict(rdd)
if rdd.is_a?(Spark::RDD)
predict_soft(rdd).map('lambda{|x| x.index(x.max)}')
else
raise ArgumentError, 'Argument must be a RDD.'
end
end
# Find the membership of each point in 'x' to all mixture components.
def predict_soft(rdd)
Spark.jb.call(RubyMLLibAPI.new, 'predictSoftGMM', rdd, weights, means, sigmas)
end
def means
@means ||= @gaussians.map(&:mu)
end
def sigmas
@sigmas ||= @gaussians.map(&:sigma)
end
end
end
end
module Spark
module Mllib
class GaussianMixture
def self.train(rdd, k, convergence_tol: 0.001, max_iterations: 100, seed: nil)
weights, means, sigmas = Spark.jb.call(RubyMLLibAPI.new, 'trainGaussianMixtureModel', rdd,
k, convergence_tol, max_iterations, Spark.jb.to_long(seed))
means.map! {|mu| Spark.jb.java_to_ruby(mu)}
sigmas.map!{|sigma| Spark.jb.java_to_ruby(sigma)}
mvgs = Array.new(k) do |i|
MultivariateGaussian.new(means[i], sigmas[i])
end
GaussianMixtureModel.new(weights, mvgs)
end
end
end
end
================================================
FILE: lib/spark/mllib/clustering/kmeans.rb
================================================
module Spark
module Mllib
##
# KMeansModel
#
# A clustering model derived from the k-means method.
#
# == Examples:
#
# Spark::Mllib.import
#
# # Dense vectors
# data = [
# DenseVector.new([0.0,0.0]),
# DenseVector.new([1.0,1.0]),
# DenseVector.new([9.0,8.0]),
# DenseVector.new([8.0,9.0])
# ]
#
# model = KMeans.train($sc.parallelize(data), 2, max_iterations: 10,
# runs: 30, initialization_mode: "random")
#
# model.predict([0.0, 0.0]) == model.predict([1.0, 1.0])
# # => true
# model.predict([8.0, 9.0]) == model.predict([9.0, 8.0])
# # => true
#
#
# # Sparse vectors
# data = [
# SparseVector.new(3, {1 => 1.0}),
# SparseVector.new(3, {1 => 1.1}),
# SparseVector.new(3, {2 => 1.0}),
# SparseVector.new(3, {2 => 1.1})
# ]
# model = KMeans.train($sc.parallelize(data), 2, initialization_mode: "k-means||")
#
# model.predict([0.0, 1.0, 0.0]) == model.predict([0, 1.1, 0.0])
# # => true
# model.predict([0.0, 0.0, 1.0]) == model.predict([0, 0, 1.1])
# # => true
# model.predict(data[0]) == model.predict(data[1])
# # => true
# model.predict(data[2]) == model.predict(data[3])
# # => true
#
class KMeansModel
attr_reader :centers
def initialize(centers)
@centers = centers
end
# Find the cluster to which x belongs in this model.
def predict(vector)
vector = Spark::Mllib::Vectors.to_vector(vector)
best = 0
best_distance = Float::INFINITY
@centers.each_with_index do |center, index|
distance = vector.squared_distance(center)
if distance < best_distance
best = index
best_distance = distance
end
end
best
end
def self.from_java(object)
centers = object.clusterCenters
centers.map! do |center|
Spark.jb.java_to_ruby(center)
end
KMeansModel.new(centers)
end
end
end
end
module Spark
module Mllib
class KMeans
# Trains a k-means model using the given set of parameters.
#
# == Arguments:
# rdd::
# The training data, an RDD of Vectors.
#
# k::
# Number of clusters.
#
# max_iterations::
# Max number of iterations.
#
# runs::
# Number of parallel runs, defaults to 1. The best model is returned.
#
# initialization_mode::
# Initialization model, either "random" or "k-means||" (default).
#
# seed::
# Random seed value for cluster initialization.
#
# epsilon::
# The distance threshold within which we've consider centers to have converged.
#
def self.train(rdd, k, max_iterations: 100, runs: 1, initialization_mode: 'k-means||', seed: nil,
initialization_steps: 5, epsilon: 0.0001)
cluster_initial_model = []
# Call returns KMeansModel
Spark.jb.call(RubyMLLibAPI.new, 'trainKMeansModel', rdd,
k, max_iterations, runs, initialization_mode, Spark.jb.to_long(seed), initialization_steps, epsilon, cluster_initial_model)
end
end
end
end
================================================
FILE: lib/spark/mllib/matrix.rb
================================================
module Spark
module Mllib
module Matrices
def self.dense(*args)
DenseMatrix.new(*args)
end
def self.sparse(*args)
SparseMatrix.new(*args)
end
def self.to_matrix(data)
if data.is_a?(SparseMatrix) || data.is_a?(DenseMatrix)
data
elsif data.is_a?(Array)
DenseMatrix.new(data)
end
end
end
end
end
module Spark
module Mllib
# @abstract Parent for all type of matrices
class MatrixBase < MatrixAdapter
end
end
end
module Spark
module Mllib
##
# DenseMatrix
#
# DenseMatrix.new(2, 3, [[1,2,3], [4,5,6]]).values
# # => [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]
#
class DenseMatrix < MatrixBase
def initialize(rows, cols, values)
super(:dense, rows, cols, values.to_a)
end
def to_java
JDenseMatrix.new(shape[0], shape[1], values.flatten)
end
def self.from_java(object)
rows = object.numRows
cols = object.numCols
values = object.values
DenseMatrix.new(rows, cols, values)
end
end
end
end
module Spark
module Mllib
##
# SparseMatrix
#
# == Arguments:
# rows::
# Number of rows.
#
# cols::
# Number of columns.
#
# col_pointers::
# The index corresponding to the start of a new column.
#
# row_indices::
# The row index of the entry. They must be in strictly
# increasing order for each column.
#
# values::
# Nonzero matrix entries in column major.
#
# == Examples:
#
# SparseMatrix.new(3, 3, [0, 2, 3, 6], [0, 2, 1, 0, 1, 2], [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).values
#
# # => [
# # [1.0, 0.0, 4.0],
# # [0.0, 3.0, 5.0],
# # [2.0, 0.0, 6.0]
# # ]
#
class SparseMatrix < MatrixBase
attr_reader :col_pointers, :row_indices
def initialize(rows, cols, col_pointers, row_indices, values)
super(:sparse, rows, cols)
@col_pointers = col_pointers
@row_indices = row_indices
@values = values
j = 0
while j < cols
idx = col_pointers[j]
idx_end = col_pointers[j+1]
while idx < idx_end
self[row_indices[idx], j] = values[idx]
idx += 1
end
j += 1
end
end
end
end
end
================================================
FILE: lib/spark/mllib/regression/common.rb
================================================
module Spark
module Mllib
##
# RegressionModel
#
# A linear model that has a vector of coefficients and an intercept.
#
class RegressionModel
attr_reader :weights, :intercept
def initialize(weights, intercept)
@weights = Spark::Mllib::Vectors.to_vector(weights)
@intercept = intercept.to_f
end
# Predict the value of the dependent variable given a vector data
# containing values for the independent variables.
#
# == Examples:
# lm = RegressionModel.new([1.0, 2.0], 0.1)
#
# lm.predict([-1.03, 7.777]) - 14.624 < 1e-6
# # => true
#
# lm.predict(SparseVector.new(2, {0 => -1.03, 1 => 7.777})) - 14.624 < 1e-6
# # => true
#
def predict(data)
data = Spark::Mllib::Vectors.to_vector(data)
@weights.dot(data) + @intercept
end
end
end
end
module Spark
module Mllib
##
# RegressionMethodBase
#
# Parent for regression methods
#
class RegressionMethodBase
def self.train(rdd, options)
# String keys to symbols
options.symbolize_keys!
# Reverse merge
self::DEFAULT_OPTIONS.each do |key, value|
if options.has_key?(key)
# value from user
else
options[key] = value
end
end
# Validation
first = rdd.first
unless first.is_a?(LabeledPoint)
raise Spark::MllibError, "RDD should contains LabeledPoint, got #{first.class}"
end
# Initial weights is optional for user (not for Spark)
options[:initial_weights] = Vectors.to_vector(options[:initial_weights] || [0.0] * first.features.size)
end
end
end
end
================================================
FILE: lib/spark/mllib/regression/labeled_point.rb
================================================
module Spark
module Mllib
##
# LabeledPoint
#
# The features and labels of a data point.
#
# == Parameters:
# label::
# Label for this data point.
#
# features::
# Vector of features for this point
#
class LabeledPoint
attr_reader :label, :features
def initialize(label, features)
@label = label.to_f
@features = Spark::Mllib::Vectors.to_vector(features)
end
def self.from_java(object)
LabeledPoint.new(
object.label,
Spark.jb.java_to_ruby(object.features)
)
end
def marshal_dump
[@label, @features]
end
def marshal_load(array)
initialize(array[0], array[1])
end
end
end
end
================================================
FILE: lib/spark/mllib/regression/lasso.rb
================================================
##
# LassoModel
#
# Train a regression model with L1-regularization using Stochastic Gradient Descent.
# This solves the l1-regularized least squares regression formulation
# f(weights) = 1/2n ||A weights-y||^2^ + regParam ||weights||_1
# Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with
# its corresponding right hand side label y.
# See also the documentation for the precise formulation.
#
# == Examples:
#
# Spark::Mllib.import
#
# # Dense vectors
# data = [
# LabeledPoint.new(0.0, [0.0]),
# LabeledPoint.new(1.0, [1.0]),
# LabeledPoint.new(3.0, [2.0]),
# LabeledPoint.new(2.0, [3.0])
# ]
# lrm = LassoWithSGD.train($sc.parallelize(data), initial_weights: [1.0])
#
# lrm.predict([0.0]) - 0 < 0.5
# # => true
#
# lrm.predict([1.0]) - 1 < 0.5
# # => true
#
# lrm.predict(SparseVector.new(1, {0 => 1.0})) - 1 < 0.5
# # => true
#
#
# # Sparse vectors
# data = [
# LabeledPoint.new(0.0, SparseVector.new(1, {0 => 0.0})),
# LabeledPoint.new(1.0, SparseVector.new(1, {0 => 1.0})),
# LabeledPoint.new(3.0, SparseVector.new(1, {0 => 2.0})),
# LabeledPoint.new(2.0, SparseVector.new(1, {0 => 3.0}))
# ]
# lrm = LinearRegressionWithSGD.train($sc.parallelize(data), initial_weights: [1.0])
#
# lrm.predict([0.0]) - 0 < 0.5
# # => true
#
# lrm.predict(SparseVector.new(1, {0 => 1.0})) - 1 < 0.5
# # => true
#
class Spark::Mllib::LassoModel < Spark::Mllib::RegressionModel
end
module Spark
module Mllib
class LassoWithSGD < RegressionMethodBase
DEFAULT_OPTIONS = {
iterations: 100,
step: 1.0,
reg_param: 0.01,
mini_batch_fraction: 1.0,
initial_weights: nil,
intercept: false,
validate: true,
convergence_tol: 0.001
}
# Train a Lasso regression model on the given data.
#
# == Parameters:
# rdd::
# The training data (RDD instance).
#
# iterations::
# The number of iterations (default: 100).
#
# step::
# The step parameter used in SGD (default: 1.0).
#
# reg_param::
# The regularizer parameter (default: 0.0).
#
# mini_batch_fraction::
# Fraction of data to be used for each SGD iteration (default: 1.0).
#
# initial_weights::
# The initial weights (default: nil).
#
# intercept::
# Boolean parameter which indicates the use
# or not of the augmented representation for
# training data (i.e. whether bias features
# are activated or not).
# (default: false)
#
# validate::
# Boolean parameter which indicates if the
# algorithm should validate data before training.
# (default: true)
#
# convergence_tol::
# A condition which decides iteration termination.
# (default: 0.001)
#
def self.train(rdd, options={})
super
weights, intercept = Spark.jb.call(RubyMLLibAPI.new, 'trainLassoModelWithSGD', rdd,
options[:iterations].to_i,
options[:step].to_f,
options[:reg_param].to_f,
options[:mini_batch_fraction].to_f,
options[:initial_weights],
options[:intercept],
options[:validate],
options[:convergence_tol])
LassoModel.new(weights, intercept)
end
end
end
end
================================================
FILE: lib/spark/mllib/regression/linear.rb
================================================
##
# LinearRegressionModel
#
# Train a linear regression model with no regularization using Stochastic Gradient Descent.
# This solves the least squares regression formulation
# f(weights) = 1/n ||A weights-y||^2^
# (which is the mean squared error).
# Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with
# its corresponding right hand side label y.
# See also the documentation for the precise formulation.
#
# == Examples:
#
# Spark::Mllib.import
#
# # Dense vectors
# data = [
# LabeledPoint.new(0.0, [0.0]),
# LabeledPoint.new(1.0, [1.0]),
# LabeledPoint.new(3.0, [2.0]),
# LabeledPoint.new(2.0, [3.0])
# ]
# lrm = LinearRegressionWithSGD.train($sc.parallelize(data), initial_weights: [1.0])
#
# lrm.intercept # => 0.0
# lrm.weights # => [0.9285714285714286]
#
# lrm.predict([0.0]) < 0.5
# # => true
#
# lrm.predict([1.0]) - 1 < 0.5
# # => true
#
# lrm.predict(SparseVector.new(1, {0 => 1.0})) - 1 < 0.5
# # => true
#
# # Sparse vectors
# data = [
# LabeledPoint.new(0.0, SparseVector.new(1, {0 => 0.0})),
# LabeledPoint.new(1.0, SparseVector.new(1, {0 => 1.0})),
# LabeledPoint.new(3.0, SparseVector.new(1, {0 => 2.0})),
# LabeledPoint.new(2.0, SparseVector.new(1, {0 => 3.0}))
# ]
# lrm = LinearRegressionWithSGD.train($sc.parallelize(data), initial_weights: [1.0])
#
# lrm.intercept # => 0.0
# lrm.weights # => [0.9285714285714286]
#
# lrm.predict([0.0]) < 0.5
# # => true
#
# lrm.predict(SparseVector.new(1, {0 => 1.0})) - 1 < 0.5
# # => true
#
class Spark::Mllib::LinearRegressionModel < Spark::Mllib::RegressionModel
end
module Spark
module Mllib
class LinearRegressionWithSGD < RegressionMethodBase
DEFAULT_OPTIONS = {
iterations: 100,
step: 1.0,
mini_batch_fraction: 1.0,
initial_weights: nil,
reg_param: 0.0,
reg_type: nil,
intercept: false,
validate: true,
convergence_tol: 0.001
}
# Train a linear regression model on the given data.
#
# == Parameters:
# rdd::
# The training data (RDD instance).
#
# iterations::
# The number of iterations (default: 100).
#
# step::
# The step parameter used in SGD (default: 1.0).
#
# mini_batch_fraction::
# Fraction of data to be used for each SGD iteration (default: 1.0).
#
# initial_weights::
# The initial weights (default: nil).
#
# reg_param::
# The regularizer parameter (default: 0.0).
#
# reg_type::
# The type of regularizer used for training our model (default: nil).
#
# Allowed values:
# - "l1" for using L1 regularization (lasso),
# - "l2" for using L2 regularization (ridge),
# - None for no regularization
#
# intercept::
# Boolean parameter which indicates the use
# or not of the augmented representation for
# training data (i.e. whether bias features
# are activated or not).
# (default: false)
#
# validate::
# Boolean parameter which indicates if the
# algorithm should validate data before training.
# (default: true)
#
# convergence_tol::
# A condition which decides iteration termination.
# (default: 0.001)
#
def self.train(rdd, options={})
super
weights, intercept = Spark.jb.call(RubyMLLibAPI.new, 'trainLinearRegressionModelWithSGD', rdd,
options[:iterations].to_i,
options[:step].to_f,
options[:mini_batch_fraction].to_f,
options[:initial_weights],
options[:reg_param].to_f,
options[:reg_type],
options[:intercept],
options[:validate],
options[:convergence_tol])
LinearRegressionModel.new(weights, intercept)
end
end
end
end
================================================
FILE: lib/spark/mllib/regression/ridge.rb
================================================
##
# RidgeRegressionModel
#
# Train a regression model with L2-regularization using Stochastic Gradient Descent.
# This solves the l1-regularized least squares regression formulation
# f(weights) = 1/2n ||A weights-y||^2^ + regParam/2 ||weights||^2^
# Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with
# its corresponding right hand side label y.
# See also the documentation for the precise formulation.
#
# == Examples:
#
# Spark::Mllib.import
#
# data = [
# LabeledPoint.new(0.0, [0.0]),
# LabeledPoint.new(1.0, [1.0]),
# LabeledPoint.new(3.0, [2.0]),
# LabeledPoint.new(2.0, [3.0])
# ]
# lrm = RidgeRegressionWithSGD.train($sc.parallelize(data), initial_weights: [1.0])
#
# lrm.predict([0.0]) - 0 < 0.5
# # => true
#
# lrm.predict([1.0]) - 1 < 0.5
# # => true
#
# lrm.predict(SparseVector.new(1, {0 => 1.0})) - 1 < 0.5
# # => true
#
# data = [
# LabeledPoint.new(0.0, SparseVector.new(1, {0 => 0.0})),
# LabeledPoint.new(1.0, SparseVector.new(1, {0 => 1.0})),
# LabeledPoint.new(3.0, SparseVector.new(1, {0 => 2.0})),
# LabeledPoint.new(2.0, SparseVector.new(1, {0 => 3.0}))
# ]
# lrm = LinearRegressionWithSGD.train($sc.parallelize(data), initial_weights: [1.0])
#
# lrm.predict([0.0]) - 0 < 0.5
# # => true
#
# lrm.predict(SparseVector.new(1, {0 => 1.0})) - 1 < 0.5
# # => true
#
class Spark::Mllib::RidgeRegressionModel < Spark::Mllib::RegressionModel
end
module Spark
module Mllib
class RidgeRegressionWithSGD < RegressionMethodBase
DEFAULT_OPTIONS = {
iterations: 100,
step: 1.0,
reg_param: 0.01,
mini_batch_fraction: 1.0,
initial_weights: nil,
intercept: false,
validate: true,
convergence_tol: 0.001
}
# Train a ridge regression model on the given data.
#
# == Parameters:
# rdd::
# The training data (RDD instance).
#
# iterations::
# The number of iterations (default: 100).
#
# step::
# The step parameter used in SGD (default: 1.0).
#
# reg_param::
# The regularizer parameter (default: 0.0).
#
# mini_batch_fraction::
# Fraction of data to be used for each SGD iteration (default: 1.0).
#
# initial_weights::
# The initial weights (default: nil).
#
# intercept::
# Boolean parameter which indicates the use
# or not of the augmented representation for
# training data (i.e. whether bias features
# are activated or not).
# (default: false)
#
# validate::
# Boolean parameter which indicates if the
# algorithm should validate data before training.
# (default: true)
#
# convergence_tol::
# A condition which decides iteration termination.
# (default: 0.001)
#
def self.train(rdd, options={})
super
weights, intercept = Spark.jb.call(RubyMLLibAPI.new, 'trainRidgeModelWithSGD', rdd,
options[:iterations].to_i,
options[:step].to_f,
options[:reg_param].to_f,
options[:mini_batch_fraction].to_f,
options[:initial_weights],
options[:intercept],
options[:validate],
options[:convergence_tol])
RidgeRegressionModel.new(weights, intercept)
end
end
end
end
================================================
FILE: lib/spark/mllib/ruby_matrix/matrix_adapter.rb
================================================
require 'matrix'
module Spark
module Mllib
class MatrixAdapter < ::Matrix
def self.new(*args)
object = self.allocate
if args.size == 2
# Matrix is initialized from Matrix
# Arguments: rows, column count
object.__send__(:original_initialize, *args)
else
object.__send__(:initialize, *args)
end
object
end
alias_method :original_initialize, :initialize
def initialize(type, rows, cols, values=nil)
case type
when :dense
values = values.dup
if rows * cols == values.size
# Values are on one row
# 2x2 => [1,2,3,4]
values = values.each_slice(cols).to_a
else
# 2x2 => [[1,2], [3,4]]
end
when :sparse
values = Array.new(rows) { Array.new(cols) { 0.0 } }
else
raise Spark::MllibError, 'Unknow vector type.'
end
super(values, cols)
end
def shape
[row_count, column_count]
end
def values
@values || to_a
end
end
end
end
================================================
FILE: lib/spark/mllib/ruby_matrix/vector_adapter.rb
================================================
require 'matrix'
# Based on ruby 2.1
class Vector
def self.elements(array, copy=true)
DenseVector.new(convert_to_array(array, copy))
end
end
module Spark
module Mllib
class VectorAdapter < ::Vector
def self.new(*args)
object = self.allocate
object.__send__(:initialize, *args)
object
end
def initialize(*args)
case args.shift
when :dense
values = args.shift.dup
when :sparse
values = [0.0] * args.shift.to_i
else
raise Spark::MllibError, 'Unknow vector type.'
end
super(values)
end
def []=(index, value)
@elements[index] = value
end
def dot(other)
if other.is_a?(Spark::Mllib::MatrixBase)
other * self
else
inner_product(other)
end
end
def squared_distance(other)
diff = self - other
diff.dot(diff)
end
def values
@values || to_a
end
end
end
end
================================================
FILE: lib/spark/mllib/stat/distribution.rb
================================================
##
# MultivariateGaussian
#
# This class provides basic functionality for a Multivariate Gaussian (Normal) Distribution. In
# the event that the covariance matrix is singular, the density will be computed in a
# reduced dimensional subspace under which the distribution is supported.
#
# == Arguments:
# mu:: The mean vector of the distribution
# sigma:: The covariance matrix of the distribution
#
Spark::Mllib::MultivariateGaussian = Struct.new(:mu, :sigma)
================================================
FILE: lib/spark/mllib/vector.rb
================================================
module Spark
module Mllib
module Vectors
def self.dense(*args)
DenseVector.new(*args)
end
def self.sparse(*args)
SparseVector.new(*args)
end
def self.parse(data)
if data.start_with?('[') && data.end_with?(']')
DenseVector.parse(data)
elsif data.start_with?('(') && data.end_with?(')')
SparseVector.parse(data)
else
raise ArgumentError, 'Unknow vector.'
end
end
def self.to_vector(data)
if data.is_a?(SparseVector) || data.is_a?(DenseVector)
data
elsif data.is_a?(Array)
DenseVector.new(data)
end
end
end
end
end
module Spark
module Mllib
# @abstract Parent for all type of vectors
class VectorBase < VectorAdapter
end
end
end
module Spark
module Mllib
##
# A dense vector represented by a value array.
#
# Dense vector is a vector in which most of the elements are non-zero.
#
# == Example:
# DenseVector.new([1,2,3,4,5]).values
# # => [1, 2, 3, 4, 5]
#
# DenseVector.new(1..5).values
# # => [1, 2, 3, 4, 5]
#
class DenseVector < VectorBase
def initialize(values)
super(:dense, values.to_a)
end
# Covert string to vector
#
# DenseVector.parse("[1.0,2.0,3.0,4.0,5.0]")
#
def self.parse(data)
unless data =~ /\[[0-9., ]+\]/
raise ArgumentError, 'Unknow format for DenseVector.'
end
data.sub!('[', '')
data.sub!(']', '')
data = data.split(',')
data.map!(&:to_f)
DenseVector.new(data)
end
# Convert vector to string
#
# DenseVector.new([1,2,3,4,5]).to_s
# # => "[1.0,2.0,3.0,4.0,5.0]"
#
def to_s
"[#{values.join(',')}]"
end
def to_java
JDenseVector.new(values)
end
def self.from_java(object)
DenseVector.new(object.values)
end
def marshal_dump
values
end
def marshal_load(array)
initialize(array)
end
end
end
end
module Spark
module Mllib
##
# A sparse vector represented by an index array and an value array.
#
# Sparse vector is a vector in which most of the elements are zero.
#
# == Example:
# SparseVector.new(4, {1 => 1.0, 3 => 5.5}).values
# # => [0, 1.0, 0, 5.5]
#
# SparseVector.new(4, [[1, 3], [1.0, 5.5]]).values
# # => [0, 1.0, 0, 5.5]
#
# SparseVector.new(4, [1, 3], [1.0, 5.5]).values
# # => [0, 1.0, 0, 5.5]
#
class SparseVector < VectorBase
attr_reader :indices
def initialize(arg1, arg2=nil, arg3=nil)
super(:sparse, arg1)
if arg2.is_a?(Hash)
@indices = arg2.keys
@values = arg2.values
else
@indices = arg2
@values = arg3
end
@indices.zip(@values).each do |(index, value)|
self[index] = value
end
end
# Covert string to vector
#
# SparseVector.parse("(5,[1,4],[3.0,5.0])")
#
def self.parse(data)
data = data.match(/\(([0-9]+)[ ]*,[ ]*\[([0-9,. ]*)\][ ]*,[ ]*\[([0-9,. ]*)\]\)/)
if data
size = data[1].to_i
indices = data[2].split(',')
indices.map!(&:to_i)
values = data[3].split(',')
values.map!(&:to_f)
SparseVector.new(size, indices, values)
else
raise ArgumentError, 'Unknow format for SparseVector.'
end
end
# Convert vector to string
#
# SparseVector.new(5, {1 => 3, 4 => 5}).to_s
# # => "(5,[1,4],[3.0,5.0])"
#
def to_s
"(#{size},[#{indices.join(',')}],[#{values.join(',')}])"
end
def marshal_dump
[size, indices, values]
end
def marshal_load(array)
initialize(array[0], array[1], array[2])
end
end
end
end
================================================
FILE: lib/spark/mllib.rb
================================================
module Spark
# MLlib is Spark’s scalable machine learning library consisting of common learning algorithms and utilities,
# including classification, regression, clustering, collaborative filtering, dimensionality reduction,
# as well as underlying optimization primitives.
module Mllib
extend Spark::Library
# Base classes
autoload_without_import :VectorBase, 'spark/mllib/vector'
autoload_without_import :MatrixBase, 'spark/mllib/matrix'
autoload_without_import :RegressionMethodBase, 'spark/mllib/regression/common'
autoload_without_import :ClassificationMethodBase, 'spark/mllib/classification/common'
# Linear algebra
autoload :Vectors, 'spark/mllib/vector'
autoload :DenseVector, 'spark/mllib/vector'
autoload :SparseVector, 'spark/mllib/vector'
autoload :Matrices, 'spark/mllib/matrix'
autoload :DenseMatrix, 'spark/mllib/matrix'
autoload :SparseMatrix, 'spark/mllib/matrix'
# Regression
autoload :LabeledPoint, 'spark/mllib/regression/labeled_point'
autoload :RegressionModel, 'spark/mllib/regression/common'
autoload :LinearRegressionModel, 'spark/mllib/regression/linear'
autoload :LinearRegressionWithSGD, 'spark/mllib/regression/linear'
autoload :LassoModel, 'spark/mllib/regression/lasso'
autoload :LassoWithSGD, 'spark/mllib/regression/lasso'
autoload :RidgeRegressionModel, 'spark/mllib/regression/ridge'
autoload :RidgeRegressionWithSGD, 'spark/mllib/regression/ridge'
# Classification
autoload :ClassificationModel, 'spark/mllib/classification/common'
autoload :LogisticRegressionWithSGD, 'spark/mllib/classification/logistic_regression'
autoload :LogisticRegressionWithLBFGS, 'spark/mllib/classification/logistic_regression'
autoload :SVMModel, 'spark/mllib/classification/svm'
autoload :SVMWithSGD, 'spark/mllib/classification/svm'
autoload :NaiveBayesModel, 'spark/mllib/classification/naive_bayes'
autoload :NaiveBayes, 'spark/mllib/classification/naive_bayes'
# Clustering
autoload :KMeans, 'spark/mllib/clustering/kmeans'
autoload :KMeansModel, 'spark/mllib/clustering/kmeans'
autoload :GaussianMixture, 'spark/mllib/clustering/gaussian_mixture'
autoload :GaussianMixtureModel, 'spark/mllib/clustering/gaussian_mixture'
# Stat
autoload :MultivariateGaussian, 'spark/mllib/stat/distribution'
def self.prepare
return if @prepared
# if narray?
# require 'spark/mllib/narray/vector'
# require 'spark/mllib/narray/matrix'
# elsif mdarray?
# require 'spark/mllib/mdarray/vector'
# require 'spark/mllib/mdarray/matrix'
# else
# require 'spark/mllib/matrix/vector'
# require 'spark/mllib/matrix/matrix'
# end
require 'spark/mllib/ruby_matrix/vector_adapter'
require 'spark/mllib/ruby_matrix/matrix_adapter'
@prepared = true
nil
end
def self.narray?
Gem::Specification::find_all_by_name('narray').any?
end
def self.mdarray?
Gem::Specification::find_all_by_name('mdarray').any?
end
end
end
Spark::Mllib.prepare
================================================
FILE: lib/spark/rdd.rb
================================================
module Spark
##
# A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. Represents an immutable,
# partitioned collection of elements that can be operated on in parallel. This class contains the
# basic operations available on all RDDs, such as `map`, `filter`, and `persist`.
#
class RDD
extend Forwardable
attr_reader :jrdd, :context, :command
include Spark::Helper::Logger
include Spark::Helper::Parser
include Spark::Helper::Statistic
def_delegators :@command, :serializer, :deserializer, :libraries, :files
# Initializing RDD, this method is root of all Pipelined RDD - its unique
# If you call some operations on this class it will be computed in Java
#
# == Parameters:
# jrdd:: org.apache.spark.api.java.JavaRDD
# context:: {Spark::Context}
# serializer:: {Spark::Serializer}
#
def initialize(jrdd, context, serializer, deserializer=nil)
@jrdd = jrdd
@context = context
@cached = false
@checkpointed = false
@command = Spark::CommandBuilder.new(serializer, deserializer)
end
def inspect
comms = @command.commands.join(' -> ')
result = %{#<#{self.class.name}:0x#{object_id}}
result << %{ (#{comms})} unless comms.empty?
result << %{ (cached)} if cached?
result << %{\n}
result << %{ Serializer: "#{serializer}"\n}
result << %{Deserializer: "#{deserializer}"}
result << %{>}
result
end
# =============================================================================
# Operators
def +(other)
self.union(other)
end
# =============================================================================
# Commad and serializer
def add_command(klass, *args)
@command.deep_copy.add_command(klass, *args)
end
# Add ruby library
# Libraries will be included before computing
#
# == Example:
# rdd.add_library('pry').add_library('nio4r', 'distribution')
#
def add_library(*libraries)
@command.add_library(*libraries)
self
end
# Bind object to RDD
#
# == Example:
# text = "test"
#
# rdd = $sc.parallelize(0..5)
# rdd = rdd.map(lambda{|x| x.to_s + " " + text})
# rdd = rdd.bind(text: text)
#
# rdd.collect
# # => ["0 test", "1 test", "2 test", "3 test", "4 test", "5 test"]
#
def bind(objects)
unless objects.is_a?(Hash)
raise ArgumentError, 'Argument must be a Hash.'
end
@command.bind(objects)
self
end
def new_rdd_from_command(klass, *args)
comm = add_command(klass, *args)
PipelinedRDD.new(self, comm)
end
# =============================================================================
# Variables and non-computing functions
def config
@context.config
end
def default_reduce_partitions
config['spark.default.parallelism'] || partitions_size
end
# Count of ParallelCollectionPartition
def partitions_size
jrdd.rdd.partitions.size
end
# A unique ID for this RDD (within its SparkContext).
def id
jrdd.id
end
# Persist this RDD with the default storage level MEMORY_ONLY_SER because of serialization.
def cache
persist('memory_only_ser')
end
# Set this RDD's storage level to persist its values across operations after the first time
# it is computed. This can only be used to assign a new storage level if the RDD does not
# have a storage level set yet.
#
# See StorageLevel for type of new_level
#
def persist(new_level)
@cached = true
jrdd.persist(Spark::StorageLevel.java_get(new_level))
self
end
# Mark the RDD as non-persistent, and remove all blocks for it from memory and disk.
#
# == Parameters:
# blocking:: whether to block until all blocks are deleted.
#
def unpersist(blocking=true)
@cached = false
jrdd.unpersist(blocking)
self
end
def cached?
@cached
end
def checkpointed?
@checkpointed
end
# Return the name of this RDD.
#
def name
_name = jrdd.name
_name && _name.encode(Encoding::UTF_8)
end
# Assign a name to this RDD.
#
def set_name(value)
jrdd.setName(value)
value
end
def name=(value)
set_name(value)
end
def to_java
marshal = Spark::Serializer.marshal
if deserializer.batched?
ser = deserializer.deep_copy
ser.serializer = marshal
else
ser = Spark::Serializer.batched(marshal)
end
rdd = self.reserialize(ser)
RubyRDD.toJava(rdd.jrdd, rdd.serializer.batched?)
end
# =============================================================================
# Actions which return value
# Return an array that contains all of the elements in this RDD.
# RJB raise an error if stage is killed.
def collect(as_enum=false)
file = Tempfile.new('collect', context.temp_dir)
context.set_call_site(caller.first)
RubyRDD.writeRDDToFile(jrdd.rdd, file.path)
collect_from_file(file, as_enum)
rescue => e
raise Spark::RDDError, e.message
ensure
context.clear_call_site
end
def collect_from_file(file, as_enum=false)
if self.is_a?(PipelinedRDD)
klass = @command.serializer
else
klass = @command.deserializer
end
if as_enum
result = klass.load_from_file(file)
else
result = klass.load_from_io(file).to_a
file.close
file.unlink
end
result
end
# Convert an Array to Hash
#
def collect_as_hash
Hash[collect]
end
# Take the first num elements of the RDD.
#
# It works by first scanning one partition, and use the results from
# that partition to estimate the number of additional partitions needed
# to satisfy the limit.
#
# == Example:
# rdd = $sc.parallelize(0..100, 20)
# rdd.take(5)
# # => [0, 1, 2, 3, 4]
#
def take(count)
buffer = []
parts_count = self.partitions_size
# No parts was scanned, yet
last_scanned = -1
while buffer.empty?
last_scanned += 1
buffer += context.run_job_with_command(self, [last_scanned], true, Spark::Command::Take, 0, -1)
end
# Assumption. Depend on batch_size and how Spark divided data.
items_per_part = buffer.size
left = count - buffer.size
while left > 0 && last_scanned < parts_count
parts_to_take = (left.to_f/items_per_part).ceil
parts_for_scanned = Array.new(parts_to_take) do
last_scanned += 1
end
# We cannot take exact number of items because workers are isolated from each other.
# => once you take e.g. 50% from last part and left is still > 0 then its very
# difficult merge new items
items = context.run_job_with_command(self, parts_for_scanned, true, Spark::Command::Take, left, last_scanned)
buffer += items
left = count - buffer.size
# Average size of all parts
items_per_part = [items_per_part, items.size].reduce(0){|sum, x| sum + x.to_f/2}
end
buffer.slice!(0, count)
end
# Return the first element in this RDD.
#
# == Example:
# rdd = $sc.parallelize(0..100)
# rdd.first
# # => 0
#
def first
self.take(1)[0]
end
# Reduces the elements of this RDD using the specified lambda or method.
#
# == Example:
# rdd = $sc.parallelize(0..10)
# rdd.reduce(lambda{|sum, x| sum+x})
# # => 55
#
def reduce(f)
_reduce(Spark::Command::Reduce, f, f)
end
# Aggregate the elements of each partition, and then the results for all the partitions, using a
# given associative function and a neutral "zero value".
#
# The function f(x, y) is allowed to modify x and return it as its result value to avoid
# object allocation; however, it should not modify y.
#
# Be careful, zero_values is applied to all stages. See example.
#
# == Example:
# rdd = $sc.parallelize(0..10, 2)
# rdd.fold(1, lambda{|sum, x| sum+x})
# # => 58
#
def fold(zero_value, f)
self.aggregate(zero_value, f, f)
end
# Aggregate the elements of each partition, and then the results for all the partitions, using
# given combine functions and a neutral "zero value".
#
# This function can return a different result type. We need one operation for merging.
#
# Result must be an Array otherwise Serializer Array's zero value will be send
# as multiple values and not just one.
#
# == Example:
# # 1 2 3 4 5 => 15 + 1 = 16
# # 6 7 8 9 10 => 40 + 1 = 41
# # 16 * 41 = 656
#
# seq = lambda{|x,y| x+y}
# com = lambda{|x,y| x*y}
#
# rdd = $sc.parallelize(1..10, 2)
# rdd.aggregate(1, seq, com)
# # => 656
#
def aggregate(zero_value, seq_op, comb_op)
_reduce(Spark::Command::Aggregate, seq_op, comb_op, zero_value)
end
# Return the max of this RDD
#
# == Example:
# rdd = $sc.parallelize(0..10)
# rdd.max
# # => 10
#
def max
self.reduce('lambda{|memo, item| memo > item ? memo : item }')
end
# Return the min of this RDD
#
# == Example:
# rdd = $sc.parallelize(0..10)
# rdd.min
# # => 0
#
def min
self.reduce('lambda{|memo, item| memo < item ? memo : item }')
end
# Return the sum of this RDD
#
# == Example:
# rdd = $sc.parallelize(0..10)
# rdd.sum
# # => 55
#
def sum
self.reduce('lambda{|sum, item| sum + item}')
end
# Return the number of values in this RDD
#
# == Example:
# rdd = $sc.parallelize(0..10)
# rdd.count
# # => 11
#
def count
# nil is for seq_op => it means the all result go directly to one worker for combine
@count ||= self.map_partitions('lambda{|iterator| iterator.to_a.size }')
.aggregate(0, nil, 'lambda{|sum, item| sum + item }')
end
# Return a {Spark::StatCounter} object that captures the mean, variance
# and count of the RDD's elements in one operation.
def stats
@stats ||= new_rdd_from_command(Spark::Command::Stats).reduce('lambda{|memo, item| memo.merge(item)}')
end
# Compute the mean of this RDD's elements.
#
# == Example:
# $sc.parallelize([1, 2, 3]).mean
# # => 2.0
#
def mean
stats.mean
end
# Compute the variance of this RDD's elements.
#
# == Example:
# $sc.parallelize([1, 2, 3]).variance
# # => 0.666...
#
def variance
stats.variance
end
# Compute the standard deviation of this RDD's elements.
#
# == Example:
# $sc.parallelize([1, 2, 3]).stdev
# # => 0.816...
#
def stdev
stats.stdev
end
# Compute the sample standard deviation of this RDD's elements (which
# corrects for bias in estimating the standard deviation by dividing by
# N-1 instead of N).
#
# == Example:
# $sc.parallelize([1, 2, 3]).sample_stdev
# # => 1.0
#
def sample_stdev
stats.sample_stdev
end
# Compute the sample variance of this RDD's elements (which corrects
# for bias in estimating the variance by dividing by N-1 instead of N).
#
# == Example:
# $sc.parallelize([1, 2, 3]).sample_variance
# # => 1.0
#
def sample_variance
stats.sample_variance
end
# Compute a histogram using the provided buckets. The buckets
# are all open to the right except for the last which is closed.
# e.g. [1,10,20,50] means the buckets are [1,10) [10,20) [20,50],
# which means 1<=x<10, 10<=x<20, 20<=x<=50. And on the input of 1
# and 50 we would have a histogram of 1,0,1.
#
# If your histogram is evenly spaced (e.g. [0, 10, 20, 30]),
# this can be switched from an O(log n) inseration to O(1) per
# element(where n = # buckets).
#
# Buckets must be sorted and not contain any duplicates, must be
# at least two elements.
#
# == Examples:
# rdd = $sc.parallelize(0..50)
#
# rdd.histogram(2)
# # => [[0.0, 25.0, 50], [25, 26]]
#
# rdd.histogram([0, 5, 25, 50])
# # => [[0, 5, 25, 50], [5, 20, 26]]
#
# rdd.histogram([0, 15, 30, 45, 60])
# # => [[0, 15, 30, 45, 60], [15, 15, 15, 6]]
#
def histogram(buckets)
# -----------------------------------------------------------------------
# Integer
#
if buckets.is_a?(Integer)
# Validation
if buckets < 1
raise ArgumentError, "Bucket count must be >= 1, #{buckets} inserted."
end
# Filter invalid values
# Nil and NaN
func = 'lambda{|x|
if x.nil? || (x.is_a?(Float) && x.nan?)
false
else
true
end
}'
filtered = self.filter(func)
# Compute the minimum and the maximum
func = 'lambda{|memo, item|
[memo[0] < item[0] ? memo[0] : item[0],
memo[1] > item[1] ? memo[1] : item[1]]
}'
min, max = filtered.map('lambda{|x| [x, x]}').reduce(func)
# Min, max must be valid numbers
if (min.is_a?(Float) && !min.finite?) || (max.is_a?(Float) && !max.finite?)
raise Spark::RDDError, 'Histogram on either an empty RDD or RDD containing +/-infinity or NaN'
end
# Already finished
if min == max || buckets == 1
return [min, max], [filtered.count]
end
# Custom range
begin
span = max - min # increment
buckets = (0...buckets).map do |x|
min + (x * span) / buckets.to_f
end
buckets << max
rescue NoMethodError
raise Spark::RDDError, 'Can not generate buckets with non-number in RDD'
end
even = true
# -----------------------------------------------------------------------
# Array
#
elsif buckets.is_a?(Array)
if buckets.size < 2
raise ArgumentError, 'Buckets should have more than one value.'
end
if buckets.detect{|x| x.nil? || (x.is_a?(Float) && x.nan?)}
raise ArgumentError, 'Can not have nil or nan numbers in buckets.'
end
if buckets.detect{|x| buckets.count(x) > 1}
raise ArgumentError, 'Buckets should not contain duplicated values.'
end
if buckets.sort != buckets
raise ArgumentError, 'Buckets must be sorted.'
end
even = false
# -----------------------------------------------------------------------
# Other
#
else
raise Spark::RDDError, 'Buckets should be number or array.'
end
reduce_func = 'lambda{|memo, item|
memo.size.times do |i|
memo[i] += item[i]
end
memo
}'
return buckets, new_rdd_from_command(Spark::Command::Histogram, even, buckets).reduce(reduce_func)
end
# Applies a function f to all elements of this RDD.
#
# == Example:
# rdd = $sc.parallelize(0..5)
# rdd.foreach(lambda{|x| puts x})
# # => nil
#
def foreach(f, options={})
new_rdd_from_command(Spark::Command::Foreach, f).collect
nil
end
# Applies a function f to each partition of this RDD.
#
# == Example:
# rdd = $sc.parallelize(0..5)
# rdd.foreachPartition(lambda{|x| puts x.to_s})
# # => nil
#
def foreach_partition(f, options={})
new_rdd_from_command(Spark::Command::ForeachPartition, f).collect
nil
end
# =============================================================================
# Transformations of RDD
# Return a new RDD by applying a function to all elements of this RDD.
#
# == Example:
# rdd = $sc.parallelize(0..5)
# rdd.map(lambda {|x| x*2}).collect
# # => [0, 2, 4, 6, 8, 10]
#
def map(f)
new_rdd_from_command(Spark::Command::Map, f)
end
# Return a new RDD by first applying a function to all elements of this
# RDD, and then flattening the results.
#
# == Example:
# rdd = $sc.parallelize(0..5)
# rdd.flat_map(lambda {|x| [x, 1]}).collect
# # => [0, 1, 1, 1, 2, 1, 3, 1, 4, 1, 5, 1]
#
def flat_map(f)
new_rdd_from_command(Spark::Command::FlatMap, f)
end
# Return a new RDD by applying a function to each partition of this RDD.
#
# == Example:
# rdd = $sc.parallelize(0..10, 2)
# rdd.map_partitions(lambda{|part| part.reduce(:+)}).collect
# # => [15, 40]
#
def map_partitions(f)
new_rdd_from_command(Spark::Command::MapPartitions, f)
end
# Return a new RDD by applying a function to each partition of this RDD, while tracking the index
# of the original partition.
#
# == Example:
# rdd = $sc.parallelize(0...4, 4)
# rdd.map_partitions_with_index(lambda{|part, index| part.first * index}).collect
# # => [0, 1, 4, 9]
#
def map_partitions_with_index(f, options={})
new_rdd_from_command(Spark::Command::MapPartitionsWithIndex, f)
end
# Return a new RDD containing only the elements that satisfy a predicate.
#
# == Example:
# rdd = $sc.parallelize(0..10)
# rdd.filter(lambda{|x| x.even?}).collect
# # => [0, 2, 4, 6, 8, 10]
#
def filter(f)
new_rdd_from_command(Spark::Command::Filter, f)
end
# Return a new RDD containing non-nil elements.
#
# == Example:
# rdd = $sc.parallelize([1, nil, 2, nil, 3])
# rdd.compact.collect
# # => [1, 2, 3]
#
def compact
new_rdd_from_command(Spark::Command::Compact)
end
# Return an RDD created by coalescing all elements within each partition into an array.
#
# == Example:
# rdd = $sc.parallelize(0..10, 3)
# rdd.glom.collect
# # => [[0, 1, 2], [3, 4, 5, 6], [7, 8, 9, 10]]
#
def glom
new_rdd_from_command(Spark::Command::Glom)
end
# Return a new RDD that is reduced into num_partitions partitions.
#
# == Example:
# rdd = $sc.parallelize(0..10, 3)
# rdd.coalesce(2).glom.collect
# # => [[0, 1, 2], [3, 4, 5, 6, 7, 8, 9, 10]]
#
def coalesce(num_partitions)
if self.is_a?(PipelinedRDD)
deser = @command.serializer
else
deser = @command.deserializer
end
new_jrdd = jrdd.coalesce(num_partitions)
RDD.new(new_jrdd, context, @command.serializer, deser)
end
# Return the Cartesian product of this RDD and another one, that is, the
# RDD of all pairs of elements `(a, b)` where `a` is in `self` and
# `b` is in `other`.
#
# == Example:
# rdd1 = $sc.parallelize([1,2,3])
# rdd2 = $sc.parallelize([4,5,6])
#
# rdd1.cartesian(rdd2).collect
# # => [[1, 4], [1, 5], [1, 6], [2, 4], [2, 5], [2, 6], [3, 4], [3, 5], [3, 6]]
#
def cartesian(other)
_deserializer = Spark::Serializer::Cartesian.new(self.deserializer, other.deserializer)
new_jrdd = jrdd.cartesian(other.jrdd)
RDD.new(new_jrdd, context, serializer, _deserializer)
end
# Return a new RDD containing the distinct elements in this RDD.
# Ordering is not preserved because of reducing
#
# == Example:
# rdd = $sc.parallelize([1,1,1,2,3])
# rdd.distinct.collect
# # => [1, 2, 3]
#
def distinct
self.map('lambda{|x| [x, nil]}')
.reduce_by_key('lambda{|x,_| x}')
.map('lambda{|x| x[0]}')
end
# Return a shuffled RDD.
#
# == Example:
# rdd = $sc.parallelize(0..10)
# rdd.shuffle.collect
# # => [3, 10, 6, 7, 8, 0, 4, 2, 9, 1, 5]
#
def shuffle(seed=nil)
seed ||= Random.new_seed
new_rdd_from_command(Spark::Command::Shuffle, seed)
end
# Return the union of this RDD and another one. Any identical elements will appear multiple
# times (use .distinct to eliminate them).
#
# == Example:
# rdd = $sc.parallelize([1, 2, 3])
# rdd.union(rdd).collect
# # => [1, 2, 3, 1, 2, 3]
#
def union(other)
if self.serializer != other.serializer
other = other.reserialize(serializer)
end
new_jrdd = jrdd.union(other.jrdd)
RDD.new(new_jrdd, context, serializer, deserializer)
end
# Return a new RDD with different serializer. This method is useful during union
# and join operations.
#
# == Example:
# rdd = $sc.parallelize([1, 2, 3], nil, serializer: "marshal")
# rdd = rdd.map(lambda{|x| x.to_s})
# rdd.reserialize("oj").collect
# # => ["1", "2", "3"]
#
def reserialize(new_serializer)
if serializer == new_serializer
return self
end
new_command = @command.deep_copy
new_command.serializer = new_serializer
PipelinedRDD.new(self, new_command)
end
# Return the intersection of this RDD and another one. The output will not contain
# any duplicate elements, even if the input RDDs did.
#
# == Example:
# rdd1 = $sc.parallelize([1,2,3,4,5])
# rdd2 = $sc.parallelize([1,4,5,6,7])
# rdd1.intersection(rdd2).collect
# # => [1, 4, 5]
#
def intersection(other)
mapping_function = 'lambda{|item| [item, nil]}'
filter_function = 'lambda{|(key, values)| values.size > 1}'
self.map(mapping_function)
.cogroup(other.map(mapping_function))
.filter(filter_function)
.keys
end
# Return a copy of the RDD partitioned using the specified partitioner.
#
# == Example:
# rdd = $sc.parallelize(["1","2","3","4","5"]).map(lambda {|x| [x, 1]})
# rdd.partitionBy(2).glom.collect
# # => [[["3", 1], ["4", 1]], [["1", 1], ["2", 1], ["5", 1]]]
#
def partition_by(num_partitions, partition_func=nil)
num_partitions ||= default_reduce_partitions
partition_func ||= 'lambda{|x| Spark::Digest.portable_hash(x.to_s)}'
_partition_by(num_partitions, Spark::Command::PartitionBy::Basic, partition_func)
end
# Return a sampled subset of this RDD. Operations are base on Poisson and Uniform
# distributions.
# TODO: Replace Unfirom for Bernoulli
#
# == Examples:
# rdd = $sc.parallelize(0..100)
#
# rdd.sample(true, 10).collect
# # => [17, 17, 22, 23, 51, 52, 62, 64, 69, 70, 96]
#
# rdd.sample(false, 0.1).collect
# # => [3, 5, 9, 32, 44, 55, 66, 68, 75, 80, 86, 91, 98]
#
def sample(with_replacement, fraction, seed=nil)
new_rdd_from_command(Spark::Command::Sample, with_replacement, fraction, seed)
end
# Return a fixed-size sampled subset of this RDD in an array
#
# == Examples:
# rdd = $sc.parallelize(0..100)
#
# rdd.take_sample(true, 10)
# # => [90, 84, 74, 44, 27, 22, 72, 96, 80, 54]
#
# rdd.take_sample(false, 10)
# # => [5, 35, 30, 48, 22, 33, 40, 75, 42, 32]
#
def take_sample(with_replacement, num, seed=nil)
if num < 0
raise Spark::RDDError, 'Size have to be greater than 0'
elsif num == 0
return []
end
# Taken from scala
num_st_dev = 10.0
# Number of items
initial_count = self.count
return [] if initial_count == 0
# Create new generator
seed ||= Random.new_seed
rng = Random.new(seed)
# Shuffle elements if requested num if greater than array size
if !with_replacement && num >= initial_count
return self.shuffle(seed).collect
end
# Max num
max_sample_size = Integer::MAX - (num_st_dev * Math.sqrt(Integer::MAX)).to_i
if num > max_sample_size
raise Spark::RDDError, "Size can not be greate than #{max_sample_size}"
end
# Approximate fraction with tolerance
fraction = compute_fraction(num, initial_count, with_replacement)
# Compute first samled subset
samples = self.sample(with_replacement, fraction, seed).collect
# If the first sample didn't turn out large enough, keep trying to take samples;
# this shouldn't happen often because we use a big multiplier for their initial size.
index = 0
while samples.size < num
log_warning("Needed to re-sample due to insufficient sample size. Repeat #{index}")
samples = self.sample(with_replacement, fraction, rng.rand(0..Integer::MAX)).collect
index += 1
end
samples.shuffle!(random: rng)
samples[0, num]
end
# Return an RDD created by piping elements to a forked external process.
#
# == Cmds:
# cmd = [env,] command... [,options]
#
# env: hash
# name => val : set the environment variable
# name => nil : unset the environment variable
# command...:
# commandline : command line string which is passed to the standard shell
# cmdname, arg1, ... : command name and one or more arguments (This form does
# not use the shell. See below for caveats.)
# [cmdname, argv0], arg1, ... : command name, argv[0] and zero or more arguments (no shell)
# options: hash
#
# See http://ruby-doc.org/core-2.2.0/Process.html#method-c-spawn
#
# == Examples:
# $sc.parallelize(0..5).pipe('cat').collect
# # => ["0", "1", "2", "3", "4", "5"]
#
# rdd = $sc.parallelize(0..5)
# rdd = rdd.pipe('cat', "awk '{print $1*10}'")
# rdd = rdd.map(lambda{|x| x.to_i + 1})
# rdd.collect
# # => [1, 11, 21, 31, 41, 51]
#
def pipe(*cmds)
new_rdd_from_command(Spark::Command::Pipe, cmds)
end
# =============================================================================
# Pair functions
# Merge the values for each key using an associative reduce function. This will also perform
# the merging locally on each mapper before sending results to a reducer, similarly to a
# "combiner" in MapReduce. Output will be hash-partitioned with the existing partitioner/
# parallelism level.
#
# == Example:
# rdd = $sc.parallelize(["a","b","c","a","b","c","a","c"]).map(lambda{|x| [x, 1]})
# rdd.reduce_by_key(lambda{|x,y| x+y}).collect_as_hash
# # => {"a"=>3, "b"=>2, "c"=>3}
#
def reduce_by_key(f, num_partitions=nil)
combine_by_key('lambda {|x| x}', f, f, num_partitions)
end
# Generic function to combine the elements for each key using a custom set of aggregation
# functions. Turns a JavaPairRDD[(K, V)] into a result of type JavaPairRDD[(K, C)], for a
# "combined type" C * Note that V and C can be different -- for example, one might group an
# RDD of type (Int, Int) into an RDD of type (Int, List[Int]). Users provide three
# functions:
#
# == Parameters:
# create_combiner:: which turns a V into a C (e.g., creates a one-element list)
# merge_value:: to merge a V into a C (e.g., adds it to the end of a list)
# merge_combiners:: to combine two C's into a single one.
#
# == Example:
# def combiner(x)
# x
# end
#
# def merge(x,y)
# x+y
# end
#
# rdd = $sc.parallelize(["a","b","c","a","b","c","a","c"], 2).map(lambda{|x| [x, 1]})
# rdd.combine_by_key(method(:combiner), method(:merge), method(:merge)).collect_as_hash
# # => {"a"=>3, "b"=>2, "c"=>3}
#
def combine_by_key(create_combiner, merge_value, merge_combiners, num_partitions=nil)
_combine_by_key(
[Spark::Command::CombineByKey::Combine, create_combiner, merge_value],
[Spark::Command::CombineByKey::Merge, merge_combiners],
num_partitions
)
end
# Return an RDD of grouped items.
#
# == Example:
# rdd = $sc.parallelize(0..5)
# rdd.group_by(lambda{|x| x%2}).collect
# # => [[0, [0, 2, 4]], [1, [1, 3, 5]]]
#
def group_by(f, num_partitions=nil)
self.key_by(f).group_by_key(num_partitions)
end
# Group the values for each key in the RDD into a single sequence. Allows controlling the
# partitioning of the resulting key-value pair RDD by passing a Partitioner.
#
# Note: If you are grouping in order to perform an aggregation (such as a sum or average)
# over each key, using reduce_by_key or combine_by_key will provide much better performance.
#
# == Example:
# rdd = $sc.parallelize([["a", 1], ["a", 2], ["b", 3]])
# rdd.group_by_key.collect
# # => [["a", [1, 2]], ["b", [3]]]
#
def group_by_key(num_partitions=nil)
create_combiner = 'lambda{|item| [item]}'
merge_value = 'lambda{|combiner, item| combiner << item; combiner}'
merge_combiners = 'lambda{|combiner_1, combiner_2| combiner_1 += combiner_2; combiner_1}'
combine_by_key(create_combiner, merge_value, merge_combiners, num_partitions)
end
# Merge the values for each key using an associative function f
# and a neutral `zero_value` which may be added to the result an
# arbitrary number of times, and must not change the result
# (e.g., 0 for addition, or 1 for multiplication.).
#
# == Example:
# rdd = $sc.parallelize([["a", 1], ["b", 2], ["a", 3], ["a", 4], ["c", 5]])
# rdd.fold_by_key(1, lambda{|x,y| x+y})
# # => [["a", 9], ["c", 6], ["b", 3]]
#
def fold_by_key(zero_value, f, num_partitions=nil)
self.aggregate_by_key(zero_value, f, f, num_partitions)
end
# Aggregate the values of each key, using given combine functions and a neutral zero value.
#
# == Example:
# def combine(x,y)
# x+y
# end
#
# def merge(x,y)
# x*y
# end
#
# rdd = $sc.parallelize([["a", 1], ["b", 2], ["a", 3], ["a", 4], ["c", 5]], 2)
# rdd.aggregate_by_key(1, method(:combine), method(:merge))
# # => [["b", 3], ["a", 16], ["c", 6]]
#
def aggregate_by_key(zero_value, seq_func, comb_func, num_partitions=nil)
_combine_by_key(
[Spark::Command::CombineByKey::CombineWithZero, zero_value, seq_func],
[Spark::Command::CombineByKey::Merge, comb_func],
num_partitions
)
end
# The same functionality as cogroup but this can grouped only 2 rdd's and you
# can change num_partitions.
#
# == Example:
# rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3]])
# rdd2 = $sc.parallelize([["a", 4], ["a", 5], ["b", 6]])
# rdd1.group_with(rdd2).collect
# # => [["a", [1, 2, 4, 5]], ["b", [3, 6]]]
#
def group_with(other, num_partitions=nil)
self.union(other).group_by_key(num_partitions)
end
# For each key k in `this` or `other`, return a resulting RDD that contains a tuple with the
# list of values for that key in `this` as well as `other`.
#
# == Example:
# rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3]])
# rdd2 = $sc.parallelize([["a", 4], ["a", 5], ["b", 6]])
# rdd3 = $sc.parallelize([["a", 7], ["a", 8], ["b", 9]])
# rdd1.cogroup(rdd2, rdd3).collect
# # => [["a", [1, 2, 4, 5, 7, 8]], ["b", [3, 6, 9]]]
#
def cogroup(*others)
unioned = self
others.each do |other|
unioned = unioned.union(other)
end
unioned.group_by_key
end
# Return each (key, value) pair in self RDD that has no pair with matching
# key in other RDD.
#
# == Example:
# rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3], ["c", 4]])
# rdd2 = $sc.parallelize([["b", 5], ["c", 6]])
# rdd1.subtract_by_key(rdd2).collect
# # => [["a", 1], ["a", 2]]
#
def subtract_by_key(other, num_partitions=nil)
create_combiner = 'lambda{|item| [[item]]}'
merge_value = 'lambda{|combiner, item| combiner.first << item; combiner}'
merge_combiners = 'lambda{|combiner_1, combiner_2| combiner_1 += combiner_2; combiner_1}'
self.union(other)
.combine_by_key(create_combiner, merge_value, merge_combiners, num_partitions)
.filter('lambda{|(key,values)| values.size == 1}')
.flat_map_values('lambda{|item| item.first}')
end
# Return an RDD with the elements from self that are not in other.
#
# == Example:
# rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3], ["c", 4]])
# rdd2 = $sc.parallelize([["a", 2], ["c", 6]])
# rdd1.subtract(rdd2).collect
# # => [["a", 1], ["b", 3], ["c", 4]]
#
def subtract(other, num_partitions=nil)
mapping_function = 'lambda{|x| [x,nil]}'
self.map(mapping_function)
.subtract_by_key(other.map(mapping_function), num_partitions)
.keys
end
# Sort the RDD by key
#
# == Example:
# rdd = $sc.parallelize([["c", 1], ["b", 2], ["a", 3]])
# rdd.sort_by_key.collect
# # => [["a", 3], ["b", 2], ["c", 1]]
#
def sort_by_key(ascending=true, num_partitions=nil)
self.sort_by('lambda{|(key, _)| key}')
end
# Sort the RDD by value
#
# == Example:
# rdd = $sc.parallelize([["a", 3], ["b", 1], ["c", 2]])
# rdd.sort_by_value.collect
# # => [["b", 1], ["c", 2], ["a", 3]]
#
def sort_by_value(ascending=true, num_partitions=nil)
self.sort_by('lambda{|(_, value)| value}')
end
# Sorts this RDD by the given key_function
#
# This is a different implementation than spark. Sort by doesn't use
# key_by method first. It can be slower but take less memory and
# you can always use map.sort_by_key
#
# == Example:
# rdd = $sc.parallelize(["aaaaaaa", "cc", "b", "eeee", "ddd"])
#
# rdd.sort_by.collect
# # => ["aaaaaaa", "b", "cc", "ddd", "eeee"]
#
# rdd.sort_by(lambda{|x| x.size}).collect
# # => ["b", "cc", "ddd", "eeee", "aaaaaaa"]
#
def sort_by(key_function=nil, ascending=true, num_partitions=nil)
key_function ||= 'lambda{|x| x}'
num_partitions ||= default_reduce_partitions
command_klass = Spark::Command::SortByKey
# Allow spill data to disk due to memory limit
# spilling = config['spark.shuffle.spill'] || false
spilling = false
memory = ''
# Set spilling to false if worker has unlimited memory
if memory.empty?
spilling = false
memory = nil
else
memory = to_memory_size(memory)
end
# Sorting should do one worker
if num_partitions == 1
rdd = self
rdd = rdd.coalesce(1) if partitions_size > 1
return rdd.new_rdd_from_command(command_klass, key_function, ascending, spilling, memory, serializer)
end
# Compute boundary of collection
# Collection should be evenly distributed
# 20.0 is from scala RangePartitioner (for roughly balanced output partitions)
count = self.count
sample_size = num_partitions * 20.0
fraction = [sample_size / [count, 1].max, 1.0].min
samples = self.sample(false, fraction, 1).map(key_function).collect
samples.sort!
# Reverse is much faster than reverse sort_by
samples.reverse! if !ascending
# Determine part bounds
bounds = determine_bounds(samples, num_partitions)
shuffled = _partition_by(num_partitions, Spark::Command::PartitionBy::Sorting, key_function, bounds, ascending, num_partitions)
shuffled.new_rdd_from_command(command_klass, key_function, ascending, spilling, memory, serializer)
end
# Creates array of the elements in this RDD by applying function f.
#
# == Example:
# rdd = $sc.parallelize(0..5)
# rdd.key_by(lambda{|x| x%2}).collect
# # => [[0, 0], [1, 1], [0, 2], [1, 3], [0, 4], [1, 5]]
#
def key_by(f)
new_rdd_from_command(Spark::Command::KeyBy, f)
end
# Pass each value in the key-value pair RDD through a map function without changing
# the keys. This also retains the original RDD's partitioning.
#
# == Example:
# rdd = $sc.parallelize(["ruby", "scala", "java"])
# rdd = rdd.map(lambda{|x| [x, x]})
# rdd = rdd.map_values(lambda{|x| x.upcase})
# rdd.collect
# # => [["ruby", "RUBY"], ["scala", "SCALA"], ["java", "JAVA"]]
#
def map_values(f)
new_rdd_from_command(Spark::Command::MapValues, f)
end
# Pass each value in the key-value pair RDD through a flat_map function
# without changing the keys; this also retains the original RDD's
# partitioning.
#
# == Example:
# rdd = $sc.parallelize([["a", [1,2]], ["b", [3]]])
# rdd = rdd.flat_map_values(lambda{|x| x*2})
# rdd.collect
# # => [["a", 1], ["a", 2], ["a", 1], ["a", 2], ["b", 3], ["b", 3]]
#
def flat_map_values(f)
new_rdd_from_command(Spark::Command::FlatMapValues, f)
end
# Return an RDD with the first element of PairRDD
#
# == Example:
# rdd = $sc.parallelize([[1,2], [3,4], [5,6]])
# rdd.keys.collect
# # => [1, 3, 5]
#
def keys
self.map('lambda{|(key, _)| key}')
end
# Return an RDD with the second element of PairRDD
#
# == Example:
# rdd = $sc.parallelize([[1,2], [3,4], [5,6]])
# rdd.keys.collect
# # => [2, 4, 6]
#
def values
self.map('lambda{|(_, value)| value}')
end
# Return the list of values in the RDD for key `key`.
# TODO: add Partitioner for efficiently searching
#
# == Example:
# rdd = $sc.parallelize(0..10)
# rdd = rdd.group_by(lambda {|x| x%3})
# rdd.lookup(2)
# # => [[2, 5, 8]]
#
# rdd = $sc.parallelize(0..10)
# rdd = rdd.key_by(lambda{|x| x.even?})
# rdd.lookup(true)
# # => [0, 2, 4, 6, 8, 10]
#
def lookup(key)
lookup_key = "lookup_key_#{object_id}"
self.filter("lambda{|(key, _)| key == #{lookup_key}}")
.bind(lookup_key => key)
.values
.collect
end
# Aliases
alias_method :partitionsSize, :partitions_size
alias_method :defaultReducePartitions, :default_reduce_partitions
alias_method :setName, :set_name
alias_method :addLibrary, :add_library
alias_method :require, :add_library
alias_method :flatMap, :flat_map
alias_method :mapPartitions, :map_partitions
alias_method :mapPartitionsWithIndex, :map_partitions_with_index
alias_method :reduceByKey, :reduce_by_key
alias_method :combineByKey, :combine_by_key
alias_method :groupByKey, :group_by_key
alias_method :groupWith, :group_with
alias_method :partitionBy, :partition_by
alias_method :defaultReducePartitions, :default_reduce_partitions
alias_method :foreachPartition, :foreach_partition
alias_method :mapValues, :map_values
alias_method :takeSample, :take_sample
alias_method :sortBy, :sort_by
alias_method :sortByKey, :sort_by_key
alias_method :keyBy, :key_by
alias_method :groupBy, :group_by
alias_method :foldByKey, :fold_by_key
alias_method :aggregateByKey, :aggregate_by_key
alias_method :subtractByKey, :subtract_by_key
alias_method :sampleStdev, :sample_stdev
alias_method :sampleVariance, :sample_variance
private
# This is base method for reduce operation. Is used by reduce, fold and aggregation.
# Only difference is that fold has zero value.
#
def _reduce(klass, seq_op, comb_op, zero_value=nil)
if seq_op.nil?
# Partitions are already reduced
rdd = self
else
rdd = new_rdd_from_command(klass, seq_op, zero_value)
end
# Send all results to one worker and combine results
rdd = rdd.coalesce(1).compact
# Add the same function to new RDD
comm = rdd.add_command(klass, comb_op, zero_value)
comm.deserializer = @command.serializer
# Value is returned in array
PipelinedRDD.new(rdd, comm).collect[0]
end
def _partition_by(num_partitions, klass, *args)
# RDD is transform from [key, value] to [hash, [key, value]]
keyed = new_rdd_from_command(klass, *args)
keyed.serializer.unbatch!
# PairwiseRDD and PythonPartitioner are borrowed from Python
# but works great on ruby too
pairwise_rdd = PairwiseRDD.new(keyed.jrdd.rdd).asJavaPairRDD
partitioner = PythonPartitioner.new(num_partitions, args.first.object_id)
new_jrdd = pairwise_rdd.partitionBy(partitioner).values
# Reset deserializer
RDD.new(new_jrdd, context, @command.serializer, keyed.serializer)
end
# For using a different combine_by_key
#
# == Used for:
# * combine_by_key
# * fold_by_key (with zero value)
#
def _combine_by_key(combine, merge, num_partitions)
num_partitions ||= default_reduce_partitions
# Combine key
combined = new_rdd_from_command(combine.shift, *combine)
# Merge items
shuffled = combined.partition_by(num_partitions)
merge_comm = shuffled.add_command(merge.shift, *merge)
PipelinedRDD.new(shuffled, merge_comm)
end
end
# Pipelined Resilient Distributed Dataset, operations are pipelined and sended to worker
#
# RDD
# `-- map
# `-- map
# `-- map
#
# Code is executed from top to bottom
#
class PipelinedRDD < RDD
attr_reader :prev_jrdd, :command
def initialize(prev, command)
if prev.is_a?(PipelinedRDD) && prev.pipelinable?
# Second, ... stages
@prev_jrdd = prev.prev_jrdd
else
# First stage
@prev_jrdd = prev.jrdd
end
@cached = false
@checkpointed = false
@context = prev.context
@command = command
end
def pipelinable?
!(cached? || checkpointed?)
end
# Serialization necessary things and sent it to RubyRDD (scala extension)
def jrdd
@jrdd ||= _jrdd
end
private
def _jrdd
command = @command.build
broadcasts = @command.bound_objects.select{|_, value| value.is_a?(Spark::Broadcast)}.values
broadcasts = to_java_array_list(broadcasts.map(&:jbroadcast))
ruby_rdd = RubyRDD.new(@prev_jrdd.rdd, command, broadcasts, @context.jaccumulator)
ruby_rdd.asJavaRDD
end
end
end
================================================
FILE: lib/spark/sampler.rb
================================================
require 'distribution'
# Random Generators
module Spark
module RandomGenerator
class Poisson
def initialize(mean, seed)
generator = Random.new(seed)
@exp_rng = Distribution::Exponential.rng(1.0/mean, random: generator)
end
def rand
t = 0.0
number = 0
loop{
t += @exp_rng.call
if t > 1
return number
end
number += 1
}
end
end
end
end
# Samplers
module Spark
module Sampler
class Base
attr_reader :fraction, :seed
def initialize(fraction, seed=nil)
@fraction = fraction
@seed = seed || Random.new_seed
end
end
# Poisson Sampler
# -------------------------------------------------------------------------
class Poisson < Base
def sample(iterator)
iterator.map! do |item|
count = rng.rand
Array.new(count) { item }
end
iterator.flatten!
iterator.compact!
iterator
end
def lazy_sample(iterator)
Enumerator::Lazy.new(iterator) do |yielder, value|
count = rng.rand
count.times { yielder << value }
end
end
def rng
@rng ||= Spark::RandomGenerator::Poisson.new(fraction, seed)
end
end
# Uniform Sampler
# -------------------------------------------------------------------------
class Uniform < Base
def sample(iterator)
iterator.select!{|item| rng.rand <= fraction}
iterator
end
def lazy_sample(iterator)
iterator.select do |item|
rng.rand <= fraction
end
end
def rng
@rng ||= Random.new(seed)
end
end
end
end
================================================
FILE: lib/spark/serializer/auto_batched.rb
================================================
module Spark
module Serializer
##
# AutoBatched serializator
#
# Batch size is computed automatically. Simillar to Python's AutoBatchedSerializer.
#
class AutoBatched < Batched
MAX_RATIO = 10
def initialize(serializer, best_size=65536)
@serializer = serializer
@best_size = best_size.to_i
error('Batch size must be greater than 1') if @best_size < 2
end
def batched?
true
end
def unbatch!
end
def name
"AutoBatched(#{@best_size})"
end
def dump_to_io(data, io)
check_each(data)
# Only Array have .slice
data = data.to_a
index = 0
batch = 2
max = @best_size * MAX_RATIO
loop do
chunk = data.slice(index, batch)
if chunk.nil? || chunk.empty?
break
end
serialized = @serializer.dump(chunk)
io.write_string(serialized)
index += batch
size = serialized.bytesize
if size < @best_size
batch *= 2
elsif size > max && batch > 1
batch /= 2
end
end
io.flush
end
end
end
end
Spark::Serializer.register('auto_batched', 'autobatched', Spark::Serializer::AutoBatched)
================================================
FILE: lib/spark/serializer/base.rb
================================================
module Spark
module Serializer
# @abstract Parent for all serializers
class Base
def load_from_io(io)
return to_enum(__callee__, io) unless block_given?
loop do
size = io.read_int_or_eof
break if size == Spark::Constant::DATA_EOF
yield load(io.read(size))
end
end
def load_from_file(file, *args)
return to_enum(__callee__, file, *args) unless block_given?
load_from_io(file, *args).each do |item|
yield item
end
file.close
file.unlink
end
def ==(other)
self.to_s == other.to_s
end
def batched?
false
end
def unbatch!
end
def check_each(data)
unless data.respond_to?(:each)
error('Data must be iterable.')
end
end
def error(message)
raise Spark::SerializeError, message
end
def name
self.class.name.split('::').last
end
def to_s
name
end
def inspect
%{#}
end
end
end
end
================================================
FILE: lib/spark/serializer/batched.rb
================================================
module Spark
module Serializer
class Batched < Base
attr_writer :serializer
def initialize(serializer, batch_size=nil)
batch_size ||= Spark::Serializer::DEFAULT_BATCH_SIZE
@serializer = serializer
@batch_size = batch_size.to_i
error('Batch size must be greater than 0') if @batch_size < 1
end
# Really batched
def batched?
@batch_size > 1
end
def unbatch!
@batch_size = 1
end
def load(data)
@serializer.load(data)
end
def dump(data)
@serializer.dump(data)
end
def name
"Batched(#{@batch_size})"
end
def to_s
"#{name} -> #{@serializer}"
end
# === Dump ==============================================================
def dump_to_io(data, io)
check_each(data)
if batched?
data = data.each_slice(@batch_size)
end
data.each do |item|
serialized = dump(item)
io.write_string(serialized)
end
io.flush
end
# === Load ==============================================================
def load_from_io(io)
return to_enum(__callee__, io) unless block_given?
loop do
size = io.read_int_or_eof
break if size == Spark::Constant::DATA_EOF
data = io.read(size)
data = load(data)
if batched?
data.each{|item| yield item }
else
yield data
end
end
end
end
end
end
Spark::Serializer.register('batched', Spark::Serializer::Batched)
================================================
FILE: lib/spark/serializer/cartesian.rb
================================================
module Spark
module Serializer
class Cartesian < Pair
def aggregate(item1, item2)
item1.product(item2)
end
end
end
end
Spark::Serializer.register('cartesian', Spark::Serializer::Cartesian)
================================================
FILE: lib/spark/serializer/compressed.rb
================================================
module Spark
module Serializer
class Compressed < Base
def initialize(serializer)
@serializer = serializer
end
def dump(data)
Zlib::Deflate.deflate(@serializer.dump(data))
end
def load(data)
@serializer.load(Zlib::Inflate.inflate(data))
end
end
end
end
begin
# TODO: require only if it is necessary
require 'zlib'
Spark::Serializer.register('compress', 'compressed', Spark::Serializer::Compressed)
rescue LoadError
end
================================================
FILE: lib/spark/serializer/marshal.rb
================================================
module Spark
module Serializer
class Marshal < Base
def dump(data)
::Marshal.dump(data)
end
def load(data)
::Marshal.load(data)
end
end
end
end
Spark::Serializer.register('marshal', Spark::Serializer::Marshal)
================================================
FILE: lib/spark/serializer/message_pack.rb
================================================
module Spark
module Serializer
class MessagePack < Base
def dump(data)
::MessagePack.dump(data)
end
def load(data)
::MessagePack.load(data)
end
end
end
end
begin
# TODO: require only if it is necessary
require 'msgpack'
Spark::Serializer.register('messagepack', 'message_pack', 'msgpack', 'msg_pack', Spark::Serializer::MessagePack)
rescue LoadError
end
================================================
FILE: lib/spark/serializer/oj.rb
================================================
module Spark
module Serializer
class Oj < Base
def dump(data)
::Oj.dump(data)
end
def load(data)
::Oj.load(data)
end
end
end
end
begin
# TODO: require only if it is necessary
require 'oj'
Spark::Serializer.register('oj', Spark::Serializer::Oj)
rescue LoadError
end
================================================
FILE: lib/spark/serializer/pair.rb
================================================
module Spark
module Serializer
class Pair < Base
def initialize(serializer1, serializer2)
@serializer1 = serializer1
@serializer2 = serializer2
end
def to_s
"#{name}(#{@serializer1}, #{@serializer2})"
end
def aggregate(item1, item2)
item1.zip(item2)
end
def load_from_io(io)
return to_enum(__callee__, io) unless block_given?
loop do
size = io.read_int_or_eof
break if size == Spark::Constant::DATA_EOF
item1 = @serializer1.load(io.read(size))
item2 = @serializer2.load(io.read_string)
item1 = [item1] unless @serializer1.batched?
item2 = [item2] unless @serializer2.batched?
aggregate(item1, item2).each do |item|
yield item
end
end
end
end
end
end
Spark::Serializer.register('pair', Spark::Serializer::Pair)
================================================
FILE: lib/spark/serializer/text.rb
================================================
module Spark
module Serializer
class Text < Base
attr_reader :encoding
def initialize(encoding=Encoding::UTF_8)
error('Encoding must be an instance of Encoding') unless encoding.is_a?(Encoding)
@encoding = encoding
end
def load(data)
data.to_s.force_encoding(@encoding)
end
def to_s
"Text(#{@encoding})"
end
end
end
end
Spark::Serializer.register('string', 'text', Spark::Serializer::Text)
================================================
FILE: lib/spark/serializer.rb
================================================
module Spark
##
# Serializer
#
module Serializer
DEFAULT_COMPRESS = false
DEFAULT_BATCH_SIZE = 1024
DEFAULT_SERIALIZER_NAME = 'marshal'
@@registered = {}
# Register class and create method for quick access.
# Class will be available also as __name__ for using
# in build method (Proc binding problem).
#
# == Examples:
# register('test1', 'test2', Class)
#
# Spark::Serializer.test1
# Spark::Serializer.test2
#
# # Proc binding problem
# build { marshal } # => Spark::Serializer::Marshal
#
# marshal = 1
# build { marshal } # => 1
#
# build { __marshal__ } # => Spark::Serializer::Marshal
#
def self.register(*args)
klass = args.pop
args.each do |arg|
@@registered[arg] = klass
define_singleton_method(arg.to_sym){|*args| klass.new(*args) }
define_singleton_method("__#{arg}__".to_sym){|*args| klass.new(*args) }
end
end
def self.find(name)
@@registered[name.to_s.downcase]
end
def self.find!(name)
klass = find(name)
if klass.nil?
raise Spark::SerializeError, "Unknow serializer #{name}."
end
klass
end
def self.build(text=nil, &block)
if block_given?
class_eval(&block)
else
class_eval(text.to_s.downcase)
end
end
end
end
# Parent
require 'spark/serializer/base'
# Basic
require 'spark/serializer/oj'
require 'spark/serializer/marshal'
require 'spark/serializer/message_pack'
require 'spark/serializer/text'
# Others
require 'spark/serializer/batched'
require 'spark/serializer/auto_batched'
require 'spark/serializer/compressed'
require 'spark/serializer/pair'
require 'spark/serializer/cartesian'
================================================
FILE: lib/spark/sort.rb
================================================
module Spark
module InternalSorter
class Base
def initialize(key_function)
@key_function = key_function
end
end
class Ascending < Base
def sort(data)
data.sort_by!(&@key_function)
end
end
class Descending < Ascending
def sort(data)
super
data.reverse!
end
end
def self.get(ascending, key_function)
if ascending
type = Ascending
else
type = Descending
end
type.new(key_function)
end
end
end
module Spark
class ExternalSorter
include Spark::Helper::System
# Items from GC cannot be destroyed so #make_parts need some reserve
MEMORY_RESERVE = 50 # %
# How big will be chunk for adding new memory because GC not cleaning
# immediately un-referenced variables
MEMORY_FREE_CHUNK = 10 # %
# How many items will be evaluate from iterator at start
START_SLICE_SIZE = 10
# Maximum of slicing. Memory control can be avoided by large value.
MAX_SLICE_SIZE = 10_000
# How many values will be taken from each enumerator.
EVAL_N_VALUES = 10
# Default key function
KEY_FUNCTION = lambda{|item| item}
attr_reader :total_memory, :memory_limit, :memory_chunk, :serializer
def initialize(total_memory, serializer)
@total_memory = total_memory
@memory_limit = total_memory * (100-MEMORY_RESERVE) / 100
@memory_chunk = total_memory * (100-MEMORY_FREE_CHUNK) / 100
@serializer = serializer
end
def add_memory!
@memory_limit += memory_chunk
end
def sort_by(iterator, ascending=true, key_function=KEY_FUNCTION)
return to_enum(__callee__, iterator, key_function) unless block_given?
create_temp_folder
internal_sorter = Spark::InternalSorter.get(ascending, key_function)
# Make N sorted enumerators
parts = make_parts(iterator, internal_sorter)
return [] if parts.empty?
# Need new key function because items have new structure
# From: [1,2,3] to [[1, Enumerator],[2, Enumerator],[3, Enumerator]]
key_function_with_enum = lambda{|(key, _)| key_function[key]}
internal_sorter = Spark::InternalSorter.get(ascending, key_function_with_enum)
heap = []
enums = []
# Load first items to heap
parts.each do |part|
EVAL_N_VALUES.times {
begin
heap << [part.next, part]
rescue StopIteration
break
end
}
end
# Parts can be empty but heap not
while parts.any? || heap.any?
internal_sorter.sort(heap)
# Since parts are sorted and heap contains EVAL_N_VALUES method
# can add EVAL_N_VALUES items to the result
EVAL_N_VALUES.times {
break if heap.empty?
item, enum = heap.shift
enums << enum
yield item
}
# Add new element to heap from part of which was result item
while (enum = enums.shift)
begin
heap << [enum.next, enum]
rescue StopIteration
parts.delete(enum)
enums.delete(enum)
end
end
end
ensure
destroy_temp_folder
end
private
def create_temp_folder
@dir = Dir.mktmpdir
end
def destroy_temp_folder
FileUtils.remove_entry_secure(@dir) if @dir
end
# New part is created when current part exceeds memory limit (is variable)
# Every new part have more memory because of ruby GC
def make_parts(iterator, internal_sorter)
slice = START_SLICE_SIZE
parts = []
part = []
loop do
begin
# Enumerator does not have slice method
slice.times { part << iterator.next }
rescue StopIteration
break
end
# Carefully memory_limit is variable
if memory_usage > memory_limit
# Sort current part with origin key_function
internal_sorter.sort(part)
# Tempfile for current part
# will be destroyed on #destroy_temp_folder
file = Tempfile.new("part", @dir)
serializer.dump(part, file)
# Peek is at the end of file
file.seek(0)
parts << serializer.load(file)
# Some memory will be released but not immediately
# need some new memory for start
part.clear
add_memory!
else
slice = [slice*2, MAX_SLICE_SIZE].min
end
end
# Last part which is not in the file
if part.any?
internal_sorter.sort(part)
parts << part.each
end
parts
end
end # ExternalSorter
end # Spark
================================================
FILE: lib/spark/sql/column.rb
================================================
module Spark
module SQL
class Column
# =============================================================================
# Creating
def self.to_java(col)
if col.is_a?(Column)
col.jcolumn
else
from_name(col)
end
end
def self.from_literal(literal)
JSQLFunctions.lit(literal)
end
def self.from_name(name)
JSQLFunctions.col(name)
end
# =============================================================================
# Functions for virtual columns
# Evaluates a list of conditions and returns one of multiple possible result expressions.
# If {Column.otherwise} is not invoked, nil is returned for unmatched conditions.
#
# == Parameters:
# condition:: a boolean {Column} expression
# value:: a literal value, or a {Column} expression
#
# == Example:
# df.select(when(df['age'] == 2, 3).otherwise(4).alias("age")).collect()
# # [Row(age=3), Row(age=4)]
#
# df.select(when(df.age == 2, df.age + 1).alias("age")).collect()
# # [Row(age=3), Row(age=nil)]
#
def self.when(condition, value)
Column.new(JSQLFunctions).when(condition, value)
end
# =============================================================================
# Initialized column
attr_reader :jcolumn
def initialize(jcolumn)
@jcolumn = jcolumn
end
FUNC_OPERATORS = {
'!' => 'not',
'~' => 'negate',
'-@' => 'negate'
}
BIN_OPERATORS = {
'[]' => 'apply',
'+' => 'plus',
'-' => 'minus',
'*' => 'multiply',
'/' => 'divide',
'%' => 'mod',
'==' => 'equalTo',
'!=' => 'notEqual',
'<' => 'lt',
'<=' => 'leq',
'>' => 'gt',
'>=' => 'geq',
'&' => 'and',
'|' => 'or',
'like' => 'like',
'starts_with' => 'startsWith',
'ends_with' => 'endsWith',
'bitwiseOR' => 'bitwiseOR',
'bitwiseAND' => 'bitwiseAND',
'bitwiseXOR' => 'bitwiseXOR',
}
UNARY_OPERATORS = {
'asc' => 'asc',
'desc' => 'desc',
'is_null' => 'isNull',
'is_not_null' => 'isNotNull'
}
FUNC_OPERATORS.each do |op, func|
eval <<-METHOD
def #{op}
func_op('#{func}')
end
METHOD
end
BIN_OPERATORS.each do |op, func|
eval <<-METHOD
def #{op}(item)
bin_op('#{func}', item)
end
METHOD
end
UNARY_OPERATORS.each do |op, func|
eval <<-METHOD
def #{op}
unary_op('#{func}')
end
METHOD
end
# An expression that gets an item at position ordinal out of a list,
# or gets an item by key out of a Hash.
#
# == Example:
# df.select(df.l.get_item(0), df.d.get_item("key")).show
# # +----+------+
# # |l[0]|d[key]|
# # +----+------+
# # | 1| value|
# # +----+------+
#
# df.select(df.l[0], df.d["key"]).show
# # +----+------+
# # |l[0]|d[key]|
# # +----+------+
# # | 1| value|
# # +----+------+
#
def get_item(key)
self[key]
end
# An expression that gets a field by name in a StructField.
#
# == Example:
# df.select(df.r.get_field("b")).show
# # +----+
# # |r[b]|
# # +----+
# # | b|
# # +----+
#
# df.select(df.r.a).show
# # +----+
# # |r[a]|
# # +----+
# # | 1|
# # +----+
#
def get_field(name)
self[name]
end
# Return a {Column} which is a substring of the column.
#
# == Parameters:
# start:: start position (Integer or Column)
# length:: length of the substring (Integer or Column)
#
# == Example:
# df.select(df.name.substr(1, 3).alias("col")).collect
# # => [#, #]
#
def substr(start, length)
if start.is_a?(Integer) && length.is_a?(Integer)
new_jcolumn = jcolumn.substr(start, length)
elsif start.is_a?(Column) && length.is_a?(Column)
new_jcolumn = jcolumn.substr(start.jcolumn, length.jcolumn)
else
raise ArgumentError, "Unsupported type: #{start.class} and #{length.class}."
end
Column.new(new_jcolumn)
end
# A boolean expression that is evaluated to true if the value of this
# expression is contained by the evaluated values of the arguments.
#
# == Example:
# df[df.name.isin("Bob", "Mike")].collect
# # => [#]
#
# df[df.age.isin(1, 2, 3)].collect
# # => [#]
#
def isin(*cols)
if cols.size == 1 && cols.first.is_a?(Array)
cols = cols.first
end
cols = cols.map do |col|
Column.from_literal(col)
end
new_jcolumn = jcolumn.isin(Spark.jb.to_seq(cols))
Column.new(new_jcolumn)
end
# Returns this column aliased with a new name or names (in the case of expressions that
# return more than one column, such as explode).
#
# == Example:
# df.select(df.age.alias("age2")).collect
# # => [#, #]
#
def alias(name)
Column.new(jcolumn.as(name))
end
# Convert the column into type data_type.
#
# == Example:
# df.select(df.age.cast("string").alias('ages')).collect
# # => [#, #]
#
# df.select(df.age.cast(StringType.new).alias('ages')).collect
# # => [#, #]
#
def cast(data_type)
case data_type
when String
new_jcolumn = jcolumn.cast(data_type)
when DataType
jdata_type = JDataType.fromJson(data_type.json)
new_jcolumn = jcolumn.cast(jdata_type)
else
raise ArgumentError, "Unsupported type: #{data_type.class}"
end
Column.new(new_jcolumn)
end
# A boolean expression that is evaluated to true if the value of this
# expression is between the given columns.
#
# == Example:
# df.select(df.name, df.age.between(2, 4)).show
# # +-----+--------------------------+
# # | name|((age >= 2) && (age <= 4))|
# # +-----+--------------------------+
# # |Alice| true|
# # | Bob| false|
# # +-----+--------------------------+
#
def between(lower, upper)
(self >= lower) & (self <= upper)
end
# Evaluates a list of conditions and returns one of multiple possible result expressions.
# If {Column.otherwise} is not invoked, nil is returned for unmatched conditions.
#
# == Parameters:
# condition:: a boolean {Column} expression.
# value:: a literal value, or a {Column} expression.
#
# == Example:
# df.select(df.name, Column.when(df.age > 4, 1).when(df.age < 3, -1).otherwise(0)).show
# # +-----+--------------------------------------------------------+
# # | name|CASE WHEN (age > 4) THEN 1 WHEN (age < 3) THEN -1 ELSE 0|
# # +-----+--------------------------------------------------------+
# # |Alice| -1|
# # | Bob| 1|
# # +-----+--------------------------------------------------------+
#
def when(condition, value)
unless condition.is_a?(Column)
raise ArgumentError, "Condition must be a Column"
end
if value.is_a?(Column)
value = value.jcolumn
end
new_jcolumn = jcolumn.when(condition.jcolumn, value)
Column.new(new_jcolumn)
end
# Evaluates a list of conditions and returns one of multiple possible result expressions.
# If {Column.otherwise} is not invoked, nil is returned for unmatched conditions.
#
# == Example:
# df.select(df.name, Column.when(df.age > 3, 1).otherwise(0)).show
# # +-----+---------------------------------+
# # | name|CASE WHEN (age > 3) THEN 1 ELSE 0|
# # +-----+---------------------------------+
# # |Alice| 0|
# # | Bob| 1|
# # +-----+---------------------------------+
#
def otherwise(value)
if value.is_a?(Column)
value = value.jcolumn
end
new_jcolumn = jcolumn.otherwise(value)
Column.new(new_jcolumn)
end
def over(*)
raise Spark::NotImplemented
end
def method_missing(method, item)
get_field(item)
end
def to_s
"Column(\"#{jcolumn.toString}\")"
end
def inspect
"#<#{to_s}>"
end
alias_method :as, :alias
alias_method :slice, :substr
alias_method :astype, :cast
private
def func_op(name)
new_jcolumn = JSQLFunctions.__send__(name, jcolumn)
Column.new(new_jcolumn)
end
def bin_op(name, item)
if item.is_a?(Column)
col = item.jcolumn
else
col = item
end
new_jcolumn = jcolumn.__send__(name, col)
Column.new(new_jcolumn)
end
def unary_op(name)
new_jcolumn = jcolumn.__send__(name)
Column.new(new_jcolumn)
end
end
end
end
================================================
FILE: lib/spark/sql/context.rb
================================================
module Spark
module SQL
class Context
attr_reader :spark_context, :jsql_context
def initialize(spark_context)
@spark_context = spark_context
@jsql_context = JSQLContext.new(spark_context.sc)
end
def read
DataFrameReader.new(self)
end
end
end
end
================================================
FILE: lib/spark/sql/data_frame.rb
================================================
module Spark
module SQL
##
# Spark::SQL::DataFrame
#
# All example are base on people.json
#
class DataFrame
attr_reader :jdf, :sql_context
def initialize(jdf, sql_context)
@jdf = jdf
@sql_context = sql_context
end
# Returns the column as a {Column}.
#
# == Examples:
# df.select(df['age']).collect
# # => [#2}>, #5}>]
#
# df[ ["name", "age"] ].collect
# # => [#"Alice", "age"=>2}>, #"Bob", "age"=>5}>]
#
# df[ df.age > 3 ].collect
# # => [#5, "name"=>"Bob"}>]
#
# df[df[0] > 3].collect
# # => [#5, "name"=>"Bob"}>]
#
def [](item)
case item
when String
jcolumn = jdf.apply(item)
Column.new(jcolumn)
when Array
select(*item)
when Numeric
jcolumn = jdf.apply(columns[item])
Column.new(jcolumn)
when Column
where(item)
else
raise ArgumentError, "Unsupported type: #{item.class}"
end
end
# Returns all column names as a Array.
#
# == Example:
# df.columns
# # => ['age', 'name']
#
def columns
schema.fields.map(&:name)
end
# Returns the schema of this {DataFrame} as a {StructType}.
def schema
return @schema if @schema
begin
@schema = DataType.parse(JSON.parse(jdf.schema.json))
rescue => e
raise Spark::ParseError, 'Unable to parse datatype from schema'
end
end
def show_string(n=20, truncate=true)
jdf.showString(n, truncate)
end
# Prints the first n rows to the console.
#
# == Parameters:
# n:: Number of rows to show.
# truncate:: Whether truncate long strings and align cells right.
#
def show(n=20, truncate=true)
puts show_string(n, truncate)
end
# Prints out the schema in the tree format.
#
# == Example:
# df.print_schema
# # root
# # |-- age: integer (nullable = true)
# # |-- name: string (nullable = true)
#
def print_schema
puts jdf.schema.treeString
end
def explain(extended=false)
if extended
jdf.queryExecution.toString
else
jdf.queryExecution.executedPlan.toString
end
end
# Prints the (logical and physical) plans to the console for debugging purpose.
#
# == Example:
# df.print_explain
# # Scan PhysicalRDD[age#0,name#1]
#
# df.print_explain(true)
# # == Parsed Logical Plan ==
# # ...
# # == Analyzed Logical Plan ==
# # ...
# # == Optimized Logical Plan ==
# # ...
# # == Physical Plan ==
# # ...
#
def print_explain(extended=false)
puts explain(extended)
end
# Returns all column names and their data types as a list.
#
# == Example:
# df.dtypes
# # => [('age', 'int'), ('name', 'string')]
#
def dtypes
schema.fields.map do |field|
[field.name, field.data_type.simple_string]
end
end
def inspect
types = dtypes.map do |(name, type)|
"#{name}: #{type}"
end
"#"
end
# Get column by name
def method_missing(method, *args, &block)
name = method.to_s
if columns.include?(name)
self[name]
else
super
end
end
# =============================================================================
# Collect
# Returns all the records as a list of {Row}.
#
# == Example:
# df.collect
# # => [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
#
def collect
Spark.jb.call(jdf, 'collect')
end
def collect_as_hash
result = collect
result.map!(&:to_h)
result
end
def values
result = collect
result.map! do |item|
item.to_h.values
end
result
end
# Returns the number of rows in this {DataFrame}.
def count
jdf.count.to_i
end
# Returns the first num rows as an Array of {Row}.
def take(num)
limit(num).collect
end
# Return first {Row}.
def first
take(1).first
end
# =============================================================================
# Queries
# Projects a set of expressions and returns a new {DataFrame}
#
# == Parameters:
# *cols::
# List of column names (string) or expressions {Column}.
# If one of the column names is '*', that column is expanded to include all columns
# in the current DataFrame.
#
# == Example:
# df.select('*').collect
# # => [#2, "name"=>"Alice"}>, #5, "name"=>"Bob"}>]
#
# df.select('name', 'age').collect
# # => [#"Alice", "age"=>2}>, #"Bob", "age"=>5}>]
#
# df.select(df.name, (df.age + 10).alias('age')).collect
# # => [#"Alice", "age"=>12}>, #"Bob", "age"=>15}>]
#
def select(*cols)
jcols = cols.map do |col|
Column.to_java(col)
end
new_jdf = jdf.select(jcols)
DataFrame.new(new_jdf, sql_context)
end
# Filters rows using the given condition.
#
# == Examples:
# df.filter(df.age > 3).collect
# # => [#5, "name"=>"Bob"}>]
#
# df.where(df.age == 2).collect
# # => [#2, "name"=>"Alice"}>]
#
# df.filter("age > 3").collect
# # => [#5, "name"=>"Bob"}>]
#
# df.where("age = 2").collect
# # => [#2, "name"=>"Alice"}>]
#
def filter(condition)
case condition
when String
new_jdf = jdf.filter(condition)
when Column
new_jdf = jdf.filter(condition.jcolumn)
else
raise ArgumentError, 'Condition must be String or Column'
end
DataFrame.new(new_jdf, sql_context)
end
# Limits the result count to the number specified.
def limit(num)
new_jdf = jdf.limit(num)
DataFrame.new(new_jdf, sql_context)
end
alias_method :where, :filter
end
end
end
================================================
FILE: lib/spark/sql/data_frame_reader.rb
================================================
module Spark
module SQL
class DataFrameReader
attr_reader :sql_context, :jreader
def initialize(sql_context)
@sql_context = sql_context
@jreader = sql_context.jsql_context.read
end
def df(jdf)
DataFrame.new(jdf, sql_context)
end
# Specifies the input data source format.
# Parameter is name of the data source, e.g. 'json', 'parquet'.
def format(source)
jreader.format(source)
self
end
# Adds an input option for the underlying data source.
def option(key, value)
jreader.option(key, value.to_s)
self
end
# Adds input options for the underlying data source.
def options(options)
options.each do |key, value|
jreader.option(key, value.to_s)
end
self
end
# Loads data from a data source and returns it as a :class`DataFrame`.
#
# == Parameters:
# path:: Optional string for file-system backed data sources.
# format:: Optional string for format of the data source. Default to 'parquet'.
# schema:: Optional {StructType} for the input schema.
# options:: All other string options.
#
def load(path=nil, new_format=nil, new_schema=nil, new_options=nil)
new_format && format(new_format)
new_schema && schema(new_schema)
new_options && options(new_options)
if path.nil?
df(jreader.load)
else
df(jreader.load(path))
end
end
# Specifies the input schema.
#
# Some data sources (e.g. JSON) can infer the input schema automatically from data.
# By specifying the schema here, the underlying data source can skip the schema
# inference step, and thus speed up data loading.
#
# Parameter schema must be StructType object.
#
def schema(new_schema)
unless new_schema.is_a?(StructType)
raise ArgumentError, 'Schema must be a StructType.'
end
jschema = sql_context.jsql_context.parseDataType(new_schema.json)
jreader.schema(jschema)
self
end
# Loads a JSON file (one object per line) and returns the result as {DataFrame}
#
# If the schema parameter is not specified, this function goes
# through the input once to determine the input schema.
#
# == Parameters:
# path:: string, path to the JSON dataset
# schema:: an optional {StructType} for the input schema.
#
# == Example:
# df = sql.read.json('people.json')
# df.dtypes
# # => [('age', 'bigint'), ('name', 'string')]
#
def json(path, new_schema=nil)
# ClassNotFoundException: Failed to load class for data source: json
# df(jreader.json(path))
load(path, 'org.apache.spark.sql.execution.datasources.json', new_schema)
end
end
end
end
================================================
FILE: lib/spark/sql/data_type.rb
================================================
module Spark
module SQL
##
# Spark::SQL::DataType
#
class DataType
cattr_accessor :atomic_types
self.atomic_types = {}
cattr_accessor :complex_types
self.complex_types = {}
def self.parse(data)
if data.is_a?(Hash)
type = data['type']
if complex_types.has_key?(type)
complex_types[type].from_json(data)
# elsif type == 'udt'
# UserDefinedType.from_json(data)
else
raise Spark::SQLError, "Unsupported type: #{type}"
end
else
if atomic_types.has_key?(data)
atomic_types[data].new
else
raise Spark::SQLError, "Unsupported type: #{type}"
end
end
end
def self.class_name
name.split('::').last
end
def self.type_name
class_name.sub('Type', '').downcase
end
def self.complex
complex_types[type_name] = self
end
def self.atomic
atomic_types[type_name] = self
end
def ==(other)
self.class == other.class && self.to_s == other.to_s
end
def type_name
self.class.type_name
end
def simple_string
type_name
end
def json_value
type_name
end
def json
json_value.to_json
end
def to_s
self.class.class_name
end
def inspect
"#<#{to_s}>"
end
end
##
# Spark::SQL::StructType
#
# Struct type, consisting of a list of {StructField}.
# This is the data type representing a {Row}.
#
# == Example:
# struct1 = StructType.new([StructField.new('f1', StringType.new, true)])
# struct2 = StructType.new([StructField.new('f2', StringType.new, true)])
# struct1 == struct2
# # => true
#
class StructType < DataType
complex
attr_reader :fields
def self.from_json(json)
fields = json['fields'].map do |field|
StructField.from_json(field)
end
StructType.new(fields)
end
def initialize(fields=[])
@fields = fields
@names = fields.map(&:name)
end
def json_value
{
'type' => type_name,
'fields' => fields.map(&:json_value)
}
end
def to_s
"StructType(#{fields.join(', ')})"
end
end
##
# Spark::SQL::StructField
#
class StructField < DataType
attr_reader :name, :data_type, :nullable, :metadata
def self.from_json(json)
StructField.new(json['name'], DataType.parse(json['type']), json['nullable'], json['metadata'])
end
# A field in {StructType}.
#
# == Parameters:
# name:: string, name of the field.
# data_type:: {DataType} of the field.
# nullable:: boolean, whether the field can be null (nil) or not.
# metadata:: a dict from string to simple type that can be to_internald to JSON automatically
#
# == Example:
# f1 = StructField.new('f1', StringType.new, true)
# f2 = StructField.new('f2', StringType.new, true)
# f1 == f2
# # => true
#
def initialize(name, data_type, nullable=true, metadata={})
@name = name
@data_type = data_type
@nullable = nullable
@metadata = metadata
end
def json_value
{
'name' => name,
'type' => data_type.json_value,
'nullable' => nullable,
'metadata' => metadata,
}
end
def to_s
%{StructField(#{name}, #{data_type}, #{nullable})}
end
end
##
# Spark::SQL::AtomicType
#
# An internal type used to represent everything that is not
# null, UDTs, arrays, structs, and maps.
#
class AtomicType < DataType
end
##
# Spark::SQL::BooleanType
#
# Boolean data type.
#
class BooleanType < AtomicType
atomic
end
##
# Spark::SQL::NumericType
#
# Numeric data types.
#
class NumericType < AtomicType
end
##
# Spark::SQL::IntegralType
#
# Integral data types.
#
class IntegralType < NumericType
end
##
# Spark::SQL::StringType
#
# String data type.
#
class StringType < AtomicType
atomic
end
##
# Spark::SQL::LongType
#
# Long data type, i.e. a signed 64-bit integer.
#
# If the values are beyond the range of [-9223372036854775808, 9223372036854775807],
# please use {DecimalType}.
#
class LongType < IntegralType
atomic
end
end
end
================================================
FILE: lib/spark/sql/row.rb
================================================
module Spark
module SQL
##
# Spark::SQL::Row
#
class Row
attr_reader :data
def self.from_java(object, with_schema=true)
if with_schema
fields = object.schema.fieldNames
else
# Create virtual schema (t0, t1, t2, ...)
raise Spark::NotImplemented, 'Row must have a schema'
end
if object.anyNull
data = {}
object.size.times do |i|
if object.isNullAt(i)
value = nil
else
value = Spark.jb.to_ruby(object.get(i))
end
data[ fields[i] ] = value
end
else
data = fields.zip(Spark.jb.to_ruby(object.values))
end
Row.new(data)
end
def initialize(data={})
@data = data.to_h
end
def [](item)
@data[item]
end
def to_h
@data
end
def inspect
formated = data.map do |key, value|
"#{key}: \"#{value}\""
end
%{#}
end
end
end
end
================================================
FILE: lib/spark/sql.rb
================================================
module Spark
module SQL
extend Spark::Library
autoload_without_import :Context, 'spark/sql/context'
autoload_without_import :DataType, 'spark/sql/data_type'
autoload_without_import :DataFrame, 'spark/sql/data_frame'
autoload_without_import :DataFrameReader, 'spark/sql/data_frame_reader'
autoload :Row, 'spark/sql/row'
autoload :Column, 'spark/sql/column'
# Types
autoload :StructType, 'spark/sql/data_type'
autoload :StructField, 'spark/sql/data_type'
autoload :AtomicType, 'spark/sql/data_type'
autoload :NumericType, 'spark/sql/data_type'
autoload :IntegralType, 'spark/sql/data_type'
autoload :StringType, 'spark/sql/data_type'
autoload :LongType, 'spark/sql/data_type'
end
SQLContext = Spark::SQL::Context
end
================================================
FILE: lib/spark/stat_counter.rb
================================================
module Spark
class StatCounter
attr_reader :n # count of our values
attr_reader :mu # mean of our values
attr_reader :m2 # variance numerator (sum of (x - mean)^2)
attr_reader :max # max of our values
attr_reader :min # min of our values
def initialize(iterator)
@n = 0
@mu = 0.0
@m2 = 0.0
@max = -Float::INFINITY
@min = Float::INFINITY
merge(iterator)
end
def merge(other)
if other.is_a?(Spark::StatCounter)
merge_stat_counter(other)
elsif other.respond_to?(:each)
merge_array(other)
else
merge_value(other)
end
self
end
def sum
@n * @mu
end
# Return the variance of the values.
def variance
if @n == 0
Float::NAN
else
@m2 / @n
end
end
# Return the sample variance, which corrects for bias in estimating the variance by dividing
# by N-1 instead of N.
def sample_variance
if @n <= 1
Float::NAN
else
@m2 / (@n - 1)
end
end
# Return the standard deviation of the values.
def stdev
Math.sqrt(variance)
end
# Return the sample standard deviation of the values, which corrects for bias in estimating the
# variance by dividing by N-1 instead of N.
def sample_stdev
Math.sqrt(sample_variance)
end
def to_s
"(count: #{count}, mean: #{mean}, stdev: #{stdev}, max: #{max}, min: #{min})"
end
alias_method :count, :n
alias_method :mean, :mu
alias_method :max_value, :max
alias_method :min_value, :min
alias_method :sampleStdev, :sample_stdev
alias_method :sampleVariance, :sample_variance
private
def merge_stat_counter(other)
if other == self
other = self.deep_copy
end
if @n == 0
@n = other.n
@mu = other.mu
@m2 = other.m2
@max = other.max
@min = other.min
elsif other.n != 0
delta = other.mu - @mu
if other.n * 10 < @n
@mu = @mu + (delta * other.n) / (@n + other.n)
elsif @n * 10 < other.n
@mu = other.mu - (delta * @n) / (@n + other.n)
else
@mu = (@mu * @n + other.mu * other.n) / (@n + other.n)
end
@max = [@max, other.max].max
@min = [@min, other.min].min
@m2 += other.m2 + (delta * delta * @n * other.n) / (@n + other.n)
@n += other.n
end
end
def merge_array(array)
array.each do |item|
merge_value(item)
end
end
def merge_value(value)
delta = value - @mu
@n += 1
@mu += delta / @n
@m2 += delta * (value - @mu)
@max = [@max, value].max
@min = [@min, value].min
end
end
end
================================================
FILE: lib/spark/storage_level.rb
================================================
# Necessary libraries
Spark.load_lib
module Spark
class StorageLevel
def self.reload
return if @reloaded
reload!
@reloaded = true
end
def self.reload!
self.const_set(:NONE, JStorageLevel.NONE)
self.const_set(:DISK_ONLY, JStorageLevel.DISK_ONLY)
self.const_set(:DISK_ONLY_2, JStorageLevel.DISK_ONLY_2)
self.const_set(:MEMORY_ONLY, JStorageLevel.MEMORY_ONLY)
self.const_set(:MEMORY_ONLY_SER, JStorageLevel.MEMORY_ONLY_SER)
self.const_set(:MEMORY_ONLY_2, JStorageLevel.MEMORY_ONLY_2)
self.const_set(:MEMORY_ONLY_SER_2, JStorageLevel.MEMORY_ONLY_SER_2)
self.const_set(:MEMORY_AND_DISK, JStorageLevel.MEMORY_AND_DISK)
self.const_set(:MEMORY_AND_DISK_2, JStorageLevel.MEMORY_AND_DISK_2)
self.const_set(:MEMORY_AND_DISK_SER, JStorageLevel.MEMORY_AND_DISK_SER)
self.const_set(:MEMORY_AND_DISK_SER_2, JStorageLevel.MEMORY_AND_DISK_SER_2)
self.const_set(:OFF_HEAP, JStorageLevel.OFF_HEAP)
end
def self.java_get(arg)
reload
if arg.is_a?(String)
const_get(arg.upcase)
else
arg
end
end
end
end
================================================
FILE: lib/spark/version.rb
================================================
module Spark
VERSION = '1.2.1'
end
================================================
FILE: lib/spark/worker/master.rb
================================================
#!/usr/bin/env ruby
$PROGRAM_NAME = 'RubySparkMaster'
require 'socket'
require 'io/wait'
require 'nio'
require_relative 'worker'
# New process group
# Otherwise master can be killed from pry console
Process.setsid
# =================================================================================================
# Master
#
module Master
def self.create
case ARGV[0].to_s.strip
when 'thread'
Master::Thread.new
else
Master::Process.new
end
end
class Base
include Spark::Constant
def initialize
@port = ARGV[1].to_s.strip.to_i
@socket = TCPSocket.open('localhost', @port)
@worker_arguments = @socket.read_string
end
def run
selector = NIO::Selector.new
monitor = selector.register(@socket, :r)
monitor.value = Proc.new { receive_message }
loop {
selector.select {|monitor| monitor.value.call}
}
end
def receive_message
command = @socket.read_int
case command
when CREATE_WORKER
create_worker
when KILL_WORKER
kill_worker
when KILL_WORKER_AND_WAIT
kill_worker_and_wait
end
end
def kill_worker_and_wait
if kill_worker
@socket.write_int(SUCCESSFULLY_KILLED)
else
@socket.write_int(UNSUCCESSFUL_KILLING)
end
end
end
# ===============================================================================================
# Worker::Process
#
class Process < Base
def create_worker
if fork?
pid = ::Process.fork do
Worker::Process.new(@port).run
end
else
pid = ::Process.spawn("ruby #{@worker_arguments} worker.rb #{@port}")
end
# Detach child from master to avoid zombie process
::Process.detach(pid)
end
def kill_worker
worker_id = @socket.read_long
::Process.kill('TERM', worker_id)
rescue
nil
end
def fork?
@can_fork ||= _fork?
end
def _fork?
return false if !::Process.respond_to?(:fork)
pid = ::Process.fork
exit unless pid # exit the child immediately
true
rescue NotImplementedError
false
end
end
# ===============================================================================================
# Worker::Thread
#
class Thread < Base
def initialize
::Thread.abort_on_exception = true
# For synchronous access to socket IO
$mutex_for_command = Mutex.new
$mutex_for_iterator = Mutex.new
super
end
def create_worker
::Thread.new do
Worker::Thread.new(@port).run
end
end
def kill_worker
worker_id = @socket.read_long
thread = ObjectSpace._id2ref(worker_id)
thread.kill
rescue
nil
end
end
end
# Create proper master by worker_type
Master.create.run
================================================
FILE: lib/spark/worker/spark_files.rb
================================================
class SparkFiles
class << self
attr_accessor :root_directory
end
def self.get(file_name)
File.join(root_directory, file_name)
end
def self.get_content(file_name)
File.read(get(file_name))
end
end
================================================
FILE: lib/spark/worker/worker.rb
================================================
#!/usr/bin/env ruby
# Load root of the gem
lib = File.expand_path(File.join('..', '..'), File.dirname(__FILE__))
$LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)
require 'ruby-spark.rb'
require 'socket'
require_relative 'spark_files'
# =================================================================================================
# Worker
#
# Iterator is LAZY !!!
#
module Worker
class Base
include Spark::Helper::Serialize
include Spark::Helper::System
include Spark::Constant
attr_accessor :socket
def initialize(port)
# Open socket to Spark
@socket = TCPSocket.open('localhost', port)
# Send back worker ID
socket.write_long(id)
end
def run
begin
compute
rescue => e
send_error(e)
else
successful_finish
end
end
private
def before_start
# Should be implemented in sub-classes
end
def before_end
# Should be implemented in sub-classes
end
# These methods must be on one method because iterator is Lazy
# which mean that exception can be raised at `serializer` or `compute`
def compute
before_start
# Load split index
@split_index = socket.read_int
# Load files
SparkFiles.root_directory = socket.read_string
# Load broadcast
count = socket.read_int
count.times do
Spark::Broadcast.register(socket.read_long, socket.read_string)
end
# Load command
@command = socket.read_data
# Load iterator
@iterator = @command.deserializer.load_from_io(socket).lazy
# Compute
@iterator = @command.execute(@iterator, @split_index)
# Result is not iterable
@iterator = [@iterator] unless @iterator.respond_to?(:each)
# Send result
@command.serializer.dump_to_io(@iterator, socket)
end
def send_error(e)
# Flag
socket.write_int(WORKER_ERROR)
# Message
socket.write_string(e.message)
# Backtrace
socket.write_int(e.backtrace.size)
e.backtrace.each do |item|
socket.write_string(item)
end
socket.flush
# Wait for spark
# Socket is closed before throwing an exception
# Singal that ruby exception was fully received
until socket.closed?
sleep(0.1)
end
# Depend on type of worker
kill_worker
end
def successful_finish
# Finish
socket.write_int(WORKER_DONE)
# Send changed accumulator
changed = Spark::Accumulator.changed
socket.write_int(changed.size)
changed.each do |accumulator|
socket.write_data([accumulator.id, accumulator.value])
end
# Send it
socket.flush
before_end
end
def log(message=nil)
return if !$DEBUG
$stdout.puts %{==> #{Time.now.strftime('%H:%M:%S')} [#{id}] #{message}}
$stdout.flush
end
end
# ===============================================================================================
# Worker::Process
#
class Process < Base
def id
::Process.pid
end
private
def before_start
$PROGRAM_NAME = 'RubySparkWorker'
end
def kill_worker
Process.exit(false)
end
end
# ===============================================================================================
# Worker::Thread
#
class Thread < Base
def id
::Thread.current.object_id
end
private
def load_command
$mutex_for_command.synchronize { super }
end
# Threads changing for reading is very slow
# Faster way is do it one by one
def load_iterator
# Wait for incoming connection for preventing deadlock
if jruby?
socket.io_wait
else
socket.wait_readable
end
$mutex_for_iterator.synchronize { super }
end
def kill_worker
Thread.current.kill
end
end
end
# Worker is loaded as standalone
if $PROGRAM_NAME == __FILE__
worker = Worker::Process.new(ARGV[0])
worker.run
end
================================================
FILE: lib/spark.rb
================================================
# Gems and libraries
require 'method_source'
require 'securerandom'
require 'forwardable'
require 'sourcify'
require 'socket'
require 'tempfile'
require 'tmpdir'
require 'json'
module Spark
autoload :Context, 'spark/context'
autoload :Config, 'spark/config'
autoload :RDD, 'spark/rdd'
autoload :CLI, 'spark/cli'
autoload :Build, 'spark/build'
autoload :Serializer, 'spark/serializer'
autoload :Helper, 'spark/helper'
autoload :StorageLevel, 'spark/storage_level'
autoload :Command, 'spark/command'
autoload :CommandBuilder, 'spark/command_builder'
autoload :Sampler, 'spark/sampler'
autoload :Logger, 'spark/logger'
autoload :JavaBridge, 'spark/java_bridge'
autoload :ExternalSorter, 'spark/sort'
autoload :Constant, 'spark/constant'
autoload :Broadcast, 'spark/broadcast'
autoload :Accumulator, 'spark/accumulator'
autoload :StatCounter, 'spark/stat_counter'
autoload :Library, 'spark/library'
# Mllib
autoload :Mllib, 'spark/mllib'
# SQL
autoload :SQL, 'spark/sql'
autoload :SQLContext, 'spark/sql'
include Helper::System
DEFAULT_CONFIG_FILE = File.join(Dir.home, '.ruby-spark.conf')
def self.print_logo(message=nil)
puts <<-STRING
Welcome to
__ ____ __
______ __/ / __ __ / __/__ ___ _____/ /__
/ __/ // / _ \\/ // / _\\ \\/ _ \\/ _ `/ __/ '_/
/_/ \\_,_/_.__/\\_, / /___/ .__/\\_,_/_/ /_/\\_\\ version #{Spark::VERSION}
/___/ /_/
#{message}
STRING
end
# Returns current configuration. Configurations can be changed until
# context is initialized. In this case config is locked only for reading.
#
# == Configuration can be changed:
#
# Spark.config.set('spark.app.name', 'RubySpark')
#
# Spark.config['spark.app.name'] = 'RubySpark'
#
# Spark.config do
# set 'spark.app.name', 'RubySpark'
# end
#
def self.config(&block)
@config ||= Spark::Config.new
if block_given?
@config.instance_eval(&block)
else
@config
end
end
# Destroy current configuration. This can be useful for restarting config
# to set new. It has no effect if context is already started.
def self.clear_config
@config = nil
end
# Return a current active context or nil.
def self.context
@context
end
# Current active SQLContext or nil.
def self.sql_context
@sql_context
end
# Initialize spark context if not already. Config will be automatically
# loaded on constructor. From that point `config` will use configuration
# from running Spark and will be locked only for reading.
def self.start
@context ||= Spark::Context.new
end
def self.start_sql
@sql_context ||= Spark::SQL::Context.new(start)
end
def self.stop
@context.stop
RubyWorker.stopServer
logger.info('Workers were stopped')
rescue
nil
ensure
@context = nil
@sql_context = nil
clear_config
end
def self.started?
!!@context
end
# ===============================================================================
# Defaults
# Load default configuration for Spark and RubySpark
# By default are values stored at ~/.ruby-spark.conf
# File is automatically created
def self.load_defaults
unless File.exists?(DEFAULT_CONFIG_FILE)
save_defaults_to(DEFAULT_CONFIG_FILE)
end
load_defaults_from(DEFAULT_CONFIG_FILE)
end
# Clear prev setting and load new from file
def self.load_defaults_from(file_path)
# Parse values
values = File.readlines(file_path)
values.map!(&:strip)
values.select!{|value| value.start_with?('gem.')}
values.map!{|value| value.split(nil, 2)}
values = Hash[values]
# Clear prev values
@target_dir = nil
@ruby_spark_jar = nil
@spark_home = nil
# Load new
@target_dir = values['gem.target']
end
# Create target dir and new config file
def self.save_defaults_to(file_path)
dir = File.join(Dir.home, ".ruby-spark.#{SecureRandom.uuid}")
if Dir.exist?(dir)
save_defaults_to(file_path)
else
Dir.mkdir(dir, 0700)
file = File.open(file_path, 'w')
file.puts "# Directory where will be Spark saved"
file.puts "gem.target #{dir}"
file.puts ""
file.puts "# You can also defined spark properties"
file.puts "# spark.master spark://master:7077"
file.puts "# spark.ruby.serializer marshal"
file.puts "# spark.ruby.serializer.batch_size 2048"
file.close
end
end
# ===============================================================================
# Global settings and variables
def self.logger
@logger ||= Spark::Logger.new
end
# Root of the gem
def self.root
@root ||= File.expand_path('..', File.dirname(__FILE__))
end
# Default directory for java extensions
def self.target_dir
@target_dir ||= File.join(root, 'target')
end
# Directory where is worker.rb
def self.worker_dir
@worker_dir ||= File.join(root, 'lib', 'spark', 'worker')
end
def self.ruby_spark_jar
@ruby_spark_jar ||= File.join(target_dir, 'ruby-spark.jar')
end
def self.spark_ext_dir
@spark_ext_dir ||= File.join(root, 'ext', 'spark')
end
# ===============================================================================
# Load JVM and jars
# Load dependent libraries, can be use once
# Cannot load before CLI::install
#
# == Parameters:
# target::
# path to directory where are located sparks .jar files or single Spark jar
#
def self.load_lib(target=nil)
return if @java_bridge
target ||= Spark.target_dir
@java_bridge = JavaBridge.init(target)
@java_bridge.import_all
nil
end
def self.java_bridge
@java_bridge
end
# Aliases
class << self
alias_method :sc, :context
alias_method :jb, :java_bridge
alias_method :home, :root
end
end
# C/Java extensions
require 'ruby_spark_ext'
# Ruby core extensions
require 'spark/ext/module'
require 'spark/ext/object'
require 'spark/ext/hash'
require 'spark/ext/string'
require 'spark/ext/integer'
require 'spark/ext/ip_socket'
require 'spark/ext/io'
# Other requirments
require 'spark/version'
require 'spark/error'
# Load default settings for gem and Spark
Spark.load_defaults
# Make sure that Spark be always stopped
Kernel.at_exit do
begin
Spark.started? && Spark.stop
rescue
end
end
================================================
FILE: ruby-spark.gemspec
================================================
# coding: utf-8
lib = File.expand_path('../lib', __FILE__)
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
require 'spark/version'
Gem::Specification.new do |spec|
spec.name = 'ruby-spark'
spec.version = Spark::VERSION
spec.authors = ['Ondřej Moravčík']
spec.email = ['moravcik.ondrej@gmail.com']
spec.summary = %q{Ruby wrapper for Apache Spark}
spec.description = %q{}
spec.homepage = ''
spec.license = 'MIT'
spec.files = `git ls-files -z`.split("\x0")
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
spec.require_paths = ['lib']
if RUBY_PLATFORM =~ /java/
spec.platform = 'java'
extensions = ['ext/ruby_java/extconf.rb']
else
extensions = ['ext/ruby_c/extconf.rb']
spec.add_dependency 'rjb'
end
spec.extensions = extensions
spec.required_ruby_version = '>= 2.0'
spec.requirements << 'java, scala'
spec.add_dependency 'sourcify', '0.6.0.rc4'
spec.add_dependency 'method_source'
spec.add_dependency 'commander'
spec.add_dependency 'pry'
spec.add_dependency 'nio4r'
spec.add_dependency 'distribution'
spec.add_development_dependency 'bundler', '~> 1.6'
spec.add_development_dependency 'rake'
end
================================================
FILE: spec/generator.rb
================================================
class Generator
def self.numbers(size=1000)
Array.new(size){ rand(1..1000) }
end
def self.numbers_with_zero(size=1000)
Array.new(size){ rand(0..1000) }
end
def self.words(size=1000)
Array.new(size) { word }
end
def self.word(size=10)
Array.new(rand(1..size)){(97+rand(26)).chr}.join
end
def self.lines(size=1000, letters=3)
Array.new(size) do
Array.new(rand(50..100)){
(97+rand(letters)).chr + (' ' * (rand(10) == 0 ? 1 : 0))
}.join
end
end
def self.hash(size=1000)
Array.new(size) do
[word(2), rand(1..10)]
end
end
def self.hash_with_values(size=1000, values_count=10)
Array.new(size) do
[word(2), Array.new(values_count) { rand(1..10) }]
end
end
end
================================================
FILE: spec/inputs/lorem_300.txt
================================================
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aenean ligula neque, ultricies et lorem
vel, accumsan cursus felis. Maecenas ullamcorper, magna eu lobortis gravida, diam leo rutrum diam,
eget elementum sapien felis non magna. Etiam scelerisque, mauris et cursus fermentum, ipsum nisl
vulputate nisl, sit amet pulvinar libero sem at lectus. Vivamus nibh lectus, elementum eget dui non,
fermentum volutpat orci. Nam imperdiet, dui id placerat pellentesque, purus sem semper augue, id
dictum est ipsum et erat. Integer arcu tortor, ullamcorper ac libero a, iaculis sollicitudin orci.
Sed dapibus hendrerit neque, ac aliquet arcu elementum sed. Phasellus ornare interdum erat, eget
fringilla sapien ornare vitae. In condimentum, mi sed condimentum viverra, nisl sapien scelerisque
mi, vel varius metus dolor eu lorem. Nulla pulvinar ac metus eu volutpat. Suspendisse potenti. Duis
vitae mauris arcu. Proin et dignissim dolor, eget congue purus. Ut malesuada neque massa. Ut viverra
faucibus turpis, in pharetra nulla iaculis quis. Morbi imperdiet risus eu eros varius facilisis.
Aenean nec dapibus sapien. Fusce tempus, risus vitae volutpat faucibus, dolor diam cursus risus, sit
amet faucibus mauris mauris quis orci. Aliquam massa ante, accumsan non sapien quis, ullamcorper
fermentum elit. Pellentesque risus orci, rhoncus ac mi sed, volutpat vehicula sem. Mauris suscipit
odio vel mi scelerisque, at cursus libero ullamcorper. Nulla aliquam metus arcu, in vestibulum sem
ullamcorper eu. Pellentesque laoreet venenatis metus ut accumsan. Quisque ut enim interdum,
fringilla lorem nec, dignissim orci. Fusce vel diam sed ante dictum scelerisque. Vestibulum lectus
enim, gravida sit amet ullamcorper sit amet, rhoncus nec dui. Praesent eget molestie tellus, quis
iaculis sapien. Sed ut rutrum velit. Pellentesque habitant morbi tristique senectus et netus et
malesuada fames ac turpis egestas. Donec tortor quam, venenatis ac rhoncus et, gravida non orci. Ut
lacus dolor, auctor id ante varius, pharetra placerat nulla. Nulla facilisi. Nam quis feugiat nibh,
ut ultrices est. Nulla at mi nec metus porttitor tempor. Donec leo lorem, rhoncus ut arcu eu,
venenatis eleifend risus. Phasellus non porttitor neque, sit amet accumsan nisl. Pellentesque non
urna tempor, interdum orci non, gravida enim. Sed in urna et dolor cursus aliquet et vel magna.
Quisque vestibulum tortor scelerisque orci mattis, eu aliquet sem condimentum. Proin ac ultricies
erat. Integer sodales, turpis quis volutpat pretium, justo lacus lobortis mauris, nec commodo orci
leo sit amet metus. Ut ornare ipsum vitae malesuada aliquam. Quisque lobortis semper elit id
consectetur. Aenean facilisis sapien eu ipsum adipiscing mattis. Praesent malesuada aliquet
venenatis. Ut aliquet vel sapien nec euismod. Morbi eros urna, rutrum ut iaculis sed, vulputate sit
amet nunc. Nulla facilisi. Morbi sagittis nec magna sed scelerisque. Maecenas a euismod eros.
Vestibulum suscipit pharetra velit porta fermentum. Phasellus euismod auctor metus ut interdum.
Quisque lectus lorem, tristique ut libero vel, rhoncus tincidunt tellus. Sed malesuada vestibulum
purus, at tincidunt massa imperdiet vitae. Ut mollis eleifend elit, et sodales nisl facilisis eu.
Fusce ligula ligula, porta id est sed, tincidunt malesuada odio. Maecenas ultricies dignissim nunc,
quis adipiscing urna auctor commodo. Phasellus tincidunt odio non nulla luctus sollicitudin. Mauris
pharetra porttitor est iaculis sollicitudin. Curabitur quam sem, fringilla id tellus vitae,
elementum convallis eros. Morbi sollicitudin eleifend leo, ut euismod ligula ornare sagittis. Nullam
luctus, mi eget dapibus elementum, diam purus fringilla lectus, sit amet sodales neque turpis sed
mi. Sed volutpat sem euismod posuere mollis. Integer viverra egestas lacinia. Quisque viverra metus
massa, in condimentum sem tincidunt a. Proin ac ipsum non leo sollicitudin consectetur id a sem.
Cras tempus venenatis nisl sit amet venenatis. Nulla facilisi. Morbi scelerisque mi est, vitae
lobortis sem ultricies faucibus. In urna ante, faucibus ac eros et, dignissim mollis justo. Quisque
aliquet tortor sem, ac mattis tortor faucibus sed. Donec tortor lacus, egestas in convallis at,
vulputate eu nibh. Aenean ligula augue, imperdiet in tempor id, consequat vitae erat. Sed id eros a
justo semper ultricies. Curabitur nunc nisi, placerat at leo sed, vehicula pulvinar velit. Nullam ut
ipsum augue. Fusce condimentum quam commodo, venenatis massa eleifend, dignissim neque. Curabitur
sit amet hendrerit tortor, a condimentum sem. Morbi lobortis porta porttitor. Maecenas mollis ipsum
ac est venenatis auctor at vel lectus. Mauris luctus euismod dolor. Cras vitae nibh eget sem
placerat adipiscing. Pellentesque ac molestie ligula. Vivamus sit amet lectus odio. Duis lacinia
rutrum faucibus. Curabitur luctus ultricies enim, id imperdiet ipsum viverra vitae. Mauris et
iaculis erat, vel faucibus purus. Fusce non nisl tristique, dignissim lacus id, fermentum velit. Sed
facilisis sapien at interdum viverra. Aliquam erat volutpat. Maecenas suscipit diam vitae velit
vulputate tincidunt. Nulla facilisi. Sed eget tortor et ante mollis cursus. Nullam vitae porttitor
magna. Quisque iaculis massa dui, id rutrum purus blandit eu. Duis convallis ipsum id commodo
iaculis. Praesent sagittis ut tortor ut varius. Curabitur consequat volutpat scelerisque. Cras
pharetra lectus eget urna imperdiet ullamcorper. Sed lacinia ut eros non malesuada. Quisque
hendrerit suscipit convallis. Vivamus posuere vestibulum massa, non accumsan diam tincidunt eu.
Nulla bibendum dictum mi sit amet faucibus. Nullam egestas lorem nunc, vel malesuada elit imperdiet
vitae. Sed luctus ligula at erat tempus tristique. Proin varius mi quis libero sollicitudin
ullamcorper. In hac habitasse platea dictumst. Praesent auctor arcu vel luctus consequat. Curabitur
consequat magna sit amet ante feugiat dictum. Morbi scelerisque faucibus urna, ac dapibus sem
ultricies eu. Pellentesque rhoncus sapien nec eros facilisis consectetur. Duis eleifend vestibulum
suscipit. Morbi orci metus, malesuada sit amet urna ac, laoreet vehicula lacus. Quisque gravida,
nunc fringilla tincidunt vestibulum, lacus urna commodo nisl, quis sodales lectus ipsum et augue. Ut
non erat sit amet neque fermentum ultricies. Vestibulum tincidunt est elit, ac dapibus velit
faucibus id. Praesent in viverra libero. Proin eleifend, odio eget sodales dignissim, nunc arcu
ullamcorper libero, sit amet sodales diam ipsum in tellus. Suspendisse enim nunc, accumsan non
ligula et, vulputate viverra ante. Ut id elit eu dui dictum malesuada at id orci. Vivamus sed felis
aliquam metus consequat euismod nec eu libero. Phasellus mattis malesuada ipsum eu posuere. Nullam
at massa enim. Duis vitae urna blandit, ultricies nisi in, consequat elit. Quisque nec nibh ut
tortor pulvinar euismod. Praesent molestie felis ac risus elementum sollicitudin. Donec eu leo in
augue convallis mattis. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur
ridiculus mus. Integer ut dignissim lectus. Vivamus eros felis, gravida et auctor ut, volutpat vitae
dui. Nunc adipiscing sapien et lectus rutrum vestibulum. Mauris fermentum, metus eu sollicitudin
malesuada, lorem diam vestibulum metus, ut elementum metus nibh sed augue. Cras lectus risus,
feugiat eget fringilla a, cursus et eros. Praesent aliquam justo vel condimentum lacinia. Sed
condimentum dui nec leo blandit, vel elementum odio laoreet. Quisque suscipit molestie iaculis.
Nullam dignissim, mauris sit amet condimentum aliquet, magna sapien scelerisque nisl, tincidunt
auctor purus libero at lectus. Nulla facilisi. Sed egestas erat at dictum egestas. Cras non mauris
ut dolor interdum condimentum. Fusce quis hendrerit purus, dictum cursus mi. Maecenas mattis, turpis
sit amet mollis ultricies, mi turpis ornare velit, eget suscipit magna eros sit amet purus. Integer
ut viverra elit. Praesent eu augue viverra nunc convallis porta. Etiam venenatis dignissim nisl et
semper. Cras eu nisl vitae justo ornare porttitor vel nec augue. Pellentesque faucibus mollis neque,
nec ullamcorper purus mollis sed. Suspendisse ut molestie lectus, faucibus aliquet libero. Aliquam
tristique, neque ut lobortis ultricies, tellus elit ultrices risus, sodales dapibus sem mauris et
magna. Sed et sem porttitor, fringilla mauris vestibulum, porttitor dui. Proin vitae viverra elit.
Integer nec adipiscing velit. Nunc quis urna tristique, ultrices orci eget, aliquet lorem. Curabitur
consequat adipiscing sodales. In elementum condimentum ante id placerat. Cras ac turpis tristique
lacus vulputate dictum vel nec libero. Curabitur fringilla interdum tempus. Integer placerat dolor
ut magna aliquet bibendum. Cras ac metus magna. Curabitur vehicula magna ut sapien viverra ornare.
Donec risus nisi, imperdiet eu laoreet in, tempor lobortis urna. Etiam malesuada et lacus ac
consectetur. Morbi facilisis sapien quis nisl laoreet semper. Suspendisse volutpat sapien vel quam
blandit faucibus. Nam sagittis velit eros, vitae suscipit tortor elementum ac. Pellentesque habitant
morbi tristique senectus et netus et malesuada fames ac turpis egestas. Donec nec nibh dictum,
pretium nulla eu, pharetra mauris. Vestibulum leo mi, convallis et euismod ac, molestie in ligula.
Vestibulum tempor tincidunt porttitor. Integer nisl orci, dignissim ac volutpat a, auctor eget
augue. Suspendisse eget euismod nunc, eu elementum ipsum. Cras libero tortor, gravida quis
vestibulum a, tincidunt aliquam mauris. Integer elementum pellentesque posuere. Donec accumsan
feugiat pulvinar. Aliquam eros justo, dictum non elementum nec, tristique vel massa. Nulla a velit
porttitor, aliquam turpis nec, ultricies ligula. Nam id dignissim dui. Ut placerat arcu nec accumsan
varius. Sed quis accumsan nunc, in dapibus lorem. Morbi egestas sagittis pulvinar. Morbi id mauris
ante. Sed magna nibh, venenatis quis lacinia in, congue quis metus. Nunc lacus lectus, adipiscing
sed consequat id, luctus vel dui. Mauris eu nisi erat. Proin eleifend lectus sit amet ligula
fringilla semper. Suspendisse tristique, quam ac pharetra dictum, libero risus rutrum ipsum, eget
tristique arcu neque vel nisi. Ut auctor nulla vitae porta faucibus. Suspendisse ut tellus enim.
Morbi commodo posuere quam. Proin consequat in quam pulvinar posuere. Nunc id ullamcorper est. Cras
ac molestie massa. Cras leo tellus, tempus id nibh quis, porttitor laoreet elit. Mauris in ornare
nisi. Duis vel velit felis. Suspendisse gravida felis nec nulla hendrerit pretium. Cras at orci
neque. Phasellus vehicula, ipsum at tempus sodales, mauris est condimentum metus, a vehicula ante
tellus sit amet diam. Suspendisse fermentum elit in volutpat viverra. Nullam gravida in augue sed
mollis. Curabitur aliquam diam non quam aliquam ultrices. Quisque pretium semper diam eget
malesuada. Suspendisse porttitor sagittis sem at malesuada. Donec euismod elementum nulla, sit amet
eleifend enim adipiscing nec. Nullam porta, enim ac tincidunt molestie, turpis mi porta justo,
ornare tristique sem orci quis turpis. Nullam leo dolor, pellentesque ac hendrerit et, tempus quis
nisi. Fusce pretium mattis tortor sagittis suscipit. Vestibulum vitae suscipit libero. Mauris
consequat sagittis mi, id tempus est condimentum et. In eget condimentum odio, a malesuada quam.
Vivamus id turpis non nulla eleifend cursus ut sit amet tellus. Proin ultrices luctus nibh, eget
condimentum ligula vestibulum in. Aliquam pharetra aliquet erat nec lacinia. Cras fringilla est
fringilla ante tristique, vitae bibendum dolor malesuada. Praesent ut dui pulvinar, suscipit velit
gravida, malesuada nunc. Cras tempus feugiat interdum. Vivamus lectus lorem, rutrum ut neque at,
sollicitudin euismod nulla. Vestibulum ac ligula suscipit, ultricies felis eget, adipiscing lectus.
Maecenas nec enim vel eros molestie lobortis faucibus sit amet urna. Sed ac consequat nulla. Nulla
et libero nisi. Pellentesque euismod nunc quis ipsum tristique, suscipit elementum magna aliquam.
Praesent sit amet tincidunt leo. Duis tempor arcu eget est posuere imperdiet. Quisque vel dui
adipiscing, auctor nibh vel, vulputate sapien. Curabitur eu sodales lacus. Aliquam felis eros,
mattis a diam eu, ullamcorper vestibulum turpis. Vivamus vitae vulputate lacus, sed convallis lorem.
Vestibulum mattis sollicitudin vulputate. Mauris cursus erat eget nisi accumsan, nec commodo tellus
blandit. Etiam gravida nulla et lorem molestie auctor. Mauris venenatis iaculis nulla vel mollis.
Morbi pretium sed eros at commodo. Aliquam eu justo turpis. Pellentesque lobortis, nisl eget
ultricies dictum, augue sem placerat elit, vitae pretium lectus massa eget tortor. Nulla accumsan,
massa eu rutrum pharetra, mi sapien aliquam massa, viverra facilisis metus nisi in dolor. Duis felis
velit, interdum a elit non, cursus pellentesque libero. Cum sociis natoque penatibus et magnis dis
parturient montes, nascetur ridiculus mus. Nunc vel nisi quis augue accumsan aliquam. Suspendisse
ante lectus, lobortis nec suscipit at, ullamcorper at diam. Aliquam hendrerit, eros ac egestas
condimentum, enim metus lobortis nibh, sit amet convallis augue nulla nec lorem. Lorem ipsum dolor
sit amet, consectetur adipiscing elit. Ut ac ligula eget est blandit scelerisque at vitae nunc. Sed
venenatis eros non quam auctor posuere. Curabitur convallis dapibus semper. Fusce et leo sed massa
posuere porta. Morbi convallis lobortis eros. Quisque ac nisl dictum, sagittis eros et, pellentesque
metus. Quisque mattis sodales lorem quis malesuada. Aenean neque sapien, rutrum vitae euismod quis,
euismod eu mi. Etiam ante tellus, auctor vitae pulvinar a, mattis nec tellus. Morbi libero lectus,
mattis sit amet convallis at, viverra et nisi. Proin a ante tristique, blandit urna at, lobortis
leo. Praesent nec odio sit amet ligula adipiscing pretium at rhoncus felis. Ut ut velit turpis. Sed
tempor lectus massa, vel gravida libero gravida a. Nunc mollis, lorem id dapibus hendrerit, mi orci
gravida orci, at vehicula neque nisl quis nibh. Mauris feugiat, ligula sit amet interdum laoreet,
lectus leo accumsan dolor, eu cursus tortor quam eget lectus. Sed commodo, est in bibendum
condimentum, magna neque dictum sapien, at lacinia sem ipsum ut eros. In eget erat eu nulla
hendrerit tincidunt id vulputate nibh. Nunc sed imperdiet urna, eu tempor orci. Phasellus
pellentesque sapien eu risus tincidunt, ut iaculis risus fermentum. Suspendisse condimentum erat
vitae porta malesuada. Ut a vulputate lorem. Nulla ullamcorper, neque in posuere vulputate, neque
magna tempor erat, sit amet luctus nisi nibh quis ligula. Duis porta urna et fermentum interdum. Sed
pellentesque odio euismod nisi auctor rutrum. Suspendisse mi nibh, dignissim eget porttitor quis,
commodo a massa. Nunc vel eleifend turpis. Sed iaculis, massa quis egestas pellentesque, nibh ante
feugiat ante, a euismod lacus nunc et felis. Nam in aliquet odio. Nulla eget enim aliquam, faucibus
est at, fringilla tellus. Duis molestie massa ornare, sodales leo eget, lobortis nibh. Nam bibendum
mi a facilisis mattis. Duis ultrices arcu tellus, vitae interdum tortor dictum et. Sed id luctus
lectus, eu tempus quam. Duis mi nisl, iaculis vel tortor sit amet, vulputate sodales risus. Cras
vitae lobortis nisi, eu adipiscing ante. Nam eget scelerisque libero. Nulla pulvinar, velit et
posuere sagittis, odio risus venenatis sapien, at tristique enim augue quis sem. Integer rutrum
blandit eros eu faucibus. Etiam eget iaculis felis, in fermentum ante. Nullam a placerat risus, id
accumsan quam. Donec est orci, elementum eu sapien non, ultricies ullamcorper leo. Praesent
tincidunt, mauris in viverra hendrerit, dolor nisi cursus orci, vel lacinia neque ante eu magna. Nam
facilisis massa at nisi accumsan, non condimentum turpis facilisis. Cras quis ipsum at orci ornare
venenatis vitae et ante. Morbi vitae luctus lacus. Nullam eu felis at mi hendrerit commodo a eu
diam. Maecenas ultricies, urna sit amet egestas tempor, dolor ligula dictum nibh, vehicula commodo
ipsum diam at nunc. Proin facilisis tincidunt elit, sed vulputate leo lobortis sed. In tincidunt
risus lorem, venenatis pellentesque tellus accumsan vitae. Integer ullamcorper mi ut risus
consectetur dictum in quis dui. Pellentesque sed diam sed purus egestas mollis id at sapien. Nunc
cursus mi nec accumsan porta. Nullam pulvinar pharetra felis. Etiam porta massa et diam scelerisque,
ut iaculis nisl luctus. Curabitur vel metus id lacus faucibus tempus. Nullam ornare neque orci, nec
scelerisque erat mattis nec. Phasellus ultrices ultrices nisi quis venenatis. Sed ultrices iaculis
diam a faucibus. Phasellus quis suscipit nulla. Nulla ultricies, turpis et dictum ullamcorper, urna
metus porta tellus, quis congue dolor libero quis sem. Nam tempus metus risus, sed rutrum nibh
cursus malesuada. Vivamus bibendum odio eget mi aliquet, sed tempor eros tincidunt. Suspendisse eu
ultricies ligula, non commodo sem. Ut aliquet elit sed leo laoreet aliquam. Vivamus feugiat a justo
non auctor. Sed rhoncus orci ut dictum dignissim. Duis eros libero, tempus non venenatis quis,
suscipit eget turpis. Aliquam sed ullamcorper velit, in tincidunt tellus. Ut dapibus erat vel nunc
feugiat elementum. Cras congue, erat sit amet lacinia venenatis, nisi magna rhoncus nulla, eu
blandit eros neque ac eros. Donec vulputate placerat dapibus. Integer dignissim odio eget iaculis
ultrices. Vestibulum ligula neque, tincidunt at pretium ac, tincidunt sit amet tellus. Sed fermentum
egestas tortor, non volutpat sapien. Aliquam erat volutpat. Duis semper placerat sapien at placerat.
Praesent facilisis pharetra dignissim. Morbi laoreet sed tortor eu rhoncus. Vivamus eleifend felis
eu dui ornare ornare sed at urna. Nulla nulla justo, hendrerit id enim vitae, blandit consequat
nibh. Aliquam mattis diam mattis fringilla tempor. Suspendisse suscipit est sed pulvinar commodo.
Lorem ipsum dolor sit amet, consectetur adipiscing elit. In in scelerisque enim. Phasellus ornare
nisl consequat volutpat bibendum. Vivamus et nunc viverra, ultrices lorem a, cursus purus. Curabitur
nibh libero, hendrerit lobortis malesuada sit amet, fringilla et augue. Vestibulum est lacus,
fringilla sit amet dictum pulvinar, lacinia at leo. Proin iaculis felis vitae metus viverra blandit.
Mauris accumsan sagittis semper. Quisque non diam a quam volutpat faucibus. Pellentesque eros orci,
commodo eget fringilla eu, euismod et turpis. Duis molestie et eros ac ullamcorper. Phasellus
consequat risus eget elementum semper. Donec at mi a justo laoreet condimentum porttitor in purus.
Nulla sit amet libero consectetur, iaculis neque nec, scelerisque turpis. Aliquam interdum nibh eget
accumsan dictum. Ut lobortis, mi non eleifend lobortis, lorem mauris pretium urna, at fermentum
tellus felis eu nunc. Aliquam in nibh tristique, tempus purus a, cursus massa. Suspendisse potenti.
Maecenas porttitor et erat in sollicitudin. Cum sociis natoque penatibus et magnis dis parturient
montes, nascetur ridiculus mus. Vestibulum commodo placerat velit, vel pellentesque neque sagittis
eget. Interdum et malesuada fames ac ante ipsum primis in faucibus. Nullam eu massa placerat,
iaculis eros eget, viverra orci. Aliquam ac lacus porttitor, eleifend elit id, vehicula mauris. Sed
ac interdum libero. Sed laoreet suscipit mi, ac accumsan massa condimentum nec. Suspendisse sodales
libero sollicitudin, malesuada quam ac, viverra enim. Sed sapien libero, egestas sit amet orci non,
venenatis interdum augue. In hac habitasse platea dictumst. Fusce gravida orci at ligula fringilla
adipiscing. Nunc quis ipsum quis nibh egestas porta. Proin et faucibus elit. Etiam in neque at nunc
pharetra adipiscing nec vel magna. Donec at nunc scelerisque, tincidunt risus ut, bibendum nisi.
Donec pulvinar fermentum purus, ac adipiscing urna iaculis at. Nulla ut nunc vitae lorem dapibus
fringilla. Ut placerat dignissim nulla ornare mattis. Mauris rutrum tellus quis odio dictum, ac
tempor velit scelerisque. Quisque ligula elit, convallis nec volutpat vitae, pulvinar id mauris.
Vivamus vel accumsan tortor. Donec eu sollicitudin dolor. Pellentesque egestas congue tristique.
Phasellus ut sollicitudin nisl. Praesent diam neque, malesuada id tincidunt id, malesuada in eros.
Phasellus adipiscing ipsum vel justo molestie vulputate. Praesent ultricies dapibus lacus pulvinar
gravida. Donec consequat, orci et mattis ultrices, nibh enim sagittis metus, vitae eleifend enim
tellus vitae augue. Suspendisse placerat iaculis risus nec iaculis. Ut ullamcorper ultrices dui, sed
blandit mauris hendrerit vitae. Nulla ac dolor lectus. Etiam pellentesque neque at odio bibendum, at
venenatis tellus fermentum. Maecenas a condimentum metus. Phasellus semper scelerisque feugiat.
Fusce varius varius tincidunt. Ut vel auctor magna. Cras dui turpis, euismod in enim a, scelerisque
adipiscing lectus. Duis mollis pharetra risus, sed ultrices nulla blandit non. Integer ac pulvinar
magna. Aenean fermentum auctor magna. Ut in viverra sapien. Proin ac bibendum magna, cursus gravida
elit. Phasellus vehicula facilisis nibh, tempor sagittis mauris accumsan et. Vestibulum sed lacus
luctus diam ornare venenatis non vel felis. Morbi posuere sit amet nisl quis pulvinar. Suspendisse
blandit tempus risus quis pretium. Nullam gravida libero vel aliquam suscipit. Nunc vel nunc at leo
pharetra tempor et ut mi. Aliquam erat volutpat. Nulla placerat odio tellus. Nam adipiscing massa
nec varius posuere. Proin placerat tellus posuere lorem suscipit, sit amet sagittis sem condimentum.
Ut pharetra odio quis tellus mattis facilisis. Quisque eget interdum est. Quisque mattis, felis eu
semper feugiat, quam augue interdum mauris, eget sodales nisi neque quis erat. Curabitur semper, mi
posuere luctus molestie, neque ante sagittis nulla, sit amet vehicula eros eros in justo. Integer
aliquet vehicula arcu, quis iaculis justo. Sed tincidunt sem id est porta volutpat. Mauris varius
felis ut est venenatis, ornare porttitor arcu adipiscing. Sed luctus rutrum ante, consectetur
sollicitudin sapien accumsan vulputate. Vivamus id diam vehicula, fermentum nunc id, viverra justo.
Quisque porttitor, odio in molestie hendrerit, libero eros vehicula odio, id vestibulum sapien neque
quis nibh. Donec vel faucibus est. Ut nec sapien vitae nibh congue egestas vel euismod tellus. Lorem
ipsum dolor sit amet, consectetur adipiscing elit. Vestibulum quis lacus lorem. Integer egestas
euismod ante, vitae condimentum neque eleifend non. Sed posuere bibendum ante, ut facilisis dui
condimentum at. In ut varius augue. Vivamus bibendum eu odio vel convallis. Vivamus cursus sodales
iaculis. Nullam convallis facilisis blandit. Phasellus iaculis porttitor elit, eget vestibulum ipsum
convallis eu. Quisque volutpat justo ipsum, eleifend cursus urna facilisis a. Sed at diam nec sem
semper scelerisque. Aliquam euismod erat quis nisi dictum, at sodales leo fermentum. Nam at nisl
metus. Proin luctus porttitor ante in tincidunt. Maecenas laoreet vitae enim eget elementum. Nulla
id sagittis enim, nec ultrices tortor. Nam rutrum ipsum sit amet erat auctor, eu venenatis libero
ultricies. Ut condimentum neque non diam ullamcorper, ultrices feugiat neque egestas. Pellentesque
at lobortis est, in blandit mi. Maecenas tincidunt eros id massa pulvinar, quis varius eros
lobortis. Curabitur vitae sodales orci. Suspendisse potenti. Pellentesque eu fringilla nibh. Etiam
sed pretium enim, lacinia consequat lectus. Quisque sed mi risus. Praesent posuere dolor sed mauris
dapibus, id tristique mi mattis. Quisque nec urna rutrum, consectetur mauris ut, egestas libero.
Fusce a justo orci. Etiam vitae aliquet ipsum. Curabitur consequat tempor eros, ut placerat lectus
tempus et. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis
egestas. Sed ligula mi, laoreet sit amet nunc id, ullamcorper fermentum magna. Maecenas enim dui,
viverra at nulla ut, lacinia pretium nunc. Donec at ultricies nulla, nec cursus odio. Donec
ullamcorper nec turpis imperdiet hendrerit. Sed euismod aliquam vehicula. Nunc sed enim eleifend
turpis venenatis sagittis. Sed laoreet velit erat. Proin nisl erat, vulputate et fermentum iaculis,
mollis suscipit magna. Sed porta, augue ut accumsan fermentum, arcu tortor rutrum tellus, sit amet
sollicitudin lectus turpis non felis. Vestibulum ante leo, interdum sed venenatis non, porttitor ut
nibh. Sed sit amet luctus erat. Duis id rhoncus justo, non rutrum lorem. Mauris ut laoreet elit.
Praesent sed diam porta, rhoncus massa a, tincidunt lorem. Mauris bibendum nunc nec est ullamcorper
bibendum. Nullam venenatis libero sed ligula scelerisque euismod quis at dui. Donec ac velit luctus,
molestie mi at, tempor leo. Pellentesque a ultricies risus. Maecenas malesuada faucibus nulla quis
consectetur. Phasellus pretium interdum risus sit amet aliquet. Nullam eleifend sem id magna
laoreet, ut lobortis mi tincidunt. Maecenas in justo tempor, viverra ipsum eu, tincidunt nulla. Sed
sed molestie turpis. Pellentesque imperdiet, eros non vulputate fringilla, turpis odio luctus
lectus, eu lacinia purus nisl vitae justo. Etiam non dapibus dolor. Fusce non urna scelerisque,
interdum massa vitae, venenatis metus. Vestibulum scelerisque dolor ac lectus sollicitudin, eget
fringilla sapien fringilla. Suspendisse non quam massa. Donec a sollicitudin eros, ut mollis turpis.
Nullam gravida congue semper. Phasellus vitae tellus vitae nulla cursus tempor et non elit.
Vestibulum pharetra in ligula a venenatis. Maecenas at erat sed nulla vulputate pulvinar et eu
libero. Donec pulvinar arcu nisi, sed posuere turpis cursus a. In nec turpis interdum, condimentum
velit in, consectetur lacus. Duis porta, felis a rhoncus ornare, ligula est elementum nunc, eu
adipiscing massa lorem in nibh. In consequat gravida eros. Phasellus condimentum malesuada sapien
ultrices tempor. Suspendisse sit amet diam in est pulvinar iaculis nec vitae nibh. Vivamus rhoncus
enim lorem, elementum posuere est pretium ut. Duis lectus lorem, ultricies ac dignissim in, egestas
et ipsum. Proin nec est ac dui sagittis dictum. Cras dictum augue ipsum, sit amet gravida ligula
scelerisque nec. Ut congue blandit porta. Nunc porta vitae risus at sagittis. Donec viverra, ante id
porta consectetur, felis turpis fringilla dui, ut vulputate nulla eros sit amet augue. Donec
aliquet, felis ut tempor pretium, enim leo suscipit risus, eget mollis justo ipsum ut augue. Nullam
at lacus eu orci dapibus laoreet nec convallis leo. Fusce rhoncus sed neque sit amet viverra. Donec
arcu nisl, hendrerit non pulvinar eu, blandit ac neque. Curabitur porta velit metus, non ullamcorper
nibh volutpat non. Proin tristique orci nec pretium lobortis. Curabitur quam neque, lacinia vitae
massa id, molestie pellentesque risus. Praesent vitae lectus bibendum, tincidunt augue vel, volutpat
magna. Curabitur quis feugiat magna. In libero risus, commodo eu mauris vitae, euismod ullamcorper
libero. Cras elementum rutrum lacus eu euismod. Morbi purus metus, rutrum nec varius sed, dignissim
eget nisi. Vivamus mauris nibh, hendrerit eu massa sed, ultrices suscipit est. Cras id odio dui.
Nulla condimentum luctus ipsum, eu molestie turpis commodo sed. Aliquam erat volutpat. Ut sodales
urna sit amet est dapibus pharetra. In nec vestibulum mi. Nullam mattis fringilla venenatis. Sed
risus sem, tempor vitae suscipit a, viverra in quam. In malesuada odio nec laoreet accumsan. Donec
justo diam, lacinia eu ante eget, pulvinar molestie mauris. Interdum et malesuada fames ac ante
ipsum primis in faucibus. Sed vulputate ornare dolor a tempor. Maecenas egestas, augue et semper
egestas, elit ipsum varius sem, a dapibus eros velit in sapien. Nulla sit amet eros ullamcorper,
hendrerit nunc eu, aliquet ipsum. Sed sit amet lacus enim. Curabitur faucibus rutrum dui, a tempor
velit vestibulum sed. Curabitur sed nunc id lorem semper malesuada. Maecenas semper eros eu
pellentesque vulputate. Nulla accumsan dolor placerat eros euismod facilisis. Nam vitae velit
tortor. Fusce tincidunt felis luctus, scelerisque dui in, rutrum nulla. Proin a pharetra tellus.
Aenean varius dolor nec risus eleifend fringilla. Proin at tellus ligula. Cras imperdiet mollis nisi
eget auctor. Etiam libero nunc, dictum at fermentum vitae, vehicula tincidunt justo. Proin tempor
risus elit, vestibulum auctor erat tristique vel. Etiam varius dui ante, a fringilla erat
ullamcorper vel. Quisque cursus quam imperdiet ornare dictum. Suspendisse turpis nunc, scelerisque a
congue eget, faucibus ut mauris. Suspendisse venenatis nisi nec dolor pharetra, id euismod sem
accumsan. Quisque et accumsan justo, elementum vulputate nulla. Etiam et sapien scelerisque,
malesuada lacus non, pretium enim. Curabitur ultrices, ipsum hendrerit pulvinar volutpat, dui tortor
mattis tortor, sed tincidunt magna lectus non eros. Ut hendrerit velit non metus pellentesque
mattis. Nullam velit nisi, ornare sit amet ipsum id, commodo tincidunt nisi. Aliquam egestas, ante
non placerat convallis, mi mauris posuere ligula, nec auctor lectus mi quis quam. In auctor
facilisis ante id elementum. Donec interdum ipsum vitae lorem sollicitudin rutrum. Etiam congue
pharetra lorem ac dictum. Donec feugiat interdum vulputate. Curabitur mollis suscipit nisi, vel
tincidunt risus fringilla at. Phasellus tincidunt, nulla a tincidunt tempor, libero turpis imperdiet
tortor, vel convallis orci neque vitae nisi. Nunc euismod massa quis mollis ultricies. Proin non
ante elit. Pellentesque et convallis massa. Curabitur blandit mattis metus, non aliquam erat iaculis
ut. Nam vestibulum ipsum vitae nulla varius, sit amet sodales ipsum congue. Nullam eget mauris ut
est blandit rhoncus sit amet ac arcu. Nulla at purus consequat, lobortis massa sit amet, posuere
ante. Nam bibendum laoreet tempus. Fusce ac nulla consequat, placerat sem vitae, condimentum enim.
Vestibulum sed tellus nec elit varius venenatis. Donec et dapibus dui. Nullam est metus, ultrices
nec lectus vel, fermentum elementum lacus. Curabitur imperdiet vestibulum enim. Aenean sollicitudin
at leo quis ullamcorper. Suspendisse in posuere risus. In quis mattis sem, eu facilisis arcu.
Vestibulum faucibus auctor accumsan. Morbi mattis sit amet augue ac sodales. Integer varius eget
orci iaculis aliquet. Suspendisse a auctor turpis. Fusce vestibulum vestibulum ante sed mattis.
Mauris ornare rhoncus enim ac egestas. Donec turpis eros, interdum non placerat nec, adipiscing eu
urna. Integer feugiat mi quis eros fringilla vehicula. Proin suscipit magna ultricies laoreet
dignissim. Donec vehicula ac lacus non vehicula. Sed euismod mattis facilisis. Etiam nec risus vitae
risus iaculis lobortis. Duis eu dui sit amet turpis tincidunt vulputate. Nunc tortor diam, egestas
in ante ac, scelerisque placerat ante. Nullam interdum ultricies nisl a vehicula. Integer id nunc
elit. Sed rutrum sit amet neque quis tristique.
================================================
FILE: spec/inputs/numbers/1.txt
================================================
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
================================================
FILE: spec/inputs/numbers/10.txt
================================================
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
================================================
FILE: spec/inputs/numbers/11.txt
================================================
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
================================================
FILE: spec/inputs/numbers/12.txt
================================================
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
================================================
FILE: spec/inputs/numbers/13.txt
================================================
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
================================================
FILE: spec/inputs/numbers/14.txt
================================================
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
================================================
FILE: spec/inputs/numbers/15.txt
================================================
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
================================================
FILE: spec/inputs/numbers/16.txt
================================================
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
================================================
FILE: spec/inputs/numbers/17.txt
================================================
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
================================================
FILE: spec/inputs/numbers/18.txt
================================================
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
================================================
FILE: spec/inputs/numbers/19.txt
================================================
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
================================================
FILE: spec/inputs/numbers/2.txt
================================================
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
================================================
FILE: spec/inputs/numbers/20.txt
================================================
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
================================================
FILE: spec/inputs/numbers/3.txt
================================================
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
================================================
FILE: spec/inputs/numbers/4.txt
================================================
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
================================================
FILE: spec/inputs/numbers/5.txt
================================================
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
================================================
FILE: spec/inputs/numbers/6.txt
================================================
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
================================================
FILE: spec/inputs/numbers/7.txt
================================================
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
================================================
FILE: spec/inputs/numbers/8.txt
================================================
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
================================================
FILE: spec/inputs/numbers/9.txt
================================================
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
================================================
FILE: spec/inputs/numbers_0_100.txt
================================================
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
================================================
FILE: spec/inputs/numbers_1_100.txt
================================================
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
================================================
FILE: spec/inputs/people.json
================================================
{"id":1,"name":"Matthew Fuller","age":49,"email":"mfuller0@blogger.com","active":false}
{"id":2,"name":"Pamela Thomas","age":58,"email":"pthomas1@apache.org","address":"92 Beilfuss Lane","active":false,"ip_address":"41.52.54.168"}
{"id":3,"name":"Joan Stevens","age":33,"email":"jstevens2@xrea.com","address":"1 Wayridge Circle","active":true,"ip_address":"159.204.170.10"}
{"id":4,"name":"Laura Reynolds","email":"lreynolds3@admin.ch","address":"431 Spenser Court","active":true,"ip_address":"164.254.150.90"}
{"id":5,"name":"Daniel Baker","email":"dbaker4@blinklist.com","active":true,"ip_address":"165.138.63.70"}
{"id":6,"name":"Christina Lane","email":"clane5@cnbc.com","address":"7 Chinook Park","active":true,"ip_address":"46.240.67.103"}
{"id":7,"name":"Carlos Washington","age":50,"email":"cwashington6@issuu.com","address":"6487 Memorial Trail","active":false,"ip_address":"152.45.154.18"}
{"id":8,"name":"Harold Reid","age":53,"email":"hreid7@seesaa.net","active":true}
{"id":9,"name":"Earl Harris","age":37,"email":"eharris8@homestead.com","active":false}
{"id":10,"name":"Jack Hernandez","age":30,"email":"jhernandez9@adobe.com","address":"29407 Memorial Alley","active":false,"ip_address":"129.222.144.1"}
{"id":11,"name":"Nicole Torres","age":25,"email":"ntorresa@amazon.de","address":"34804 Havey Point","active":false,"ip_address":"5.114.113.83"}
{"id":12,"name":"Theresa Gordon","age":19,"email":"tgordonb@xinhuanet.com","active":false}
{"id":13,"name":"Emily Schmidt","age":25,"email":"eschmidtc@arstechnica.com","address":"115 Bluestem Pass","active":true}
{"id":14,"name":"Dennis Ford","age":50,"email":"dfordd@hc360.com","address":"4107 Kim Avenue","active":true,"ip_address":"44.170.237.89"}
{"id":15,"name":"Deborah Williams","age":28,"email":"dwilliamse@cmu.edu","address":"7 Kipling Pass","active":false}
{"id":16,"name":"Rachel Sullivan","age":31,"email":"rsullivanf@pagesperso-orange.fr","address":"8196 Harbort Park","active":true,"ip_address":"216.142.141.210"}
{"id":17,"name":"Phillip Jordan","email":"pjordang@liveinternet.ru","active":false}
{"id":18,"name":"Fred Mitchell","email":"fmitchellh@shinystat.com","address":"279 Gateway Parkway","active":false}
{"id":19,"name":"Antonio Dunn","age":23,"email":"adunni@mediafire.com","address":"71 Maple Place","active":true,"ip_address":"39.50.250.70"}
{"id":20,"name":"Alan Boyd","age":59,"email":"aboydj@sbwire.com","address":"4302 Warner Road","active":false,"ip_address":"106.253.236.0"}
{"id":21,"name":"Louise Wright","age":19,"email":"lwrightk@so-net.ne.jp","address":"5 Maryland Hill","active":false,"ip_address":"51.0.99.116"}
{"id":22,"name":"Diane Greene","age":39,"email":"dgreenel@jugem.jp","address":"38 Merrick Lane","active":false,"ip_address":"146.124.156.180"}
{"id":23,"name":"Emily Richardson","age":23,"email":"erichardsonm@csmonitor.com","active":true}
{"id":24,"name":"Joseph Henderson","age":36,"email":"jhendersonn@drupal.org","address":"55 Morningstar Lane","active":true,"ip_address":"54.187.254.99"}
{"id":25,"name":"Chris Fowler","age":31,"email":"cfowlero@msu.edu","address":"4 Oakridge Center","active":false}
{"id":26,"name":"Helen West","age":38,"email":"hwestp@time.com","address":"93 Blaine Parkway","active":true,"ip_address":"159.131.255.177"}
{"id":27,"name":"Jimmy Black","age":46,"email":"jblackq@house.gov","address":"80157 Bay Drive","active":true,"ip_address":"163.137.84.52"}
{"id":28,"name":"Melissa Allen","age":56,"email":"mallenr@upenn.edu","address":"381 Merrick Way","active":false}
{"id":29,"name":"Scott Walker","age":48,"email":"swalkers@etsy.com","active":true}
{"id":30,"name":"Jimmy Wood","email":"jwoodt@bloomberg.com","address":"1041 Claremont Lane","active":true}
{"id":31,"name":"Betty Jacobs","email":"bjacobsu@ihg.com","address":"6520 Anderson Junction","active":false,"ip_address":"166.45.58.141"}
{"id":32,"name":"Richard Stone","age":34,"email":"rstonev@rakuten.co.jp","address":"51 Bay Pass","active":true,"ip_address":"9.35.132.204"}
{"id":33,"name":"Melissa Henderson","age":21,"email":"mhendersonw@washington.edu","address":"06 Delaware Avenue","active":false}
{"id":34,"name":"David Stanley","age":57,"email":"dstanleyx@ucoz.com","address":"692 Lien Avenue","active":true,"ip_address":"194.251.38.0"}
{"id":35,"name":"Cynthia Murphy","age":20,"email":"cmurphyy@xinhuanet.com","active":false}
{"id":36,"name":"Todd Henry","age":38,"address":"589 Katie Center","active":true,"ip_address":"177.233.117.222"}
{"id":37,"name":"Christina Stephens","age":40,"email":"cstephens10@illinois.edu","address":"51039 Hermina Point","active":true}
{"id":38,"name":"Sharon Gomez","email":"sgomez11@parallels.com","address":"57089 Texas Way","active":true,"ip_address":"149.85.104.141"}
{"id":39,"name":"Benjamin Fisher","age":30,"email":"bfisher12@gmpg.org","address":"3 Welch Plaza","active":false,"ip_address":"116.184.105.191"}
{"id":40,"name":"Mark Stewart","age":38,"email":"mstewart13@uiuc.edu","active":false,"ip_address":"167.115.237.197"}
{"id":41,"name":"Mark Black","age":45,"email":"mblack14@tuttocitta.it","address":"9 Rutledge Pass","active":false,"ip_address":"108.90.166.239"}
{"id":42,"name":"Christina Lawrence","age":47,"email":"clawrence15@simplemachines.org","address":"239 Eggendart Junction","active":true,"ip_address":"8.118.127.22"}
{"id":43,"name":"Howard Lynch","age":52,"email":"hlynch16@slideshare.net","active":true}
{"id":44,"name":"Heather Perez","age":60,"email":"hperez17@techcrunch.com","address":"1 Almo Court","active":false,"ip_address":"110.184.153.36"}
{"id":45,"name":"Michael Howell","age":57,"email":"mhowell18@wufoo.com","address":"341 Shelley Alley","active":false}
{"id":46,"name":"Gregory Johnson","age":57,"email":"gjohnson19@japanpost.jp","address":"4 Basil Plaza","active":true,"ip_address":"249.29.102.40"}
{"id":47,"name":"Christopher Miller","age":50,"email":"cmiller1a@google.es","address":"76 Granby Way","active":true}
{"id":48,"name":"Beverly Hall","age":60,"email":"bhall1b@cam.ac.uk","address":"9 Novick Place","active":true}
{"id":49,"name":"Todd Adams","age":58,"email":"tadams1c@yahoo.co.jp","active":false}
{"id":50,"name":"Judith Watkins","age":30,"email":"jwatkins1d@comcast.net","address":"5874 Esker Parkway","active":true,"ip_address":"229.176.89.163"}
{"id":51,"name":"Cheryl Howard","age":34,"email":"choward1e@cam.ac.uk","address":"492 Mandrake Lane","active":false,"ip_address":"255.117.98.35"}
{"id":52,"name":"Mary West","email":"mwest1f@cnn.com","address":"4 Vera Avenue","active":false,"ip_address":"118.130.207.177"}
{"id":53,"name":"Carol Welch","age":39,"email":"cwelch1g@sun.com","address":"794 Burrows Pass","active":true,"ip_address":"205.98.9.218"}
{"id":54,"name":"Donald Reed","age":23,"email":"dreed1h@wsj.com","address":"0769 Dryden Trail","active":true,"ip_address":"35.72.239.99"}
{"id":55,"name":"Michael Wells","age":29,"email":"mwells1i@deviantart.com","address":"9033 Crescent Oaks Way","active":false,"ip_address":"33.18.26.152"}
{"id":56,"name":"Joyce Montgomery","age":34,"email":"jmontgomery1j@sciencedaily.com","address":"29093 Lyons Circle","active":true,"ip_address":"85.155.89.174"}
{"id":57,"name":"Angela Garza","age":24,"email":"agarza1k@hc360.com","address":"388 Kenwood Street","active":false,"ip_address":"204.191.24.172"}
{"id":58,"name":"Rose Green","age":26,"email":"rgreen1l@businessinsider.com","address":"3 Mesta Pass","active":true}
{"id":59,"name":"Wanda Williamson","age":39,"email":"wwilliamson1m@cafepress.com","address":"18596 Westridge Crossing","active":true,"ip_address":"215.98.196.209"}
{"id":60,"name":"Irene Washington","age":49,"email":"iwashington1n@ameblo.jp","address":"83 Monica Crossing","active":false,"ip_address":"141.46.156.186"}
{"id":61,"name":"Anna Freeman","age":50,"email":"afreeman1o@blogs.com","address":"3 Gulseth Way","active":true}
{"id":62,"name":"Kathleen Romero","age":23,"email":"kromero1p@craigslist.org","address":"419 Leroy Court","active":true}
{"id":63,"name":"Matthew Alexander","age":58,"email":"malexander1q@gnu.org","active":false}
{"id":64,"name":"Louis Moore","age":50,"email":"lmoore1r@salon.com","address":"671 Buhler Hill","active":true,"ip_address":"21.247.160.104"}
{"id":65,"name":"Christina Brooks","age":27,"email":"cbrooks1s@google.cn","address":"80405 Jana Circle","active":true,"ip_address":"121.100.200.46"}
{"id":66,"name":"Sarah Moreno","age":30,"address":"03 Cottonwood Way","active":true,"ip_address":"111.174.142.117"}
{"id":67,"name":"Harold Rodriguez","age":24,"email":"hrodriguez1u@squidoo.com","address":"76 Green Circle","active":true}
{"id":68,"name":"Louise Black","age":18,"email":"lblack1v@yale.edu","address":"951 Blackbird Junction","active":false,"ip_address":"212.47.220.126"}
{"id":69,"name":"Adam Montgomery","email":"amontgomery1w@mlb.com","address":"1 Mesta Terrace","active":false}
{"id":70,"name":"Jacqueline Pierce","age":58,"email":"jpierce1x@google.com.au","address":"0161 Village Plaza","active":false,"ip_address":"116.164.88.112"}
{"id":71,"name":"Ann Stone","age":45,"email":"astone1y@yelp.com","address":"1011 Heath Terrace","active":false}
{"id":72,"name":"Teresa Arnold","age":33,"email":"tarnold1z@mayoclinic.com","active":false,"ip_address":"81.165.73.142"}
{"id":73,"name":"Arthur Shaw","age":27,"email":"ashaw20@latimes.com","address":"9956 Hooker Road","active":true}
{"id":74,"name":"Wayne Garrett","age":41,"email":"wgarrett21@adobe.com","address":"34 Grasskamp Street","active":true,"ip_address":"29.26.28.17"}
{"id":75,"name":"Russell Castillo","age":46,"email":"rcastillo22@printfriendly.com","address":"444 South Avenue","active":false}
{"id":76,"name":"Shirley Burke","age":47,"email":"sburke23@lulu.com","address":"70 Florence Drive","active":false}
{"id":77,"name":"Tammy Washington","age":46,"email":"twashington24@youtube.com","address":"559 Hollow Ridge Road","active":true,"ip_address":"230.169.245.123"}
{"id":78,"name":"Diane Freeman","age":49,"email":"dfreeman25@github.com","address":"04 Transport Center","active":false,"ip_address":"138.200.234.169"}
{"id":79,"name":"Anne Morrison","email":"amorrison26@telegraph.co.uk","address":"525 Shasta Junction","active":true}
{"id":80,"name":"Paul Johnston","age":51,"email":"pjohnston27@youku.com","address":"16254 Ryan Center","active":false,"ip_address":"214.38.125.121"}
{"id":81,"name":"Virginia Welch","age":58,"email":"vwelch28@china.com.cn","address":"2 Michigan Hill","active":true}
{"id":82,"name":"Louis Hughes","age":44,"email":"lhughes29@mysql.com","address":"423 Meadow Valley Pass","active":false,"ip_address":"213.45.167.91"}
{"id":83,"name":"Betty Reynolds","age":57,"email":"breynolds2a@furl.net","address":"4486 Kedzie Road","active":true}
{"id":84,"name":"Norma Olson","age":18,"email":"nolson2b@goo.gl","active":true}
{"id":85,"name":"David Ward","age":28,"email":"dward2c@ibm.com","address":"3 Kings Place","active":true}
{"id":86,"name":"Phyllis Williamson","age":26,"email":"pwilliamson2d@nationalgeographic.com","address":"7 Northview Street","active":false,"ip_address":"234.86.8.89"}
{"id":87,"name":"Kathleen Holmes","age":46,"email":"kholmes2e@zdnet.com","address":"4814 Colorado Place","active":false}
{"id":88,"name":"George King","age":23,"email":"gking2f@ask.com","address":"966 Morrow Junction","active":false,"ip_address":"89.94.24.41"}
{"id":89,"name":"Raymond Garcia","age":47,"email":"rgarcia2g@quantcast.com","active":true,"ip_address":"135.10.187.167"}
{"id":90,"name":"Rose Meyer","age":38,"active":true,"ip_address":"228.216.201.80"}
{"id":91,"name":"Jennifer Gray","age":50,"email":"jgray2i@princeton.edu","address":"58241 Calypso Court","active":true,"ip_address":"158.144.236.158"}
{"id":92,"name":"Bonnie Franklin","age":24,"email":"bfranklin2j@slideshare.net","address":"629 Prairieview Center","active":false}
{"id":93,"name":"Sarah Martin","age":52,"email":"smartin2k@cnn.com","address":"997 Kensington Lane","active":false}
{"id":94,"name":"Shirley Hamilton","age":39,"email":"shamilton2l@nih.gov","address":"934 Clarendon Lane","active":false}
{"id":95,"name":"Gregory Kim","age":37,"email":"gkim2m@tinyurl.com","active":true,"ip_address":"216.24.238.78"}
{"id":96,"name":"Betty Sanchez","age":46,"email":"bsanchez2n@washington.edu","active":true}
{"id":97,"name":"Ann Cooper","age":41,"email":"acooper2o@issuu.com","active":false}
{"id":98,"name":"Christopher Cole","active":true}
{"id":99,"name":"Debra Lopez","age":36,"address":"4 Grim Drive","active":false,"ip_address":"1.217.64.60"}
{"id":100,"name":"Shawn Moore","age":35,"email":"smoore2r@mayoclinic.com","active":true}
================================================
FILE: spec/lib/collect_spec.rb
================================================
require 'spec_helper'
RSpec.describe Spark::RDD do
let(:mapping) { lambda{|x| [x, 1]} }
let(:numbers) { Generator.numbers }
it '.collect_as_hash' do
rdd = $sc.parallelize(numbers)
rdd = rdd.map(mapping)
expect(rdd.collect_as_hash).to eql(Hash[numbers.map(&mapping)])
end
context '.take' do
let(:size) { 1000 }
let(:numbers) { Generator.numbers(size) }
let(:rdd) { $sc.parallelize(numbers) }
it 'nothing' do
expect(rdd.take(0)).to eql([])
end
it 'first' do
expect(rdd.first).to eql(numbers.first)
end
it 'less than limit' do
_size = size / 2
expect(rdd.take(_size)).to eql(numbers.take(_size))
end
it 'all' do
expect(rdd.take(size)).to eql(numbers)
end
it 'more than limit' do
expect(rdd.take(size*2)).to eql(numbers)
end
end
end
================================================
FILE: spec/lib/command_spec.rb
================================================
require 'spec_helper'
def to_s_method(x)
x.to_s
end
RSpec::describe Spark::CommandBuilder do
let(:numbers) { Generator.numbers }
let(:rdd) { $sc.parallelize(numbers, 1) }
context '.serialize_function' do
let(:result) { numbers.map(&:to_s) }
it 'string' do
expect(rdd.map('lambda{|x| x.to_s}').collect).to eql(result)
end
it 'symbol' do
expect(rdd.map(:to_s).collect).to eql(result)
end
it 'lambda' do
expect(rdd.map(lambda{|x| x.to_s}).collect).to eql(result)
end
it 'method' do
expect(rdd.map(method(:to_s_method)).collect).to eql(result)
end
end
context '.bind' do
it 'number' do
number = rand(0..10000000)
rdd2 = rdd.map(lambda{|x| x * number}).bind(number: number)
expect(rdd2.collect).to eq(numbers.map{|x| x * number})
end
it 'open struct' do
require 'ostruct'
struct = OpenStruct.new
struct.number = 3
struct.string = '3'
struct.array = [1, 2, 3]
func = lambda{|item|
item * struct.number + struct.string.to_i + struct.array[0]
}
rdd2 = rdd.add_library('ostruct')
rdd2 = rdd2.map(func)
rdd2 = rdd2.bind(struct: struct)
expect(rdd2.collect).to eq(numbers.map(&func))
end
it 'different naming' do
array = [1, 2, 3]
rdd2 = rdd.map(lambda{|_| my_array.size})
rdd2 = rdd2.bind(my_array: array)
expect(rdd2.sum).to eq(numbers.size * array.size)
end
end
end
================================================
FILE: spec/lib/config_spec.rb
================================================
require 'spec_helper'
RSpec.describe Spark::Config do
before(:context) do
Spark.stop
end
after(:context) do
spark_start
end
it 'should be stopped' do
expect(Spark.started?).to be_falsy
end
context 'new config' do
let(:configuration) do
{
'test.test1' => 'test1',
'test.test2' => 'test2',
'test.test3' => 'test3'
}
end
before(:each) do
Spark.clear_config
end
it 'throught methods' do
configuration.each do |key, value|
Spark.config.set(key, value)
end
configuration.each do |key, value|
expect(Spark.config.get(key)).to eql(value)
end
end
it 'throught hash style' do
configuration.each do |key, value|
Spark.config[key] = value
end
configuration.each do |key, value|
expect(Spark.config[key]).to eql(value)
end
end
it 'throught dsl' do
configuration.each do |key, value|
Spark.config {
set key, value
}
end
configuration.each do |key, value|
expect(Spark.config[key]).to eql(value)
end
end
end
end
================================================
FILE: spec/lib/context_spec.rb
================================================
require 'spec_helper'
RSpec.describe Spark::Context do
it '.run_job' do
workers = 5
numbers = (0...100).to_a
func = lambda{|part| part.size}
ser = Spark::Serializer.build { __batched__(__marshal__, 1) }
rdd = $sc.parallelize(numbers, workers, ser)
rdd_result = $sc.run_job(rdd, func)
result = numbers.each_slice(numbers.size/workers).map(&func)
expect(rdd_result).to eql(result)
parts = [0, 2]
func = lambda{|part| part.to_s}
rdd_result = $sc.run_job(rdd, func, parts)
result = []
sliced_numbers = numbers.each_slice(numbers.size/workers).to_a
parts.each do |part|
result << func.call(sliced_numbers[part])
end
expect(rdd_result).to eql(result)
end
it '.broadcast' do
workers = rand(1..5)
values1 = [1,2,3]
values2 = [4,5,6]
broadcast1 = $sc.broadcast(values1)
broadcast2 = $sc.broadcast(values2)
rdd = $sc.parallelize(0..5, workers)
rdd = rdd.bind(broadcast1: broadcast1, broadcast2: broadcast2)
rdd = rdd.map_partitions(lambda{|_| broadcast1.value + broadcast2.value })
expect(rdd.sum).to eql(
(values1 + values2).reduce(:+) * workers
)
end
# context '.accumulator' do
# it 'test' do
# accum1 = $sc.accumulator(0,)
# accum2 = $sc.accumulator(1, :*, 1)
# accum3 = $sc.accumulator(0, lambda{|max, val| val > max ? val : max})
# accum1 += 1
# accum2.add(2)
# accum2.add(2)
# accum2.add(2)
# accum3.add(9)
# accum3.add(6)
# accum3.add(7)
# expect(accum1.value).to eql(1)
# expect(accum2.value).to eql(8)
# expect(accum3.value).to eql(9)
# func = Proc.new do |_, index|
# accum1.add(1)
# accum2.add(2)
# accum3.add(index * 10)
# end
# rdd = $sc.parallelize(0..4, 4)
# rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3)
# rdd = rdd.map_partitions_with_index(func)
# rdd.collect
# # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
# sleep(1)
# expect(accum1.value).to eql(5)
# expect(accum2.value).to eql(128)
# expect(accum3.value).to eql(30)
# end
# context 'accum param' do
# it 'symbol' do
# accum1 = $sc.accumulator(1, :+, 0)
# accum2 = $sc.accumulator(5, :-, 3)
# accum3 = $sc.accumulator(1, :*, 1)
# accum4 = $sc.accumulator(1.0, :/, 1.0)
# accum5 = $sc.accumulator(2, :**, 2)
# func = Proc.new do |_|
# accum1.add(1)
# accum2.add(1)
# accum3.add(2)
# accum4.add(2)
# accum5.add(2)
# end
# rdd = $sc.parallelize(0..4, 2)
# rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3, accum4: accum4, accum5: accum5)
# rdd = rdd.map_partitions(func)
# rdd.collect
# # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
# sleep(1)
# expect(accum1.value).to eq(3)
# expect(accum2.value).to eq(1)
# expect(accum3.value).to eq(4)
# expect(accum4.value).to eq(4)
# expect(accum5.value).to eq(65536)
# end
# it 'proc' do
# accum1 = $sc.accumulator(1, lambda{|mem, val| mem + val}, 0)
# accum2 = $sc.accumulator('a', lambda{|mem, val| mem + val}, '')
# accum3 = $sc.accumulator([], lambda{|mem, val| mem << val}, [])
# func = Proc.new do |_|
# accum1.add(1)
# accum2.add('a')
# accum3.add(1)
# end
# rdd = $sc.parallelize(0..4, 2)
# rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3)
# rdd = rdd.map_partitions(func)
# rdd.collect
# # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
# sleep(1)
# expect(accum1.value).to eq(3)
# expect(accum2.value).to eq('aaa')
# expect(accum3.value).to eq([[1], [1]])
# end
# it 'string' do
# expect { $sc.accumulator(1, '0') }.to raise_error(Spark::SerializeError)
# accum = $sc.accumulator(1, 'lambda{|mem, val| mem + val}', 0)
# func = Proc.new do |_|
# accum.add(1)
# end
# rdd = $sc.parallelize(0..4, 2)
# rdd = rdd.bind(accum: accum)
# rdd = rdd.map_partitions(func)
# rdd.collect
# # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
# sleep(1)
# expect(accum.value).to eq(3)
# end
# end
# end
end
================================================
FILE: spec/lib/ext_spec.rb
================================================
require 'spec_helper'
RSpec.describe Array do
it '.deep_copy' do
data = ['a', 'b', 'c']
new_data = data.dup
data[0] << 'a'
expect(data).to eql(new_data)
new_data = data.deep_copy
data[1] << 'b'
expect(data).to_not eql(new_data)
end
end
RSpec.describe Hash do
it '.stringify_keys!' do
data = {
a: 'a',
b: 'b',
c: 'c'
}
data.stringify_keys!
expect(data).to eql({
'a' => 'a',
'b' => 'b',
'c' => 'c'
})
end
end
RSpec.describe String do
it '.camelize' do
data = 'aaa_bbb_ccc'.camelize
expect(data).to eql('AaaBbbCcc')
end
end
RSpec.describe IO do
it 'serialize' do
file = Tempfile.new('serialize')
file.binmode
file.write_int(1)
file.write_long(2)
file.write_string('3')
file.write_data([4])
file.rewind
expect(file.read_int).to eq(1)
expect(file.read_long).to eq(2)
expect(file.read_string).to eq('3')
expect(file.read_data).to eq([4])
file.unlink
end
end
================================================
FILE: spec/lib/external_apps_spec.rb
================================================
require 'spec_helper'
RSpec.describe Spark::RDD do
context '.pipe' do
let(:words) { Generator.words }
let(:numbers) { Generator.numbers }
it 'single program' do
skip if windows?
rdd = $sc.parallelize(words, 1)
rdd = rdd.pipe('tr a b')
result = words.dup
result.map! do |x|
x.gsub('a', 'b')
end
expect(rdd.collect).to eql(result)
end
it 'multiple program' do
skip if windows?
rdd = $sc.parallelize(numbers, 1)
rdd = rdd.pipe("tr 1 5", "awk '{print $1*10}'")
rdd = rdd.map(lambda{|x| x.to_i * 100})
result = numbers.dup
result.map! do |x|
x.to_s.gsub('1', '5')
end
result.map! do |x|
x.to_i * 10
end
result.map! do |x|
x * 100
end
expect(rdd.collect).to eql(result)
end
end
end
================================================
FILE: spec/lib/filter_spec.rb
================================================
require 'spec_helper'
def func4(item)
item.start_with?('a') && item.size > 3 && item[1].to_s.ord > 106
end
RSpec.shared_examples 'a filtering' do |workers|
context "with #{workers || 'default'} worker" do
it 'when numbers' do
rdd2 = rdd_numbers(workers)
rdd2 = rdd2.filter(func1)
result = numbers.select(&func1)
expect(rdd2.collect).to eql(result)
rdd3 = rdd_numbers(workers)
rdd3 = rdd3.filter(func1)
rdd3 = rdd3.filter(func2)
expect(rdd3.collect).to eql([])
end
it 'when words' do
rdd2 = rdd_words(workers)
rdd2 = rdd2.filter(func3)
result = words.select{|x| func3.call(x)}
expect(rdd2.collect).to eql(result)
rdd3 = rdd_words(workers)
rdd3 = rdd3.filter(method(:func4))
result = words.select{|x| func4(x)}
expect(rdd3.collect).to eql(result)
end
end
end
RSpec.describe 'Spark::RDD.filter' do
let(:func1) { lambda{|x| x.to_i.even?} }
let(:func2) { lambda{|x| x.to_i.odd?} }
let(:func3) { lambda{|x| x.to_s.start_with?('b')} }
context 'throught parallelize' do
let(:numbers) { Generator.numbers_with_zero }
let(:words) { Generator.words }
def rdd_numbers(workers)
$sc.parallelize(numbers, workers)
end
def rdd_words(workers)
$sc.parallelize(words, workers)
end
it_behaves_like 'a filtering', 2
# it_behaves_like 'a filtering', nil
# it_behaves_like 'a filtering', rand(2..10)
end
context 'throught text_file' do
let(:file_numbers) { File.join('spec', 'inputs', 'numbers_0_100.txt') }
let(:file_words) { File.join('spec', 'inputs', 'lorem_300.txt') }
let(:numbers) { File.readlines(file_numbers).map(&:strip) }
let(:words) { File.readlines(file_words).map(&:strip) }
def rdd_numbers(workers)
$sc.text_file(file_numbers, workers)
end
def rdd_words(workers)
$sc.text_file(file_words, workers)
end
it_behaves_like 'a filtering', 2
# it_behaves_like 'a filtering', nil
# it_behaves_like 'a filtering', rand(2..10)
end
end
================================================
FILE: spec/lib/flat_map_spec.rb
================================================
require 'spec_helper'
RSpec.shared_examples 'a flat mapping' do |workers|
it "with #{workers || 'default'} worker" do
rdd2 = rdd(workers).map(func1)
result = numbers.flat_map(&func1)
expect(rdd2.collect).to eql(result)
rdd3 = rdd(workers)
rdd3 = rdd3.flat_map(func1)
rdd3 = rdd3.flat_map(func2)
rdd3 = rdd3.flat_map(func3)
result = numbers.flat_map(&func1).flat_map(&func2).flat_map(&func3)
expect(rdd3.collect).to eql(result)
rdd4 = rdd(workers)
rdd4 = rdd4.flat_map(func1)
rdd4 = rdd4.flat_map(func2)
rdd4 = rdd4.flat_map(func3)
expect(rdd4.collect).to eql(rdd3.collect)
end
end
RSpec.shared_examples 'a flat mapping values' do |workers|
it "with #{workers || 'default'} worker" do
rdd2 = rdd(workers).flat_map_values(func1)
result = []
hash_with_values.each do |(key, values)|
values = func1.call(values).flatten
values.each do |value|
result << [key, value]
end
end
expect(rdd2.collect).to eql(result)
rdd2 = rdd(workers).flat_map_values(func2)
result = []
hash_with_values.each do |(key, values)|
values = func2.call(values).flatten
values.each do |value|
result << [key, value]
end
end
expect(rdd2.collect).to eql(result)
end
end
RSpec.describe 'Spark::RDD' do
let(:func1) { lambda{|x| x*2} }
let(:func2) { lambda{|x| [x*3, 1, 1]} }
let(:func3) { lambda{|x| [x*4, 2, 2]} }
context 'throught parallelize' do
context '.flat_map' do
let(:numbers) { Generator.numbers_with_zero }
def rdd(workers)
$sc.parallelize(numbers, workers)
end
it_behaves_like 'a flat mapping', 1
it_behaves_like 'a flat mapping', 2
# it_behaves_like 'a flat mapping', nil
# it_behaves_like 'a flat mapping', rand(2..10)
end
context '.flat_map_values' do
let(:func1) { lambda{|x| x*2} }
let(:func2) { lambda{|x| [x.first]} }
let(:hash_with_values) { Generator.hash_with_values }
def rdd(workers)
$sc.parallelize(hash_with_values, workers)
end
it_behaves_like 'a flat mapping values', 1
it_behaves_like 'a flat mapping values', 2
# it_behaves_like 'a flat mapping values', nil
# it_behaves_like 'a flat mapping values', rand(2..10)
end
end
context 'throught text_file' do
context '.flat_map' do
let(:file) { File.join('spec', 'inputs', 'numbers_0_100.txt') }
let(:numbers) { File.readlines(file).map(&:strip) }
def rdd(workers)
$sc.text_file(file, workers)
end
it_behaves_like 'a flat mapping', 1
it_behaves_like 'a flat mapping', 2
# it_behaves_like 'a flat mapping', nil
# it_behaves_like 'a flat mapping', rand(2..10)
end
end
end
================================================
FILE: spec/lib/group_spec.rb
================================================
require 'spec_helper'
RSpec.shared_examples 'a groupping by key' do |workers|
it "with #{workers || 'default'} worker" do
expect(rdd_result(workers)).to eql(result)
end
end
RSpec.shared_examples 'a cogroupping by key' do |workers|
context "with #{workers || 'default'} worker" do
it '.group_with' do
rdd = rdd_1(workers).group_with(rdd_2(workers))
expect(rdd.collect_as_hash).to eql(result_12)
end
it '.cogroup' do
rdd = rdd_1(workers).cogroup(rdd_2(workers), rdd_3(workers))
expect(rdd.collect_as_hash).to eql(result_123)
end
end
end
RSpec.shared_examples 'a groupping by' do |workers|
it "with #{workers || 'default'} worker" do
rdd = rdd_numbers(workers)
rdd = rdd.group_by(key_function1)
expect(rdd.collect_as_hash).to eql(numbers.group_by(&key_function1))
rdd = rdd_words(workers)
rdd = rdd.group_by(key_function2)
expect(rdd.collect_as_hash).to eql(words.group_by(&key_function2))
end
end
RSpec.describe 'Spark::RDD' do
def make_result(*hashes)
_result = {}
hashes.each do |data|
data.each do |key, value|
_result[key] ||= []
_result[key] << value
end
end
_result
end
context '.group_by_key' do
let(:hash) { Generator.hash }
let(:result) { make_result(hash) }
def rdd_result(workers)
rdd = $sc.parallelize(hash)
rdd.group_by_key.collect_as_hash
end
it_behaves_like 'a groupping by key', 1
it_behaves_like 'a groupping by key', 2
# it_behaves_like 'a groupping by key', nil
# it_behaves_like 'a groupping by key', rand(2..10)
end
context 'cogroup' do
let(:hash1) { Generator.hash }
let(:hash2) { Generator.hash }
let(:hash3) { Generator.hash }
let(:result_12) { make_result(hash1, hash2) }
let(:result_123) { make_result(hash1, hash2, hash3) }
def rdd_1(workers)
$sc.parallelize(hash1)
end
def rdd_2(workers)
$sc.parallelize(hash2)
end
def rdd_3(workers)
$sc.parallelize(hash3)
end
it_behaves_like 'a cogroupping by key', 1
it_behaves_like 'a cogroupping by key', 2
# it_behaves_like 'a cogroupping by key', nil
# it_behaves_like 'a cogroupping by key', rand(2..10)
end
context 'group_by' do
let(:key_function1) { lambda{|x| x%2} }
let(:key_function2) { lambda{|x| x.size} }
let(:numbers) { Generator.numbers }
let(:words) { Generator.words }
def rdd_numbers(workers)
$sc.parallelize(numbers)
end
def rdd_words(workers)
$sc.parallelize(words)
end
it_behaves_like 'a groupping by', 1
it_behaves_like 'a groupping by', 2
# it_behaves_like 'a groupping by', nil
# it_behaves_like 'a groupping by', rand(2..10)
end
end
================================================
FILE: spec/lib/helper_spec.rb
================================================
require 'spec_helper'
RSpec.configure do |c|
c.include Spark::Helper::Parser
c.include Spark::Helper::Statistic
end
RSpec.describe Spark::Helper do
it 'memory size' do
expect(to_memory_size('512mb')).to eql(524288.0)
expect(to_memory_size('1586 mb')).to eql(1624064.0)
expect(to_memory_size('3 MB')).to eql(3072.0)
expect(to_memory_size('9gb')).to eql(9437184.0)
expect(to_memory_size('9gb', 'mb')).to eql(9216.0)
expect(to_memory_size('9mb', 'gb')).to eql(0.01)
expect(to_memory_size('6652548796kb', 'mb')).to eql(6496629.68)
end
context 'statistic' do
it 'compute_fraction' do
expect(compute_fraction(1, 1000, true)).to be_within(0.001).of(0.013)
expect(compute_fraction(2, 1000, true)).to be_within(0.001).of(0.018)
expect(compute_fraction(3, 1000, true)).to be_within(0.001).of(0.023)
expect(compute_fraction(4, 1000, true)).to be_within(0.001).of(0.028)
expect(compute_fraction(5, 1000, true)).to be_within(0.001).of(0.031)
expect(compute_fraction(1, 1000, false)).to be_within(0.001).of(0.0249)
expect(compute_fraction(2, 1000, false)).to be_within(0.001).of(0.0268)
expect(compute_fraction(3, 1000, false)).to be_within(0.001).of(0.0287)
expect(compute_fraction(4, 1000, false)).to be_within(0.001).of(0.0305)
expect(compute_fraction(5, 1000, false)).to be_within(0.001).of(0.0322)
end
it 'bisect_right' do
data = [10, 20, 30, 40, 50, 60, 70, 80, 90]
expect(bisect_right(data, 0)).to eq(0)
expect(bisect_right(data, 1)).to eq(0)
expect(bisect_right(data, 1, 2)).to eq(2)
expect(bisect_right(data, 1, 3)).to eq(3)
expect(bisect_right(data, 1, 4)).to eq(4)
expect(bisect_right(data, 9)).to eq(0)
expect(bisect_right(data, 10)).to eq(1)
expect(bisect_right(data, 40)).to eq(4)
expect(bisect_right(data, 42)).to eq(4)
expect(bisect_right(data, 72)).to eq(7)
expect(bisect_right(data, 80, 4)).to eq(8)
expect(bisect_right(data, 80, 5)).to eq(8)
expect(bisect_right(data, 80, 8)).to eq(8)
expect(bisect_right(data, 80, 9)).to eq(9)
expect(bisect_right(data, 200)).to eq(9)
end
it 'determine_bounds' do
data = [10, 20, 30, 40, 50, 60, 70, 80, 90]
expect(determine_bounds(data, 0)).to eq([])
expect(determine_bounds(data, 1)).to eq([])
expect(determine_bounds(data, 2)).to eq([50])
expect(determine_bounds(data, 3)).to eq([40, 70])
expect(determine_bounds(data, 4)).to eq([30, 50, 70])
expect(determine_bounds(data, 20)).to eq(data)
end
end
end
================================================
FILE: spec/lib/key_spec.rb
================================================
require 'spec_helper'
RSpec.shared_examples 'a keying by' do |workers|
it "with #{workers || 'default'} worker" do
rdd = rdd_numbers(workers)
rdd = rdd.key_by(key_function1)
result = numbers.map{|item| [key_function1.call(item), item]}
expect(rdd.collect).to eql(result)
rdd = rdd_words(workers)
rdd = rdd.key_by(key_function2)
result = words.map{|item| [key_function2.call(item), item]}
expect(rdd.collect).to eql(result)
end
end
RSpec.describe 'Spark::RDD' do
context 'key_by' do
let(:key_function1) { lambda{|x| x.even?} }
let(:key_function2) { lambda{|x| x.include?('a')} }
let(:numbers) { Generator.numbers }
let(:words) { Generator.words }
def rdd_numbers(workers)
$sc.parallelize(numbers)
end
def rdd_words(workers)
$sc.parallelize(words)
end
it_behaves_like 'a keying by', 1
it_behaves_like 'a keying by', 2
# it_behaves_like 'a keying by', nil
# it_behaves_like 'a keying by', rand(2..10)
end
it 'lookup' do
numbers = Generator.numbers
rdd_numbers = $sc.parallelize(numbers, 2)
rdd = rdd_numbers.group_by(lambda {|x| x%3})
rdd.lookup(2)
expect(rdd.lookup(2).first).to eq(
numbers.group_by{|x| x%3}[2]
)
rdd = rdd_numbers.key_by(lambda{|x| x.even?})
expect(rdd.lookup(true)).to eq(
numbers.select(&:even?)
)
end
end
================================================
FILE: spec/lib/manipulation_spec.rb
================================================
require 'spec_helper'
RSpec.describe 'Spark::RDD' do
let(:numbers) { 1..100 }
let(:rand_numbers) { Generator.numbers }
it '.glom' do
rdd = $sc.parallelize(numbers, 1).glom
expect(rdd.collect).to eql([numbers.to_a])
ser = Spark::Serializer.build { __batched__(__marshal__, 1) }
rdd = $sc.parallelize(numbers, 5, ser).glom
expect(rdd.collect).to eql(numbers.each_slice(20).to_a)
end
it '.coalesce' do
rdd = $sc.parallelize(numbers, 5)
rdd2 = rdd.glom
expect(rdd2.collect.size).to eql(5)
rdd3 = rdd.coalesce(4).glom
expect(rdd3.collect.size).to eql(4)
end
it '.distinct' do
rdd = $sc.parallelize(rand_numbers, 5)
rdd = rdd.distinct
expect(rdd.collect.sort).to eql(rand_numbers.uniq.sort)
rdd = $sc.parallelize(numbers, 5)
rdd = rdd.map(lambda{|x| 1})
rdd = rdd.distinct
expect(rdd.collect).to eql([1])
end
context '.union' do
it 'classic method' do
rdd = $sc.parallelize(numbers, 5)
rdd = rdd.union(rdd).collect
expect(rdd.collect.sort).to eql((numbers.to_a+numbers.to_a).sort)
end
it 'with a different serializer' do
rdd1 = $sc.parallelize(numbers, 1, Spark::Serializer.build{ __batched__(__marshal__) })
rdd2 = $sc.parallelize(numbers, 1, Spark::Serializer.build{ __batched__(__oj__) })
expect { rdd1.union(rdd2).collect }.to_not raise_error
end
it 'as operator' do
rdd1 = $sc.parallelize(numbers)
rdd2 = $sc.parallelize(rand_numbers)
expect((rdd1+rdd2).sum).to eql((numbers.to_a+rand_numbers).reduce(:+))
end
end
it '.compact' do
data = [nil, nil , 0, 0, 1, 2, nil, 6]
result = data.compact
ser = Spark::Serializer.build { __batched__(__marshal__, 1) }
rdd = $sc.parallelize(data, 1).compact
expect(rdd.collect).to eql(result)
rdd = $sc.parallelize(data, 5, ser).compact
expect(rdd.collect).to eql(result)
rdd = $sc.parallelize(data, 1, ser).compact
expect(rdd.collect).to eql(result)
end
it '.intersection' do
data1 = [0,1,2,3,4,5,6,7,8,9,10]
data2 = [5,6,7,8,9,10,11,12,13,14,15]
rdd1 = $sc.parallelize(data1)
rdd2 = $sc.parallelize(data2)
expect(rdd1.intersection(rdd2).collect.sort).to eql(data1 & data2)
end
it '.shuffle' do
data = Generator.numbers
rdd = $sc.parallelize(data)
expect(rdd.shuffle.collect).to_not eql(data)
end
context '.cartesian' do
let(:data1) { Generator.numbers(100) }
let(:data2) { Generator.numbers(100) }
let(:result) { data1.product(data2).map(&:to_s).sort }
it 'unbatched' do
ser = Spark::Serializer.build { __batched__(__marshal__, 1) }
rdd1 = $sc.parallelize(data1, 2, ser)
rdd2 = $sc.parallelize(data2, 2, ser)
rdd = rdd1.cartesian(rdd2).map(lambda{|x| x.to_s})
expect(rdd.collect.sort).to eql(result)
end
it 'batched' do
ser1 = Spark::Serializer.build { __batched__(__marshal__, rand(4..10)) }
ser2 = Spark::Serializer.build { __batched__(__marshal__, rand(4..10)) }
rdd1 = $sc.parallelize(data1, 2, ser1)
rdd2 = $sc.parallelize(data2, 2, ser2)
rdd = rdd1.cartesian(rdd2).map(lambda{|x| x.to_s})
expect(rdd.collect.sort).to eql(result)
end
end
end
================================================
FILE: spec/lib/map_partitions_spec.rb
================================================
require 'spec_helper'
def func3(x)
x.map(&:to_i).reduce(:+)
end
def func4_with_index(data, index)
[{
index => data.map(&:to_i).reduce(:*)
}]
end
RSpec.shared_examples 'a map partitions' do |workers|
context "with #{workers || 'default'} worker" do
it 'without index' do
rdd2 = rdd(workers).map_partitions(func1)
result = func1.call(numbers)
expect(func1.call(rdd2.collect)).to eql(result)
rdd3 = rdd(workers)
rdd3 = rdd3.map_partitions(func1)
rdd3 = rdd3.map_partitions(func2)
rdd3 = rdd3.map_partitions(method(:func3))
result = func3(func2.call(func1.call(numbers)))
# Not same number of workers
expect(rdd3.collect.size).to be >= 1
rdd4 = rdd(workers)
rdd4 = rdd4.map_partitions(func1)
rdd4 = rdd4.map_partitions(func2)
rdd4 = rdd4.map_partitions(method(:func3))
expect(rdd4.collect).to eql(rdd3.collect)
end
it 'with index' do
rdd2 = rdd(workers).map_partitions_with_index(method(:func4_with_index))
result = rdd2.collect
expect(result).to be_a(Array)
result.each do |x|
expect(x).to be_a(Hash)
end
# Multiply by 0
# Some values are 0 because of batched serialization
expect(result.map(&:values).flatten.compact.uniq.first).to eql(0)
end
end
end
RSpec::describe 'Spark::RDD.map_partitions(_with_index)' do
let(:func1) { lambda{|x| x.map(&:to_i)} }
let(:func2) {
lambda{|x|
x.map{|y| y*2}
}
}
context 'throught parallelize' do
let(:numbers) { 0..1000 }
def rdd(workers)
$sc.parallelize(numbers, workers)
end
it_behaves_like 'a map partitions', 1
it_behaves_like 'a map partitions', 2
# it_behaves_like 'a map partitions', nil
# it_behaves_like 'a map partitions', rand(2..10)
end
context 'throught text_file' do
let(:file) { File.join('spec', 'inputs', 'numbers_0_100.txt') }
let(:numbers) { File.readlines(file).map(&:strip) }
def rdd(workers)
$sc.text_file(file, workers)
end
it_behaves_like 'a map partitions', 1
it_behaves_like 'a map partitions', 2
# it_behaves_like 'a map partitions', nil
# it_behaves_like 'a map partitions', rand(2..10)
end
end
================================================
FILE: spec/lib/map_spec.rb
================================================
require 'spec_helper'
RSpec.shared_examples 'a mapping' do |workers|
it "with #{workers || 'default'} worker" do
rdd2 = rdd(workers).map(func1)
result = numbers.map(&func1)
expect(rdd2.collect).to eql(result)
rdd3 = rdd(workers)
rdd3 = rdd3.map(func1)
rdd3 = rdd3.map(func2)
rdd3 = rdd3.map(func3)
result = numbers.map(&func1).map(&func2).map(&func3)
expect(rdd3.collect).to eql(result)
rdd4 = rdd(workers)
rdd4 = rdd4.map(func3)
rdd4 = rdd4.map(func2)
rdd4 = rdd4.map(func1)
expect(rdd4.collect).to eql(rdd3.collect)
end
end
RSpec.shared_examples 'a mapping values' do |workers|
it "with #{workers || 'default'} worker" do
rdd2 = rdd(workers).map_values(func1)
result = hash.map{|key, value| [key, func1.call(value)]}
expect(rdd2.collect).to eql(result)
rdd3 = rdd(workers)
rdd3 = rdd3.map_values(func1)
rdd3 = rdd3.map_values(func2)
rdd3 = rdd3.map_values(func3)
result = hash.map{|key, value| [key, func1.call(value)]}
.map{|key, value| [key, func2.call(value)]}
.map{|key, value| [key, func3.call(value)]}
expect(rdd3.collect).to eql(result)
end
end
RSpec.describe 'Spark::RDD' do
let(:func1) { lambda{|x| x*2} }
let(:func2) { lambda{|x| x*3} }
let(:func3) { lambda{|x| x*4} }
context 'throught parallelize' do
context '.map' do
let(:numbers) { Generator.numbers }
def rdd(workers)
$sc.parallelize(numbers, workers)
end
it_behaves_like 'a mapping', 1
it_behaves_like 'a mapping', 2
# it_behaves_like 'a mapping', nil
# it_behaves_like 'a mapping', rand(2..10)
end
context '.map_values' do
let!(:hash) { Generator.hash }
def rdd(workers)
$sc.parallelize(hash, workers)
end
it_behaves_like 'a mapping values', 1
it_behaves_like 'a mapping values', 2
# it_behaves_like 'a mapping values', nil
# it_behaves_like 'a mapping values', rand(2..10)
end
end
context 'throught text_file' do
context '.map' do
let(:file) { File.join('spec', 'inputs', 'numbers_0_100.txt') }
let(:numbers) { File.readlines(file).map(&:strip) }
def rdd(workers)
$sc.text_file(file, workers)
end
it_behaves_like 'a mapping', 1
it_behaves_like 'a mapping', 2
# it_behaves_like 'a mapping', nil
# it_behaves_like 'a mapping', rand(2..10)
end
end
end
================================================
FILE: spec/lib/mllib/classification_spec.rb
================================================
require 'spec_helper'
RSpec.describe 'Spark::Mllib classification' do
let(:data1) do
[
LabeledPoint.new(0.0, [1, 0, 0]),
LabeledPoint.new(1.0, [0, 1, 1]),
LabeledPoint.new(0.0, [2, 0, 0]),
LabeledPoint.new(1.0, [0, 2, 1])
]
end
let(:values1) do
data1.map do |lp|
lp.features.values
end
end
let(:rdd1) { $sc.parallelize(data1) }
context 'logistic regression' do
it 'test' do
lrm = LogisticRegressionWithSGD.train(rdd1)
expect(lrm.predict(values1[0])).to be <= 0
expect(lrm.predict(values1[1])).to be > 0
expect(lrm.predict(values1[2])).to be <= 0
expect(lrm.predict(values1[3])).to be > 0
end
end
context 'svm' do
it 'test' do
lrm = SVMWithSGD.train(rdd1)
expect(lrm.predict(values1[0])).to be <= 0
expect(lrm.predict(values1[1])).to be > 0
expect(lrm.predict(values1[2])).to be <= 0
expect(lrm.predict(values1[3])).to be > 0
end
end
context 'naive bayes' do
it 'test' do
lrm = NaiveBayes.train(rdd1)
expect(lrm.predict(values1[0])).to be <= 0
expect(lrm.predict(values1[1])).to be > 0
expect(lrm.predict(values1[2])).to be <= 0
expect(lrm.predict(values1[3])).to be > 0
end
end
end
================================================
FILE: spec/lib/mllib/clustering_spec.rb
================================================
require 'spec_helper'
RSpec.describe 'Spark::Mllib clustering' do
context 'kmeans' do
it 'test' do
data = [
DenseVector.new([0, 1.1]),
DenseVector.new([0, 1.2]),
DenseVector.new([1.1, 0]),
DenseVector.new([1.2, 0])
]
model = KMeans.train($sc.parallelize(data), 2, initialization_mode: 'k-means||')
expect(model.predict(data[0])).to eq(model.predict(data[1]))
expect(model.predict(data[2])).to eq(model.predict(data[3]))
end
it 'deterministic' do
data = Array.new(10) do |i|
i *= 10
DenseVector.new([i, i])
end
clusters1 = KMeans.train($sc.parallelize(data), 3, initialization_mode: 'k-means||', seed: 42)
clusters2 = KMeans.train($sc.parallelize(data), 3, initialization_mode: 'k-means||', seed: 42)
centers1 = clusters1.centers.to_a
centers2 = clusters2.centers.to_a
centers1.zip(centers2).each do |c1, c2|
expect(c1).to eq(c2)
end
end
end
end
================================================
FILE: spec/lib/mllib/matrix_spec.rb
================================================
require 'spec_helper'
RSpec.describe 'Spark::Mllib::Matrix' do
context 'dense' do
it 'construct' do
values = [[1.0, 0.0, 4.0], [0.0, 3.0, 5.0], [2.0, 0.0, 6.0]]
matrix = DenseMatrix.new(3, 3, [[1.0, 0.0, 4.0], [0.0, 3.0, 5.0], [2.0, 0.0, 6.0]])
expect(matrix.shape).to eq([3, 3])
expect(matrix.values).to eq([[1.0, 0.0, 4.0], [0.0, 3.0, 5.0], [2.0, 0.0, 6.0]])
end
end
context 'sparse' do
it 'construct' do
values = [1.0, 2.0, 4.0, 5.0]
column_pointers = [0, 2, 2, 4, 4]
row_indices = [1, 2, 1, 2]
matrix = SparseMatrix.new(3, 4, column_pointers, row_indices, values)
expect(matrix.shape).to eq([3, 4])
expect(matrix.to_a).to eq(
[
[0.0, 0.0, 0.0, 0.0],
[1.0, 0.0, 4.0, 0.0],
[2.0, 0.0, 5.0, 0.0]
]
)
end
end
end
================================================
FILE: spec/lib/mllib/regression_spec.rb
================================================
require 'spec_helper'
# Mllib functions are tested on Spark
# This just test if ruby call proper methods
RSpec.describe 'Spark::Mllib regression' do
let(:data1) do
[
LabeledPoint.new(-1.0, [0, -1]),
LabeledPoint.new(1.0, [0, 1]),
LabeledPoint.new(-1.0, [0, -2]),
LabeledPoint.new(1.0, [0, 2])
]
end
let(:values1) do
data1.map do |lp|
lp.features.values
end
end
let(:rdd1) { $sc.parallelize(data1) }
context 'labeled point' do
let(:lp) { LabeledPoint.new(1, [1,2,3]) }
it 'from array' do
expect(lp.label).to eql(1.0)
expect(lp.features).to be_a(DenseVector)
end
it 'serialize' do
lp2 = Marshal.load(Marshal.dump(lp))
expect(lp2.label).to eql(lp.label)
expect(lp2.features.values).to eql(lp.features.values)
end
end
context 'linear regression' do
context 'test' do
let(:lrm) { LinearRegressionWithSGD.train(rdd1) }
it 'test' do
expect(lrm.predict(values1[0])).to be <= 0
expect(lrm.predict(values1[1])).to be > 0
expect(lrm.predict(values1[2])).to be <= 0
expect(lrm.predict(values1[3])).to be > 0
end
it 'test via rdd' do
rdd = $sc.parallelize(values1, 1)
rdd = rdd.map(lambda{|value| model.predict(value)})
rdd = rdd.bind(model: lrm)
result = rdd.collect
expect(result[0]).to be <= 0
expect(result[1]).to be > 0
expect(result[2]).to be <= 0
expect(result[3]).to be > 0
end
end
# Y = 3 + 10*X1 + 10*X2
it 'linear regression' do
data = Spark.jb.call(RubyMLLibUtilAPI, 'generateLinearInput', 3.0, ['10.0', '10.0'], 100, 42, 0.1)
rdd = $sc.parallelize(data)
lrm = LinearRegressionWithSGD.train(rdd, iterations: 1000, intercept: true, step: 1.0)
expect(lrm.intercept).to be_between(2.5, 3.5)
expect(lrm.weights.size).to eq(2)
expect(lrm.weights[0]).to be_between(9.0, 11.0)
expect(lrm.weights[1]).to be_between(9.0, 11.0)
end
end
context 'lasso' do
it 'test' do
lrm = LassoWithSGD.train(rdd1)
expect(lrm.predict(values1[0])).to be <= 0
expect(lrm.predict(values1[1])).to be > 0
expect(lrm.predict(values1[2])).to be <= 0
expect(lrm.predict(values1[3])).to be > 0
end
it 'local random SGD with initial weights' do
data = Spark.jb.call(RubyMLLibUtilAPI, 'generateLinearInput', 2.0, ['-1.5', '0.01'], 1000, 42, 0.1)
data.map! do |lp|
LabeledPoint.new(lp.label, [1.0] + lp.features.values)
end
rdd = $sc.parallelize(data);
lrm = LassoWithSGD.train(rdd, step: 1.0, reg_param: 0.01, iterations: 40, initial_weights: [-1.0, -1.0, -1.0])
expect(lrm.weights[0]).to be_between(1.9, 2.1)
expect(lrm.weights[1]).to be_between(-1.60, -1.40)
expect(lrm.weights[2]).to be_between(-1.0e-2, 1.0e-2)
end
end
context 'ridge' do
it 'test' do
lrm = RidgeRegressionWithSGD.train(rdd1)
expect(lrm.predict(values1[0])).to be <= 0
expect(lrm.predict(values1[1])).to be > 0
expect(lrm.predict(values1[2])).to be <= 0
expect(lrm.predict(values1[3])).to be > 0
end
end
end
================================================
FILE: spec/lib/mllib/vector_spec.rb
================================================
require 'spec_helper'
RSpec.describe 'Spark::Mllib::Vector' do
context 'parsing' do
it 'dense vector' do
dv = DenseVector.new([1.0, 2.0, 3.0, 4.0, 5.0])
dv2 = DenseVector.parse(dv.to_s)
dv3 = Vectors.parse(dv.to_s)
expect(dv.to_s).to eq("[1.0,2.0,3.0,4.0,5.0]")
expect(dv2.values).to eq(dv.values)
expect(dv3.values).to eq(dv.values)
end
it 'sparse vector' do
sv = SparseVector.new(5, {1 => 3, 4 => 5})
sv2 = SparseVector.parse(sv.to_s)
sv3 = Vectors.parse(sv.to_s)
expect(sv.to_s).to eq("(5,[1,4],[3,5])")
expect(sv2.size).to eq(sv.size)
expect(sv2.indices).to eq(sv.indices)
expect(sv2.values).to eq(sv.values)
expect(sv3.size).to eq(sv.size)
expect(sv3.indices).to eq(sv.indices)
expect(sv3.values).to eq(sv.values)
end
end
it 'dot' do
sv = SparseVector.new(4, {1 => 1, 3 => 2})
dv = DenseVector.new([1.0, 2.0, 3.0, 4.0])
lst = DenseVector.new([1, 2, 3, 4])
expect(sv.dot(dv)).to eq(10.0)
expect(dv.dot(dv)).to eq(30.0)
expect(lst.dot(dv)).to eq(30.0)
end
it 'squared distance' do
sv = SparseVector.new(4, {1 => 1, 3 => 2})
dv = DenseVector.new([1.0, 2.0, 3.0, 4.0])
lst = DenseVector.new([4, 3, 2, 1])
expect(sv.squared_distance(dv)).to eq(15)
expect(sv.squared_distance(lst)).to eq(25)
expect(dv.squared_distance(lst)).to eq(20)
expect(dv.squared_distance(sv)).to eq(15)
expect(lst.squared_distance(sv)).to eq(25)
expect(lst.squared_distance(dv)).to eq(20)
expect(sv.squared_distance(sv)).to eq(0)
expect(dv.squared_distance(dv)).to eq(0)
expect(lst.squared_distance(lst)).to eq(0)
end
it 'sparse vector indexing' do
sv1 = SparseVector.new(4, {1 => 1, 3 => 2})
sv2 = SparseVector.new(4, [1, 3], [1, 2])
expect(sv1[0]).to eq(0)
expect(sv1[3]).to eq(2)
expect(sv1[1]).to eq(1)
expect(sv1[2]).to eq(0)
expect(sv1[-1]).to eq(2)
expect(sv1[-2]).to eq(0)
expect(sv1[-4]).to eq(0)
expect(sv2[0]).to eq(0)
expect(sv2[3]).to eq(2)
expect(sv2[1]).to eq(1)
expect(sv2[2]).to eq(0)
expect(sv2[-1]).to eq(2)
expect(sv2[-2]).to eq(0)
expect(sv2[-4]).to eq(0)
end
end
================================================
FILE: spec/lib/reduce_by_key_spec.rb
================================================
require 'spec_helper'
def flat_map(line)
line.split
end
def map(item)
[item, 1]
end
def reduce(x,y)
x+y
end
RSpec.shared_examples 'a words counting' do |workers|
context "with #{workers || 'default'} worker" do
let(:result) do
keyyed = lines.flat_map{|x| x.split}.map{|x| [x,1]}
result = keyyed.reduce({}){|memo, item|
key = item[0]
value = item[1]
memo[key] ||= 0
memo[key] += value
memo
}
result
end
it 'when lambda' do
rdd2 = rdd(workers)
rdd2 = rdd2.flat_map(lambda{|line| line.split})
rdd2 = rdd2.map(lambda{|word| [word, 1]})
rdd2 = rdd2.reduce_by_key(lambda{|x,y| x+y})
expect(rdd2.collect_as_hash).to eql(result)
end
it 'when method' do
rdd2 = rdd(workers)
rdd2 = rdd2.flat_map(method(:flat_map))
rdd2 = rdd2.map(method(:map))
rdd2 = rdd2.reduce_by_key(method(:reduce))
expect(rdd2.collect_as_hash).to eql(result)
end
it 'keys, values' do
rdd2 = rdd(workers)
rdd2 = rdd2.flat_map(method(:flat_map))
rdd2 = rdd2.map(method(:map))
rdd2 = rdd2.reduce_by_key(method(:reduce))
expect(rdd2.keys.collect.sort).to eql(result.keys.sort)
expect { rdd2.values.collect.reduce(:+) }.to_not raise_error
end
end
end
RSpec.describe 'Spark::RDD' do
context '.reduce_by_key' do
context 'throught parallelize' do
let(:lines) { Generator.lines }
def rdd(workers)
$sc.parallelize(lines, workers)
end
it_behaves_like 'a words counting', 2
# it_behaves_like 'a words counting', nil
# it_behaves_like 'a words counting', rand(2..10)
end
context 'throught text_file' do
let(:file) { File.join('spec', 'inputs', 'lorem_300.txt') }
let(:lines) { File.readlines(file).map(&:strip) }
def rdd(workers)
$sc.text_file(file, workers)
end
it_behaves_like 'a words counting', 2
# it_behaves_like 'a words counting', nil
# it_behaves_like 'a words counting', rand(2..10)
end
end
context '.fold_by_key' do
let(:numbers) { Generator.numbers }
let(:zero_value) { 0 }
let(:rdd) { $sc.parallelize(numbers) }
let(:map) { lambda{|x| [x, 1]} }
let(:add) { lambda{|x,y| x+y} }
let(:result) do
_result = {}
numbers.map(&map).each do |key, value|
_result[key] ||= zero_value
_result[key] = add.call(_result[key], value)
end
_result
end
def fold_by_key(num_partitions=nil)
rdd.map(map).fold_by_key(zero_value, add, num_partitions).collect_as_hash
end
it 'default num_partitions' do
expect(fold_by_key).to eq(result)
end
it 'default num_partitions' do
expect(
fold_by_key rand(1..10)
).to eq(result)
end
end
end
================================================
FILE: spec/lib/reduce_spec.rb
================================================
require 'spec_helper'
def longest_words(memo, word)
memo.length > word.length ? memo : word
end
RSpec.shared_examples 'a reducing' do |workers|
context "with #{workers || 'default'} worker" do
it '.reduce' do
rdd2 = rdd_numbers(workers)
rdd2 = rdd2.map(to_i)
rdd2 = rdd2.reduce(func1)
result = numbers.map(&:to_i).reduce(&func1)
expect(rdd2).to eql(result)
rdd3 = rdd_numbers(workers)
rdd3 = rdd3.map(to_i)
rdd3 = rdd3.reduce(func2)
result = numbers.map(&:to_i).reduce(&func2)
expect(rdd3).to eql(result)
rdd4 = rdd_lines(workers)
rdd4 = rdd4.flat_map(split)
rdd4 = rdd4.reduce(method(:longest_words))
result = lines.flat_map(&split).reduce(&lambda(&method(:longest_words)))
expect(rdd4).to eql(result)
end
it '.fold' do
rdd2 = rdd_numbers(workers)
rdd2 = rdd2.map(to_i)
rdd_result = rdd2.fold(1, func1)
# all workers add 1 + last reducing phase
result = numbers.map(&:to_i).reduce(&func1) + rdd2.partitions_size + 1
expect(rdd_result).to eql(result)
end
it '.aggregate' do
rdd2 = rdd_numbers(workers)
rdd2 = rdd2.map(to_i)
# Sum of items + their count
seq = lambda{|x,y| [x[0] + y, x[1] + 1]}
com = lambda{|x,y| [x[0] + y[0], x[1] + y[1]]}
rdd_result = rdd2.aggregate([0,0], seq, com)
result = [numbers.reduce(:+), numbers.size]
expect(rdd_result).to eql(result)
end
it '.max' do
rdd2 = rdd_numbers(workers)
rdd2 = rdd2.map(to_i)
expect(rdd2.max).to eql(numbers.map(&:to_i).max)
end
it '.min' do
rdd2 = rdd_numbers(workers)
rdd2 = rdd2.map(to_i)
expect(rdd2.min).to eql(numbers.map(&:to_i).min)
end
it '.sum' do
rdd2 = rdd_numbers(workers)
rdd2 = rdd2.map(to_i)
expect(rdd2.sum).to eql(numbers.map(&:to_i).reduce(:+))
end
it '.count' do
rdd2 = rdd_numbers(workers)
rdd2 = rdd2.map(to_i)
expect(rdd2.count).to eql(numbers.size)
end
end
end
RSpec.describe 'Spark::RDD' do
let(:func1) { lambda{|sum, x| sum+x} }
let(:func2) { lambda{|product, x| product*x} }
let(:to_i) { lambda{|item| item.to_i} }
let(:split) { lambda{|item| item.split} }
context 'throught parallelize' do
let(:numbers) { Generator.numbers }
let(:lines) { Generator.lines }
def rdd_numbers(workers)
$sc.parallelize(numbers, workers)
end
def rdd_lines(workers)
$sc.parallelize(lines, workers)
end
it_behaves_like 'a reducing', 1
it_behaves_like 'a reducing', 2
# it_behaves_like 'a reducing', nil
# it_behaves_like 'a reducing', rand(2..10)
end
context 'throught text_file' do
let(:file) { File.join('spec', 'inputs', 'numbers_0_100.txt') }
let(:file_lines) { File.join('spec', 'inputs', 'lorem_300.txt') }
let(:numbers) { File.readlines(file).map(&:strip).map(&:to_i) }
let(:lines) { File.readlines(file_lines).map(&:strip) }
def rdd_numbers(workers)
$sc.text_file(file, workers)
end
def rdd_lines(workers)
$sc.text_file(file_lines, workers)
end
it_behaves_like 'a reducing', 1
it_behaves_like 'a reducing', 2
# it_behaves_like 'a reducing', nil
# it_behaves_like 'a reducing', rand(2..10)
end
end
================================================
FILE: spec/lib/sample_spec.rb
================================================
require 'spec_helper'
# Sample method can not be tested because of random generator
# Just test it for raising error
RSpec.shared_examples 'a sampler' do |workers|
context "with #{workers || 'default'} worker" do
context '.sample' do
it 'with replacement' do
rdd2 = rdd(workers).sample(true, rand)
expect { rdd2.collect }.to_not raise_error
end
it 'without replacement' do
rdd2 = rdd(workers).sample(false, rand)
expect { rdd2.collect }.to_not raise_error
end
end
context '.take_sample' do
it 'with replacement' do
size = rand(10..999)
expect(rdd(workers).take_sample(true, size).size).to eql(size)
end
it 'without replacement' do
size = rand(10..999)
expect(rdd(workers).take_sample(false, size).size).to eql(size)
end
end
end
end
RSpec.describe 'Spark::RDD' do
let(:numbers) { Generator.numbers(1000) }
def rdd(workers)
$sc.parallelize(numbers, workers)
end
it_behaves_like 'a sampler', 1
it_behaves_like 'a sampler', 2
# it_behaves_like 'a sampler', nil
# it_behaves_like 'a sampler', rand(2..10)
end
================================================
FILE: spec/lib/serializer_spec.rb
================================================
require 'spec_helper'
require 'zlib'
RSpec.describe Spark::Serializer do
let(:data) { [1, 'test', 2.0, [3], {key: 'value'}, :test, String] }
it 'find' do
expect(described_class.find('not_existed_class')).to eql(nil)
expect(described_class.find('Marshal')).to eq(described_class::Marshal)
expect(described_class.find('marshal')).to eq(described_class::Marshal)
expect(described_class.find(:marshal)).to eq(described_class::Marshal)
expect(described_class.find('batched')).to eq(described_class::Batched)
end
it 'find!' do
expect { expect(described_class.find!('not_existed_class')) }.to raise_error(Spark::SerializeError)
expect { expect(described_class.find!('marshal')) }.to_not raise_error
expect { expect(described_class.find!('batched')) }.to_not raise_error
end
it 'register' do
NewSerializer = Class.new
expect(described_class.find('new_serializer_1')).to eql(nil)
expect(described_class.find('new_serializer_2')).to eql(nil)
expect(described_class.find('new_serializer_3')).to eql(nil)
described_class.register('new_serializer_1', 'new_serializer_2', 'new_serializer_3', NewSerializer)
expect(described_class.find('new_serializer_1')).to eql(NewSerializer)
expect(described_class.find('new_serializer_2')).to eql(NewSerializer)
expect(described_class.find('new_serializer_3')).to eql(NewSerializer)
end
it '==' do
# One class
marshal1 = described_class::Marshal.new
marshal2 = described_class::Marshal.new
expect(marshal1).to eq(marshal1)
expect(marshal1).to eq(marshal2)
# Two classes
compressed1 = described_class::Compressed.new(marshal1)
compressed2 = described_class::Compressed.new(marshal2)
expect(compressed1).to eq(compressed1)
expect(compressed1).to eq(compressed2)
# Three classes
batched1 = described_class::Batched.new(compressed1, 1)
batched2 = described_class::Batched.new(compressed2, 1)
batched3 = described_class::Batched.new(compressed1, 2)
expect(batched1).to eq(batched2)
expect(batched1).to_not eq(batched3)
end
context 'build' do
let(:marshal1) { described_class::Marshal.new }
let(:compressed1) { described_class::Compressed.new(marshal1) }
let(:batched1) { described_class::Batched.new(compressed1, 1) }
it 'block' do
expect(described_class.build{ marshal }).to eq(marshal1)
expect(described_class.build{ marshal }).to eq(described_class.build{ __marshal__ })
expect(described_class.build{ compressed(marshal) }).to eq(compressed1)
expect(described_class.build{ batched(compressed(marshal), 1) }).to eq(batched1)
end
it 'text' do
expect(described_class.build('marshal')).to eq(marshal1)
expect(described_class.build('compressed(marshal)')).to eq(compressed1)
expect(described_class.build('batched(compressed(marshal), 1)')).to eq(batched1)
end
end
it 'serialization' do
marshal1 = described_class.build{ marshal }
compressed1 = described_class.build{ compressed(marshal) }
expect(marshal1.dump(data)).to eq(Marshal.dump(data))
expect(compressed1.dump(data)).to eq(
Zlib::Deflate.deflate(Marshal.dump(data))
)
end
context 'Auto batched' do
let(:klass) { Spark::Serializer::AutoBatched }
let(:marshal) { Spark::Serializer::Marshal.new }
let(:numbers) { Generator.numbers }
it 'initialize' do
expect { klass.new }.to raise_error(ArgumentError)
expect { klass.new(marshal) }.to_not raise_error
expect { klass.new(marshal, 1) }.to raise_error(Spark::SerializeError)
end
it 'serialization' do
serializer1 = klass.new(marshal)
serializer2 = klass.new(marshal, 2)
rdd1 = Spark.sc.parallelize(numbers, 2, serializer1)
rdd2 = Spark.sc.parallelize(numbers, 2, serializer2).map(:to_i)
result = rdd1.collect
expect(rdd1.serializer).to eq(serializer1)
expect(result).to eq(numbers)
expect(result).to eq(rdd2.collect)
end
end
end
================================================
FILE: spec/lib/sort_spec.rb
================================================
require 'spec_helper'
RSpec.shared_examples 'a sorting' do |workers|
it "with #{workers || 'default'} worker" do
rdd2 = rdd(workers)
rdd2 = rdd2.flat_map(split)
result = lines.flat_map(&split)
# Sort by self
rdd3 = rdd2.map(map).sort_by_key
result2 = result.map(&map).sort_by{|(key, _)| key}
expect(rdd3.collect).to eql(result2)
# Sort by len
rdd3 = rdd2.map(len_map).sort_by_key
result2 = result.map(&len_map).sort_by{|(key, _)| key}
expect(rdd3.collect).to eql(result2)
end
end
RSpec.describe 'Spark::RDD' do
let(:split) { lambda{|x| x.split} }
let(:map) { lambda{|x| [x.to_s, 1]} }
let(:len_map) { lambda{|x| [x.size, x]} }
context 'throught parallelize' do
context '.map' do
let(:lines) { Generator.lines }
def rdd(workers)
$sc.parallelize(lines, workers)
end
it_behaves_like 'a sorting', 1
it_behaves_like 'a sorting', 2
# it_behaves_like 'a sorting', nil
# it_behaves_like 'a sorting', rand(2..10)
end
end
context 'throught text_file' do
context '.map' do
let(:file) { File.join('spec', 'inputs', 'lorem_300.txt') }
let(:lines) { File.readlines(file).map(&:strip) }
def rdd(workers)
$sc.text_file(file, workers)
end
it_behaves_like 'a sorting', 1
it_behaves_like 'a sorting', 2
# it_behaves_like 'a sorting', nil
# it_behaves_like 'a sorting', rand(2..10)
end
end
end
================================================
FILE: spec/lib/sql/column_spec.rb
================================================
require 'spec_helper'
RSpec.shared_examples 'binary comparison' do |op|
it "#{op}" do
to_test = 20
result = df.select('age').where( df.age.__send__(op, to_test) ).values.flatten
result.each do |item|
if op == '!='
expect(item).to_not eq(to_test)
else
expect(item).to be.__send__(op, to_test)
end
end
end
end
RSpec.describe Spark::SQL::Column do
let(:file) { File.join('spec', 'inputs', 'people.json') }
let(:df) { $sql.read.json(file) }
let(:data) do
# Data are line delimited
result = []
File.readlines(file).each do |line|
result << JSON.parse(line)
end
result
end
context 'operators' do
it 'func' do
result = df.select( df.id, df.active, ~df.id, !df.active ).collect_as_hash.map(&:values)
result.each do |item|
expect(item[0]).to eq(-item[2])
expect(item[1]).to eq(!item[3])
end
end
context 'binary' do
it 'arithmetic' do
result = df.select( df.id, df.id+1, df.id-1, df.id*2, df.id/2, df.id%2 ).collect_as_hash.map(&:values)
result.each do |item|
expect(item[1]).to eq(item[0]+1)
expect(item[2]).to eq(item[0]-1)
expect(item[3]).to eq(item[0]*2)
expect(item[4]).to eq(item[0]/2.0)
expect(item[5]).to eq(item[0]%2)
end
end
# comparison
it_behaves_like 'binary comparison', '=='
it_behaves_like 'binary comparison', '!='
it_behaves_like 'binary comparison', '<'
it_behaves_like 'binary comparison', '<='
it_behaves_like 'binary comparison', '>'
it_behaves_like 'binary comparison', '>='
it 'logical' do
result = df.select('id').where( (df.id >= 20) & (df.id <= 30) ).values.flatten
expect(result).to all( be_between(20, 30) )
result = df.select('id').where( (df.id == 1) | (df.id == 2) ).values.flatten
expect(result).to eq([1, 2])
end
it 'like' do
result = df.select('email').where( df.email.like('%com%') ).values.flatten
expect(result).to all( include('com') )
end
it 'null' do
result1 = df.select('address').where( df.address.is_null ).values.flatten
result2 = df.select('address').where( df.address.is_not_null ).values.flatten
expect(result1).to all( be_nil )
expect(result2).to all( be_an(String) )
end
end
end
it 'substr' do
result = df.select( df.name.substr(1, 3) ).values.flatten
result.each do |item|
expect(item.size).to eq(3)
end
end
it 'isin' do
result = df.select('age').where( df.age.isin(20, 21, 22) ).values.flatten
expect(result).to all( eq(20).or eq(21).or eq(22) )
end
it 'alias' do
result = df.select( df.id.as('id2') ).collect_as_hash.map(&:keys).flatten
expect(result).to all( eq('id2') )
end
it 'cast' do
result = df.select( df.id, df.id.cast('string').alias('age2') ).values
result.each do |item|
expect(item[0]).to be_an(Integer)
expect(item[0].to_s).to eq(item[1])
end
end
it 'when, otherwise' do
result = df.select(df.id, Spark::SQL::Column.when(df.id <= 20, 1).when(df.id >= 30, 3).otherwise(2)).values
result.each do |item|
id = item[0]
value = item[1]
if id <= 20
expect(value).to eq(1)
elsif id >= 30
expect(value).to eq(3)
else
expect(value).to eq(2)
end
end
end
end
================================================
FILE: spec/lib/sql/data_frame_spec.rb
================================================
require 'spec_helper'
RSpec.describe Spark::SQL::DataFrame do
let(:file) { File.join('spec', 'inputs', 'people.json') }
let(:df) { $sql.read.json(file) }
context '[]' do
it 'String' do
value = df['age']
expect(value).to be_a(Spark::SQL::Column)
expect(value.to_s).to eq('Column("age")')
end
it 'Array' do
value = df[ ['name', 'age'] ]
expect(value).to be_a(Spark::SQL::DataFrame)
expect(value.columns).to eq(['name', 'age'])
end
it 'Numeric' do
value = df[0]
expect(value).to be_a(Spark::SQL::Column)
expect(value.to_s).to eq('Column("active")')
end
it 'Column' do
value = df[ df[0] == true ]
expect(value).to be_a(Spark::SQL::DataFrame)
end
end
it 'columns' do
expect(df.columns).to eq(['active', 'address', 'age', 'email', 'id', 'ip_address', 'name'])
end
it 'schema' do
schema = df.schema
expect(schema).to be_a(Spark::SQL::StructType)
expect(schema.type_name).to eq('struct')
end
it 'show_string' do
expect(df.show_string).to start_with('+--')
end
it 'dtypes' do
expect(df.dtypes).to eq([['active', 'boolean'], ['address', 'string'], ['age', 'long'], ['email', 'string'], ['id', 'long'], ['ip_address', 'string'], ['name', 'string']])
end
it 'take' do
expect(df.take(10).size).to eq(10)
end
it 'count' do
expect(df.count).to eq(100)
end
context 'select' do
it '*' do
row = df.select('*').first
expect(row.data.keys).to eq(['active', 'address', 'age', 'email', 'id', 'ip_address', 'name'])
end
it 'with string' do
row = df.select('name', 'age').first
expect(row.data.keys).to eq(['name', 'age'])
end
it 'with column' do
row = df.select(df.name, df.age).first
expect(row.data.keys).to eq(['name', 'age'])
end
end
context 'where' do
it 'with string' do
eq_20 = df.filter('age = 20').collect
expect(eq_20.map{|c| c['age']}).to all(be == 20)
end
it 'with column' do
nil_values = df.where(df.age.is_null).collect
greater_or_eq_20 = df.where(df.age >= 20).collect
lesser_than_20 = df.where(df.age < 20).collect
expect(nil_values.size + greater_or_eq_20.size + lesser_than_20.size).to eq(df.count)
expect(nil_values.map{|c| c['age']}).to all(be_nil)
expect(greater_or_eq_20.map{|c| c['age']}).to all(be >= 20)
expect(lesser_than_20.map{|c| c['age']}).to all(be < 20)
end
end
end
================================================
FILE: spec/lib/statistic_spec.rb
================================================
require 'spec_helper'
RSpec.shared_examples 'a stats' do |workers|
let(:numbers) { [1.0, 1.0, 2.0, 3.0, 5.0, 8.0] }
context "with #{workers || 'default'} worker" do
it 'stats class' do
stats = $sc.parallelize(numbers, workers).stats
expect(stats.sum).to be_within(0.1).of(20)
expect(stats.mean).to be_within(0.1).of(20/6.0)
expect(stats.max).to be_within(0.1).of(8.0)
expect(stats.min).to be_within(0.1).of(1.0)
expect(stats.variance).to be_within(0.1).of(6.22222)
expect(stats.sample_variance).to be_within(0.1).of(7.46667)
expect(stats.stdev).to be_within(0.1).of(2.49444)
expect(stats.sample_stdev).to be_within(0.1).of(2.73252)
end
it 'rdd methods' do
rdd = $sc.parallelize([1, 2, 3], workers)
expect(rdd.mean).to be_within(0.1).of(2.0)
expect(rdd.variance).to be_within(0.1).of(0.666)
expect(rdd.stdev).to be_within(0.1).of(0.816)
expect(rdd.sample_stdev).to be_within(0.1).of(1.0)
expect(rdd.sample_variance).to be_within(0.1).of(1.0)
end
end
end
RSpec.shared_examples 'a histogram' do |workers|
context "with #{workers || 'default'} worker" do
it 'empty' do
rdd = $sc.parallelize([], workers, ser)
expect( rdd.histogram([0, 10])[1] ).to eq([0])
expect( rdd.histogram([0, 4, 10])[1] ).to eq([0, 0])
end
it 'validation' do
rdd = $sc.parallelize([], workers, ser)
expect { rdd.histogram(0) }.to raise_error(ArgumentError)
end
it 'double' do
rdd = $sc.parallelize([1.0, 2.0, 3.0, 4.0], workers, ser)
buckets, counts = rdd.histogram(2)
expect(buckets).to eq([1.0, 2.5, 4.0])
expect(counts).to eq([2, 2])
end
it 'out of range' do
rdd = $sc.parallelize([10.01, -0.01], workers, ser)
expect( rdd.histogram([0, 10])[1] ).to eq([0])
expect( rdd.histogram([0, 4, 10])[1] ).to eq([0, 0])
end
it 'in range with one bucket' do
rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)
expect( rdd.histogram([0, 10])[1] ).to eq([4])
expect( rdd.histogram([0, 4, 10])[1] ).to eq([3, 1])
end
it 'in range with one bucket exact match' do
rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)
expect( rdd.histogram([1, 4])[1] ).to eq([4])
end
it 'out of range with two buckets' do
rdd = $sc.parallelize([10.01, -0.01], workers, ser)
expect( rdd.histogram([0, 5, 10])[1] ).to eq([0, 0])
end
it 'out of range with two uneven buckets' do
rdd = $sc.parallelize([10.01, -0.01], workers, ser)
expect( rdd.histogram([0, 4, 10])[1] ).to eq([0, 0])
end
it 'in range with two buckets' do
rdd = $sc.parallelize([1, 2, 3, 5, 6], workers, ser)
expect( rdd.histogram([0, 5, 10])[1] ).to eq([3, 2])
end
it 'in range with two bucket and nil' do
rdd = $sc.parallelize([1, 2, 3, 5, 6, nil, Float::NAN], workers, ser)
expect( rdd.histogram([0, 5, 10])[1] ).to eq([3, 2])
end
it 'in range with two uneven buckets' do
rdd = $sc.parallelize([1, 2, 3, 5, 6], workers, ser)
expect( rdd.histogram([0, 5, 11])[1] ).to eq([3, 2])
end
it 'mixed range with two uneven buckets' do
rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.0, 11.01], workers, ser)
expect( rdd.histogram([0, 5, 11])[1] ).to eq([4, 3])
end
it 'mixed range with four uneven buckets' do
rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1], workers, ser)
expect( rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1] ).to eq([4, 2, 1, 3])
end
it 'mixed range with uneven buckets and NaN' do
rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1, nil, Float::NAN], workers, ser)
expect( rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1] ).to eq([4, 2, 1, 3])
end
it 'out of range with infinite buckets' do
rdd = $sc.parallelize([10.01, -0.01, Float::NAN, Float::INFINITY], workers, ser)
expect( rdd.histogram([-Float::INFINITY, 0, Float::INFINITY])[1] ).to eq([1, 1])
end
it 'without buckets' do
rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)
expect( rdd.histogram(1) ).to eq([[1, 4], [4]])
end
it 'without buckets single element' do
rdd = $sc.parallelize([1], workers, ser)
expect( rdd.histogram(1) ).to eq([[1, 1], [1]])
end
it 'without bucket no range' do
rdd = $sc.parallelize([1, 1, 1, 1], workers, ser)
expect( rdd.histogram(1) ).to eq([[1, 1], [4]])
end
it 'without buckets basic two' do
rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)
expect( rdd.histogram(2) ).to eq([[1, 2.5, 4], [2, 2]])
end
it 'without buckets with more requested than elements' do
rdd = $sc.parallelize([1, 2], workers, ser)
buckets = [1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]
hist = [1, 0, 0, 0, 0, 0, 0, 0, 0, 1]
expect( rdd.histogram(10) ).to eq([buckets, hist])
end
it 'string' do
rdd = $sc.parallelize(['ab', 'ac', 'b', 'bd', 'ef'], workers, ser)
expect( rdd.histogram(['a', 'b', 'c'])[1] ).to eq([2, 2])
expect( rdd.histogram(1) ).to eq([['ab', 'ef'], [5]])
expect { rdd.histogram(2) }.to raise_error(Spark::RDDError)
end
end
end
RSpec.describe Spark::RDD do
let(:ser) { Spark::Serializer.build { __batched__(__marshal__, 1) } }
context '.stats' do
it_behaves_like 'a stats', 1
it_behaves_like 'a stats', 2
# it_behaves_like 'a stats', rand(2..5)
end
context '.histogram' do
it_behaves_like 'a histogram', 1
it_behaves_like 'a histogram', 2
# it_behaves_like 'a histogram', rand(2..5)
end
end
================================================
FILE: spec/lib/whole_text_files_spec.rb
================================================
require 'spec_helper'
RSpec.shared_examples 'a whole_text_files' do |workers|
it "with #{workers || 'default'} worker" do
rdd2 = rdd(workers).map(get_numbers)
result = files.size
expect(rdd2.collect.size).to eql(result)
rdd3 = rdd(workers)
rdd3 = rdd3.flat_map(get_numbers)
result = 0
files.each{|f| result += File.read(f).split.map(&:to_i).reduce(:+)}
expect(rdd3.sum).to eql(result)
end
end
RSpec.describe 'Spark::Context' do
let(:get_numbers) { lambda{|file, content| content.split.map(&:to_i)} }
let(:dir) { File.join('spec', 'inputs', 'numbers') }
let(:files) { Dir.glob(File.join(dir, '*')) }
def rdd(workers)
$sc.whole_text_files(dir, workers)
end
it_behaves_like 'a whole_text_files', 1
it_behaves_like 'a whole_text_files', 2
# it_behaves_like 'a whole_text_files', nil
# it_behaves_like 'a whole_text_files', rand(2..10)
end
================================================
FILE: spec/spec_helper.rb
================================================
require 'simplecov'
SimpleCov.start
$LOAD_PATH.unshift File.dirname(__FILE__) + '/../lib'
require 'ruby-spark'
require 'generator'
# Loading
Spark.load_lib
Spark.jb.import_all_test
Spark::Mllib.import
# Keep it on method because its called from config test
def spark_start
Spark.logger.disable
Spark.config do
set 'spark.ruby.serializer.batch_size', 100
end
$sc = Spark.start
$sql = Spark.start_sql
end
def windows?
RbConfig::CONFIG['host_os'] =~ /mswin|mingw/
end
RSpec.configure do |config|
config.default_formatter = 'doc'
config.color = true
config.tty = true
config.before(:suite) do
spark_start
end
config.after(:suite) do
Spark.stop
end
end