Repository: ondra-m/ruby-spark
Branch: master
Commit: d1b9787642fe
Files: 191
Total size: 440.0 KB

Directory structure:
gitextract_h83fh3m2/

├── .gitignore
├── .travis.yml
├── CHANGELOG.md
├── Gemfile
├── Guardfile
├── LICENSE.txt
├── README.md
├── Rakefile
├── TODO.md
├── benchmark/
│   ├── aggregate.rb
│   ├── bisect.rb
│   ├── comparison/
│   │   ├── prepare.sh
│   │   ├── python.py
│   │   ├── r.r
│   │   ├── ruby.rb
│   │   ├── run-all.sh
│   │   └── scala.scala
│   ├── custom_marshal.rb
│   ├── digest.rb
│   ├── enumerator.rb
│   ├── serializer.rb
│   ├── sort.rb
│   ├── sort2.rb
│   └── take.rb
├── bin/
│   └── ruby-spark
├── example/
│   ├── pi.rb
│   └── website_search.rb
├── ext/
│   ├── ruby_c/
│   │   ├── extconf.rb
│   │   ├── murmur.c
│   │   ├── murmur.h
│   │   └── ruby-spark.c
│   ├── ruby_java/
│   │   ├── Digest.java
│   │   ├── Murmur2.java
│   │   ├── RubySparkExtService.java
│   │   └── extconf.rb
│   └── spark/
│       ├── build.sbt
│       ├── project/
│       │   └── plugins.sbt
│       ├── sbt/
│       │   └── sbt
│       └── src/
│           ├── main/
│           │   └── scala/
│           │       ├── Exec.scala
│           │       ├── MLLibAPI.scala
│           │       ├── Marshal.scala
│           │       ├── MarshalDump.scala
│           │       ├── MarshalLoad.scala
│           │       ├── RubyAccumulatorParam.scala
│           │       ├── RubyBroadcast.scala
│           │       ├── RubyConstant.scala
│           │       ├── RubyMLLibAPI.scala
│           │       ├── RubyMLLibUtilAPI.scala
│           │       ├── RubyPage.scala
│           │       ├── RubyRDD.scala
│           │       ├── RubySerializer.scala
│           │       ├── RubyTab.scala
│           │       ├── RubyUtils.scala
│           │       └── RubyWorker.scala
│           └── test/
│               └── scala/
│                   └── MarshalSpec.scala
├── lib/
│   ├── ruby-spark.rb
│   ├── spark/
│   │   ├── accumulator.rb
│   │   ├── broadcast.rb
│   │   ├── build.rb
│   │   ├── cli.rb
│   │   ├── command/
│   │   │   ├── base.rb
│   │   │   ├── basic.rb
│   │   │   ├── pair.rb
│   │   │   ├── sort.rb
│   │   │   └── statistic.rb
│   │   ├── command.rb
│   │   ├── command_builder.rb
│   │   ├── command_validator.rb
│   │   ├── config.rb
│   │   ├── constant.rb
│   │   ├── context.rb
│   │   ├── error.rb
│   │   ├── ext/
│   │   │   ├── hash.rb
│   │   │   ├── integer.rb
│   │   │   ├── io.rb
│   │   │   ├── ip_socket.rb
│   │   │   ├── module.rb
│   │   │   ├── object.rb
│   │   │   └── string.rb
│   │   ├── helper/
│   │   │   ├── logger.rb
│   │   │   ├── parser.rb
│   │   │   ├── serialize.rb
│   │   │   ├── statistic.rb
│   │   │   └── system.rb
│   │   ├── helper.rb
│   │   ├── java_bridge/
│   │   │   ├── base.rb
│   │   │   ├── jruby.rb
│   │   │   └── rjb.rb
│   │   ├── java_bridge.rb
│   │   ├── library.rb
│   │   ├── logger.rb
│   │   ├── mllib/
│   │   │   ├── classification/
│   │   │   │   ├── common.rb
│   │   │   │   ├── logistic_regression.rb
│   │   │   │   ├── naive_bayes.rb
│   │   │   │   └── svm.rb
│   │   │   ├── clustering/
│   │   │   │   ├── gaussian_mixture.rb
│   │   │   │   └── kmeans.rb
│   │   │   ├── matrix.rb
│   │   │   ├── regression/
│   │   │   │   ├── common.rb
│   │   │   │   ├── labeled_point.rb
│   │   │   │   ├── lasso.rb
│   │   │   │   ├── linear.rb
│   │   │   │   └── ridge.rb
│   │   │   ├── ruby_matrix/
│   │   │   │   ├── matrix_adapter.rb
│   │   │   │   └── vector_adapter.rb
│   │   │   ├── stat/
│   │   │   │   └── distribution.rb
│   │   │   └── vector.rb
│   │   ├── mllib.rb
│   │   ├── rdd.rb
│   │   ├── sampler.rb
│   │   ├── serializer/
│   │   │   ├── auto_batched.rb
│   │   │   ├── base.rb
│   │   │   ├── batched.rb
│   │   │   ├── cartesian.rb
│   │   │   ├── compressed.rb
│   │   │   ├── marshal.rb
│   │   │   ├── message_pack.rb
│   │   │   ├── oj.rb
│   │   │   ├── pair.rb
│   │   │   └── text.rb
│   │   ├── serializer.rb
│   │   ├── sort.rb
│   │   ├── sql/
│   │   │   ├── column.rb
│   │   │   ├── context.rb
│   │   │   ├── data_frame.rb
│   │   │   ├── data_frame_reader.rb
│   │   │   ├── data_type.rb
│   │   │   └── row.rb
│   │   ├── sql.rb
│   │   ├── stat_counter.rb
│   │   ├── storage_level.rb
│   │   ├── version.rb
│   │   └── worker/
│   │       ├── master.rb
│   │       ├── spark_files.rb
│   │       └── worker.rb
│   └── spark.rb
├── ruby-spark.gemspec
└── spec/
    ├── generator.rb
    ├── inputs/
    │   ├── lorem_300.txt
    │   ├── numbers/
    │   │   ├── 1.txt
    │   │   ├── 10.txt
    │   │   ├── 11.txt
    │   │   ├── 12.txt
    │   │   ├── 13.txt
    │   │   ├── 14.txt
    │   │   ├── 15.txt
    │   │   ├── 16.txt
    │   │   ├── 17.txt
    │   │   ├── 18.txt
    │   │   ├── 19.txt
    │   │   ├── 2.txt
    │   │   ├── 20.txt
    │   │   ├── 3.txt
    │   │   ├── 4.txt
    │   │   ├── 5.txt
    │   │   ├── 6.txt
    │   │   ├── 7.txt
    │   │   ├── 8.txt
    │   │   └── 9.txt
    │   ├── numbers_0_100.txt
    │   ├── numbers_1_100.txt
    │   └── people.json
    ├── lib/
    │   ├── collect_spec.rb
    │   ├── command_spec.rb
    │   ├── config_spec.rb
    │   ├── context_spec.rb
    │   ├── ext_spec.rb
    │   ├── external_apps_spec.rb
    │   ├── filter_spec.rb
    │   ├── flat_map_spec.rb
    │   ├── group_spec.rb
    │   ├── helper_spec.rb
    │   ├── key_spec.rb
    │   ├── manipulation_spec.rb
    │   ├── map_partitions_spec.rb
    │   ├── map_spec.rb
    │   ├── mllib/
    │   │   ├── classification_spec.rb
    │   │   ├── clustering_spec.rb
    │   │   ├── matrix_spec.rb
    │   │   ├── regression_spec.rb
    │   │   └── vector_spec.rb
    │   ├── reduce_by_key_spec.rb
    │   ├── reduce_spec.rb
    │   ├── sample_spec.rb
    │   ├── serializer_spec.rb
    │   ├── sort_spec.rb
    │   ├── sql/
    │   │   ├── column_spec.rb
    │   │   └── data_frame_spec.rb
    │   ├── statistic_spec.rb
    │   └── whole_text_files_spec.rb
    └── spec_helper.rb

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
/.gemtags
/.tags
/java/spark.jar
.jbundler
target/*
*.class
*.jar
pom.xml
vendor/*
*.gem
*.rbc
.bundle
.config
.yardoc
Gemfile.lock
InstalledFiles
_yardoc
coverage
doc/
lib/bundler/man
pkg
rdoc
spec/reports
test/tmp
test/version_tmp
tmp
*.bundle
*.so
*.o
*.a
mkmf.log
ext/spark/target/*
ext/spark/project/target/*
ext/spark/project/project/target/*
wiki
/benchmark/performance/spark/*
/benchmark/performance/rspark/*
/_*


================================================
FILE: .travis.yml
================================================
language: ruby

rvm:
  - 2.2.0

before_script:
  - bundle exec rake compile
  - bundle exec ruby bin/ruby-spark build

cache:
  bundler: true
  directories:
    - $HOME/.m2
    - $HOME/.ivy2
    - $HOME/.sbt


================================================
FILE: CHANGELOG.md
================================================
## Unreleased

## 1.3.0

  - new method on RDD (lookup)
  - fix sbt url
  - Spark 1.5.0

## 1.2.0 (15.06.2015)

  - target folder is now located at HOME
  - better serializators
  - error when java class does not exist
  - default setting at ~/.ruby-spark.conf
  - compatible with Spark 1.4.0
  - added calling site to RDD


================================================
FILE: Gemfile
================================================
source 'https://rubygems.org'

gemspec

gem 'sourcify', '0.6.0.rc4'
gem 'method_source'
gem 'commander'
gem 'pry'
gem 'nio4r'
gem 'distribution'

platform :mri do
  gem 'rjb'
  gem 'msgpack'
  gem 'oj'
  gem 'narray'
end

platform :jruby do
  gem 'msgpack-jruby', require: 'msgpack'

  # NameError: no constructorfor arguments (org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.joda.time.chrono.GJChronology) on Java::OrgJodaTime::DateTime
  # gem 'mdarray'
end

group :stats do
  # gem 'nmatrix'
  # gem 'statsample'
  # gem 'statsample-glm'
  # gem 'statsample-timeseries'
  # gem 'statistics2'
  # gem 'statsample-optimization' # libgsl0-dev
  # gem 'narray'
  # gem 'gsl-nmatrix'
end

group :development do
  gem 'benchmark-ips'
  gem 'rspec'
  gem 'rake-compiler'
  gem 'guard'
  gem 'guard-rspec'
  gem 'listen'
end

group :test do
  gem 'simplecov', require: false
end


================================================
FILE: Guardfile
================================================
guard :rspec, cmd: 'rspec' do
  watch(%r{^spec/.+_spec\.rb$})
  watch(%r{^lib/(.+)\.rb$})     { |m| "spec/lib/#{m[1]}_spec.rb" }
  watch('spec/spec_helper.rb')  { "spec" }
end


================================================
FILE: LICENSE.txt
================================================
Copyright (c) 2014 Ondřej Moravčík

MIT License

Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:

The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


================================================
FILE: README.md
================================================
# Ruby-Spark [![Build Status](https://travis-ci.org/ondra-m/ruby-spark.svg?branch=master)](https://travis-ci.org/ondra-m/ruby-spark)

Apache Spark™ is a fast and general engine for large-scale data processing.

This Gem allows the use Spark functionality on Ruby.

> Word count in Spark's Ruby API

```ruby
file = spark.text_file("hdfs://...")

file.flat_map(:split)
    .map(lambda{|word| [word, 1]})
    .reduce_by_key(lambda{|a, b| a+b})
```

- [Apache Spark](http://spark.apache.org)
- [Wiki](https://github.com/ondra-m/ruby-spark/wiki)
- [Rubydoc](http://www.rubydoc.info/gems/ruby-spark)

## Installation

### Requirments

- Java 7+
- Ruby 2+
- wget or curl
- MRI or JRuby

Add this line to your application's Gemfile:

```ruby
gem 'ruby-spark'
```

And then execute:

```
$ bundle
```

Or install it yourself as:

```
$ gem install ruby-spark
```

Run `rake compile` if you are using gem from local filesystem.

### Build Apache Spark

This command will download Spark and build extensions for this gem ([SBT](ext/spark/build.sbt) is used for compiling). For more informations check [wiki](https://github.com/ondra-m/ruby-spark/wiki/Installation). Jars will be stored at you HOME directory.

```
$ ruby-spark build
```


## Usage

You can use Ruby Spark via interactive shell (Pry is used)

```
$ ruby-spark shell
```

Or on existing project.

If you want configure Spark first. See [configurations](https://github.com/ondra-m/ruby-spark/wiki/Configuration) for more details.

```ruby
require 'ruby-spark'

# Configuration
Spark.config do
   set_app_name "RubySpark"
   set 'spark.ruby.serializer', 'oj'
   set 'spark.ruby.serializer.batch_size', 100
end

# Start Apache Spark
Spark.start

# Context reference
Spark.sc
```

Finally, to stop the cluster. On the shell is Spark stopped automatically when environment exit.

```ruby
Spark.stop
```
After first use, global configuration is created at **~/.ruby-spark.conf**. There can be specified properties for Spark and RubySpark.


## Creating RDD (a new collection)

Single text file:

```ruby
rdd = sc.text_file(FILE, workers_num, serializer=nil)
```

All files on directory:

```ruby
rdd = sc.whole_text_files(DIRECTORY, workers_num, serializer=nil)
```

Direct uploading structures from ruby:

```ruby
rdd = sc.parallelize([1,2,3,4,5], workers_num, serializer=nil)
rdd = sc.parallelize(1..5, workers_num, serializer=nil)
```

There is 2 conditions:
1. choosen serializer must be able to serialize it
2. data must be iterable

If you do not specified serializer -> default is used (defined from spark.ruby.serializer.* options). [Check this](https://github.com/ondra-m/ruby-spark/wiki/Loading-data#custom-serializer) if you want create custom serializer.

## Operations

All operations can be divided into 2 groups:

- **Transformations:** append new operation to current RDD and return new
- **Actions:** add operation and start calculations

More informations:

- [Wiki page](https://github.com/ondra-m/ruby-spark/wiki/RDD)
- [Rubydoc](http://www.rubydoc.info/github/ondra-m/ruby-spark/master/Spark/RDD)
- [rdd.rb](https://github.com/ondra-m/ruby-spark/blob/master/lib/spark/rdd.rb)

You can also check official Spark documentation. First make sure that method is implemented here.

- [Transformations](http://spark.apache.org/docs/latest/programming-guide.html#transformations)
- [Actions](http://spark.apache.org/docs/latest/programming-guide.html#actions)

#### Transformations

<dl>          
  <dt><code>rdd.map(function)</code></dt>
  <dd>Return a new RDD by applying a function to all elements of this RDD.</dd>

  <dt><code>rdd.flat_map(function)</code></dt>
  <dd>Return a new RDD by first applying a function to all elements of this RDD, and then flattening the results.</dd>

  <dt><code>rdd.map_partitions(function)</code></dt>
  <dd>Return a new RDD by applying a function to each partition of this RDD.</dd>

  <dt><code>rdd.filter(function)</code></dt>
  <dd>Return a new RDD containing only the elements that satisfy a predicate.</dd>

  <dt><code>rdd.cartesian(other)</code></dt>
  <dd>Return the Cartesian product of this RDD and another one, that is, the RDD of all pairs of elements `(a, b)` where `a` is in `self` and `b` is in `other`.</dd>

  <dt><code>rdd.intersection(other)</code></dt>
  <dd>Return the intersection of this RDD and another one. The output will not contain any duplicate elements, even if the input RDDs did.</dd>

  <dt><code>rdd.sample(with_replacement, fraction, seed)</code></dt>
  <dd>Return a sampled subset of this RDD. Operations are base on Poisson and Uniform distributions.</dd>

  <dt><code>rdd.group_by_key(num_partitions)</code></dt>
  <dd>Group the values for each key in the RDD into a single sequence.</dd>
  
  <dt><a href="http://www.rubydoc.info/gems/ruby-spark/Spark/RDD" target="_blank"><code>...many more...</code></a></dt>
  <dd></dd>
</dl>


#### Actions

<dl> 
  <dt><code>rdd.take(count)</code></dt>
  <dd>Take the first num elements of the RDD.</dd>

  <dt><code>rdd.reduce(function)</code></dt>
  <dd>Reduces the elements of this RDD using the specified lambda or method.</dd>

  <dt><code>rdd.aggregate(zero_value, seq_op, comb_op)</code></dt>
  <dd>Aggregate the elements of each partition, and then the results for all the partitions, using given combine functions and a neutral “zero value”.</dd>

  <dt><code>rdd.histogram(buckets)</code></dt>
  <dd>Compute a histogram using the provided buckets.</dd>

  <dt><code>rdd.collect</code></dt>
  <dd>Return an array that contains all of the elements in this RDD.</dd>

  <dt><a href="http://www.rubydoc.info/gems/ruby-spark/Spark/RDD" target="_blank"><code>...many more...</code></a></dt>
  <dd></dd>
</dl>


## Examples

##### Basic methods

```ruby
# Every batch will be serialized by Marshal and will have size 10
ser = Spark::Serializer.build('batched(marshal, 10)')

# Range 0..100, 2 workers, custom serializer
rdd = Spark.sc.parallelize(0..100, 2, ser)


# Take first 5 items
rdd.take(5)
# => [0, 1, 2, 3, 4]


# Numbers reducing
rdd.reduce(lambda{|sum, x| sum+x})
rdd.reduce(:+)
rdd.sum
# => 5050


# Reducing with zero items
seq = lambda{|x,y| x+y}
com = lambda{|x,y| x*y}
rdd.aggregate(1, seq, com)
# 1. Every workers adds numbers
#    => [1226, 3826]
# 2. Results are multiplied
#    => 4690676


# Statistic method
rdd.stats
# => StatCounter: (count, mean, max, min, variance,
#                  sample_variance, stdev, sample_stdev)


# Compute a histogram using the provided buckets.
rdd.histogram(2)
# => [[0.0, 50.0, 100], [50, 51]]


# Mapping
rdd.map(lambda {|x| x*2}).collect
# => [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, ...]
rdd.map(:to_f).collect
# => [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ...]


# Mapping the whole collection
rdd.map_partitions(lambda{|part| part.reduce(:+)}).collect
# => [1225, 3825]


# Selecting
rdd.filter(lambda{|x| x.even?}).collect
# => [0, 2, 4, 6, 8, 10, 12, 14, 16, ...]


# Sampling
rdd.sample(true, 10).collect
# => [3, 36, 40, 54, 58, 82, 86, 95, 98]


# Sampling X items
rdd.take_sample(true, 10)
# => [53, 87, 71, 74, 18, 75, 55, 94, 46, 32]


# Using external process
rdd.pipe('cat', "awk '{print $1*10}'")
# => ["0", "10", "20", "30", "40", "50", ...]
```

##### Words count using methods

```ruby
# Content:
# "first line"
# "second line"
rdd = sc.text_file(PATH)

# ["first", "line", "second", "line"]
rdd = rdd.flat_map(lambda{|line| line.split})

# [["first", 1], ["line", 1], ["second", 1], ["line", 1]]
rdd = rdd.map(lambda{|word| [word, 1]})

# [["first", 1], ["line", 2], ["second", 1]]
rdd = rdd.reduce_by_key(lambda{|a, b| a+b})

# {"first"=>1, "line"=>2, "second"=>1}
rdd.collect_as_hash
```

##### Estimating PI with a custom serializer

```ruby
slices = 3
n = 100000 * slices

def map(_)
  x = rand * 2 - 1
  y = rand * 2 - 1

  if x**2 + y**2 < 1
    return 1
  else
    return 0
  end
end

rdd = Spark.context.parallelize(1..n, slices, serializer: 'oj')
rdd = rdd.map(method(:map))

puts 'Pi is roughly %f' % (4.0 * rdd.sum / n)
```

##### Estimating PI

```ruby
rdd = sc.parallelize([10_000], 1)
rdd = rdd.add_library('bigdecimal/math')
rdd = rdd.map(lambda{|x| BigMath.PI(x)})
rdd.collect # => #<BigDecimal, '0.31415926...'>
```

### Mllib (Machine Learning Library)

Mllib functions are using Spark's Machine Learning Library. Ruby objects are serialized and deserialized in Java so you cannot use custom classes. Supported are primitive types such as string or integers.

All supported methods/models:

- [Rubydoc / Mllib](http://www.rubydoc.info/github/ondra-m/ruby-spark/Spark/Mllib)
- [Github / Mllib](https://github.com/ondra-m/ruby-spark/tree/master/lib/spark/mllib)

##### Linear regression

```ruby
# Import Mllib classes into Object
# Otherwise are accessible via Spark::Mllib::LinearRegressionWithSGD
Spark::Mllib.import(Object)

# Training data
data = [
  LabeledPoint.new(0.0, [0.0]),
  LabeledPoint.new(1.0, [1.0]),
  LabeledPoint.new(3.0, [2.0]),
  LabeledPoint.new(2.0, [3.0])
]

# Train a model
lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initial_weights: [1.0])

lrm.predict([0.0])
```

##### K-Mean

```ruby
Spark::Mllib.import

# Dense vectors
data = [
  DenseVector.new([0.0,0.0]),
  DenseVector.new([1.0,1.0]),
  DenseVector.new([9.0,8.0]),
  DenseVector.new([8.0,9.0])
]

model = KMeans.train(sc.parallelize(data), 2)

model.predict([0.0, 0.0]) == model.predict([1.0, 1.0])
# => true
model.predict([8.0, 9.0]) == model.predict([9.0, 8.0])
# => true
```

## Benchmarks


================================================
FILE: Rakefile
================================================
#-*- mode: ruby -*-

require "bundler/gem_tasks"
require "rspec/core/rake_task"

RSpec::Core::RakeTask.new

task default: :spec
task test:    :spec

def java?
  RUBY_PLATFORM =~ /java/
end

if java?
  require "rake/javaextensiontask"
  Rake::JavaExtensionTask.new("ruby_java") do |ext|
    ext.name = "ruby_spark_ext"
  end
else
  require "rake/extensiontask"
  Rake::ExtensionTask.new("ruby_c") do |ext|
    ext.name = "ruby_spark_ext"
  end
end


task :clean do
  Dir['lib/*.{jar,o,so}'].each do |path|
    puts "Deleting #{path} ..."
    File.delete(path)
  end
  FileUtils.rm_rf('./pkg')
  FileUtils.rm_rf('./tmp')
end


================================================
FILE: TODO.md
================================================
- refactor JavaBridge
  - to_java, from_java
  - every type should have class
  - automatic registration
- add Streaming
- worker informations (time, memory, ...)
- killing zombie workers
- add_rb, add_inline_rb to Spark::{Context, RDD}
- fix broadcast for cluster
- dump to disk if there is memory limit
- Add Partitioner to RDD
- add NonExist serializer


================================================
FILE: benchmark/aggregate.rb
================================================
require 'benchmark'
require 'benchmark/ips'

data = 0..1_000_000
zero_value = rand(100_000)
function = Proc.new{|sum, n| sum+n}

Benchmark.ips do |r|  
  r.report('each') do
    sum = zero_value
    data.each do |n|
      sum += n
    end
  end

  r.report('reduce') do
    data.reduce(zero_value){|sum, n| sum+n}
  end

  r.report('each with function') do
    sum = zero_value
    data.each do |n|
      sum = function.call(sum, n)
    end
  end

  r.report('reduce with function') do
    data.reduce(zero_value, &function)
  end

  r.compare!
end


================================================
FILE: benchmark/bisect.rb
================================================
require "benchmark"

def bisect_left1(a, x, opts={})
  return nil if a.nil?
  return 0 if a.empty?

  lo = (opts[:lo] || opts[:low]).to_i
  hi = opts[:hi] || opts[:high] || a.length

  while lo < hi
    mid = (lo + hi) / 2
    v = a[mid]
    if v < x
      lo = mid + 1
    else
      hi = mid
    end
  end
  return lo
end

def bisect_left2(list, item)
  count = 0
  list.each{|i|
    return count if i >= item
    count += 1
  }
  nil
end

def bisect_left3(list, item, lo = 0, hi = list.size)
  while lo < hi
    i = (lo + hi - 1) >> 1

    if 0 <= (list[i] <=> item)
      hi = i
    else
      lo = i + 1
    end
  end
  return hi
end

array = Array.new(1000000) { rand(0..1000000) };
to_find = Array.new(500) { rand(0..10000) };

Benchmark.bm(20) do |x|
  x.report("bisect_left1") do
    to_find.each do |item|
      bisect_left1(array, item)
    end
  end

  x.report("bisect_left2") do
    to_find.each do |item|
      bisect_left2(array, item)
    end
  end

  x.report("bisect_left3") do
    to_find.each do |item|
      bisect_left3(array, item)
    end
  end
end

array = Array.new(100000) { Array.new(rand(1..10)){(97+rand(26)).chr}.join };
to_find = Array.new(500) { (97+rand(26)).chr };

Benchmark.bm(20) do |x|
  x.report("bisect_left1") do
    to_find.each do |item|
      bisect_left1(array, item)
    end
  end

  x.report("bisect_left2") do
    to_find.each do |item|
      bisect_left2(array, item)
    end
  end

  x.report("bisect_left3") do
    to_find.each do |item|
      bisect_left3(array, item)
    end
  end
end


================================================
FILE: benchmark/comparison/prepare.sh
================================================
#!/usr/bin/env bash

# Current dir
cd "$(dirname "$0")"

# Exit immediately if a pipeline returns a non-zero status.
set -e

# Spark
wget "http://d3kbcqa49mib13.cloudfront.net/spark-1.3.0-bin-hadoop2.4.tgz" -O spark.tgz
tar xvzf spark.tgz
mv spark-1.3.0-bin-hadoop2.4 spark
rm spark.tgz

# RSpark (only for 1.3.0)
git clone git@github.com:amplab-extras/SparkR-pkg.git rspark
cd rspark
SPARK_VERSION=1.3.0 ./install-dev.sh


================================================
FILE: benchmark/comparison/python.py
================================================
import os
import math
from time import time
from random import random
from operator import add
from pyspark import SparkContext

sc = SparkContext(appName="Python", master="local[*]")

log_file = open(os.environ.get('PYTHON_LOG'), 'w')

def log(*values):
  values = map(lambda x: str(x), values)
  log_file.write(';'.join(values))
  log_file.write('\n')

workers = int(os.environ.get('WORKERS'))
numbers_count = int(os.environ.get('NUMBERS_COUNT'))
text_file = os.environ.get('TEXT_FILE')

numbers = range(numbers_count)
floats = [float(i) for i in numbers]
with open(text_file) as t:
  strings = t.read().split("\n")


# =============================================================================
# Serialization
# =============================================================================

t = time()
rdd_numbers = sc.parallelize(numbers, workers)
t = time() - t
log('NumbersSerialization', t)


t = time()
rdd_floats = sc.parallelize(floats, workers)
t = time() - t
log('FloatsSerialization', t)


t = time()
rdd_strings = sc.parallelize(strings, workers)
t = time() - t
log('StringsSerialization', t)


# =============================================================================
# Computing
# =============================================================================


# --- Is prime? ---------------------------------------------------------------

def is_prime(x):
  if x < 2:
    return [x, False]
  elif x == 2:
    return [x, True]
  elif x % 2 == 0:
    return [x, False]
  else:
    upper = int(math.sqrt(float(x)))
    result = True

    i = 3
    while i <= upper:
      if x % i == 0:
        result = False
        break

      i += 2

    return [x, result]

t = time()
rdd_numbers.map(is_prime).collect()
t = time() - t

log('IsPrime', t)


# --- Matrix multiplication ---------------------------------------------------

matrix_size = int(os.environ.get('MATRIX_SIZE'))

matrix = []
for row in range(matrix_size):
  matrix.append([])
  for col in range(matrix_size):
    matrix[row].append(row+col)

def multiplication_func(matrix):
  matrix = list(matrix)
  size = len(matrix)

  new_matrix = []
  for row in range(size):
    new_matrix.append([])
    for col in range(size):

      result = 0
      for i in range(size):
        result += matrix[row][i] * matrix[col][i]
      new_matrix[row].append(result)

  return new_matrix

t = time()
rdd = sc.parallelize(matrix, 1)
rdd.mapPartitions(multiplication_func).collect()
t = time() - t

log('MatrixMultiplication', t)


# --- Pi digits ---------------------------------------------------------------
# http://rosettacode.org/wiki/Pi#Python

pi_digit = int(os.environ.get('PI_DIGIT'))

def pi_func(size):
  size = size.next()
  result = ''

  q, r, t, k, n, l = 1, 0, 1, 1, 3, 3
  while size > 0:
    if 4*q+r-t < n*t:
      result += str(n)
      size -= 1
      nr = 10*(r-n*t)
      n  = ((10*(3*q+r))//t)-10*n
      q  *= 10
      r  = nr
    else:
      nr = (2*q+r)*l
      nn = (q*(7*k)+2+(r*l))//(t*l)
      q  *= k
      t  *= l
      l  += 2
      k += 1
      n  = nn
      r  = nr

  return [result]

t = time()
rdd = sc.parallelize([pi_digit], 1)
rdd.mapPartitions(pi_func).collect()
t = time() - t

log('PiDigit', t)


log_file.close()


================================================
FILE: benchmark/comparison/r.r
================================================
library(SparkR)
sc <- sparkR.init(master="local[*]")

logFile <- file(Sys.getenv("R_LOG"), "w")

logInfo <- function(...){
  args <- list(...)
  line <- paste(args, collapse = ";")
  writeLines(line, logFile)
}

workers <- as.integer(Sys.getenv('WORKERS'))
numbersCount <- as.integer(Sys.getenv('NUMBERS_COUNT'))
textFile <- Sys.getenv('TEXT_FILE')


# =============================================================================
# Serialization
# =============================================================================

time <- proc.time()
rddNumbers <- parallelize(sc, as.numeric(seq(0, numbersCount)), workers)
time <- as.double(proc.time()-time)[3]

logInfo('NumbersSerialization', time)


# =============================================================================
# Computing
# =============================================================================

isPrime = function(x) {
  if(x < 2){
    c(x, FALSE)
  }
  else if(x == 2){
    c(x, TRUE)
  }
  else if(x %% 2 == 0){
    c(x, FALSE)
  }
  else{
    upper <- as.numeric(sqrt(as.double(x)))
    result <- TRUE

    i <- 3
    while(i <= upper){
      if(x %% i == 0){
        result = FALSE
        break
      }

      i <- i+2
    }

    c(x, result)
  }
}

time <- proc.time()
rdd <- map(rddNumbers, isPrime)
capture.output(collect(rdd), file='/dev/null')
time <- as.double(proc.time()-time)[3]

logInfo('IsPrime', time)


close(logFile)
sparkR.stop()


================================================
FILE: benchmark/comparison/ruby.rb
================================================
#!/usr/bin/env ruby

lib = File.expand_path(File.dirname(__FILE__) + '/../../lib')
$LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)

require 'ruby-spark'
require 'benchmark'

Spark.start
sc = Spark.context

$log_file = File.open(ENV['RUBY_LOG'], 'w')

def log(*values)
  $log_file.puts(values.join(';'))
end

workers = ENV['WORKERS'].to_i
numbers_count = ENV['NUMBERS_COUNT'].to_i
text_file = ENV['TEXT_FILE']

numbers = (0...numbers_count).to_a
floats = numbers.map(&:to_f)
strings = File.read(text_file).split("\n")


# =============================================================================
# Serialization
# =============================================================================

time = Benchmark.realtime do
  @rdd_numbers = sc.parallelize(numbers, workers)
end

log('NumbersSerialization', time)


time = Benchmark.realtime do
  @rdd_floats = sc.parallelize(floats, workers)
end

log('FloatsSerialization', time)


time = Benchmark.realtime do
  @rdd_strings = sc.parallelize(strings, workers)
end

log('StringsSerialization', time)


# =============================================================================
# Computing
# =============================================================================


# --- Is prime? ---------------------------------------------------------------

is_prime = Proc.new do |x|
  case
  when x < 2
    [x, false]
  when x == 2
    [x, true]
  when x % 2 == 0
    [x, false]
  else
    upper = Math.sqrt(x.to_f).to_i
    result = true

    i = 3
    while i <= upper
      if x % i == 0
        result = false
        break
      end

      i += 2
    end

    [x, result]
  end
end

time = Benchmark.realtime do
  @rdd_numbers.map(is_prime).collect
end

log('IsPrime', time)


# --- Matrix multiplication ---------------------------------------------------

matrix_size = ENV['MATRIX_SIZE'].to_i

matrix = Array.new(matrix_size) do |row|
  Array.new(matrix_size) do |col|
    row+col
  end
end;

multiplication_func = Proc.new do |matrix|
  size = matrix.size

  Array.new(size) do |row|
    Array.new(size) do |col|
      matrix[row]

      result = 0
      size.times do |i|
        result += matrix[row][i] * matrix[col][i]
      end
      result
    end
  end
end

time = Benchmark.realtime do
  rdd = sc.parallelize(matrix, 1)
  rdd.map_partitions(multiplication_func).collect
end

log('MatrixMultiplication', time)


# --- Pi digits ---------------------------------------------------------------
# http://rosettacode.org/wiki/Pi#Ruby

pi_digit = ENV['PI_DIGIT'].to_i

pi_func = Proc.new do |size|
  size = size.first
  result = ''

  q, r, t, k, n, l = 1, 0, 1, 1, 3, 3
  while size > 0
    if 4*q+r-t < n*t
      result << n.to_s
      size -= 1
      nr = 10*(r-n*t)
      n = ((10*(3*q+r)) / t) - 10*n
      q *= 10
      r = nr
    else
      nr = (2*q+r) * l
      nn = (q*(7*k+2)+r*l) / (t*l)
      q *= k
      t *= l
      l += 2
      k += 1
      n = nn
      r = nr
    end
  end

  [result]
end

time = Benchmark.realtime do
  rdd = sc.parallelize([pi_digit], 1)
  rdd.map_partitions(pi_func).collect
end

log('PiDigit', time)


$log_file.close


================================================
FILE: benchmark/comparison/run-all.sh
================================================
#!/usr/bin/env bash

# Current dir
cd "$(dirname "$0")"

# Exit immediately if a pipeline returns a non-zero status.
set -e

# Settings
export WORKERS=2
export MATRIX_SIZE=100
export NUMBERS_COUNT=1000000
export TEXT_FILE=$(mktemp)
export PI_DIGIT=1000
export RUBY_BATCH_SIZE=2048

text_file_rows=10
text_file_per_line=10
text_file_duplicates=50

mx="4096m"
ms="4096m"


# Parse arguments
while (( "$#" )); do
  case $1 in
    --workers)
      WORKERS="$2"
      shift
      ;;
    --matrix-size)
      MATRIX_SIZE="$2"
      shift
      ;;
    --numbers-count)
      NUMBERS_COUNT="$2"
      shift
      ;;
    --random-file-rows)
      text_file_rows="$2"
      shift
      ;;
    --text-file-per-line)
      text_file_per_line="$2"
      shift
      ;;
    --text-file-duplicates)
      text_file_duplicates="$2"
      shift
      ;;
    --pi-digit)
      PI_DIGIT="$2"
      shift
      ;;
    --ruby-batch-size)
      RUBY_BATCH_SIZE="$2"
      shift
      ;;
    --mx)
      mx="$2"
      shift
      ;;
    --ms)
      ms="$2"
      shift
      ;;
    *)
      break
      ;;
  esac
  shift
done


# Generating
file=$(mktemp)

for (( i=0; i<$text_file_rows; i++ ))
do
  shuf -n $text_file_per_line /usr/share/dict/words | tr '\n' ' ' >> $file
  echo >> $file
done

for (( i=0; i<$text_file_duplicates; i++ ))
do
  cat $file >> $TEXT_FILE
done


# Before run
if [[ -z "$SPARK_HOME" ]]; then
  export SPARK_HOME=$(pwd)/spark
fi

if [[ -z "$RSPARK_HOME" ]]; then
  export RSPARK_HOME=$(pwd)/rspark
fi

export SPARK_RUBY_BATCH_SIZE="$RUBY_BATCH_SIZE"
SPARK_CLASSPATH=$($SPARK_HOME/bin/compute-classpath.sh 2>/dev/null)

export _JAVA_OPTIONS="$_JAVA_OPTIONS -Xms$ms -Xmx$mx"


# Log files
export RUBY_MARSHAL_LOG=$(mktemp)
export RUBY_OJ_LOG=$(mktemp)
export PYTHON_LOG=$(mktemp)
export SCALA_LOG=$(mktemp)
export R_LOG=$(mktemp)


# Run:
echo "Workers: $WORKERS"
echo "Matrix size: $MATRIX_SIZE"
echo "Numbers count: $NUMBERS_COUNT"
echo "Pi digits: $PI_DIGIT"
echo "File: rows = $(($text_file_rows * $text_file_duplicates))"
echo "      per line = $text_file_per_line"

# --- Ruby
export SPARK_RUBY_SERIALIZER='marshal'
export RUBY_LOG="$RUBY_MARSHAL_LOG"
/usr/bin/env ruby ruby.rb &>/dev/null

export SPARK_RUBY_SERIALIZER='oj'
export RUBY_LOG="$RUBY_OJ_LOG"
/usr/bin/env ruby ruby.rb &>/dev/null

# # --- Python
"$SPARK_HOME"/bin/spark-submit --master "local[*]" $(pwd)/python.py &>/dev/null

# # --- Scala
/usr/bin/env scalac -cp $SPARK_CLASSPATH scala.scala -d scala.jar &>/dev/null
"$SPARK_HOME"/bin/spark-submit --master "local[*]" $(pwd)/scala.jar &>/dev/null

# --- R
# "$RSPARK_HOME"/sparkR r.r #&>/dev/null


# Parse results
echo "# Ruby (Marshal)"
cat $RUBY_MARSHAL_LOG
echo ""

echo "# Ruby (Oj)"
cat $RUBY_OJ_LOG
echo ""

echo "# Python"
cat $PYTHON_LOG
echo ""

echo "# Scala"
cat $SCALA_LOG
echo ""

echo "# R"
cat $R_LOG


================================================
FILE: benchmark/comparison/scala.scala
================================================
import java.io._
import scala.math
import scala.io.Source
import org.apache.spark._

object Scala {

  val logFile = new PrintWriter(new File(System.getenv("SCALA_LOG")))

  def log(args: Any*) {
    logFile.write(args.mkString(";"))
    logFile.write("\n")
  }

  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("Scala")
    val sc = new SparkContext(conf)

    val workers = System.getenv("WORKERS").toInt
    val numbersCount = System.getenv("NUMBERS_COUNT").toInt
    val textFile = System.getenv("TEXT_FILE")

    val numbers = 0 until numbersCount
    val floats = numbers.map(_.toDouble)
    val strings = Source.fromFile(textFile).mkString.split("\n")


    // =============================================================================
    // Serialization
    // =============================================================================

    var time: Long = 0

    time = System.currentTimeMillis
    val rddNumbers = sc.parallelize(numbers, workers)
    time = System.currentTimeMillis - time

    log("NumbersSerialization", time/1000.0)


    time = System.currentTimeMillis
    val rddFloats = sc.parallelize(floats, workers)
    time = System.currentTimeMillis - time

    log("FloatsSerialization", time/1000.0)


    time = System.currentTimeMillis
    val rddStrings = sc.parallelize(strings, workers)
    time = System.currentTimeMillis - time

    log("StringsSerialization", time/1000.0)


    // =============================================================================
    // Computing
    // =============================================================================

    // --- Is prime? ---------------------------------------------------------------

    time = System.currentTimeMillis
    val primes = rddNumbers.map{ x =>
      if(x < 2){
        (x, false)
      }
      else if(x == 2){
        (x, true)
      }
      else if(x % 2 == 0){
        (x, false)
      }
      else{
        val upper = math.sqrt(x.toDouble).toInt
        var result = true

        var i = 3
        while(i <= upper && result == true){
          if(x % i == 0){
            result = false
          }
          else{
            i += 2
          }
        }

        (x, result)
      }
    }
    primes.collect()
    time = System.currentTimeMillis - time

    log("IsPrime", time/1000.0)


    // --- Matrix multiplication ---------------------------------------------------

    val matrixSize = System.getenv("MATRIX_SIZE").toInt

    val matrix = new Array[Array[Long]](matrixSize)

    for( row <- 0 until matrixSize ) {
      matrix(row) = new Array[Long](matrixSize)
      for( col <- 0 until matrixSize ) {
        matrix(row)(col) = row + col
      }
    }

    time = System.currentTimeMillis
    val rdd = sc.parallelize(matrix, 1)
    rdd.mapPartitions { it =>
      val matrix = it.toArray
      val size = matrix.size

      val newMatrix = new Array[Array[Long]](size)

      for( row <- 0 until size ) {
        newMatrix(row) = new Array[Long](size)
        for( col <- 0 until size ) {

          var result: Long = 0
          for( i <- 0 until size ) {
            result += matrix(row)(i) * matrix(col)(i)
          }
          newMatrix(row)(col) = result
        }
      }

      newMatrix.toIterator
    }
    time = System.currentTimeMillis - time

    log("MatrixMultiplication", time/1000.0)


    // --- Pi digits ---------------------------------------------------------------
    // http://rosettacode.org/wiki/Pi#Scala

    val piDigit = System.getenv("PI_DIGIT").toInt

    time = System.currentTimeMillis
    val piDigits = sc.parallelize(Array(piDigit), 1)
    piDigits.mapPartitions { it =>
      var size = it.toArray.asInstanceOf[Array[Int]](0)
      var result = ""

      var r: BigInt = 0
      var q, t, k: BigInt = 1
      var n, l: BigInt = 3
      var nr, nn: BigInt = 0

      while(size > 0){
        while((4*q+r-t) >= (n*t)){
          nr = (2*q+r)*l
          nn = (q*(7*k)+2+(r*l))/(t*l)
          q = q * k
          t = t * l
          l = l + 2
          k = k + 1
          n  = nn
          r  = nr
        }

        result += n.toString
        size -= 1
        nr = 10*(r-n*t)
        n  = ((10*(3*q+r))/t)-(10*n)
        q  = q * 10
        r  = nr
      }

      Iterator(result)
    }
    time = System.currentTimeMillis - time

    log("PiDigit", time/1000.0)


    sc.stop()
    logFile.close()
  }
}


================================================
FILE: benchmark/custom_marshal.rb
================================================
require 'benchmark'
require 'benchmark/ips'

def pack_int(data)
  [data].pack('l>')
end

def pack_long(data)
  [data].pack('q>')
end

def pack_doubles(data)
  data.pack('G*')
end

module Standard
  class LabeledPoint
    def initialize(label, features)
      @label = label
      @features = Standard::Vector.new(features)
    end

    def marshal_dump
      [@label, @features]
    end

    def marshal_load(*)
    end
  end

  class Vector
    def initialize(array)
      @values = array
    end

    def marshal_dump
      [@values]
    end

    def marshal_load(*)
    end
  end
end

module Custom
  class LabeledPoint
    def initialize(label, features)
      @label = label
      @features = Custom::Vector.new(features)
    end

    def _dump(*)
      pack_long(@label) + @features._dump
    end

    def self._load(*)
    end
  end

  class Vector
    def initialize(array)
      @values = array
    end

    def _dump(*)
      result = 'v'
      result << pack_int(@values.size)
      result << pack_doubles(@values)
      result.encode(Encoding::ASCII_8BIT)
    end

    def self._load(*)
    end
  end
end

data_size = 10_000
vector_size = 1_000
values = Array.new(vector_size) { |x| rand(10_000..100_000) }

@data1 = Array.new(data_size) {|i| Standard::LabeledPoint.new(i, values)}
@data2 = Array.new(data_size) {|i| Custom::LabeledPoint.new(i, values)}

Benchmark.ips do |r|
  r.report('standard') do
    Marshal.dump(@data1)
  end

  r.report('custom') do
    Marshal.dump(@data2)
  end

  r.compare!
end


================================================
FILE: benchmark/digest.rb
================================================
lib = File.expand_path(File.dirname(__FILE__) + '/../lib')
$LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)

def java?
  RUBY_PLATFORM =~ /java/
end

unless java?
  require 'murmurhash3'
end

require 'digest'
require 'benchmark'
require 'ruby-spark'

TEST = 5_000_000
WORDS = ["wefwefwef", "rgwefiwefwe", "a", "rujfwgrethrzjrhgawf", "irncrnuggo"]

puts "TEST COUNT = #{TEST*WORDS.size}"

# =================================================================================================
# Pure ruby mumrumur
# funny-falcon/murmurhash3-ruby

MASK32 = 0xffffffff

def murmur3_32_rotl(x, r)
  ((x << r) | (x >> (32 - r))) & MASK32
end

def murmur3_32_fmix(h)
  h &= MASK32
  h ^= h >> 16
  h = (h * 0x85ebca6b) & MASK32
  h ^= h >> 13
  h = (h * 0xc2b2ae35) & MASK32
  h ^ (h >> 16)
end

def murmur3_32__mmix(k1)
  k1 = (k1 * 0xcc9e2d51) & MASK32
  k1 = murmur3_32_rotl(k1, 15)
  (k1 * 0x1b873593) & MASK32
end

def murmur3_32_str_hash(str, seed=0)
  h1 = seed
  numbers = str.unpack('V*C*')
  tailn = str.bytesize % 4
  tail = numbers.slice!(numbers.size - tailn, tailn)
  for k1 in numbers
    h1 ^= murmur3_32__mmix(k1)
    h1 = murmur3_32_rotl(h1, 13)
    h1 = (h1*5 + 0xe6546b64) & MASK32
  end

  unless tail.empty?
    k1 = 0
    tail.reverse_each do |c1|
      k1 = (k1 << 8) | c1
    end
    h1 ^= murmur3_32__mmix(k1)
  end

  h1 ^= str.bytesize
  murmur3_32_fmix(h1)
end


# =================================================================================================
# Benchmark

Benchmark.bm(18) do |x|

  x.report("ruby hash"){
    TEST.times{
      WORDS.each{ |word|
        word.hash
      }
    }    
  }

  x.report("ext portable"){
    TEST.times{
      WORDS.each{ |word|
        Spark::Digest.portable_hash(word)
      }
    }    
  }

  x.report("murmur3 32"){
    TEST.times{
      WORDS.each{ |word|
        # MurmurHash3::V128.str_hash(word)
        # [MurmurHash3::V128.str_hash(word).join.to_i].pack("q>")
        # MurmurHash3::V128.str_hash(word)
        # a = MurmurHash3::V32.str_hash(word).to_s
        # a.slice!(0,8)

        MurmurHash3::V32.str_hash(word)
      }
    }    
  } unless java?

  # Too slow
  # x.report("murmur3 32 (ruby)"){
  #   TEST.times{
  #     WORDS.each{ |word|
  #       # MurmurHash3::V128.str_hash(word)
  #       # [MurmurHash3::V128.str_hash(word).join.to_i].pack("q>")
  #       # MurmurHash3::V128.str_hash(word)
  #       # a = murmur3_32_str_hash(word).to_s
  #       # a.slice!(0,8)

  #       murmur3_32_str_hash(word)
  #     }
  #   }    
  # }

  x.report("murmur3 128"){
    TEST.times{
      WORDS.each{ |word|
        # MurmurHash3::V128.str_hash(word)
        # [MurmurHash3::V128.str_hash(word).join.to_i].pack("q>")
        # a = MurmurHash3::V128.str_hash(word).to_s
        # a.slice!(0,8)

        MurmurHash3::V128.str_hash(word)
      }
    }    
  } unless java?

  # x.report("sha256"){
  #   TEST.times{
  #     WORDS.each{ |word|
  #       a = Digest::SHA256.digest(word)
  #       # a.slice!(0,8)
  #     }
  #   }    
  # }

  # x.report("md5"){
  #   TEST.times{
  #     WORDS.each{ |word|
  #       a = Digest::MD5.digest(word)
  #       # a.slice!(0,8)
  #     }
  #   }    
  # }
end


================================================
FILE: benchmark/enumerator.rb
================================================
require "benchmark"

class Enumerator
  def defer(&blk)
    self.class.new do |y|
      each do |*input|
        blk.call(y, *input)
      end
    end
  end
end

ARRAY_SIZE = 50_000_000

def type_yield
  return to_enum(__callee__) unless block_given?

  ARRAY_SIZE.times { |i|
    yield i
  }
end

def yield_map_x2(enum)
  return to_enum(__callee__, enum) unless block_given?
  
  enum.each do |item|
    yield item*2
  end
end

def type_enumerator_new
  Enumerator.new do |e|
    ARRAY_SIZE.times { |i|
      e << i
    }
  end
end

def enumerator_new_map_x2(enum)
  Enumerator.new do |e|
    enum.each do |item|
      e << item*2
    end
  end
end

def enumerator_defer_x2(enum)
  enum.defer do |out, inp|
    out << inp*2
  end
end

Benchmark.bm(26) do |x|
  x.report("yield max") do
    type_yield.max
  end

  x.report("yield sum") do
    type_yield.reduce(:+)
  end

  x.report("yield map x*2 sum") do
    yield_map_x2(type_yield).reduce(:+)
  end

  x.report("yield defer map x*2 sum") do
    enumerator_defer_x2(type_yield).reduce(:+)
  end

  x.report("-----"){}

  x.report("Enum.new max") do
    type_enumerator_new.max
  end

  x.report("Enum.new sum") do
    type_enumerator_new.reduce(:+)
  end

  x.report("Enum.new map x*2 sum") do
    enumerator_new_map_x2(type_enumerator_new).reduce(:+)
  end

  x.report("Enum.new defer map x*2 sum") do
    enumerator_defer_x2(type_enumerator_new).reduce(:+)
  end

end


================================================
FILE: benchmark/serializer.rb
================================================
require "benchmark"
require "yaml"
require "msgpack"
require "oj"
# require "thrift"
 
puts "Simple"

data = (0..100000).to_a

Benchmark.bmbm do |x|
  x.report("YAML") do
    serialized = YAML.dump(data)
    deserialized = YAML.load(serialized)
    puts "Size: #{serialized.size}, Equal: #{deserialized == data}"
  end

  x.report("Marshal") do
    serialized = Marshal.dump(data)
    deserialized = Marshal.load(serialized)
    puts "Size: #{serialized.size}, Equal: #{deserialized == data}"
  end

  x.report("MessagePack") do
    serialized = MessagePack.dump(data)
    deserialized = MessagePack.load(serialized)
    puts "Size: #{serialized.size}, Equal: #{deserialized == data}"
  end

  x.report("Oj") do
    serialized = Oj.dump(data)
    deserialized = Oj.load(serialized)
    puts "Size: #{serialized.size}, Equal: #{deserialized == data}"
  end

  # x.report("Thrift") do
  #   serializer = Thrift::Serializer.new
  #   deserializer = Thrift::Deserializer.new

  #   serialized = serializer.serialize(data)
  # end
end

puts ""
puts "More complex"

data = Array.new(10000000) { 
  [rand(97..122).chr, rand(10000000)]
}

Benchmark.bm do |x|
  # Take too long
  # x.report("YAML") do
  #   serialized = YAML.dump(data)
  #   YAML.load(serialized)
  # end

  x.report("Marshal") do
    serialized = Marshal.dump(data)
    deserialized = Marshal.load(serialized)
    puts " Size: #{serialized.size}, Equal: #{deserialized == data}"
  end

  x.report("MessagePack") do
    serialized = MessagePack.dump(data)
    deserialized = MessagePack.load(serialized)
    puts " Size: #{serialized.size}, Equal: #{deserialized == data}"
  end

  x.report("Oj") do
    serialized = Oj.dump(data)
    deserialized = Oj.load(serialized)
    puts " Size: #{serialized.size}, Equal: #{deserialized == data}"
  end

  # x.report("Thrift") do
  #   serializer = Thrift::Serializer.new
  #   deserializer = Thrift::Deserializer.new

  #   serialized = serializer.serialize(data)
  # end
end


================================================
FILE: benchmark/sort.rb
================================================
require "benchmark"

array = []
1000.times { 
  array << {:bar => rand(1000)} 
}

n = 500
Benchmark.bm(20) do |x|
  x.report("sort")               { n.times { array.sort{ |a,b| b[:bar] <=> a[:bar] } } }
  x.report("sort reverse")       { n.times { array.sort{ |a,b| a[:bar] <=> b[:bar] }.reverse } }
  x.report("sort_by -a[:bar]")   { n.times { array.sort_by{ |a| -a[:bar] } } }
  x.report("sort_by a[:bar]*-1") { n.times { array.sort_by{ |a| a[:bar]*-1 } } }
  x.report("sort_by.reverse!")   { n.times { array.sort_by{ |a| a[:bar] }.reverse } }
end


array = Array.new(10000) { Array.new(rand(1..10)){(97+rand(26)).chr}.join }

Benchmark.bm(20) do |x|
  x.report("sort asc")         { n.times { array.sort } }
  x.report("sort asc block")   { n.times { array.sort{|a,b| a <=> b} } }
  x.report("sort desc")        { n.times { array.sort{|a,b| b <=> a} } }
  x.report("sort asc reverse") { n.times { array.sort.reverse } }
end


key_value = Struct.new(:key, :value) do
  def <=>(other)
    key <=> other.key
  end
end

count = 10000
item_range = 1000000
array1 = Array.new(count) { [rand(item_range), rand(item_range)] }
array2 = Array.new(count) { key_value.new rand(item_range), rand(item_range) }

Benchmark.bm(20) do |x|
  x.report("sort_by")       { n.times { array1.sort_by {|a| a[0]} } }
  x.report("sort struct")   { n.times { array2.sort } }
end


================================================
FILE: benchmark/sort2.rb
================================================
require "benchmark"
require "algorithms"

NUMBER_OF_SORTING = 1
NUMBER_OF_ARRAY   = 10
WORDS_IN_ARRAY    = 100000
MAX_WORD_SIZE     = 10
EVAL_N_VALUES     = 10

puts "NUMBER_OF_SORTING: #{NUMBER_OF_SORTING}"
puts "NUMBER_OF_ARRAY: #{NUMBER_OF_ARRAY}"
puts "WORDS_IN_ARRAY: #{WORDS_IN_ARRAY}"
puts "MAX_WORD_SIZE: #{MAX_WORD_SIZE}"
puts "EVAL_N_VALUES: #{EVAL_N_VALUES}"

def words
  Array.new(WORDS_IN_ARRAY) { word }
end

def word
  Array.new(rand(1..MAX_WORD_SIZE)){(97+rand(26)).chr}.join
end

@array = Array.new(NUMBER_OF_ARRAY) { words.sort }


# =================================================================================================
# Sort1

# Vrátí nový (nevyhodnocený) enumerator
def sort1(data)
  return to_enum(__callee__, data) unless block_given?

  heap = []

  # Inicializuji heap s prvními položkami
  # připojím samotné enumeratory pro volání .next
  data.each do |a|
    heap << [a.next, a]
  end

  while data.any?
    begin
      # Seřadím pole podle hodnot
      heap.sort_by!{|(item,_)| item}
      # Uložím si hodnotu a enumerator
      item, enum = heap.shift
      # Hodnota půjde do výsledku
      yield item
      # Místo odstraněné položky nahradí další ze stejného seznamu
      heap << [enum.next, enum]
    rescue StopIteration
      # Enumerator je prázdný
      data.delete(enum)
    end
  end
end


# =================================================================================================
# Sort1_2

# Vrátí nový (nevyhodnocený) enumerator
def sort1_2(data)
  return to_enum(__callee__, data) unless block_given?

  heap = []
  enums = []

  # Inicializuji heap s prvními položkami
  # připojím samotné enumeratory pro volání .next
  data.each do |a|
    EVAL_N_VALUES.times {
      begin
        heap << [a.next, a]
      rescue StopIteration
      end
    }
  end

  while data.any? || heap.any?
      # Seřadím pole podle hodnot
      heap.sort_by!{|(item,_)| item}

      # Minimálně můžu vzít EVAL_N_VALUES
      EVAL_N_VALUES.times {
        break if heap.empty?

        # Uložím si hodnotu a enumerator
        item, enum = heap.shift
        # Hodnota půjde do výsledku
        yield item

        enums << enum
      }

    while (enum = enums.shift)
      begin
        heap << [enum.next, enum]
      rescue StopIteration
        data.delete(enum)
        enums.delete(enum)
      end
    end

  end
end


# =================================================================================================
# Sort 2

def sort2(data)
  return to_enum(__callee__, data) unless block_given?

  heap = Containers::Heap.new

  data.each do |enum|
    item = enum.next
    heap.push(item, [item, enum])
  end

  while data.any?
    begin
      item, enum = heap.pop
      yield item

      item = enum.next
      heap.push(item, [item, enum])
    rescue StopIteration
      data.delete(enum)
    end
  end
end


# =================================================================================================
# Benchmark

Benchmark.bm(10) do |x|
  x.report("sort") do
    NUMBER_OF_SORTING.times {
      @result = @array.flatten.sort
    }
  end

  x.report("sort 1") do
    NUMBER_OF_SORTING.times { 
      raise "Bad sorting" if @result != sort1(@array.map(&:each)).to_a
    }
  end

  x.report("sort 1_2") do
    NUMBER_OF_SORTING.times { 
      raise "Bad sorting" if @result != sort1_2(@array.map(&:each)).to_a
    }
  end

  # x.report("sort 2") do
  #   NUMBER_OF_SORTING.times {
  #     raise "Bad sorting" if @result != sort2(@array.map(&:each)).to_a
  #   }
  # end
end


================================================
FILE: benchmark/take.rb
================================================
require "benchmark"

SIZE = 100_000_000

@array1 = (0..SIZE).to_a;
@array2 = (0..SIZE).to_a;
@array3 = (0..SIZE).to_a;

TAKE = 100_000

Benchmark.bm(15) do |x|
  # Fastest
  x.report("take"){
    a=@array1.take(TAKE)
  }

  # Slowest and take most memory
  x.report("reverse drop"){
    @array2.reverse!
    @array2.drop(@array2.size - TAKE)
    @array2.reverse!
  }

  # Least memory
  x.report("splice"){
    a=@array2.slice!(0, TAKE)
  }
end


================================================
FILE: bin/ruby-spark
================================================
#!/usr/bin/env ruby

lib = File.expand_path(File.dirname(__FILE__) + '/../lib')
$LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)

require 'ruby-spark'

Spark::CLI.new.run


================================================
FILE: example/pi.rb
================================================
#!/usr/bin/env ruby

lib = File.expand_path(File.dirname(__FILE__) + '/../lib')
$LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)

require 'ruby-spark'

Spark.logger.disable
Spark.start

slices = 3
n = 100000 * slices

def map(_)
  x = rand * 2 - 1
  y = rand * 2 - 1

  if x**2 + y**2 < 1
    return 1
  else
    return 0
  end
end

rdd = Spark.context.parallelize(1..n, slices)
rdd = rdd.map(method(:map))

puts 'Pi is roughly %f' % (4.0 * rdd.sum / n)


================================================
FILE: example/website_search.rb
================================================
#!/usr/bin/env ruby

# Parse sitemap and search word on every page

require 'optparse'
require 'open-uri'
require 'nokogiri'
require 'ruby-spark'

options = {
  sitemap: 'http://fit.cvut.cz/sitemap.xml',
  query: 'cvut',
  workers: 2
}

opt_parser = OptionParser.new do |opts|
  opts.banner = 'Usage: website_search.rb [options]'

  opts.separator ''
  opts.separator 'Specific options:'

  opts.on('-s', '--sitemap SITEMAP', 'Sitemap URL') do |sitemap|
    options[:sitemap] = sitemap
  end

  opts.on('-q', '--query QUERY', 'Query to search') do |query|
    options[:query] = query
  end

  opts.on('-w', '--workers WORKERS_NUM', Integer, 'Number of workers') do |workers|
    options[:workers] = workers
  end

  opts.on('--quite', 'Run quitely') do |v|
    Spark.logger.disabled
  end

  opts.on_tail('-h', '--help', 'Show this message') do
    puts opts
    exit
  end
end

opt_parser.parse!

@links = []

def parse_sitemap(doc)
  doc.xpath('//sitemapindex/sitemap/loc').each do |loc|
    next_doc = Nokogiri::HTML(open(loc.text))
    parse_sitemap(next_doc)
  end

  doc.xpath('//url/loc').each do |loc|
    @links << loc.text
  end
end

doc = Nokogiri::HTML(open(options[:sitemap]))
parse_sitemap(doc)

# Map function
func = Proc.new do |url|
  begin
    open(url) {|f|
      [url, f.read.scan(query).count]
    }
  rescue
    [url, 0]
  end
end

Spark.start

rdd = Spark.sc.parallelize(@links, options[:workers])
              .add_library('open-uri')
              .bind(query: options[:query])
              .map(func)
              .sort_by(lambda{|(_, value)| value}, false)

rdd.collect.each do |(url, count)|
  puts "#{url} => #{count}"
end


================================================
FILE: ext/ruby_c/extconf.rb
================================================
require 'mkmf'

create_makefile("ruby_spark_ext")


================================================
FILE: ext/ruby_c/murmur.c
================================================
#include "murmur.h"

#if defined(_MSC_VER)
#define BIG_CONSTANT(x) (x)
#else
#define BIG_CONSTANT(x) (x##LLU)
#endif

/*-----------------------------------------------------------------------------
// MurmurHash2, 64-bit versions, by Austin Appleby
//
// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment 
// and endian-ness issues if used across multiple platforms.
//
// 64-bit hash for 64-bit platforms
*/

uint64_t MurmurHash64A(const void * key, int len, uint64_t seed)
{
  const uint64_t m = BIG_CONSTANT(0xc6a4a7935bd1e995);
  const int r = 47;

  uint64_t h = seed ^ (len * m);

  const uint64_t * data = (const uint64_t *)key;
  const uint64_t * end = data + (len/8);

  while(data != end)
  {
    uint64_t k = *data++;

    k *= m; 
    k ^= k >> r; 
    k *= m; 
    
    h ^= k;
    h *= m; 
  }

  const unsigned char * data2 = (const unsigned char*)data;

  switch(len & 7)
  {
  case 7: h ^= ((uint64_t) data2[6]) << 48;
  case 6: h ^= ((uint64_t) data2[5]) << 40;
  case 5: h ^= ((uint64_t) data2[4]) << 32;
  case 4: h ^= ((uint64_t) data2[3]) << 24;
  case 3: h ^= ((uint64_t) data2[2]) << 16;
  case 2: h ^= ((uint64_t) data2[1]) << 8;
  case 1: h ^= ((uint64_t) data2[0]);
          h *= m;
  };
 
  h ^= h >> r;
  h *= m;
  h ^= h >> r;

  return h;
} 

/* 64-bit hash for 32-bit platforms */

uint64_t MurmurHash64B(const void * key, int len, uint64_t seed)
{
  const uint32_t m = 0x5bd1e995;
  const int r = 24;

  uint32_t h1 = ((uint32_t) seed) ^ len;
  uint32_t h2 = ((uint32_t) (seed >> 32));

  const uint32_t * data = (const uint32_t *)key;

  while(len >= 8)
  {
    uint32_t k1 = *data++;
    k1 *= m; k1 ^= k1 >> r; k1 *= m;
    h1 *= m; h1 ^= k1;
    len -= 4;

    uint32_t k2 = *data++;
    k2 *= m; k2 ^= k2 >> r; k2 *= m;
    h2 *= m; h2 ^= k2;
    len -= 4;
  }

  if(len >= 4)
  {
    uint32_t k1 = *data++;
    k1 *= m; k1 ^= k1 >> r; k1 *= m;
    h1 *= m; h1 ^= k1;
    len -= 4;
  }

  switch(len)
  {
  case 3: h2 ^= ((unsigned char*)data)[2] << 16;
  case 2: h2 ^= ((unsigned char*)data)[1] << 8;
  case 1: h2 ^= ((unsigned char*)data)[0];
      h2 *= m;
  };

  h1 ^= h2 >> 18; h1 *= m;
  h2 ^= h1 >> 22; h2 *= m;
  h1 ^= h2 >> 17; h1 *= m;
  h2 ^= h1 >> 19; h2 *= m;

  uint64_t h = h1;

  h = (h << 32) | h2;

  return h;
}


// ================================================================================================
// Ruby methods

#define PORTABLE_HASH_SEED 16154832


VALUE murmur2_digest(VALUE rb_str, uint64_t seed)
{
  StringValue(rb_str);

  void * key = RSTRING_PTR(rb_str);
  long   len = RSTRING_LEN(rb_str);

  uint64_t result = MurmurHash64A(key, len, seed);

  return LONG2FIX(result);
}

// ------------------------------------------------------------------------------------------------
// Spark::Digest::Murmur2.digest

VALUE method_murmur2_digest(int argc, VALUE *argv, VALUE klass)
{
  if(argc == 0 || argc > 2){
    rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
  }

  uint64_t seed = (argc == 1 ? 0 : NUM2UINT(argv[1]));

  return murmur2_digest(argv[0], seed);
}

// ------------------------------------------------------------------------------------------------
// Spark::Digest.portable_hash

VALUE method_portable_hash(int argc, VALUE *argv, VALUE klass)
{
  if(argc != 1){
    rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
  }

  return murmur2_digest(argv[0], PORTABLE_HASH_SEED);
}


================================================
FILE: ext/ruby_c/murmur.h
================================================
#ifndef MURMUR_INCLUDED
#define MURMUR_INCLUDED

#include "ruby.h"

VALUE method_portable_hash(int argc, VALUE *argv, VALUE klass);
VALUE method_murmur2_digest(int argc, VALUE *argv, VALUE klass);

#endif


================================================
FILE: ext/ruby_c/ruby-spark.c
================================================
#include "ruby.h"
#include "murmur.h"


VALUE SparkModule;
VALUE SparkDigestModule;
VALUE SparkDigestMurmur2Class;


void Init_ruby_spark_ext()
{
  SparkModule             = rb_define_module("Spark");
  SparkDigestModule       = rb_define_module_under(SparkModule, "Digest");
  SparkDigestMurmur2Class = rb_define_class_under(SparkDigestModule, "Murmur2", rb_cObject);

  rb_define_singleton_method(SparkDigestModule, "portable_hash", method_portable_hash, -1);
  rb_define_singleton_method(SparkDigestMurmur2Class, "digest", method_murmur2_digest, -1);
}


================================================
FILE: ext/ruby_java/Digest.java
================================================
import org.jruby.Ruby;
import org.jruby.RubyModule;
import org.jruby.RubyObject;
import org.jruby.RubyClass;
import org.jruby.RubyString;
import org.jruby.RubyFixnum;
import org.jruby.anno.JRubyModule;
import org.jruby.anno.JRubyMethod;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;

@JRubyModule(name="Spark::Digest")
public class Digest extends RubyObject{

  // Have to be the same as in C extension
  final static long PORTABLE_HASH_SEED = 16154832;

  public Digest(final Ruby ruby, RubyClass rubyClass) {
    super(ruby, rubyClass);
  }

  @JRubyMethod(module=true)
  public static IRubyObject portable_hash(ThreadContext context, IRubyObject self, IRubyObject arg) {
    Ruby ruby = self.getRuntime();

    RubyString keyString = (RubyString)arg;

    long hash = Murmur2.hash64(keyString.getBytes(), (int)keyString.length().getLongValue(), PORTABLE_HASH_SEED);

    RubyFixnum result = new RubyFixnum(ruby, hash);

    return result;
  }

}


================================================
FILE: ext/ruby_java/Murmur2.java
================================================
import org.jruby.Ruby;
import org.jruby.RubyClass;
import org.jruby.RubyObject;
import org.jruby.RubyString;
import org.jruby.RubyFixnum;
import org.jruby.anno.JRubyClass;
import org.jruby.anno.JRubyMethod;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;

/** Murmur hash 2.0.
 * 
 * The murmur hash is a relative fast hash function from
 * http://murmurhash.googlepages.com/ for platforms with efficient
 * multiplication.
 *
 * http://d3s.mff.cuni.cz/~holub/sw/javamurmurhash/
 *
 */

@JRubyClass(name="Spark::Digest::Murmur2")
public class Murmur2 extends RubyObject {

  public Murmur2(final Ruby ruby, RubyClass rubyClass) {
    super(ruby, rubyClass);
  }

  @JRubyMethod(required=1, optional=1, module=true)
  public static IRubyObject digest(ThreadContext context, IRubyObject self, IRubyObject[] args) {
    Ruby ruby = context.getRuntime();

    RubyString keyString = (RubyString)args[0];
    long seed;

    if(args.length > 1){
      RubyFixnum rb_seed = (RubyFixnum)args[1];
      seed = rb_seed.getLongValue();
    }
    else{
      seed = 0;
    }

    long hash = hash64(keyString.getBytes(), (int)keyString.length().getLongValue(), seed);

    RubyFixnum result = new RubyFixnum(ruby, hash);
    return result;
  }


  /** Generates 64 bit hash from byte array of the given length and seed.
   * 
   * @param data byte array to hash
   * @param length length of the array to hash
   * @param seed initial seed value
   * @return 64 bit hash of the given array
   */
  public static long hash64(final byte[] data, int length, long seed) {
    final long m = 0xc6a4a7935bd1e995L;
    final int r = 47;

    long h = (seed&0xffffffffl)^(length*m);

    int length8 = length/8;

    for (int i=0; i<length8; i++) {
      final int i8 = i*8;
      long k =  ((long)data[i8+0]&0xff)      +(((long)data[i8+1]&0xff)<<8)
          +(((long)data[i8+2]&0xff)<<16) +(((long)data[i8+3]&0xff)<<24)
          +(((long)data[i8+4]&0xff)<<32) +(((long)data[i8+5]&0xff)<<40)
          +(((long)data[i8+6]&0xff)<<48) +(((long)data[i8+7]&0xff)<<56);
      
      k *= m;
      k ^= k >>> r;
      k *= m;
      
      h ^= k;
      h *= m; 
    }
    
    switch (length%8) {
    case 7: h ^= (long)(data[(length&~7)+6]&0xff) << 48;
    case 6: h ^= (long)(data[(length&~7)+5]&0xff) << 40;
    case 5: h ^= (long)(data[(length&~7)+4]&0xff) << 32;
    case 4: h ^= (long)(data[(length&~7)+3]&0xff) << 24;
    case 3: h ^= (long)(data[(length&~7)+2]&0xff) << 16;
    case 2: h ^= (long)(data[(length&~7)+1]&0xff) << 8;
    case 1: h ^= (long)(data[length&~7]&0xff);
            h *= m;
    };
   
    h ^= h >>> r;
    h *= m;
    h ^= h >>> r;

    return h;
  }

}


================================================
FILE: ext/ruby_java/RubySparkExtService.java
================================================
import org.jruby.Ruby;
import org.jruby.RubyClass;
import org.jruby.RubyModule;
import org.jruby.runtime.ObjectAllocator;
import org.jruby.runtime.builtin.IRubyObject;
import org.jruby.runtime.load.BasicLibraryService;

public class RubySparkExtService implements BasicLibraryService
{
  public boolean basicLoad(final Ruby ruby) throws java.io.IOException {

    RubyModule sparkModule = ruby.defineModule("Spark");
    RubyModule sparkDigestModule = sparkModule.defineModuleUnder("Digest");
    RubyClass  sparkDigestMurmur2Class = sparkDigestModule.defineClassUnder("Murmur2", ruby.getObject(), sparkDigestMurmur2Allocator);

    sparkDigestModule.defineAnnotatedMethods(Digest.class);
    sparkDigestMurmur2Class.defineAnnotatedMethods(Murmur2.class);

    return true;
  }

  public static ObjectAllocator sparkDigestMurmur2Allocator = new ObjectAllocator() {
    public IRubyObject allocate(Ruby ruby, RubyClass rubyClass) {
      return new Murmur2(ruby, rubyClass);
    }
  };

}


================================================
FILE: ext/ruby_java/extconf.rb
================================================
require 'mkmf'

create_makefile("ruby_spark_ext")


================================================
FILE: ext/spark/build.sbt
================================================
import AssemblyKeys._

assemblySettings

// Default values
val defaultScalaVersion     = "2.10.4"
val defaultSparkVersion     = "1.6.0"
val defaultSparkCoreVersion = "2.10"
val defaultTargetDir        = "target"
val defaultHadoopVersion    = "1.0.4"

// Values
val _hadoopVersion    = scala.util.Properties.envOrElse("HADOOP_VERSION", defaultHadoopVersion)
val _scalaVersion     = scala.util.Properties.envOrElse("SCALA_VERSION", defaultScalaVersion)
val _sparkVersion     = scala.util.Properties.envOrElse("SPARK_VERSION", defaultSparkVersion)
val _sparkCoreVersion = scala.util.Properties.envOrElse("SPARK_CORE_VERSION", defaultSparkCoreVersion)
val _targetDir        = scala.util.Properties.envOrElse("TARGET_DIR", defaultTargetDir)

// Project settings
name := "ruby-spark"

version := "1.0.0"

scalaVersion := _scalaVersion

javacOptions ++= Seq("-source", "1.7", "-target", "1.7")

// Jar target folder
artifactPath in Compile in packageBin := file(s"${_targetDir}/ruby-spark.jar")
outputPath in packageDependency := file(s"${_targetDir}/ruby-spark-deps.jar")

// Protocol buffer support
seq(sbtprotobuf.ProtobufPlugin.protobufSettings: _*)

// Additional libraries
libraryDependencies ++= Seq(
  "org.apache.spark"  %% "spark-core"    % _sparkVersion excludeAll(ExclusionRule(organization = "org.apache.hadoop")),
  "org.apache.spark"  %% "spark-graphx"  % _sparkVersion,
  "org.apache.spark"  %% "spark-mllib"   % _sparkVersion,
  "org.apache.spark"  %% "spark-sql"     % _sparkVersion,
  "org.apache.hadoop" %  "hadoop-client" % _hadoopVersion,
  "com.github.fommil.netlib" % "all" % "1.1.2",
  "org.scalatest" % "scalatest_2.10" % "2.2.1" % "test"
)

// Repositories
resolvers ++= Seq(
  "JBoss Repository"     at "http://repository.jboss.org/nexus/content/repositories/releases/",
  "Spray Repository"     at "http://repo.spray.io/",
  "Cloudera Repository"  at "https://repository.cloudera.com/artifactory/cloudera-repos/",
  "Akka Repository"      at "http://repo.akka.io/releases/",
  "Twitter4J Repository" at "http://twitter4j.org/maven2/",
  "Apache HBase"         at "https://repository.apache.org/content/repositories/releases",
  "Twitter Maven Repo"   at "http://maven.twttr.com/",
  "scala-tools"          at "https://oss.sonatype.org/content/groups/scala-tools",
  "Typesafe repository"  at "http://repo.typesafe.com/typesafe/releases/",
  "Second Typesafe repo" at "http://repo.typesafe.com/typesafe/maven-releases/",
  "Mesosphere Public Repository" at "http://downloads.mesosphere.io/maven",
  Resolver.sonatypeRepo("public")
)

// Merge strategy
mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) =>
  {
    case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard
    case m if m.startsWith("META-INF") => MergeStrategy.discard
    case PathList("javax", "servlet", xs @ _*) => MergeStrategy.first
    case PathList("org", "apache", xs @ _*) => MergeStrategy.first
    case PathList("org", "jboss", xs @ _*) => MergeStrategy.first
    case "about.html"  => MergeStrategy.rename
    case "reference.conf" => MergeStrategy.concat
    case _ => MergeStrategy.first
  }
}


================================================
FILE: ext/spark/project/plugins.sbt
================================================
resolvers += Resolver.url("artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns)

resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/"

resolvers += "Spray Repository" at "http://repo.spray.io/"

addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.10.2")

addSbtPlugin("com.github.gseitz" % "sbt-protobuf" % "0.3.3")


================================================
FILE: ext/spark/sbt/sbt
================================================
#!/bin/bash

# This script launches sbt for this project. If present it uses the system
# version of sbt. If there is no system version of sbt it attempts to download
# sbt locally.
SBT_VERSION=0.13.9
URL1=http://dl.bintray.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
URL2=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
JAR=sbt/sbt-launch-${SBT_VERSION}.jar

# Download sbt launch jar if it hasn't been downloaded yet
if [ ! -f ${JAR} ]; then
  # Download
  printf "Attempting to fetch sbt\n"
  JAR_DL=${JAR}.part
  if hash wget 2>/dev/null; then
    (wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR}
  elif hash curl 2>/dev/null; then
    (curl --progress-bar ${URL1} > ${JAR_DL} || curl --progress-bar ${URL2} > ${JAR_DL}) && mv ${JAR_DL} ${JAR}
  else
    printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n"
    exit -1
  fi
fi
if [ ! -f ${JAR} ]; then
  # We failed to download
  printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n"
  exit -1
fi
printf "Launching sbt from ${JAR}\n"
java \
  -Xmx1200m -XX:MaxPermSize=350m -XX:ReservedCodeCacheSize=256m \
  -jar ${JAR} \
  "$@"


================================================
FILE: ext/spark/src/main/scala/Exec.scala
================================================
package org.apache.spark.api.ruby

import java.io.{File, FileOutputStream, InputStreamReader, BufferedReader}

import scala.collection.JavaConversions._

import org.apache.spark.{SparkEnv, Logging}
import org.apache.spark.util._


/* =================================================================================================
 * class FileCommand
 * =================================================================================================
 *
 * Save command to file and than execute him because from Scala you cannot simply run
 * something like "bash --norc -i -c 'source .zshrc; ruby master.rb'"
 */

class FileCommand(command: String) extends Logging {

  var pb: ProcessBuilder = null
  var file: File = null

  // Command is complete.
  def this(command: String, env: SparkEnv) = {
    this(command)
    create(env)
  }

  // Template must contains %s which will be replaced for command
  def this(template: String, command: String, env: SparkEnv, envVars: Map[String, String]) = {
    this(template.format(command), env)
    setEnvVars(envVars)
  }

  private def create(env: SparkEnv) {
    val dir = new File(env.sparkFilesDir)
    val ext = if(Utils.isWindows) ".cmd" else ".sh"
    val shell = if(Utils.isWindows) "cmd" else "bash"

    file = File.createTempFile("command", ext, dir)

    val out = new FileOutputStream(file)
    out.write(command.getBytes)
    out.close

    logInfo(s"New FileCommand at ${file.getAbsolutePath}")

    pb = new ProcessBuilder(shell, file.getAbsolutePath)
  }

  def setEnvVars(vars: Map[String, String]) {
    pb.environment().putAll(vars)
  }

  def run = {
    new ExecutedFileCommand(pb.start)
  }
}


/* =================================================================================================
 * class ExecutedFileCommand
 * =================================================================================================
 *
 * Represent process executed from file.
 */

class ExecutedFileCommand(process: Process) {

  var reader: BufferedReader = null

  def readLine = {
    openInput
    reader.readLine.toString.trim
  }

  def openInput {
    if(reader != null){
      return
    }

    val input = new InputStreamReader(process.getInputStream)
    reader = new BufferedReader(input)
  }

  // Delegation
  def destroy = process.destroy
  def getInputStream = process.getInputStream
  def getErrorStream = process.getErrorStream
}


================================================
FILE: ext/spark/src/main/scala/MLLibAPI.scala
================================================
package org.apache.spark.mllib.api.python

// PythonMLLibAPI is private for python
class MLLibAPI extends PythonMLLibAPI {}


================================================
FILE: ext/spark/src/main/scala/Marshal.scala
================================================
package org.apache.spark.api.ruby.marshal

import java.io.{DataInputStream, DataOutputStream, ByteArrayInputStream, ByteArrayOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.collection.JavaConverters._


/* =================================================================================================
 * object Marshal
 * =================================================================================================
 */
object Marshal {
  def load(bytes: Array[Byte]) = {
    val is = new DataInputStream(new ByteArrayInputStream(bytes))

    val majorVersion = is.readUnsignedByte // 4
    val minorVersion = is.readUnsignedByte // 8

    (new MarshalLoad(is)).load
  }

  def dump(data: Any) = {
    val aos = new ByteArrayOutputStream
    val os = new DataOutputStream(aos)

    os.writeByte(4)
    os.writeByte(8)

    (new MarshalDump(os)).dump(data)
    aos.toByteArray
  }
}


/* =================================================================================================
 * class IterableMarshaller
 * =================================================================================================
 */
class IterableMarshaller(iter: Iterator[Any]) extends Iterator[Array[Byte]] {
  private val buffer = new ArrayBuffer[Any]

  override def hasNext: Boolean = iter.hasNext

  override def next(): Array[Byte] = {
    while (iter.hasNext) {
      buffer += iter.next()
    }

    Marshal.dump(buffer)
  }
}


================================================
FILE: ext/spark/src/main/scala/MarshalDump.scala
================================================
package org.apache.spark.api.ruby.marshal

import java.io.{DataInputStream, DataOutputStream, ByteArrayInputStream, ByteArrayOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.collection.JavaConverters._
import scala.reflect.{ClassTag, classTag}

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.{Vector, DenseVector, SparseVector}


/* =================================================================================================
 * class MarshalDump
 * =================================================================================================
 */
class MarshalDump(os: DataOutputStream) {

  val NAN_BYTELIST               = "nan".getBytes
  val NEGATIVE_INFINITY_BYTELIST = "-inf".getBytes
  val INFINITY_BYTELIST          = "inf".getBytes

  def dump(data: Any) {
    data match {
      case null =>
        os.writeByte('0')

      case item: Boolean =>
        val char = if(item) 'T' else 'F'
        os.writeByte(char)

      case item: Int =>
        os.writeByte('i')
        dumpInt(item)

      case item: Array[_] =>
        os.writeByte('[')
        dumpArray(item)

      case item: Double =>
        os.writeByte('f')
        dumpFloat(item)

      case item: ArrayBuffer[Any] => dump(item.toArray)
    }
  }

  def dumpInt(data: Int) {
    if(data == 0){
      os.writeByte(0)
    }
    else if (0 < data && data < 123) {
      os.writeByte(data + 5)
    }
    else if (-124 < data && data < 0) {
      os.writeByte((data - 5) & 0xff)
    }
    else {
      val buffer = new Array[Byte](4)
      var value = data

      var i = 0
      while(i != 4 && value != 0 && value != -1){
        buffer(i) = (value & 0xff).toByte
        value = value >> 8

        i += 1
      }
      val lenght = i + 1
      if(value < 0){
        os.writeByte(-lenght)
      }
      else{
        os.writeByte(lenght)
      }
      os.write(buffer, 0, lenght)
    }
  }

  def dumpArray(array: Array[_]) {
    dumpInt(array.size)

    for(item <- array) {
      dump(item)
    }
  }

  def dumpFloat(value: Double) {
    if(value.isPosInfinity){
      dumpString(NEGATIVE_INFINITY_BYTELIST)
    }
    else if(value.isNegInfinity){
      dumpString(INFINITY_BYTELIST)
    }
    else if(value.isNaN){
      dumpString(NAN_BYTELIST)
    }
    else{
      // dumpString("%.17g".format(value))
      dumpString(value.toString)
    }
  }

  def dumpString(data: String) {
    dumpString(data.getBytes)
  }

  def dumpString(data: Array[Byte]) {
    dumpInt(data.size)
    os.write(data)
  }

}


================================================
FILE: ext/spark/src/main/scala/MarshalLoad.scala
================================================
package org.apache.spark.api.ruby.marshal

import java.io.{DataInputStream, DataOutputStream, ByteArrayInputStream, ByteArrayOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.collection.JavaConverters._
import scala.reflect.{ClassTag, classTag}

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.{Vector, DenseVector, SparseVector}


/* =================================================================================================
 * class MarshalLoad
 * =================================================================================================
 */
class MarshalLoad(is: DataInputStream) {

  case class WaitForObject()

  val registeredSymbols = ArrayBuffer[String]()
  val registeredLinks = ArrayBuffer[Any]()

  def load: Any = {
    load(is.readUnsignedByte.toChar)
  }

  def load(dataType: Char): Any = {
    dataType match {
      case '0' => null
      case 'T' => true
      case 'F' => false
      case 'i' => loadInt
      case 'f' => loadAndRegisterFloat
      case ':' => loadAndRegisterSymbol
      case '[' => loadAndRegisterArray
      case 'U' => loadAndRegisterUserObject
      case _ =>
        throw new IllegalArgumentException(s"Format is not supported: $dataType.")
    }
  }


  // ----------------------------------------------------------------------------------------------
  // Load by type

  def loadInt: Int = {
    var c = is.readByte.toInt

    if (c == 0) {
      return 0
    } else if (4 < c && c < 128) {
      return c - 5
    } else if (-129 < c && c < -4) {
      return c + 5
    }

    var result: Long = 0

    if (c > 0) {
      result = 0
      for( i <- 0 until c ) {
        result |= (is.readUnsignedByte << (8 * i)).toLong
      }
    } else {
      c = -c
      result = -1
      for( i <- 0 until c ) {
        result &= ~((0xff << (8 * i)).toLong)
        result |= (is.readUnsignedByte << (8 * i)).toLong
      }
    }

    result.toInt
  }

  def loadAndRegisterFloat: Double = {
    val result = loadFloat
    registeredLinks += result
    result
  }

  def loadFloat: Double = {
    val string = loadString
    string match {
      case "nan"  => Double.NaN
      case "inf"  => Double.PositiveInfinity
      case "-inf" => Double.NegativeInfinity
      case _ => string.toDouble
    }
  }

  def loadString: String = {
    new String(loadStringBytes)
  }

  def loadStringBytes: Array[Byte] = {
    val size = loadInt
    val buffer = new Array[Byte](size)

    var readSize = 0
    while(readSize < size){
      val read = is.read(buffer, readSize, size-readSize)

      if(read == -1){
        throw new IllegalArgumentException("Marshal too short.")
      }

      readSize += read
    }

    buffer
  }

  def loadAndRegisterSymbol: String = {
    val result = loadString
    registeredSymbols += result
    result
  }

  def loadAndRegisterArray: Array[Any] = {
    val size = loadInt
    val array = new Array[Any](size)

    registeredLinks += array

    for( i <- 0 until size ) {
      array(i) = loadNextObject
    }

    array
  }

  def loadAndRegisterUserObject: Any = {
    val klass = loadNextObject.asInstanceOf[String]

    // Register future class before load the next object
    registeredLinks += WaitForObject()
    val index = registeredLinks.size - 1

    val data = loadNextObject

    val result = klass match {
      case "Spark::Mllib::LabeledPoint" => createLabeledPoint(data)
      case "Spark::Mllib::DenseVector" => createDenseVector(data)
      case "Spark::Mllib::SparseVector" => createSparseVector(data)
      case other =>
        throw new IllegalArgumentException(s"Object $other is not supported.")
    }

    registeredLinks(index) = result

    result
  }


  // ----------------------------------------------------------------------------------------------
  // Other loads

  def loadNextObject: Any = {
    val dataType = is.readUnsignedByte.toChar

    if(isLinkType(dataType)){
      readLink(dataType)
    }
    else{
      load(dataType)
    }
  }


  // ----------------------------------------------------------------------------------------------
  // To java objects

  def createLabeledPoint(data: Any): LabeledPoint = {
    val array = data.asInstanceOf[Array[_]]
    new LabeledPoint(array(0).asInstanceOf[Double], array(1).asInstanceOf[Vector])
  }

  def createDenseVector(data: Any): DenseVector = {
    new DenseVector(data.asInstanceOf[Array[_]].map(toDouble(_)))
  }

  def createSparseVector(data: Any): SparseVector = {
    val array = data.asInstanceOf[Array[_]]
    val size = array(0).asInstanceOf[Int]
    val indices = array(1).asInstanceOf[Array[_]].map(_.asInstanceOf[Int])
    val values = array(2).asInstanceOf[Array[_]].map(toDouble(_))

    new SparseVector(size, indices, values)
  }


  // ----------------------------------------------------------------------------------------------
  // Helpers

  def toDouble(data: Any): Double = data match {
    case x: Int => x.toDouble
    case x: Double => x
    case _ => 0.0
  }


  // ----------------------------------------------------------------------------------------------
  // Cache

  def readLink(dataType: Char): Any = {
    val index = loadInt

    dataType match {
      case '@' => registeredLinks(index)
      case ';' => registeredSymbols(index)
    }
  }

  def isLinkType(dataType: Char): Boolean = {
    dataType == ';' || dataType == '@'
  }

}


================================================
FILE: ext/spark/src/main/scala/RubyAccumulatorParam.scala
================================================
package org.apache.spark.api.ruby

import java.io._
import java.net._
import java.util.{List, ArrayList}

import scala.collection.JavaConversions._
import scala.collection.immutable._

import org.apache.spark._
import org.apache.spark.util.Utils

/**
 * Internal class that acts as an `AccumulatorParam` for Ruby accumulators. Inside, it
 * collects a list of pickled strings that we pass to Ruby through a socket.
 */
private class RubyAccumulatorParam(serverHost: String, serverPort: Int)
  extends AccumulatorParam[List[Array[Byte]]] {

  // Utils.checkHost(serverHost, "Expected hostname")

  val bufferSize = SparkEnv.get.conf.getInt("spark.buffer.size", 65536)

  // Socket shoudl not be serialized
  // Otherwise: SparkException: Task not serializable
  @transient var socket: Socket = null
  @transient var socketOutputStream: DataOutputStream = null
  @transient var socketInputStream:  DataInputStream = null

  def openSocket(){
    synchronized {
      if (socket == null || socket.isClosed) {
        socket = new Socket(serverHost, serverPort)

        socketInputStream  = new DataInputStream(new BufferedInputStream(socket.getInputStream, bufferSize))
        socketOutputStream = new DataOutputStream(new BufferedOutputStream(socket.getOutputStream, bufferSize))
      }
    }
  }

  override def zero(value: List[Array[Byte]]): List[Array[Byte]] = new ArrayList

  override def addInPlace(val1: List[Array[Byte]], val2: List[Array[Byte]]) : List[Array[Byte]] = synchronized {
    if (serverHost == null) {
      // This happens on the worker node, where we just want to remember all the updates
      val1.addAll(val2)
      val1
    } else {
      // This happens on the master, where we pass the updates to Ruby through a socket
      openSocket()

      socketOutputStream.writeInt(val2.size)
      for (array <- val2) {
        socketOutputStream.writeInt(array.length)
        socketOutputStream.write(array)
      }
      socketOutputStream.flush()

      // Wait for acknowledgement
      // http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
      //
      // if(in.readInt() != RubyConstant.ACCUMULATOR_ACK){
      //   throw new SparkException("Accumulator was not acknowledged")
      // }

      new ArrayList
    }
  }
}


================================================
FILE: ext/spark/src/main/scala/RubyBroadcast.scala
================================================
package org.apache.spark.api.ruby

import org.apache.spark.api.python.PythonBroadcast

/**
 * An Wrapper for Ruby Broadcast, which is written into disk by Ruby. It also will
 * write the data into disk after deserialization, then Ruby can read it from disks.
 *
 * Class use Python logic - only for semantic
 */
class RubyBroadcast(@transient var _path: String, @transient var id: java.lang.Long) extends PythonBroadcast(_path) {

}


================================================
FILE: ext/spark/src/main/scala/RubyConstant.scala
================================================
package org.apache.spark.api.ruby

object RubyConstant {
  val DATA_EOF = -2
  val WORKER_ERROR = -1
  val WORKER_DONE = 0
  val CREATE_WORKER = 1
  val KILL_WORKER = 2
  val KILL_WORKER_AND_WAIT = 3
  val SUCCESSFULLY_KILLED = 4
  val UNSUCCESSFUL_KILLING = 5
  val ACCUMULATOR_ACK = 6
}


================================================
FILE: ext/spark/src/main/scala/RubyMLLibAPI.scala
================================================
package org.apache.spark.mllib.api.ruby

import java.util.ArrayList

import scala.collection.JavaConverters._

import org.apache.spark.rdd.RDD
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.clustering.GaussianMixtureModel
import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
import org.apache.spark.mllib.api.python.MLLibAPI


class RubyMLLibAPI extends MLLibAPI {
  // trainLinearRegressionModelWithSGD
  // trainLassoModelWithSGD
  // trainRidgeModelWithSGD
  // trainLogisticRegressionModelWithSGD
  // trainLogisticRegressionModelWithLBFGS
  // trainSVMModelWithSGD
  // trainKMeansModel
  // trainGaussianMixtureModel

  // Rjb have a problem with theta: Array[Array[Double]]
  override def trainNaiveBayesModel(data: JavaRDD[LabeledPoint], lambda: Double) = {
    val model = NaiveBayes.train(data.rdd, lambda)

    List(
      Vectors.dense(model.labels),
      Vectors.dense(model.pi),
      model.theta.toSeq
    ).map(_.asInstanceOf[Object]).asJava
  }

  // On python is wt just Object
  def predictSoftGMM(
      data: JavaRDD[Vector],
      wt: ArrayList[Object],
      mu: ArrayList[Object],
      si: ArrayList[Object]): RDD[Array[Double]] = {

      // val weight = wt.asInstanceOf[Array[Double]]
      val weight = wt.toArray.map(_.asInstanceOf[Double])
      val mean = mu.toArray.map(_.asInstanceOf[DenseVector])
      val sigma = si.toArray.map(_.asInstanceOf[DenseMatrix])
      val gaussians = Array.tabulate(weight.length){
        i => new MultivariateGaussian(mean(i), sigma(i))
      }
      val model = new GaussianMixtureModel(weight, gaussians)
      model.predictSoft(data)
  }
}


================================================
FILE: ext/spark/src/main/scala/RubyMLLibUtilAPI.scala
================================================
package org.apache.spark.mllib.api.ruby

import java.util.ArrayList

import org.apache.spark.mllib.util.LinearDataGenerator
import org.apache.spark.mllib.regression.LabeledPoint

object RubyMLLibUtilAPI {

  // Ruby does have a problem with creating Array[Double]
  def generateLinearInput(
      intercept: Double,
      weights: ArrayList[String],
      nPoints: Int,
      seed: Int,
      eps: Double = 0.1): Seq[LabeledPoint] = {

    LinearDataGenerator.generateLinearInput(intercept, weights.toArray.map(_.toString.toDouble), nPoints, seed, eps)
  }

}


================================================
FILE: ext/spark/src/main/scala/RubyPage.scala
================================================
package org.apache.spark.ui.ruby

// import javax.servlet.http.HttpServletRequest

// import scala.xml.Node

// import org.apache.spark.ui.{WebUIPage, UIUtils}
// import org.apache.spark.util.Utils

// private[ui] class RubyPage(parent: RubyTab, rbConfig: Array[Tuple2[String, String]]) extends WebUIPage("") {

//   def render(request: HttpServletRequest): Seq[Node] = {
//     val content = UIUtils.listingTable(header, row, rbConfig)
//     UIUtils.headerSparkPage("Ruby Config", content, parent)
//   }

//   private def header = Seq(
//     "Number"
//   )

//   private def row(keyValue: (String, String)): Seq[Node] = {
//     // scalastyle:off
//     keyValue match {
//       case (key, value) =>
//         <tr>
//           <td>{key}</td>
//           <td>{value}</td>
//         </tr>
//     }
//     // scalastyle:on
//   }
// }

class RubyPage {}


================================================
FILE: ext/spark/src/main/scala/RubyRDD.scala
================================================
package org.apache.spark.api.ruby

import java.io._
import java.net._
import java.util.{List, ArrayList, Collections}

import scala.util.Try
import scala.reflect.ClassTag
import scala.collection.JavaConversions._

import org.apache.spark._
import org.apache.spark.{SparkEnv, Partition, SparkException, TaskContext}
import org.apache.spark.api.ruby._
import org.apache.spark.api.ruby.marshal._
import org.apache.spark.api.java.{JavaSparkContext, JavaPairRDD, JavaRDD}
import org.apache.spark.api.python.PythonRDD
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.util.Utils
import org.apache.spark.InterruptibleIterator


/* =================================================================================================
 * Class RubyRDD
 * =================================================================================================
 */

class RubyRDD(
    @transient parent: RDD[_],
    command: Array[Byte],
    broadcastVars: ArrayList[Broadcast[RubyBroadcast]],
    accumulator: Accumulator[List[Array[Byte]]])
  extends RDD[Array[Byte]](parent){

    val bufferSize = conf.getInt("spark.buffer.size", 65536)

    val asJavaRDD: JavaRDD[Array[Byte]] = JavaRDD.fromRDD(this)

    override def getPartitions: Array[Partition] = firstParent.partitions

    override val partitioner = None

    /* ------------------------------------------------------------------------------------------ */

    override def compute(split: Partition, context: TaskContext): Iterator[Array[Byte]] = {

      val env = SparkEnv.get

      // Get worker and id
      val (worker, workerId) = RubyWorker.create(env)

      // Start a thread to feed the process input from our parent's iterator
      val writerThread = new WriterThread(env, worker, split, context)

      context.addTaskCompletionListener { context =>
        writerThread.shutdownOnTaskCompletion()
        writerThread.join()

        // Cleanup the worker socket. This will also cause the worker to exit.
        try {
          RubyWorker.remove(worker, workerId)
          worker.close()
        } catch {
          case e: Exception => logWarning("Failed to close worker socket", e)
        }
      }

      val stream = new DataInputStream(new BufferedInputStream(worker.getInputStream, bufferSize))

      // Send data
      writerThread.start()

      // For violent termination of worker
      new MonitorThread(workerId, worker, context).start()

      // Return an iterator that read lines from the process's stdout
      val stdoutIterator = new StreamReader(stream, writerThread, context)

      // An iterator that wraps around an existing iterator to provide task killing functionality.
      new InterruptibleIterator(context, stdoutIterator)

    } // end compute

    /* ------------------------------------------------------------------------------------------ */

    class WriterThread(env: SparkEnv, worker: Socket, split: Partition, context: TaskContext)
      extends Thread("stdout writer for worker") {

      @volatile private var _exception: Exception = null

      setDaemon(true)

      // Contains the exception thrown while writing the parent iterator to the process.
      def exception: Option[Exception] = Option(_exception)

      // Terminates the writer thread, ignoring any exceptions that may occur due to cleanup.
      def shutdownOnTaskCompletion() {
        assert(context.isCompleted)
        this.interrupt()
      }

      // -------------------------------------------------------------------------------------------
      // Send the necessary data for worker
      //   - split index
      //   - command
      //   - iterator

      override def run(): Unit = Utils.logUncaughtExceptions {
        try {
          SparkEnv.set(env)
          val stream = new BufferedOutputStream(worker.getOutputStream, bufferSize)
          val dataOut = new DataOutputStream(stream)

          // Partition index
          dataOut.writeInt(split.index)

          // Spark files
          PythonRDD.writeUTF(SparkFiles.getRootDirectory, dataOut)

          // Broadcast variables
          dataOut.writeInt(broadcastVars.length)
          for (broadcast <- broadcastVars) {
            dataOut.writeLong(broadcast.value.id)
            PythonRDD.writeUTF(broadcast.value.path, dataOut)
          }

          // Serialized command
          dataOut.writeInt(command.length)
          dataOut.write(command)

          // Send it
          dataOut.flush()

          // Data
          PythonRDD.writeIteratorToStream(firstParent.iterator(split, context), dataOut)
          dataOut.writeInt(RubyConstant.DATA_EOF)
          dataOut.flush()
        } catch {
          case e: Exception if context.isCompleted || context.isInterrupted =>
            logDebug("Exception thrown after task completion (likely due to cleanup)", e)

          case e: Exception =>
            // We must avoid throwing exceptions here, because the thread uncaught exception handler
            // will kill the whole executor (see org.apache.spark.executor.Executor).
            _exception = e
        } finally {
          Try(worker.shutdownOutput()) // kill worker process
        }
      }
    } // end WriterThread


    /* ------------------------------------------------------------------------------------------ */

    class StreamReader(stream: DataInputStream, writerThread: WriterThread, context: TaskContext) extends Iterator[Array[Byte]] {

      def hasNext = _nextObj != null
      var _nextObj = read()

      // -------------------------------------------------------------------------------------------

      def next(): Array[Byte] = {
        val obj = _nextObj
        if (hasNext) {
          _nextObj = read()
        }
        obj
      }

      // -------------------------------------------------------------------------------------------

      private def read(): Array[Byte] = {
        if (writerThread.exception.isDefined) {
          throw writerThread.exception.get
        }
        try {
          stream.readInt() match {
            case length if length > 0 =>
              val obj = new Array[Byte](length)
              stream.readFully(obj)
              obj
            case RubyConstant.WORKER_DONE =>
              val numAccumulatorUpdates = stream.readInt()
              (1 to numAccumulatorUpdates).foreach { _ =>
                val updateLen = stream.readInt()
                val update = new Array[Byte](updateLen)
                stream.readFully(update)
                accumulator += Collections.singletonList(update)
              }
              null
            case RubyConstant.WORKER_ERROR =>
              // Exception from worker

              // message
              val length = stream.readInt()
              val obj = new Array[Byte](length)
              stream.readFully(obj)

              // stackTrace
              val stackTraceLen = stream.readInt()
              val stackTrace = new Array[String](stackTraceLen)
              (0 until stackTraceLen).foreach { i =>
                val length = stream.readInt()
                val obj = new Array[Byte](length)
                stream.readFully(obj)

                stackTrace(i) = new String(obj, "utf-8")
              }

              // Worker will be killed
              stream.close

              // exception
              val exception = new RubyException(new String(obj, "utf-8"), writerThread.exception.getOrElse(null))
              exception.appendToStackTrace(stackTrace)

              throw exception
          }
        } catch {

          case e: Exception if context.isInterrupted =>
            logDebug("Exception thrown after task interruption", e)
            throw new TaskKilledException

          case e: Exception if writerThread.exception.isDefined =>
            logError("Worker exited unexpectedly (crashed)", e)
            throw writerThread.exception.get

          case eof: EOFException =>
            throw new SparkException("Worker exited unexpectedly (crashed)", eof)
        }
      }
    } // end StreamReader

    /* ---------------------------------------------------------------------------------------------
     * Monitor thread for controll worker. Kill worker if task is interrupted.
     */

    class MonitorThread(workerId: Long, worker: Socket, context: TaskContext)
      extends Thread("Worker Monitor for worker") {

      setDaemon(true)

      override def run() {
        // Kill the worker if it is interrupted, checking until task completion.
        while (!context.isInterrupted && !context.isCompleted) {
          Thread.sleep(2000)
        }
        if (!context.isCompleted) {
          try {
            logWarning("Incomplete task interrupted: Attempting to kill Worker "+workerId.toString())
            RubyWorker.kill(workerId)
          } catch {
            case e: Exception =>
              logError("Exception when trying to kill worker "+workerId.toString(), e)
          }
        }
      }
    } // end MonitorThread
  } // end RubyRDD


/* =================================================================================================
 * Class PairwiseRDD
 * =================================================================================================
 *
 * Form an RDD[(Array[Byte], Array[Byte])] from key-value pairs returned from Ruby.
 * This is used by PySpark's shuffle operations.
 * Borrowed from Python Package -> need new deserializeLongValue ->
 *   Marshal will add the same 4b header
 */

class PairwiseRDD(prev: RDD[Array[Byte]]) extends RDD[(Long, Array[Byte])](prev) {
  override def getPartitions = prev.partitions
  override def compute(split: Partition, context: TaskContext) =
    prev.iterator(split, context).grouped(2).map {
      case Seq(a, b) => (Utils.deserializeLongValue(a.reverse), b)
      case x => throw new SparkException("PairwiseRDD: unexpected value: " + x)
    }
  val asJavaPairRDD : JavaPairRDD[Long, Array[Byte]] = JavaPairRDD.fromRDD(this)
}


/* =================================================================================================
 * Object RubyRDD
 * =================================================================================================
 */

object RubyRDD extends Logging {

  def runJob(
      sc: SparkContext,
      rdd: JavaRDD[Array[Byte]],
      partitions: ArrayList[Int],
      allowLocal: Boolean,
      filename: String): String = {
    type ByteArray = Array[Byte]
    type UnrolledPartition = Array[ByteArray]
    val allPartitions: Array[UnrolledPartition] =
      sc.runJob(rdd, (x: Iterator[ByteArray]) => x.toArray, partitions, allowLocal)
    val flattenedPartition: UnrolledPartition = Array.concat(allPartitions: _*)
    writeRDDToFile(flattenedPartition.iterator, filename)
  }

  def readRDDFromFile(sc: JavaSparkContext, filename: String, parallelism: Int): JavaRDD[Array[Byte]] = {
    val file = new DataInputStream(new BufferedInputStream(new FileInputStream(filename)))
    val objs = new collection.mutable.ArrayBuffer[Array[Byte]]
    try {
      while (true) {
        val length = file.readInt()
        val obj = new Array[Byte](length)
        file.readFully(obj)
        objs.append(obj)
      }
    } catch {
      case eof: EOFException => {}
    }
    JavaRDD.fromRDD(sc.sc.parallelize(objs, parallelism))
  }

  def writeRDDToFile[T](items: Iterator[T], filename: String): String = {
    val file = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(filename)))

    try {
      PythonRDD.writeIteratorToStream(items, file)
    } finally {
      file.close()
    }

    filename
  }

  def writeRDDToFile[T](rdd: RDD[T], filename: String): String = {
    writeRDDToFile(rdd.collect.iterator, filename)
  }

  def readBroadcastFromFile(sc: JavaSparkContext, path: String, id: java.lang.Long): Broadcast[RubyBroadcast] = {
    sc.broadcast(new RubyBroadcast(path, id))
  }

  /**
   * Convert an RDD of serialized Ruby objects to RDD of objects, that is usable in Java.
   */
  def toJava(rbRDD: JavaRDD[Array[Byte]], batched: Boolean): JavaRDD[Any] = {
    rbRDD.rdd.mapPartitions { iter =>
      iter.flatMap { item =>
        val obj = Marshal.load(item)
        if(batched){
          obj.asInstanceOf[Array[_]]
        }
        else{
          Seq(item)
        }
      }
    }.toJavaRDD()
  }

  /**
   * Convert an RDD of Java objects to an RDD of serialized Ruby objects, that is usable by Ruby.
   */
  def toRuby(jRDD: JavaRDD[_]): JavaRDD[Array[Byte]] = {
    jRDD.rdd.mapPartitions { iter => new IterableMarshaller(iter) }
  }

}


/* =================================================================================================
 * Class RubyException
 * =================================================================================================
 */

class RubyException(msg: String, cause: Exception) extends RuntimeException(msg, cause) {
  def appendToStackTrace(toAdded: Array[String]) {
    val newStactTrace = getStackTrace.toBuffer

    var regexpMatch = "(.*):([0-9]+):in `([a-z]+)'".r

    for(item <- toAdded) {
      item match {
        case regexpMatch(fileName, lineNumber, methodName) =>
          newStactTrace += new StackTraceElement("RubyWorker", methodName, fileName, lineNumber.toInt)
        case _ => null
      }
    }

    setStackTrace(newStactTrace.toArray)
  }
}


================================================
FILE: ext/spark/src/main/scala/RubySerializer.scala
================================================
package org.apache.spark.api.ruby

import scala.collection.JavaConverters._
import scala.reflect.{ClassTag, classTag}

import org.apache.spark.api.java.JavaRDD
import org.apache.spark.api.ruby.marshal._


/* =================================================================================================
 * object RubySerializer
 * =================================================================================================
 */
object RubySerializer { }


================================================
FILE: ext/spark/src/main/scala/RubyTab.scala
================================================
package org.apache.spark.ui.ruby

import scala.collection.mutable.HashMap

import org.apache.spark.ui._

// class RubyTab(parent: SparkUI, rbConfig: HashMap[String, String]) extends SparkUITab(parent, "ruby"){
//   attachPage(new RubyPage(this, rbConfig.toArray))
// }

class RubyTab {}


================================================
FILE: ext/spark/src/main/scala/RubyUtils.scala
================================================
package org.apache.spark.api.ruby

import org.apache.spark.util._
import org.apache.spark.{SparkConf, Logging}

object RubyUtils extends Logging {

  def loadPropertiesFile(conf: SparkConf, path: String): String = {
    Utils.getPropertiesFromFile(path).foreach {
      case (key, value) => conf.set(key, value)
    }
    path
  }

}


================================================
FILE: ext/spark/src/main/scala/RubyWorker.scala
================================================
package org.apache.spark.api.ruby

import java.io.{File, DataInputStream, InputStream, DataOutputStream, FileOutputStream}
import java.net.{InetAddress, ServerSocket, Socket, SocketException}
import java.nio.file.Paths

import scala.collection.mutable
import scala.collection.JavaConversions._

import org.apache.spark._
import org.apache.spark.api.python.PythonRDD
import org.apache.spark.util.Utils
import org.apache.spark.util.RedirectThread


/* =================================================================================================
 * Object RubyWorker
 * =================================================================================================
 *
 * Create and store server for creating workers.
 */

object RubyWorker extends Logging {

  val PROCESS_WAIT_TIMEOUT = 10000

  private var serverSocket: ServerSocket = null
  private val serverHost = InetAddress.getByAddress(Array(127, 0, 0, 1))
  private var serverPort: Int = 0

  private var master: ExecutedFileCommand = null
  private var masterSocket: Socket = null
  private var masterOutputStream: DataOutputStream = null
  private var masterInputStream: DataInputStream = null

  private var workers = new mutable.WeakHashMap[Socket, Long]()


  /* ----------------------------------------------------------------------------------------------
   * Create new worker but first check if exist SocketServer and master process.
   * If not it will create them. Worker have 2 chance to create.
   */

  def create(env: SparkEnv): (Socket, Long) = {
    synchronized {
      // Create the server if it hasn't been started
      createServer(env)

      // Attempt to connect, restart and retry once if it fails
      try {
        createWorker
      } catch {
        case exc: SocketException =>
          logWarning("Worker unexpectedly quit, attempting to restart")
          createWorker
      }
    }
  }

  /* ----------------------------------------------------------------------------------------------
   * Create a worker throught master process. Return new socket and id.
   * According spark.ruby.worker.type id will be:
   *   process: PID
   *   thread: thread object id
   */

  def createWorker: (Socket, Long) = {
    synchronized {
      masterOutputStream.writeInt(RubyConstant.CREATE_WORKER)
      var socket = serverSocket.accept()

      var id = new DataInputStream(socket.getInputStream).readLong()
      workers.put(socket, id)

      (socket, id)
    }
  }

  /* ----------------------------------------------------------------------------------------------
   * Create SocketServer and bind it to the localhost. Max numbers of connection on queue
   * is set to default. If server is created withou exception -> create master.
   */

  private def createServer(env: SparkEnv){
    synchronized {
      // Already running?
      if(serverSocket != null && masterSocket != null) {
        return
      }

      try {
        // Start Socket Server for comunication
        serverSocket = new ServerSocket(0, 0, serverHost)
        serverPort = serverSocket.getLocalPort

        // Create a master for worker creations
        createMaster(env)
      } catch {
        case e: Exception =>
          throw new SparkException("There was a problem with creating a server", e)
      }
    }
  }

  /* ----------------------------------------------------------------------------------------------
   * In this point SocketServer must be created. Master process create and kill workers.
   * Creating workers from Java can be an expensive operation because new process can
   * get copy of address space.
   */

  private def createMaster(env: SparkEnv){
    synchronized {
      val isDriver = env.executorId == SparkContext.DRIVER_IDENTIFIER
      val executorOptions = env.conf.get("spark.ruby.executor.options", "")
      val commandTemplate = env.conf.get("spark.ruby.executor.command")
      val workerType = env.conf.get("spark.ruby.worker.type")

      // Where is root of ruby-spark
      var executorLocation = ""

      if(isDriver){
        // Use worker from current active gem location
        executorLocation = env.conf.get("spark.ruby.driver_home")
      }
      else{
        // Use gem installed on the system
        try {
          val homeCommand = (new FileCommand(commandTemplate, "ruby-spark home", env, getEnvVars(env))).run
          executorLocation = homeCommand.readLine
        } catch {
          case e: Exception =>
            throw new SparkException("Ruby-spark gem is not installed.", e)
        }
      }

      // Master and worker are saved in GEM_ROOT/lib/spark/worker
      executorLocation = Paths.get(executorLocation, "lib", "spark", "worker").toString

      // Create master command
      // -C: change worker dir before execution
      val masterRb = s"ruby $executorOptions -C $executorLocation master.rb $workerType $serverPort"
      val masterCommand = new FileCommand(commandTemplate, masterRb, env, getEnvVars(env))

      // Start master
      master = masterCommand.run

      // Redirect master stdout and stderr
      redirectStreamsToStderr(master.getInputStream, master.getErrorStream)

      // Wait for it to connect to our socket
      serverSocket.setSoTimeout(PROCESS_WAIT_TIMEOUT)
      try {
        // Use socket for comunication. Keep stdout and stdin for log
        masterSocket = serverSocket.accept()
        masterOutputStream = new DataOutputStream(masterSocket.getOutputStream)
        masterInputStream  = new DataInputStream(masterSocket.getInputStream)

        PythonRDD.writeUTF(executorOptions, masterOutputStream)
      } catch {
        case e: Exception =>
          throw new SparkException("Ruby master did not connect back in time", e)
      }
    }
  }

  /* ----------------------------------------------------------------------------------------------
   * Gel all environment variables for executor
   */

  def getEnvVars(env: SparkEnv): Map[String, String] = {
    val prefix = "spark.ruby.executor.env."
    env.conf.getAll.filter{case (k, _) => k.startsWith(prefix)}
                   .map{case (k, v) => (k.substring(prefix.length), v)}
                   .toMap
  }

  /* ------------------------------------------------------------------------------------------- */

  def kill(workerId: Long){
    masterOutputStream.writeInt(RubyConstant.KILL_WORKER)
    masterOutputStream.writeLong(workerId)
  }

  /* ------------------------------------------------------------------------------------------- */

  def killAndWait(workerId: Long){
    masterOutputStream.writeInt(RubyConstant.KILL_WORKER_AND_WAIT)
    masterOutputStream.writeLong(workerId)

    // Wait for answer
    masterInputStream.readInt() match {
      case RubyConstant.SUCCESSFULLY_KILLED =>
        logInfo(s"Worker $workerId was successfully killed")
      case RubyConstant.UNSUCCESSFUL_KILLING =>
        logInfo(s"Worker $workerId cannot be killed (maybe is already killed)")
    }
  }

  /* ----------------------------------------------------------------------------------------------
   * workers HashMap is week but it avoid long list of workers which cannot be killed (killAndWait)
   */

  def remove(worker: Socket, workerId: Long){
    try {
      workers.remove(worker)
    } catch {
      case e: Exception => logWarning(s"Worker $workerId does not exist (maybe is already removed)")
    }
  }

  /* ------------------------------------------------------------------------------------------- */

  def stopServer{
    synchronized {
      // Kill workers
      workers.foreach { case (socket, id) => killAndWait(id) }

      // Kill master
      master.destroy

      // Stop SocketServer
      serverSocket.close()

      // Clean variables
      serverSocket = null
      serverPort = 0
      master = null
      masterSocket = null
      masterOutputStream = null
      masterInputStream = null
    }
  }

  /* ------------------------------------------------------------------------------------------- */

  private def redirectStreamsToStderr(streams: InputStream*) {
    try {
      for(stream <- streams) {
        new RedirectThread(stream, System.err, "stream reader").start()
      }
    } catch {
      case e: Exception =>
        logError("Exception in redirecting streams", e)
    }
  }

  /* ------------------------------------------------------------------------------------------- */
}


================================================
FILE: ext/spark/src/test/scala/MarshalSpec.scala
================================================
package org.apache.spark.api.ruby.marshal

import org.scalatest._


import org.apache.spark.api.ruby.marshal._

class MarshalSpec extends FunSpec with Matchers {

  // ====================================================================================
  // Load

  describe("Marshal.load"){
    describe("single value"){
      it("int"){
        val data = 1
        val serialized = Array[Byte](4, 8, 105, 6)

        Marshal.load(serialized) should equal(data)
      }

      it("double"){
        val data = 1.2
        val serialized = Array[Byte](4, 8, 102, 8, 49, 46, 50)

        Marshal.load(serialized) should equal(data)
      }
    }

    describe("array"){
      it("ints"){
        val data = Array(1, 2, 3, 4, 5)
        val serialized = Array[Byte](4, 8, 91, 10, 105, 6, 105, 7, 105, 8, 105, 9, 105, 10)

        Marshal.load(serialized) should equal(data)
      }

      it("doubles"){
        val data = Array(1.1, 2.2, 3.3)
        val serialized = Array[Byte](4, 8, 91, 8, 102, 8, 49, 46, 49, 102, 8, 50, 46, 50, 102, 8, 51, 46, 51)

        Marshal.load(serialized) should equal(data)
      }
    }
  }

  // ====================================================================================
  // Dump

  describe("Marshal.dump"){
    describe("single value"){
      it("int"){
        val data = 1
        val serialized = Array(4, 8, 105, 6)

        Marshal.dump(data) should equal(serialized)
      }

      it("double"){
        val data = 1.2
        val serialized = Array(4, 8, 102, 8, 49, 46, 50)

        Marshal.dump(data) should equal(serialized)
      }
    }

    describe("array"){
      it("ints"){
        val data = Array(1, 2, 3, 4, 5)
        val serialized = Array(4, 8, 91, 10, 105, 6, 105, 7, 105, 8, 105, 9, 105, 10)

        Marshal.dump(data) should equal(serialized)
      }

      it("doubles"){
        val data = Array(1.1, 2.2, 3.3)
        val serialized = Array(4, 8, 91, 8, 102, 8, 49, 46, 49, 102, 8, 50, 46, 50, 102, 8, 51, 46, 51)

        Marshal.dump(data) should equal(serialized)
      }
    }
  }

}


================================================
FILE: lib/ruby-spark.rb
================================================
require_relative 'spark'


================================================
FILE: lib/spark/accumulator.rb
================================================
module Spark
  ##
  # A shared variable that can be accumulated, i.e., has a commutative and associative "add"
  # operation. Worker tasks on a Spark cluster can add values to an Accumulator with the `+=`
  # operator, but only the driver program is allowed to access its value, using value.
  # Updates from the workers get propagated automatically to the driver program.
  #
  # == Arguments:
  # value::
  #   Initial value for accumulator. This values is stored only on driver process
  #
  # accum_param::
  #   How merge 2 value on worker or driver process.
  #   Symbol or Proc (or String)
  #
  # zero_value::
  #   Initial value for worker process
  #
  #
  # == Examples:
  #
  #   accum1 = $sc.accumulator(1)
  #   accum2 = $sc.accumulator(2, :*, 1)
  #   accum3 = $sc.accumulator(3, lambda{|max, val| val > max ? val : max})
  #
  #   accum1 += 1
  #
  #   accum2.add(2)
  #   accum2.add(2)
  #   accum2.add(2)
  #
  #   accum3.add(9)
  #   accum3.add(6)
  #   accum3.add(7)
  #
  #   accum1.value # => 2
  #   accum2.value # => 16
  #   accum3.value # => 9
  #
  #   func = Proc.new do |_, index|
  #     accum1.add(1)
  #     accum2.add(2)
  #     accum3.add(index * 10)
  #   end
  #
  #   rdd = $sc.parallelize(0..4, 4)
  #   rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3)
  #   rdd = rdd.map_partitions_with_index(func)
  #   rdd.collect
  #
  #   accum1.value # => 6
  #   accum2.value # => 256
  #   accum3.value # => 30
  #
  class Accumulator

    attr_reader :id, :value, :accum_param, :zero_value

    @@instances = {}
    @@changed = []

    SUPPORTED_SYMBOLS = [:+, :-, :*, :/, :**]


    # =========================================================================
    # Creating and selecting Spark::Accumulator

    def initialize(value, accum_param=:+, zero_value=0)
      @id = object_id
      @value = value
      @accum_param = accum_param
      @zero_value = zero_value
      @driver = true

      valid_accum_param

      @@instances[@id] = self
    end

    def inspect
      result  = %{#<#{self.class.name}:0x#{object_id}\n}
      result << %{   ID: #{@id}\n}
      result << %{ Zero: #{@zero_value.to_s[0, 10]}\n}
      result << %{Value: #{@value.to_s[0, 10]}>}
      result
    end

    def self.changed
      @@changed
    end

    def self.instances
      @@instances
    end

    def valid_accum_param
      if @accum_param.is_a?(Symbol)
        raise Spark::AccumulatorError, "Unsupported symbol #{@accum_param}" unless SUPPORTED_SYMBOLS.include?(@accum_param)
        @serialized_accum_param = @accum_param
        return
      end

      if @accum_param.is_a?(Proc)
        begin
          @serialized_accum_param = @accum_param.to_source
          return
        rescue
          raise Spark::SerializeError, 'Proc can not be serialized. Use String instead.'
        end
      end

      if @accum_param.is_a?(String)
        @serialized_accum_param = @accum_param
        @accum_param = eval(@accum_param)

        unless @accum_param.is_a?(Proc)
          raise Spark::SerializeError, 'Yours param is not a Proc.'
        end

        return
      end

      raise Spark::AccumulatorError, 'Unsupported param. Use Symbol, Proc or String.'
    end

    # Driver process or worker
    def driver?
      @driver
    end


    # =========================================================================
    # Operations

    def add(term)
      if !driver? && !@@changed.include?(self)
        @@changed << self
      end

      if @accum_param.is_a?(Proc)
        @value = @accum_param.call(@value, term)
      else
        add_by_symbol(term)
      end
    end

    def +(term)
      add(term)
      self
    end

    def add_by_symbol(term)
      case @accum_param
      when :+
        @value += term
      when :-
        @value -= term
      when :*
        @value *= term
      when :/
        @value /= term
      when :**
        @value **= term
      end
    end


    # =========================================================================
    # Dump and load

    def marshal_dump
      [@id, @zero_value, @serialized_accum_param]
    end

    def marshal_load(array)
      @id, @zero_value, @serialized_accum_param = array

      @value = @zero_value
      @driver = false
      load_accum_param
    end

    def load_accum_param
      if @serialized_accum_param.is_a?(String)
        @accum_param = eval(@serialized_accum_param)
      else
        @accum_param = @serialized_accum_param
      end
    end

  end
end

# =============================================================================
# Server for handeling Accumulator update
#
module Spark
  class Accumulator
    class Server

      attr_reader :server, :host, :port

      def self.start
        @instance ||= Spark::Accumulator::Server.new
      end

      def self.stop
        @instance && @instance.stop
      end

      def self.host
        start
        @instance.host
      end

      def self.port
        start
        @instance.port
      end

      def initialize
        @server = TCPServer.new(0)
        @host = @server.hostname
        @port = @server.port

        @threads = []
        handle_accept
      end

      def stop
        @threads.each(&:kill)
      rescue
        nil
      end

      def handle_accept
        @threads << Thread.new do
          loop {
            handle_connection(@server.accept)
          }
        end

      end

      def handle_connection(socket)
        @threads << Thread.new do
          until socket.closed?
            count = socket.read_int
            count.times do
              data = socket.read_data
              accum = Spark::Accumulator.instances[data[0]]
              if accum
                accum.add(data[1])
              else
                Spark.logger.warn("Accumulator with id #{data[0]} does not exist.")
              end
            end

            # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
            # socket.write_int(Spark::Constant::ACCUMULATOR_ACK)
          end

        end
      end

    end
  end
end


================================================
FILE: lib/spark/broadcast.rb
================================================
module Spark
  ##
  # Broadcast a read-only variable to the cluster, returning a Spark::Broadcast
  # object for reading it in distributed functions. The variable will
  # be sent to each cluster only once.
  #
  # == Example:
  #
  #   broadcast1 = $sc.broadcast('a')
  #   broadcast2 = $sc.broadcast('b')
  #   broadcast3 = $sc.broadcast([1,2,3])
  #
  #   func = Proc.new do |part, index|
  #     [
  #       broadcast1.value * index,
  #       broadcast2.value * index,
  #       broadcast3.value.reduce(:+)
  #     ]
  #   end
  #
  #   rdd = $sc.parallelize(0..5, 4)
  #   rdd = rdd.bind(broadcast1: broadcast1, broadcast2: broadcast2, broadcast3: broadcast3)
  #   rdd = rdd.map_partitions_with_index(func)
  #   rdd.collect
  #   # => ["", "", 6, "a", "b", 6, "aa", "bb", 6, "aaa", "bbb", 6]
  #
  class Broadcast

    LOADED       = 0 # id, value, path
    NOT_LOADED   = 1 # id, path
    WITHOUT_PATH = 2 # id

    attr_reader :id, :state, :path, :jbroadcast

    @@registered = {}

    # =========================================================================
    # Creating broadcast for SparkContext

    # Create new Broadcast and dump value to the disk
    #
    #   b = $sc.broadcast('a')
    #
    #   b.value # => 'a'
    #   b.path
    #   b.jbroadcast
    #
    def initialize(sc, value)
      @id = object_id
      @value = value
      @state = LOADED

      file = Tempfile.create('broadcast', sc.temp_dir)
      file.binmode
      file.write(Marshal.dump(value))
      file.close

      @path = file.path
      @jbroadcast = RubyRDD.readBroadcastFromFile(sc.jcontext, @path, Spark.jb.to_long(@id))

      ObjectSpace.define_finalizer(self, proc { File.unlink(@path) })
    end

    def inspect
      result  = %{#<#{self.class.name}:0x#{object_id}\n}
      result << %{   ID: #{@id}\n}
      result << %{Value: #{@value.to_s[0, 10]}>}
      result
    end

    def self.register(id, path)
      @@registered[id] = path
    end

    def value
      case state
      when LOADED
        @value
      when NOT_LOADED
        @value = Marshal.load(File.read(@path))
        @state = LOADED
        @value
      when WITHOUT_PATH
        @path = @@registered[id]

        if @path
          @state = NOT_LOADED
          value
        else
          raise Spark::BroadcastError, "Broadcast #{@id} do not have registered path."
        end
      end
    end

    def marshal_dump
      @id
    end

    def marshal_load(id)
      @id = id
      @state = WITHOUT_PATH
    end

  end
end


================================================
FILE: lib/spark/build.rb
================================================
module Spark
  module Build

    DEFAULT_SCALA_VERSION  = '2.10.4'
    DEFAULT_CORE_VERSION   = '2.10'
    DEFAULT_SPARK_VERSION  = '1.6.0'
    DEFAULT_HADOOP_VERSION = '1.0.4'

    SBT       = 'sbt/sbt'
    SBT_DEPS  = 'assemblyPackageDependency'
    SBT_EXT   = 'package'
    SBT_CLEAN = 'clean'

    def self.build(options={})
      scala_version      = options[:scala_version]      || DEFAULT_SCALA_VERSION
      spark_core_version = options[:spark_core_version] || DEFAULT_CORE_VERSION
      spark_version      = options[:spark_version]      || DEFAULT_SPARK_VERSION
      hadoop_version     = options[:hadoop_version]     || DEFAULT_HADOOP_VERSION
      target             = options[:target]             || Spark.target_dir
      only_ext           = options[:only_ext]           || false

      env = {
        'SCALA_VERSION' => scala_version,
        'SPARK_VERSION' => spark_version,
        'SPARK_CORE_VERSION' => spark_core_version,
        'HADOOP_VERSION' => hadoop_version,
        'TARGET_DIR' => target
      }

      cmd = [SBT]
      cmd << SBT_EXT
      cmd << SBT_DEPS unless only_ext
      cmd << SBT_CLEAN unless $DEBUG

      Dir.chdir(Spark.spark_ext_dir) do
        unless Kernel.system(env, cmd.join(' '))
          raise Spark::BuildError, 'Spark cannot be assembled.'
        end
      end
    end

  end
end


================================================
FILE: lib/spark/cli.rb
================================================
require 'commander'

module Commander
  module UI
    # Disable paging
    # for 'classic' help
    def self.enable_paging
    end
  end
end

module Spark
  class CLI
    include Commander::Methods

    # IRB_HISTORY_FILE = File.join(Dir.home, '.irb_spark_history')
    # IRB_HISTORY_SIZE = 100

    def run
      program :name, 'RubySpark'
      program :version, Spark::VERSION
      program :description, 'Ruby wrapper for Spark'

      global_option('-d', '--debug', 'Logging message to stdout'){ $DEBUG = true }
      default_command :help


      # Build ---------------------------------------------------------------
      command :build do |c|
        c.syntax = 'build [options]'
        c.description = 'Build spark and gem extensions'
        c.option '--hadoop-version STRING', String, 'Version of hadoop which will assembled with the Spark'
        c.option '--spark-core-version STRING', String, 'Version of Spark core'
        c.option '--spark-version STRING', String, 'Version of Spark'
        c.option '--scala-version STRING', String, 'Version of Scala'
        c.option '--target STRING', String, 'Directory where Spark will be stored'
        c.option '--only-ext', 'Build only extension for RubySpark'

        c.action do |args, options|
          Spark::Build.build(options.__hash__)
          puts
          puts 'Everything is OK'
        end
      end
      alias_command :install, :build


      # Shell -----------------------------------------------------------------
      command :shell do |c|
        c.syntax = 'shell [options]'
        c.description = 'Start ruby shell for spark'
        c.option '--target STRING', String, 'Directory where Spark is stored'
        c.option '--properties-file STRING', String, 'Path to a file from which to load extra properties'
        c.option '--[no-]start', 'Start Spark immediately'
        c.option '--[no-]logger', 'Enable/disable logger (default: enable)'
        c.option '--auto-reload', 'Autoreload changed files'

        c.action do |args, options|
          options.default start: true, logger: true

          Spark.load_lib(options.target)
          Spark.logger.disable unless options.logger

          Spark.config do
            set_app_name 'RubySpark'
          end

          Spark.config.from_file(options.properties_file)

          if options.auto_reload
            require 'listen'
            listener = Listen.to(File.join(Spark.root, 'lib')) do |modified, added, removed|
              (modified+added).each do |file|
                silence_warnings { load(file) }
              end
            end
            listener.start
          end

          if options.start
            # Load Java and Spark
            Spark.start
            $sc = Spark.context

            Spark.print_logo('Spark context is loaded as $sc')
          else
            Spark.print_logo('You can start Spark with Spark.start')
          end

          # Load Pry
          require 'pry'
          Pry.start
        end
      end


      # # IRB -------------------------------------------------------------------
      # command :irb do |c|
      #   c.syntax = 'irb [options]'
      #   c.description = 'Start ruby shell for spark'
      #   c.option '--spark-home STRING', String, 'Directory where Spark is stored'
      #   c.option '--[no-]start', 'Start Spark immediately'
      #   c.option '--[no-]logger', 'Enable/disable logger (default: enable)'
      #
      #   c.action do |args, options|
      #     options.default start: true, logger: true
      #
      #     Spark.load_lib(options.spark_home)
      #     Spark::Logger.disable unless options.logger
      #
      #     Spark.config do
      #       set_app_name 'Pry RubySpark'
      #     end
      #
      #     if options.start
      #       # Load Java and Spark
      #       Spark.start
      #       $sc = Spark.context
      #
      #       Spark.print_logo('Spark context is loaded as $sc')
      #     else
      #       Spark.print_logo('You can start Spark with Spark.start')
      #     end
      #
      #     # Load IRB
      #     require 'irb'
      #     require 'irb/completion'
      #     require 'irb/ext/save-history'
      #
      #     begin
      #       file = File.expand_path(IRB_HISTORY_FILE)
      #       if File.exists?(file)
      #         lines = IO.readlines(file).collect { |line| line.chomp }
      #         Readline::HISTORY.push(*lines)
      #       end
      #       Kernel.at_exit do
      #         lines = Readline::HISTORY.to_a.reverse.uniq.reverse
      #         lines = lines[-IRB_HISTORY_SIZE, IRB_HISTORY_SIZE] if lines.nitems > IRB_HISTORY_SIZE
      #         File.open(IRB_HISTORY_FILE, File::WRONLY | File::CREAT | File::TRUNC) { |io| io.puts lines.join("\n") }
      #       end
      #     rescue
      #     end
      #
      #     ARGV.clear # Clear Thor ARGV, otherwise IRB will parse it
      #     ARGV.concat ['--readline', '--prompt-mode', 'simple']
      #     IRB.start
      #   end
      # end


      # Home ------------------------------------------------------------------
      command :home do |c|
        c.action do |args, options|
          puts Spark.home
          exit(0)
        end
      end


      # Ruby spark jar --------------------------------------------------------
      command :ruby_spark_jar do |c|
        c.action do |args, options|
          puts Spark.ruby_spark_jar
          exit(0)
        end
      end

      run!
    end

  end
end


================================================
FILE: lib/spark/command/base.rb
================================================
##
# Spark::Command::Base
#
# Parent for all commands (Map, FlatMap, Sort, ...)
#
class Spark::Command::Base

  DEFAULT_VARIABLE_OPTIONS = {
    type: Hash,
    function: true
  }

  def initialize(*args)
    settings.variables.each do |name, options|
      instance_variable_set("@#{name}", args.shift)
    end
  end

  def to_s
    self.class.name.split('::').last
  end

  def self.error(message)
    raise Spark::CommandError, message
  end

  def error(message)
    self.class.error(message)
  end

  def log(message=nil)
    $stdout.puts %{==> #{Time.now.strftime("%H:%M:%S")} [#{self.class.name}] #{message}}
    $stdout.flush
  end


  # ===============================================================================================
  # Methods called during class loading
  # This is not nicer way but these methods set/get classes variables for child

  # Settings for command (variables)
  def self.settings
    init_settings
    class_variable_get(:@@settings)
  end

  def settings
    self.class.settings
  end

  # Init empty settings
  def self.init_settings
    if !class_variable_defined?(:@@settings)
      struct = Struct.new(:variables)

      class_variable_set(:@@settings, struct.new)
      settings.variables = {}
    end
  end

  # New variable for command
  #
  # == Example:
  #
  #   class Map < Spark::Command::Base
  #     variable :map_function
  #   end
  #
  #   command = Map.new(1)
  #
  #   command.instance_variables
  #   # => [:@map_function]
  #   command.instance_variable_get(:@map_function)
  #   # => 1
  #
  def self.variable(name, options={})
    if settings.variables.has_key?(name)
      error "Function #{name} already exist."
    end

    settings.variables[name] = DEFAULT_VARIABLE_OPTIONS.merge(options)
  end


  # ===============================================================================================
  # Executing methods

  # Execute command for data and split index
  def execute(iterator, split_index)
    # Implemented on Base but can be override
    before_run

    # Run has to be implemented on child
    if iterator.is_a?(Enumerator::Lazy) && respond_to?(:lazy_run)
      return lazy_run(iterator, split_index)
    end

    iterator = iterator.to_a
    run(iterator, split_index)
  end

  def prepared?
    !!@prepared
  end

  # This is called before execution. Executing will be stopped if
  # some command contains error (e.g. badly serialized lambda).
  #
  # == What is doing?
  # * evaluate lambda
  # * evaluate method
  # * make new lambda
  #
  def prepare
    return if prepared?

    to_function = settings.variables.select {|_, options| options[:function]}
    to_function.each do |name, options|
      name = "@#{name}"
      data = instance_variable_get(name)

      case data[:type]
      when 'proc'
        result = eval(data[:content])
      when 'symbol'
        result = lambda(&data[:content])
      when 'method'
        # Method must me added to instance not Class
        instance_eval(data[:content])
        # Method will be available as Proc
        result = lambda(&method(data[:name]))
      end

      instance_variable_set(name, result)
    end

    @prepared = true
  end

  # This method is called before every execution.
  def before_run
  end


  # ===============================================================================================
  # Bound objects

  attr_accessor :__objects__

  def method_missing(method, *args, &block)
    if __objects__ && __objects__.has_key?(method)
      return __objects__[method]
    end

    super
  end

end


================================================
FILE: lib/spark/command/basic.rb
================================================
_Base = Spark::Command::Base

# -------------------------------------------------------------------------------------------------
# Map

class Spark::Command::Map < _Base
  variable :map_function

  def run(iterator, *)
    iterator.map! do |item|
      @map_function.call(item)
    end
    iterator
  end

  def lazy_run(iterator, *)
    iterator.map do |item|
      @map_function.call(item)
    end
  end
end

# -------------------------------------------------------------------------------------------------
# FlatMap

class Spark::Command::FlatMap < Spark::Command::Map
  def run(iterator, *)
    iterator = super
    iterator.flatten!(1)
    iterator
  end

  def lazy_run(iterator, *)
    iterator.flat_map do |item|
      @map_function.call(item)
    end
  end
end

# -------------------------------------------------------------------------------------------------
# MapPartitionsWithIndex

class Spark::Command::MapPartitionsWithIndex < _Base
  variable :partition_function

  def run(iterator, index)
    iterator = @partition_function.call(iterator, index)
    iterator
  end

  # User should controll if there is Enumerator or not
  # alias_method :lazy_run, :run
end

# -------------------------------------------------------------------------------------------------
# MapPartitions

class Spark::Command::MapPartitions < Spark::Command::MapPartitionsWithIndex
  def run(iterator, *)
    # Do not use `super` because `@partition_function` can be method with 1 argument
    iterator = @partition_function.call(iterator)
    iterator
  end
  # alias_method :lazy_run, :run
end

# -------------------------------------------------------------------------------------------------
# Filter

class Spark::Command::Filter < _Base
  variable :filter_function

  def run(iterator, *)
    iterator.select! do |item|
      @filter_function.call(item)
    end
    iterator
  end

  def lazy_run(iterator, *)
    iterator.select do |item|
      @filter_function.call(item)
    end
  end
end

# -------------------------------------------------------------------------------------------------
# Compact

class Spark::Command::Compact < _Base
  def run(iterator, *)
    iterator.compact!
    iterator
  end

  def lazy_run(iterator, *)
    iterator.select do |item|
      !item.nil?
    end
  end
end

# -------------------------------------------------------------------------------------------------
# Glom

class Spark::Command::Glom < _Base
  def run(iterator, *)
    [iterator]
  end

  def lazy_run(iterator, *)
    run(iterator.to_a)
  end
end

# -------------------------------------------------------------------------------------------------
# Shuffle

class Spark::Command::Shuffle < _Base
  variable :seed, function: false, type: Integer

  def run(iterator, *)
    iterator.shuffle!(random: rng)
    iterator
  end

  def rng
    Random.new(@seed)
  end
end

# -------------------------------------------------------------------------------------------------
# PartitionBy

class Spark::Command::PartitionBy

  class Base < Spark::Command::Base
    include Spark::Helper::Serialize

    def prepare
      super

      # Default. Keep it after super because Sorting has own key_function.
      @key_function ||= lambda{|x| x[0]}
    end

    def run(iterator, *)
      iterator.map! do |item|
        make_partition_item(item)
      end
      iterator.flatten!(1)
      iterator
    end

    def lazy_run(iterator, *)
      iterator.flat_map do |item|
        make_partition_item(item)
      end
    end

    private

      def make_partition_item(item)
        [
          pack_long(@partition_func.call(@key_function[item])),
          item
        ]
      end
  end

  class Basic < Base
    variable :partition_func
  end

  class Sorting < Base
    variable :key_function
    variable :bounds, function: false, type: Array
    variable :ascending, function: false, type: [TrueClass, FalseClass]
    variable :num_partitions, function: false, type: Numeric

    def prepare
      super

      # Index by bisect alghoritm
      @partition_func ||= Proc.new do |key|
        count = 0
        @bounds.each{|i|
          break if i >= key
          count += 1
        }

        if @ascending
          count
        else
          @num_partitions - 1 - count
        end
      end
    end

  end # Sorting
end # PartitionBy

# -------------------------------------------------------------------------------------------------
# Aggregate

class Spark::Command::Aggregate < _Base
  variable :reduce_func
  variable :zero_value, function: false, type: Object

  def run(iterator, *)
    [iterator.reduce(@zero_value, &@reduce_func)]
  end

  def lazy_run(iterator, *)
    run(iterator)
  end
end

# -------------------------------------------------------------------------------------------------
# Reduce

class Spark::Command::Reduce < Spark::Command::Aggregate
  def run(iterator, *)
    [iterator.reduce(&@reduce_func)]
  end
end

# -------------------------------------------------------------------------------------------------
# Foreach

class Spark::Command::Foreach < _Base
  variable :each_function

  def run(iterator, *)
    iterator.each do |item|
      @each_function.call(item)
    end
    nil
  end
end

# -------------------------------------------------------------------------------------------------
# ForeachPartition

class Spark::Command::ForeachPartition < _Base
  variable :partition_function

  def run(iterator, *)
    @partition_function.call(iterator)
    nil
  end
end

# -------------------------------------------------------------------------------------------------
# KeyBy

class Spark::Command::KeyBy < _Base
  variable :key_function

  def run(iterator, *)
    iterator.map! do |item|
      [@key_function.call(item), item]
    end
    iterator
  end

  def lazy_run(iterator, *)
    iterator.map do |item|
      [@key_function.call(item), item]
    end
  end
end

# -------------------------------------------------------------------------------------------------
# Take

class Spark::Command::Take < _Base
  variable :total,     function: false, type: Numeric
  variable :last_part, function: false, type: Numeric

  def run(iterator, index)
    if index == @last_part && iterator.size > @total
      return iterator.slice!(0, @total)
    end

    iterator
  end
end

# -------------------------------------------------------------------------------------------------
# Pipe

class Spark::Command::Pipe < _Base
  variable :cmds, function: false, type: Array

  def before_run
    require 'open3'

    @in, @out, @threads = Open3.pipeline_rw(*@cmds)
  end

  def run(iterator, *)
    create_writing_thread(iterator)

    new_iterator = []

    # Read full input
    begin
      loop {
        new_iterator << @out.readline.rstrip
      }
    rescue EOFError
    end

    new_iterator
  end

  def lazy_run(iterator, *)
    create_writing_thread(iterator)

    Enumerator::Lazy.new([nil]) do |yielder, _|
      begin
        loop {
          yielder << @out.readline.rstrip
        }
      rescue EOFError
      end
    end
  end

  private

    def create_writing_thread(iterator)
      @writing_thread = Thread.new do
        # Send complete iterator to the pipe
        iterator.each do |item|
          @in.puts(item.to_s.rstrip)
        end

        # Input must be closed for EOFError
        @in.close
      end
    end

end


================================================
FILE: lib/spark/command/pair.rb
================================================
_Base = Spark::Command::Base

# -------------------------------------------------------------------------------------------------
# CombineByKey

class Spark::Command::CombineByKey

  # ---------------

  class Base < Spark::Command::Base
    def run(iterator, *)
      _run(iterator).to_a
    end

    def lazy_run(iterator, *)
      _run(iterator).lazy
    end
  end

  # ---------------

  class Combine < Base
    variable :create_combiner
    variable :merge_value

    def _run(iterator)
      # Not use combiners[key] ||= ..
      # it tests nil and not has_key?
      combiners = {}
      iterator.each do |key, value|
        if combiners.has_key?(key)
          combiners[key] = @merge_value.call(combiners[key], value)
        else
          combiners[key] = @create_combiner.call(value)
        end
      end
      combiners
    end
  end

  # ---------------

  class Merge < Base
    variable :merge_combiners

    def _run(iterator, *)
      combiners = {}
      iterator.each do |key, value|
        if combiners.has_key?(key)
          combiners[key] = @merge_combiners.call(combiners[key], value)
        else
          combiners[key] = value
        end
      end
      combiners
    end
  end

  # ---------------

  class CombineWithZero < Base
    variable :zero_value, function: false, type: Object
    variable :merge_value

    def _run(iterator)
      # Not use combiners[key] ||= ..
      # it tests nil and not has_key?
      combiners = {}
      iterator.each do |key, value|
        unless combiners.has_key?(key)
          combiners[key] = @zero_value
        end

        combiners[key] = @merge_value.call(combiners[key], value)
      end
      combiners
    end
  end


  # ---------------

end

# -------------------------------------------------------------------------------------------------
# MapValues

class Spark::Command::MapValues < _Base
  variable :map_function

  def run(iterator, *)
    iterator.map! do |item|
      item[1] = @map_function.call(item[1])
      item
    end
    iterator
  end

  def lazy_run(iterator, *)
    iterator.map do |item|
      item[1] = @map_function.call(item[1])
      item
    end
  end
end

# -------------------------------------------------------------------------------------------------
# FlatMapValues

class Spark::Command::FlatMapValues < _Base
  variable :map_function

  def run(iterator, *)
    iterator.map! do |(key, values)|
      values = @map_function.call(values)
      values.flatten!(1)
      values.map! do |value|
        [key, value]
      end
    end
    iterator.flatten!(1)
    iterator
  end
end


================================================
FILE: lib/spark/command/sort.rb
================================================
_Base = Spark::Command::Base

# -------------------------------------------------------------------------------------------------
# Sort

class Spark::Command::SortByKey < _Base
  variable :key_function
  variable :ascending,  function: false, type: [TrueClass, FalseClass]
  variable :spilling,   function: false, type: [TrueClass, FalseClass]
  variable :memory,     function: false, type: [Numeric, NilClass]
  variable :serializer, function: false, type: Spark::Serializer::Base

  # Currently disabled
  def before_run
    @spilling = false
  end

  def run(iterator, _)
    if @spilling
      iterator = run_with_spilling(iterator.each)
    else
      run_without_spilling(iterator)
    end

    iterator
  end

  def run_with_enum(iterator, _)
    if @spilling
      iterator = run_with_spilling(iterator)
    else
      iterator = iterator.to_a
      run_without_spilling(iterator)
    end

    iterator
  end

  private

    def run_with_spilling(iterator)
      sorter = Spark::ExternalSorter.new(@memory, @serializer)
      sorter.sort_by(iterator, @ascending, @key_function)
    end

    def run_without_spilling(iterator)
      iterator.sort_by!(&@key_function)
      iterator.reverse! unless @ascending
    end

end


================================================
FILE: lib/spark/command/statistic.rb
================================================
_Base = Spark::Command::Base

# -------------------------------------------------------------------------------------------------
# Sample

class Spark::Command::Sample < _Base
  variable :with_replacement, function: false, type: [TrueClass, FalseClass]
  variable :fraction,         function: false, type: Numeric
  variable :seed,             function: false, type: [NilClass, Numeric]

  def run(iterator, _)
    sampler.sample(iterator)
  end

  def lazy_run(iterator, _)
    sampler.lazy_sample(iterator)
  end

  def sampler
    @sampler ||= _sampler
  end

  def _sampler
    if @with_replacement
      sampler = Spark::Sampler::Poisson
    else
      sampler = Spark::Sampler::Uniform
    end

    sampler = sampler.new(@fraction, @seed)
  end
end

# -------------------------------------------------------------------------------------------------
# Stats

class Spark::Command::Stats < _Base

  def run(iterator, *)
    [Spark::StatCounter.new(iterator)]
  end

  def lazy_run(iterator, *)
    run(iterator)
  end

end

# -------------------------------------------------------------------------------------------------
# Histogram

class Spark::Command::Histogram < _Base
  include Spark::Helper::Statistic

  variable :even,    function: false, type: [TrueClass, FalseClass]
  variable :buckets, function: false, type: Array

  def run(iterator, *)
    counters = Array.new(counter_size) { 0 }
    iterator.each do |item|
      if item.nil? || (item.is_a?(Float) && !item.finite?) || item > max || item < min
        next
      end

      x = bucket_function.call(item)
      if x.nil?
        # next
      else
        counters[x] += 1
      end
    end
    [counters]
  end

  def lazy_run(iterator, *)
    run(iterator)
  end

  private

    def min
      @buckets.first
    end

    def max
      @buckets.last
    end

    def counter_size
      @buckets.size-1
    end

    def increment
      @buckets[1]-@buckets[0]
    end

    # Decide which bucket function to pass. We decide here rather than having
    # a general function so that the decission need only be made once.
    def bucket_function
      @bucket_function ||= _bucket_function
    end

    def _bucket_function
      if @even
        fast_bucket_function
      else
        basic_bucket_function
      end
    end

    # Determine the bucket function in constant time.
    # Requires that buckets are evenly spaced
    def fast_bucket_function
      Proc.new do |item|
        if item.is_a?(Float) && item.nan?
          nil
        else
          bucket_number = (item - min)/increment
          if bucket_number > counter_size || bucket_number < 0
            nil
          else
            [bucket_number.to_i, counter_size-1].min
          end
        end
      end
    end

    # Basic bucket function. Same as right bisect.
    def basic_bucket_function
      Proc.new do |item|
        bucket_number = bisect_right(@buckets, item) - 1

        # Counters is @buckets.size - 1
        # [bucket_number, counter_size-1].min

        if bucket_number > counter_size-1
          counter_size-1
        else
          bucket_number
        end
      end
    end

end


================================================
FILE: lib/spark/command.rb
================================================
module Spark
  ##
  # Container which includes all commands and other things for worker
  # Every RDD have own copy of Command
  #
  class Command

    attr_accessor :serializer, :deserializer, :commands, :libraries, :bound_objects

    def initialize
      @serializer = nil
      @deserializer = nil
      @commands = []
      @libraries = []
      @bound_objects = {}
    end

    def execute(iterator, split_index)
      # Require necessary libraries
      libraries.each{|lib| require lib}

      # Prepare bound objects
      @commands.each do |command|
        command.__objects__ = bound_objects
      end

      # Prepare for running
      @commands.each(&:prepare)

      # Run all task
      @commands.each do |command|
        iterator = command.execute(iterator, split_index)
      end

      # Return changed iterator. This is not be necessary for some tasks
      # because of using inplace changing but some task can return
      # only one value (for example reduce).
      iterator
    end

    def last
      @commands.last
    end

    def bound_objects
      # Objects from users
      # Already initialized objects on worker
      return @bound_objects if @bound_objects

      if @serialized_bound_objects
        # Still serialized
        @bound_objects = Marshal.load(@serialized_bound_objects)
      else
        # Something else
        @bound_objects = {}
      end
    end

    # Bound objects can depend on library which is loaded during @execute
    # In that case worker raise "undefined class/module"
    def marshal_dump
      [@serializer, @deserializer, @commands, @libraries, serialized_bound_objects]
    end

    def marshal_load(array)
      @serializer = array.shift
      @deserializer = array.shift
      @commands = array.shift
      @libraries = array.shift
      @serialized_bound_objects = array.shift
    end

    private

      def serialized_bound_objects
        @serialized_bound_objects ||= Marshal.dump(@bound_objects)
      end

  end
end

require 'spark/command/base'
require 'spark/command/basic'
require 'spark/command/pair'
require 'spark/command/statistic'
require 'spark/command/sort'


================================================
FILE: lib/spark/command_builder.rb
================================================
require 'spark/command_validator'

module Spark
  ##
  # Builder for building correct {Spark::Command}
  #
  class CommandBuilder

    extend Forwardable

    include Spark::Helper::Serialize
    include Spark::Helper::System
    include Spark::CommandValidator

    attr_reader :command

    def_delegators :@command, :serializer, :serializer=, :deserializer, :deserializer=, :commands,
                              :commands=, :libraries, :libraries=, :bound_objects, :bound_objects=

    def initialize(serializer, deserializer=nil)
      create_command
      self.serializer   = serializer
      self.deserializer = deserializer || serializer.dup
    end

    def create_command
      @command = Spark::Command.new
    end

    # Do not user Marshal.dump(Marshal.load(self)) because some variables
    # have marshal_dump prepared for worker.
    def deep_copy
      copy = self.dup
      copy.create_command
      copy.serializer    = self.serializer.deep_copy
      copy.deserializer  = self.deserializer.deep_copy
      copy.commands      = self.commands.dup
      copy.libraries     = self.libraries.dup
      copy.bound_objects = self.bound_objects.dup
      copy
    end

    # Serialize Command class for worker
    # Java use signed number
    def build
      unpack_chars(Marshal.dump(@command))
    end

    def add_command(klass, *args)
      variables = klass.settings.variables
      validate_size(variables, args)

      built_args = []
      variables.values.zip(args) do |var, arg|
        if var[:function]
          arg = serialize_function(arg)
        end

        validate(arg, var)
        built_args << arg
      end

      comm = klass.new(*built_args)
      @command.commands << comm
      self
    end

    def add_library(*libraries)
      @command.libraries += libraries
    end

    def bind(objects)
      objects.symbolize_keys!
      @command.bound_objects.merge!(objects)
    end

    private

        # Serialized can be Proc and Method
        #
        # === Func
        # * *string:* already serialized proc
        # * *proc:* proc
        # * *symbol:* name of method
        # * *method:* Method class
        #
        def serialize_function(func)
          case func
          when String
            serialize_function_from_string(func)
          when Symbol
            serialize_function_from_symbol(func)
          when Proc
            serialize_function_from_proc(func)
          when Method
            serialize_function_from_method(func)
          else
            raise Spark::CommandError, 'You must enter String, Symbol, Proc or Method.'
          end
        end

        def serialize_function_from_string(string)
          {type: 'proc', content: string}
        end

        def serialize_function_from_symbol(symbol)
          {type: 'symbol', content: symbol}
        end

        # Serialize Proc as String
        #
        #   lambda{|x| x*x}.to_source
        #   # => "proc { |x| (x * x) }"
        #
        def serialize_function_from_proc(proc)
          serialize_function_from_string(proc.to_source)
        rescue
          raise Spark::SerializeError, 'Proc can not be serialized. Use String instead.'
        end

        # Serialize method as string
        #
        #   def test(x)
        #     x*x
        #   end
        #   serialize_function_from_method(method(:test))
        #
        #   # => "def test(x)\n  x*x\nend\n"
        #
        def serialize_function_from_method(meth)
          if pry?
            meth = Pry::Method.new(meth)
          end

          {type: 'method', name: meth.name, content: meth.source}
        rescue
          raise Spark::SerializeError, 'Method can not be serialized. Use full path or Proc.'
        end

  end
end


================================================
FILE: lib/spark/command_validator.rb
================================================
module Spark
  module CommandValidator

    def validate(value, options)
      validate_type(value, options[:type])
    end

    def valid?(value, options)
      begin
        validate(value, options)
        return true
      rescue
        return false
      end
    end

    def validate_type(value, types)
      types = [types] if !types.is_a?(Array)

      types.each do |type|
        return if value.is_a?(type)
      end

      error "Value: #{value} should be a #{types.join(' or ')} but is #{value.class}."
    end

    def validate_size(array1, array2)
      if array1.size != array2.size
        error "Wrong number of arguments (#{array1.size} for #{array2.size})"
      end
    end

  end
end


================================================
FILE: lib/spark/config.rb
================================================
# Necessary libraries
Spark.load_lib

module Spark
  # Common configuration for RubySpark and Spark
  class Config

    include Spark::Helper::System

    TYPES = {
      'spark.shuffle.spill' => :boolean,
      'spark.ruby.serializer.compress' => :boolean
    }

    # Initialize java SparkConf and load default configuration.
    def initialize
      @spark_conf = SparkConf.new(true)
      set_default
      from_file(Spark::DEFAULT_CONFIG_FILE)
    end

    def from_file(file)
      check_read_only

      if file && File.exist?(file)
        file = File.expand_path(file)
        RubyUtils.loadPropertiesFile(spark_conf, file)
      end
    end

    def [](key)
      get(key)
    end

    def []=(key, value)
      set(key, value)
    end

    def spark_conf
      if Spark.started?
        # Get latest configuration
        Spark.context.jcontext.conf
      else
        @spark_conf
      end
    end

    def valid!
      errors = []

      if !contains?('spark.app.name')
        errors << 'An application name must be set in your configuration.'
      end

      if !contains?('spark.master')
        errors << 'A master URL must be set in your configuration.'
      end

      if Spark::Serializer.find(get('spark.ruby.serializer')).nil?
        errors << 'Unknow serializer.'
      end

      scanned = get('spark.ruby.executor.command').scan('%s')

      if scanned.size == 0
        errors << "Executor command must contain '%s'."
      end

      if scanned.size > 1
        errors << "Executor command can contain only one '%s'."
      end

      if errors.any?
        errors.map!{|error| "- #{error}"}

        raise Spark::ConfigurationError, "Configuration is not valid:\r\n#{errors.join("\r\n")}"
      end
    end

    def read_only?
      Spark.started?
    end

    # Rescue from NoSuchElementException
    def get(key)
      value = spark_conf.get(key.to_s)

      case TYPES[key]
      when :boolean
        parse_boolean(value)
      when :integer
        parse_integer(value)
      else
        value
      end
    rescue
      nil
    end

    def get_all
      Hash[spark_conf.getAll.map{|tuple| [tuple._1, tuple._2]}]
    end

    def contains?(key)
      spark_conf.contains(key.to_s)
    end

    def set(key, value)
      check_read_only
      spark_conf.set(key.to_s, value.to_s)
    end

    def set_app_name(name)
      set('spark.app.name', name)
    end

    def set_master(master)
      set('spark.master', master)
    end

    def parse_boolean(value)
      case value
      when 'true'
        true
      when 'false'
        false
      end
    end

    def parse_integer(value)
      value.to_i
    end

    # =============================================================================
    # Defaults

    def set_default
      set_app_name('RubySpark')
      set_master('local[*]')
      set('spark.ruby.driver_home', Spark.home)
      set('spark.ruby.serializer', default_serializer)
      set('spark.ruby.serializer.compress', default_serializer_compress)
      set('spark.ruby.serializer.batch_size', default_serializer_batch_size)
      set('spark.ruby.executor.command', default_executor_command)
      set('spark.ruby.executor.options', default_executor_options)
      set('spark.ruby.worker.type', default_worker_type)
      load_executor_envs
      # set('spark.ruby.executor.install', default_executor_install)
    end

    def default_serializer
      ENV['SPARK_RUBY_SERIALIZER'] || Spark::Serializer::DEFAULT_SERIALIZER_NAME
    end

    def default_serializer_compress
      ENV['SPARK_RUBY_SERIALIZER_COMPRESS'] || Spark::Serializer::DEFAULT_COMPRESS
    end

    def default_serializer_batch_size
      ENV['SPARK_RUBY_SERIALIZER_BATCH_SIZE'] || Spark::Serializer::DEFAULT_BATCH_SIZE
    end

    # Command template which is applied when scala want create a ruby
    # process (e.g. master, home request). Command is represented by '%s'.
    #
    # == Example:
    #   bash --norc -i -c "export HOME=/home/user; cd; source .bashrc; %s"
    #
    def default_executor_command
      ENV['SPARK_RUBY_EXECUTOR_COMMAND'] || '%s'
    end

    # Options for every worker.
    #
    # == Example:
    #   -J-Xmx512m
    #
    def default_executor_options
      ENV['SPARK_RUBY_EXECUTOR_OPTIONS'] || ''
    end

    # # Install command which is triggered before on start.
    # # This command using executor command template.
    # #
    # # == Example:
    # #   gem install ruby-spark -v 1.2.0
    # #
    # def default_executor_install
    #   ENV['SPARK_RUBY_EXECUTOR_INSTALL'] || ''
    # end

    # Type of worker.
    #
    # == Options:
    # process:: (default)
    # thread:: (experimental)
    #
    def default_worker_type
      ENV['SPARK_RUBY_WORKER_TYPE'] || 'process'
    end

    # Load environment variables for executor from ENV.
    #
    # == Examples:
    #   SPARK_RUBY_EXECUTOR_ENV_KEY1="1"
    #   SPARK_RUBY_EXECUTOR_ENV_KEY2="2"
    #
    def load_executor_envs
      prefix = 'SPARK_RUBY_EXECUTOR_ENV_'

      envs = ENV.select{|key, _| key.start_with?(prefix)}
      envs.each do |key, value|
        key = key.dup # ENV keys are frozen
        key.slice!(0, prefix.size)

        set("spark.ruby.executor.env.#{key}", value)
      end
    end

    # Aliases
    alias_method :getAll,     :get_all
    alias_method :setAppName, :set_app_name
    alias_method :setMaster,  :set_master

    private

      def check_read_only
        if read_only?
          raise Spark::ConfigurationError, 'Configuration is ready only'
        end
      end

  end
end


================================================
FILE: lib/spark/constant.rb
================================================
module Spark
  # Commond constant for Ruby and Spark
  module Constant
    DATA_EOF = -2
    WORKER_ERROR = -1
    WORKER_DONE = 0
    CREATE_WORKER = 1
    KILL_WORKER = 2
    KILL_WORKER_AND_WAIT = 3
    SUCCESSFULLY_KILLED = 4
    UNSUCCESSFUL_KILLING = 5
    ACCUMULATOR_ACK = 6
  end
end


================================================
FILE: lib/spark/context.rb
================================================
# Necessary libraries
Spark.load_lib

module Spark
  ##
  # Main entry point for Spark functionality. A SparkContext represents the connection to a Spark
  # cluster, and can be used to create RDDs, accumulators and broadcast variables on that cluster.
  #
  class Context

    include Spark::Helper::System
    include Spark::Helper::Parser
    include Spark::Helper::Logger

    attr_reader :jcontext, :jaccumulator, :temp_dir

    # Constructor for Ruby context. Configuration is automatically is taken
    # from Spark. Config will be automatically set to default if user start
    # context first.
    #
    def initialize
      Spark.config.valid!
      @jcontext = JavaSparkContext.new(Spark.config.spark_conf)
      @jcontext.addJar(Spark.ruby_spark_jar)

      # Does not work on 1.2
      # ui.attachTab(RubyTab.new(ui, to_java_hash(RbConfig::CONFIG)))

      spark_local_dir = JUtils.getLocalDir(sc.conf)
      @temp_dir = JUtils.createTempDir(spark_local_dir, 'ruby').getAbsolutePath

      accum_server = Spark::Accumulator::Server
      accum_server.start
      @jaccumulator = @jcontext.accumulator(ArrayList.new, RubyAccumulatorParam.new(accum_server.host, accum_server.port))

      log_info("Ruby accumulator server is running on port #{accum_server.port}")

      set_call_site('Ruby') # description of stage
    end

    def inspect
      result  = %{#<#{self.class.name}:0x#{object_id}\n}
      result << %{Tempdir: "#{temp_dir}">}
      result
    end

    def stop
      Spark::Accumulator::Server.stop
      log_info('Ruby accumulator server was stopped')
      @jcontext.stop
    end

    def sc
      @jcontext.sc
    end

    def ui
      sc.ui
    end

    # Default level of parallelism to use when not given by user (e.g. parallelize and makeRDD)
    #
    def default_parallelism
      sc.defaultParallelism
    end

    # Default serializer
    #
    # Batch -> Compress -> Basic
    #
    def default_serializer
      # Basic
      serializer = Spark::Serializer.find!(config('spark.ruby.serializer')).new

      # Compress
      if config('spark.ruby.serializer.compress')
        serializer = Spark::Serializer.compressed(serializer)
      end

      # Bactching
      batch_size = default_batch_size
      if batch_size == 'auto'
        serializer = Spark::Serializer.auto_batched(serializer)
      else
        serializer = Spark::Serializer.batched(serializer, batch_size)
      end

      # Finally, "container" contains serializers
      serializer
    end

    def default_batch_size
      size = config('spark.ruby.serializer.batch_size').to_i
      if size >= 1
        size
      else
        'auto'
      end
    end

    # Set a local property that affects jobs submitted from this thread, such as the
    # Spark fair scheduler pool.
    #
    def set_local_property(key, value)
      jcontext.setLocalProperty(key, value)
    end

    # Get a local property set in this thread, or null if it is missing
    #
    def get_local_property(key)
      jcontext.getLocalProperty(key)
    end

    # Support function for API backtraces.
    #
    def set_call_site(site)
      jcontext.setCallSite(site)
    end

    def clear_call_site
      jcontext.clearCallSite
    end

    # Return a copy of this SparkContext's configuration. The configuration *cannot*
    # be changed at runtime.
    #
    def config(key=nil)
      if key
        Spark.config.get(key)
      else
        Spark.config
      end
    end

    # Add a file to be downloaded with this Spark job on every node.
    # The path of file passed can be either a local file, a file in HDFS
    # (or other Hadoop-supported filesystems), or an HTTP, HTTPS or FTP URI.
    #
    # To access the file in Spark jobs, use `SparkFiles.get(file_name)` with the
    # filename to find its download location.
    #
    # == Example:
    #   `echo 10 > test.txt`
    #
    #   $sc.add_file('test.txt')
    #   $sc.parallelize(0..5).map(lambda{|x| x * SparkFiles.get_content('test.txt').to_i}).collect
    #   # => [0, 10, 20, 30, 40, 50]
    #
    def add_file(*files)
      files.each do |file|
        sc.addFile(file)
      end
    end

    # Broadcast a read-only variable to the cluster, returning a Spark::Broadcast
    # object for reading it in distributed functions. The variable will
    # be sent to each cluster only once.
    #
    # == Example:
    #   broadcast1 = $sc.broadcast('a')
    #   broadcast2 = $sc.broadcast('b')
    #
    #   rdd = $sc.parallelize(0..5, 4)
    #   rdd = rdd.bind(broadcast1: broadcast1, broadcast2: broadcast2)
    #   rdd = rdd.map_partitions_with_index(lambda{|part, index| [broadcast1.value * index, broadcast2.value * index] })
    #   rdd.collect
    #   # => ["", "", "a", "b", "aa", "bb", "aaa", "bbb"]
    #
    def broadcast(value)
      Spark::Broadcast.new(self, value)
    end

    # Create an Accumulator with the given initial value, using a given
    # accum_param helper object to define how to add values of the
    # data type if provided.
    #
    # == Example:
    #   accum = $sc.accumulator(7)
    #
    #   rdd = $sc.parallelize(0..5, 4)
    #   rdd = rdd.bind(accum: accum)
    #   rdd = rdd.map_partitions(lambda{|_| accum.add(1) })
    #   rdd = rdd.collect
    #
    #   accum.value
    #   # => 11
    #
    def accumulator(value, accum_param=:+, zero_value=0)
      Spark::Accumulator.new(value, accum_param, zero_value)
    end

    # Distribute a local Ruby collection to form an RDD
    # Direct method can be slow so be careful, this method update data inplace
    #
    # == Parameters:
    # data:: Range or Array
    # num_slices:: number of slice
    # serializer:: custom serializer (default: serializer based on configuration)
    #
    # == Examples:
    #   $sc.parallelize(["1", "2", "3"]).map(lambda{|x| x.to_i}).collect
    #   #=> [1, 2, 3]
    #
    #   $sc.parallelize(1..3).map(:to_s).collect
    #   #=> ["1", "2", "3"]
    #
    def parallelize(data, num_slices=nil, serializer=nil)
      num_slices ||= default_parallelism
      serializer ||= default_serializer

      serializer.check_each(data)

      # Through file
      file = Tempfile.new('to_parallelize', temp_dir)
      serializer.dump_to_io(data, file)
      file.close # not unlink
      jrdd = RubyRDD.readRDDFromFile(jcontext, file.path, num_slices)

      Spark::RDD.new(jrdd, self, serializer)
    ensure
      file && file.unlink
    end

    # Read a text file from HDFS, a local file system (available on all nodes), or any
    # Hadoop-supported file system URI, and return it as an RDD of Strings.
    #
    # == Example:
    #   f = Tempfile.new("test")
    #   f.puts("1")
    #   f.puts("2")
    #   f.close
    #
    #   $sc.text_file(f.path).map(lambda{|x| x.to_i}).collect
    #   # => [1, 2]
    #
    def text_file(path, min_partitions=nil, encoding=Encoding::UTF_8, serializer=nil)
      min_partitions ||= default_parallelism
      serializer     ||= default_serializer
      deserializer     = Spark::Serializer.build { __text__(encoding) }

      Spark::RDD.new(@jcontext.textFile(path, min_partitions), self, serializer, deserializer)
    end

    # Read a directory of text files from HDFS, a local file system (available on all nodes), or any
    # Hadoop-supported file system URI. Each file is read as a single record and returned in a
    # key-value pair, where the key is the path of each file, the value is the content of each file.
    #
    # == Example:
    #   dir = Dir.mktmpdir
    #   f1 = Tempfile.new("test1", dir)
    #   f2 = Tempfile.new("test2", dir)
    #   f1.puts("1"); f1.puts("2");
    #   f2.puts("3"); f2.puts("4");
    #   f1.close
    #   f2.close
    #
    #   $sc.whole_text_files(dir).flat_map(lambda{|key, value| value.split}).collect
    #   # => ["1", "2", "3", "4"]
    #
    def whole_text_files(path, min_partitions=nil, serializer=nil)
      min_partitions ||= default_parallelism
      serializer     ||= default_serializer
      deserializer     = Spark::Serializer.build{ __pair__(__text__, __text__) }

      Spark::RDD.new(@jcontext.wholeTextFiles(path, min_partitions), self, serializer, deserializer)
    end

    # Executes the given partition function f on the specified set of partitions,
    # returning the result as an array of elements.
    #
    # If partitions is not specified, this will run over all partitions.
    #
    # == Example:
    #   rdd = $sc.parallelize(0..10, 5)
    #   $sc.run_job(rdd, lambda{|x| x.to_s}, [0,2])
    #   # => ["[0, 1]", "[4, 5]"]
    #
    def run_job(rdd, f, partitions=nil, allow_local=false)
      run_job_with_command(rdd, partitions, allow_local, Spark::Command::MapPartitions, f)
    end

    # Execute the given command on specific set of partitions.
    #
    def run_job_with_command(rdd, partitions, allow_local, command, *args)
      if !partitions.nil? && !partitions.is_a?(Array)
        raise Spark::ContextError, 'Partitions must be nil or Array'
      end

      partitions_size = rdd.partitions_size

      # Execute all parts
      if partitions.nil?
        partitions = (0...partitions_size).to_a
      end

      # Can happend when you use coalesce
      partitions.delete_if {|part| part >= partitions_size}

      # Rjb represent Fixnum as Integer but Jruby as Long
      partitions = to_java_array_list(convert_to_java_int(partitions))

      # File for result
      file = Tempfile.new('collect', temp_dir)

      mapped = rdd.new_rdd_from_command(command, *args)
      RubyRDD.runJob(rdd.context.sc, mapped.jrdd, partitions, allow_local, file.path)

      mapped.collect_from_file(file)
    end


    # Aliases
    alias_method :textFile, :text_file
    alias_method :wholeTextFiles, :whole_text_files
    alias_method :defaultParallelism, :default_parallelism
    alias_method :setLocalProperty, :set_local_property
    alias_method :getLocalProperty, :get_local_property
    alias_method :setCallSite, :set_call_site
    alias_method :clearCallSite, :clear_call_site
    alias_method :runJob, :run_job
    alias_method :runJobWithCommand, :run_job_with_command
    alias_method :addFile, :add_file

  end
end


================================================
FILE: lib/spark/error.rb
================================================
module Spark
  # Extension cannot be built
  class BuildError < StandardError
  end

  # Proc.to_source
  # Java object cannot be converted
  class SerializeError < StandardError
  end

  # Serializer method
  # Non-existing serializer
  class NotImplemented < StandardError
  end

  # Missison app_name or master
  class ConfigurationError < StandardError
  end

  # Wrong parameters
  class RDDError < StandardError
  end

  # Validations
  class CommandError < StandardError
  end

  # Parser helper
  # SQL DataType
  class ParseError < StandardError
  end

  # Validation in context
  class ContextError < StandardError
  end

  # Broadcasts
  # Missing path
  class BroadcastError < StandardError
  end

  # Accumulators
  # Existing keys
  # Wrong ID
  class AccumulatorError < StandardError
  end

  # Wrong instances
  class MllibError < StandardError
  end

  # Wrong datatype
  class SQLError < StandardError
  end

  # Missing Java class
  class JavaBridgeError < StandardError
  end
end


================================================
FILE: lib/spark/ext/hash.rb
================================================
module Spark
  module CoreExtension
    module Hash
      module ClassMethods
      end

      module InstanceMethods
        # Destructively convert all keys to strings.
        def stringify_keys_with_spark!
          transform_keys!{ |key| key.to_s }
        end

        # Destructively convert all keys to symbols, as long as they respond
        def symbolize_keys_with_spark!
          transform_keys!{ |key| key.to_sym rescue key }
        end

        # Destructively convert all keys using the block operations.
        # Same as transform_keys but modifies +self+.
        def transform_keys_with_spark!
          keys.each do |key|
            self[yield(key)] = delete(key)
          end
          self
        end
      end

      def self.included(base)
        base.extend(ClassMethods)
        base.send(:include, InstanceMethods)
        base.class_eval do
          patch_unless_exist :stringify_keys!, :spark
          patch_unless_exist :symbolize_keys!, :spark
          patch_unless_exist :transform_keys!, :spark
        end
      end
    end
  end
end

Hash.__send__(:include, Spark::CoreExtension::Hash)


================================================
FILE: lib/spark/ext/integer.rb
================================================
module Spark
  module CoreExtension
    module Integer
      module ClassMethods
      end

      module InstanceMethods
      end

      def self.included(base)
        base.extend(ClassMethods)
        base.send(:include, InstanceMethods)
        base.class_eval do
          const_set :MAX_WITH_SPARK, 1 << (1.size * 8 - 2) - 1
          const_set :MIN_WITH_SPARK, -const_get(:MAX_WITH_SPARK) - 1

          path_const_unless_exist :MAX, :SPARK
          path_const_unless_exist :MIN, :SPARK
        end
      end
    end
  end
end

Integer.__send__(:include, Spark::CoreExtension::Integer)


================================================
FILE: lib/spark/ext/io.rb
================================================
module Spark
  module CoreExtension
    module IO
      module ClassMethods
      end

      module InstanceMethods

        # Reading

        def read_int
          unpack_int(read(4))
        end

        def read_int_or_eof
          bytes = read(4)
          return Spark::Constant::DATA_EOF if bytes.nil?
          unpack_int(bytes)
        end

        def read_long
          unpack_long(read(8))
        end

        def read_string
          read(read_int)
        end

        def read_data
          Marshal.load(read_string)
        end


        # Writing

        def write_int(data)
          write(pack_int(data))
        end

        def write_long(data)
          write(pack_long(data))
        end

        # Size and data can have different encoding
        # Marshal: both ASCII
        # Oj: ASCII and UTF-8
        def write_string(data)
          write_int(data.bytesize)
          write(data)
        end

        def write_data(data)
          write_string(Marshal.dump(data))
        end
      end

      def self.included(base)
        base.extend(ClassMethods)
        base.send(:include, Spark::Helper::Serialize)
        base.send(:include, InstanceMethods)
      end
    end
  end
end

IO.__send__(:include, Spark::CoreExtension::IO)
StringIO.__send__(:include, Spark::CoreExtension::IO)


================================================
FILE: lib/spark/ext/ip_socket.rb
================================================
module Spark
  module CoreExtension
    module IPSocket
      module ClassMethods
      end

      module InstanceMethods
        def port
          addr[1]
        end

        def hostname
          addr(true)[2]
        end

        def numeric_address
          addr[3]
        end
      end

      def self.included(base)
        base.extend(ClassMethods)
        base.send(:include, InstanceMethods)
      end
    end
  end
end

IPSocket.__send__(:include, Spark::CoreExtension::IPSocket)


================================================
FILE: lib/spark/ext/module.rb
================================================
module Spark
  module CoreExtension
    module Module

      # Patch method to class unless already exist
      #
      # == Example:
      #
      #   class Hash
      #     def a
      #       1
      #     end
      #   end
      #
      #   module HashExtension
      #     module InstanceMethods
      #       def a_with_spark
      #         2
      #       end
      #
      #       def b_with_spark
      #         1
      #       end
      #     end
      #
      #     def self.included(base)
      #       base.send(:include, InstanceMethods)
      #       base.class_eval do
      #         patch_unless_exist :a, :spark
      #         patch_unless_exist :b, :spark
      #       end
      #     end
      #   end
      #
      #   Hash.include(HashExtension)
      #
      #   Hash.new.a # => 1
      #   Hash.new.b # => 1
      #
      def patch_unless_exist(target, suffix)
        unless method_defined?(target)
          aliased_target, punctuation = target.to_s.sub(/([?!=])$/, ''), $1

          alias_method target, "#{aliased_target}_with_#{suffix}#{punctuation}"
        end
      end

      def path_const_unless_exist(target, suffix)
        unless const_defined?(target)
          const_set(target, const_get("#{target}_WITH_#{suffix}"))
        end
      end

    end
  end
end

Module.__send__(:include, Spark::CoreExtension::Module)


================================================
FILE: lib/spark/ext/object.rb
================================================
module Spark
  module CoreExtension
    module Object
      module ClassMethods
      end

      module InstanceMethods
        def deep_copy_with_spark
          Marshal.load(Marshal.dump(self))
        end

        def silence_warnings
          old_verbose, $VERBOSE = $VERBOSE, nil
          yield
        ensure
          $VERBOSE = old_verbose
        end

        def cattr_reader_with_spark(*syms)
          syms.each do |sym|
            raise NameError.new("Invalid attribute name: #{sym}") unless sym =~ /^[_A-Za-z]\w*$/

            class_eval(<<-EOS, __FILE__, __LINE__ + 1)
              @@#{sym} = nil unless defined? @@#{sym}
              def self.#{sym}
                @@#{sym}
              end
            EOS

            class_eval(<<-EOS, __FILE__, __LINE__ + 1)
              def #{sym}
                @@#{sym}
              end
            EOS
          end
        end

        def cattr_writer_with_spark(*syms)
          syms.each do |sym|
            raise NameError.new("Invalid attribute name: #{sym}") unless sym =~ /^[_A-Za-z]\w*$/

            class_eval(<<-EOS, __FILE__, __LINE__ + 1)
              @@#{sym} = nil unless defined? @@#{sym}
              def self.#{sym}=(obj)
                @@#{sym} = obj
              end
            EOS

            class_eval(<<-EOS, __FILE__, __LINE__ + 1)
              def #{sym}=(obj)
                @@#{sym} = obj
              end
            EOS
          end
        end

        def cattr_accessor_with_spark(*syms)
          cattr_reader_with_spark(*syms)
          cattr_writer_with_spark(*syms)
        end
      end

      def self.included(base)
        base.extend(ClassMethods)
        base.send(:include, InstanceMethods)
        base.class_eval do
          patch_unless_exist :deep_copy, :spark
          patch_unless_exist :silence_warnings, :spark
          patch_unless_exist :cattr_accessor, :spark
        end
      end
    end
  end
end

Object.__send__(:include, Spark::CoreExtension::Object)


================================================
FILE: lib/spark/ext/string.rb
================================================
module Spark
  module CoreExtension
    module String
      module ClassMethods
      end

      module InstanceMethods
        def camelize_with_spark
          self.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase }
        end
      end

      def self.included(base)
        base.extend(ClassMethods)
        base.send(:include, InstanceMethods)
        base.class_eval do
          patch_unless_exist :camelize, :spark
        end
      end
    end
  end
end

String.__send__(:include, Spark::CoreExtension::String)


================================================
FILE: lib/spark/helper/logger.rb
================================================
module Spark
  module Helper
    module Logger

      def self.included(base)
        base.send :extend,  Methods
        base.send :include, Methods
      end

      module Methods
        def log_info(message)
          Spark.logger.info(message)
        end

        def log_debug(message)
          Spark.logger.debug(message)
        end

        def log_trace(message)
          Spark.logger.trace(message)
        end

        def log_warning(message)
          Spark.logger.warning(message)
        end

        def log_error(message)
          Spark.logger.error(message)
        end

        alias_method :logInfo,    :log_info
        alias_method :logDebug,   :log_debug
        alias_method :logTrace,   :log_trace
        alias_method :logWarning, :log_warning
        alias_method :logError,   :log_error

      end # Methods
    end # Logger
  end # Helper
end # Spark


================================================
FILE: lib/spark/helper/parser.rb
================================================
module Spark
  module Helper
    module Parser
      
      def self.included(base)
        base.send :extend,  Methods
        base.send :include, Methods
      end
     
      module Methods
        def to_java_hash(hash)
          hash_map = HashMap.new
          hash.each_pair do |key, value|
            begin
              # RJB raise Object is NULL (but new record is put correctly)
              hash_map.put(key, value)
            rescue RuntimeError
            end
          end
          hash_map
        end

        def convert_to_java_int(data)
          if data.is_a?(Array)
            data.map{|x| JInteger.new(x)}
          else
            JInteger.new(data)
          end
        end

        def to_java_array_list(array)
          array_list = ArrayList.new
          array.each do |item|
            array_list.add(item)
          end
          array_list
        end

        # Parse and convert memory size. Shifting be better but Float doesn't support it.
        #
        # == Examples:
        #   to_memory_size("512mb")
        #   # => 524288
        #
        #   to_memory_size("512 MB")
        #   # => 524288
        #
        #   to_memory_size("512mb", "GB")
        #   # => 0.5
        #
        def to_memory_size(memory, result_unit="KB")
          match = memory.match(/([\d]+)[\s]*([\w]*)/)
          if match.nil?
            raise Spark::ParseError, "Memory has wrong format. Use: 'SIZE UNIT'"
          end

          size = match[1].to_f
          unit = match[2]

          size *= memory_multiplier_based_kb(unit)
          size /= memory_multiplier_based_kb(result_unit)
          size.round(2)
        end

        # Based to KB
        def memory_multiplier_based_kb(type)
          case type.to_s.upcase
          when "G", "GB"
            1048576
          when "M", "MB"
            1024
          when "K", "KB"
            1
          else
            raise Spark::ParseError, "Unsupported type #{type}"
          end
        end

      end # Methods

    end # Parser
  end # Helper
end # Spark


================================================
FILE: lib/spark/helper/serialize.rb
================================================
module Spark
  module Helper
    module Serialize

      DIRECTIVE_INTEGER_BIG_ENDIAN = 'l>'
      DIRECTIVE_INTEGERS_BIG_ENDIAN = 'l>*'
      DIRECTIVE_LONG_BIG_ENDIAN = 'q>'
      DIRECTIVE_LONGS_BIG_ENDIAN = 'q>*'
      DIRECTIVE_DOUBLE_BIG_ENDIAN = 'G'
      DIRECTIVE_DOUBLES_BIG_ENDIAN = 'G*'
      DIRECTIVE_UNSIGNED_CHARS = 'C*'
      DIRECTIVE_CHARS = 'c*'

      # Packing

      def pack_int(data)
        [data].pack(DIRECTIVE_INTEGER_BIG_ENDIAN)
      end

      def pack_long(data)
        [data].pack(DIRECTIVE_LONG_BIG_ENDIAN)
      end

      def pack_double(data)
        [data].pack(DIRECTIVE_DOUBLE_BIG_ENDIAN)
      end

      def pack_unsigned_chars(data)
        data.pack(DIRECTIVE_UNSIGNED_CHARS)
      end

      def pack_ints(data)
        __check_array(data)
        data.pack(DIRECTIVE_INTEGERS_BIG_ENDIAN)
      end

      def pack_longs(data)
        __check_array(data)
        data.pack(DIRECTIVE_LONGS_BIG_ENDIAN)
      end

      def pack_doubles(data)
        __check_array(data)
        data.pack(DIRECTIVE_DOUBLES_BIG_ENDIAN)
      end

      # Unpacking

      def unpack_int(data)
        data.unpack(DIRECTIVE_INTEGER_BIG_ENDIAN)[0]
      end

      def unpack_long(data)
        data.unpack(DIRECTIVE_LONG_BIG_ENDIAN)[0]
      end

      def unpack_chars(data)
        data.unpack(DIRECTIVE_CHARS)
      end

      private

        def __check_array(data)
          unless data.is_a?(Array)
            raise ArgumentError, 'Data must be an Array.'
          end
        end

    end
  end
end


================================================
FILE: lib/spark/helper/statistic.rb
================================================
module Spark
  module Helper
    module Statistic

      # Returns a sampling rate that guarantees a sample of size >= sampleSizeLowerBound 99.99% of the time.
      #
      # == How the sampling rate is determined:
      # Let p = num / total, where num is the sample size and total is the total number of
      # datapoints in the RDD. We're trying to compute q > p such that
      # * when sampling with replacement, we're drawing each datapoint with prob_i ~ Pois(q),
      #   where we want to guarantee Pr[s < num] < 0.0001 for s = sum(prob_i for i from 0 to total),
      #   i.e. the failure rate of not having a sufficiently large sample < 0.0001.
      #   Setting q = p + 5 * sqrt(p/total) is sufficient to guarantee 0.9999 success rate for
      #   num > 12, but we need a slightly larger q (9 empirically determined).
      # * when sampling without replacement, we're drawing each datapoint with prob_i
      #   ~ Binomial(total, fraction) and our choice of q guarantees 1-delta, or 0.9999 success
      #   rate, where success rate is defined the same as in sampling with replacement.
      #
      def compute_fraction(lower_bound, total, with_replacement)
        lower_bound = lower_bound.to_f

        if with_replacement
          upper_poisson_bound(lower_bound) / total
        else
          fraction = lower_bound / total
          upper_binomial_bound(0.00001, total, fraction)
        end
      end

      def upper_poisson_bound(bound)
        num_std = if bound < 6
          12
        elsif bound < 16
          9
        else
          6
        end.to_f

        [bound + num_std * Math.sqrt(bound), 1e-10].max
      end

      def upper_binomial_bound(delta, total, fraction)
        gamma = -Math.log(delta) / total
        [1, fraction + gamma + Math.sqrt(gamma*gamma + 2*gamma*fraction)].min
      end

      # Bisect right
      #
      # == Examples:
      #   data = [1,5,6,8,96,120,133]
      #
      #   bisect_right(data, 0)   # => 0
      #   bisect_right(data, 1)   # => 1
      #   bisect_right(data, 5)   # => 2
      #   bisect_right(data, 9)   # => 4
      #   bisect_right(data, 150) # => 7
      #
      def bisect_right(data, value, low=0, high=data.size)
        if low < 0
          raise ArgumentError, 'Low must be >= 0.'
        end

        while low < high
          mid = (low + high) / 2
          if value < data[mid]
            high = mid
          else
            low = mid + 1
          end
        end

        low
      end

      # Determine bound of partitioning
      #
      # == Example:
      #   data = [0,1,2,3,4,5,6,7,8,9,10]
      #   determine_bounds(data, 3)
      #   # => [3, 7]
      #
      def determine_bounds(data, num_partitions)
        if num_partitions > data.size
          return data
        end

        bounds = []
        count = data.size
        (0...(num_partitions-1)).each do |index|
          bounds << data[count * (index+1) / num_partitions]
        end
        bounds
      end

    end
  end
end


================================================
FILE: lib/spark/helper/system.rb
================================================
module Spark
  module Helper
    module System

      def self.included(base)
        base.send :extend,  Methods
        base.send :include, Methods
      end
     
      module Methods
        def windows?
          RbConfig::CONFIG['host_os'] =~ /mswin|mingw/
        end

        def mri?
          RbConfig::CONFIG['ruby_install_name'] == 'ruby'
        end

        def jruby?
          RbConfig::CONFIG['ruby_install_name'] == 'jruby'
        end

        def pry?
          !!Thread.current[:__pry__]
        end

        # Memory usage in kb
        def memory_usage
          if jruby?
            runtime = java.lang.Runtime.getRuntime
            (runtime.totalMemory - runtime.freeMemory) >> 10
          elsif windows?
            # not yet
          else
            `ps -o rss= -p #{Process.pid}`.to_i
          end
        end
      end # Methods

    end # System
  end # Helper
end # Spark


================================================
FILE: lib/spark/helper.rb
================================================
module Spark
  module Helper
    autoload :System,    "spark/helper/system"
    autoload :Logger,    "spark/helper/logger"
    autoload :Statistic, "spark/helper/statistic"
    autoload :Serialize, "spark/helper/serialize"
    autoload :Partition, "spark/helper/partition"
    autoload :Parser,    "spark/helper/parser"
  end
end


================================================
FILE: lib/spark/java_bridge/base.rb
================================================
##
# Spark::JavaBridge::Base
#
# Parent for all adapter (ruby - java)
#
module Spark
  module JavaBridge
    class Base

      include Spark::Helper::System

      JAVA_OBJECTS = [
        'java.util.ArrayList',
        'scala.collection.mutable.HashMap',
        'org.apache.spark.SparkConf',
        'org.apache.spark.api.java.JavaSparkContext',
        'org.apache.spark.api.ruby.RubyRDD',
        'org.apache.spark.api.ruby.RubyUtils',
        'org.apache.spark.api.ruby.RubyWorker',
        'org.apache.spark.api.ruby.PairwiseRDD',
        'org.apache.spark.api.ruby.RubyAccumulatorParam',
        'org.apache.spark.api.ruby.RubySerializer',
        'org.apache.spark.api.python.PythonRDD',
        'org.apache.spark.api.python.PythonPartitioner',
        'org.apache.spark.api.python.PythonUtils',
        'org.apache.spark.ui.ruby.RubyTab',
        'org.apache.spark.mllib.api.ruby.RubyMLLibAPI',
        :JInteger  => 'java.lang.Integer',
        :JLong     => 'java.lang.Long',
        :JLogger   => 'org.apache.log4j.Logger',
        :JLevel    => 'org.apache.log4j.Level',
        :JPriority => 'org.apache.log4j.Priority',
        :JUtils    => 'org.apache.spark.util.Utils',
        :JDataType => 'org.apache.spark.sql.types.DataType',
        :JSQLContext => 'org.apache.spark.sql.SQLContext',
        :JDenseVector => 'org.apache.spark.mllib.linalg.DenseVector',
        :JDenseMatrix => 'org.apache.spark.mllib.linalg.DenseMatrix',
        :JStorageLevel => 'org.apache.spark.storage.StorageLevel',
        :JSQLFunctions => 'org.apache.spark.sql.functions'
      ]

      JAVA_TEST_OBJECTS = [
        'org.apache.spark.mllib.api.ruby.RubyMLLibUtilAPI'
      ]

      RUBY_TO_JAVA_SKIP = [Fixnum, Integer]

      def initialize(target)
        @target = target
      end

      # Import all important classes into Objects
      def import_all
        return if @imported

        java_objects.each do |name, klass|
          import(name, klass)
        end

        @imported = true
        nil
      end

      # Import classes for testing
      def import_all_test
        return if @imported_test

        java_test_objects.each do |name, klass|
          import(name, klass)
        end

        @imported_test = true
        nil
      end

      # Call java object
      def call(klass, method, *args)
        # To java
        args.map!{|item| to_java(item)}

        # Call java
        result = klass.__send__(method, *args)

        # To ruby
        to_ruby(result)
      end

      def to_array_list(array)
        array_list = ArrayList.new
        array.each do |item|
          array_list.add(to_java(item))
        end
        array_list
      end

      def to_seq(array)
        PythonUtils.toSeq(to_array_list(array))
      end

      def to_long(number)
        return nil if number.nil?
        JLong.new(number)
      end

      def to_java(object)
        if RUBY_TO_JAVA_SKIP.include?(object.class)
          # Some object are convert automatically
          # This is for preventing errors
          # For example: jruby store integer as long so 1.to_java is Long
          object
        elsif object.respond_to?(:to_java)
          object.to_java
        elsif object.is_a?(Array)
          to_array_list(object)
        else
          object
        end
      end

      # Array problem:
      #   Rjb:   object.toArray -> Array
      #   Jruby: object.toArray -> java.lang.Object
      #
      def to_ruby(object)
        # Java object
        if java_object?(object)
          class_name = object.getClass.getSimpleName
          case class_name
          when 'ArraySeq'
            result = []
            iterator = object.iterator
            while iterator.hasNext
              result << to_ruby(iterator.next)
            end
            result
          when 'Map2', 'Map3', 'Map4', 'HashTrieMap'
            Hash[
              object.toSeq.array.to_a.map!{|item| [item._1, item._2]}
            ]
          when 'SeqWrapper'; object.toArray.to_a.map!{|item| to_ruby(item)}
          when 'ofRef';      object.array.to_a.map!{|item| to_ruby(item)} # WrappedArray$ofRef
          when 'LabeledPoint'; Spark::Mllib::LabeledPoint.from_java(object)
          when 'DenseVector';  Spark::Mllib::DenseVector.from_java(object)
          when 'KMeansModel';  Spark::Mllib::KMeansModel.from_java(object)
          when 'DenseMatrix';  Spark::Mllib::DenseMatrix.from_java(object)
          when 'GenericRowWithSchema'; Spark::SQL::Row.from_java(object, true)
          else
            # Some RDD
            if class_name != 'JavaRDD' && class_name.end_with?('RDD')
              object = object.toJavaRDD
              class_name = 'JavaRDD'
            end

            # JavaRDD
            if class_name == 'JavaRDD'
              jrdd = RubyRDD.toRuby(object)

              serializer = Spark::Serializer.build { __batched__(__marshal__) }
              serializer = Spark::Serializer.build { __batched__(__marshal__, 2) }

              return Spark::RDD.new(jrdd, Spark.sc, serializer, deserializer)
            end

            # Unknow
            Spark.logger.warn("Java object '#{object.getClass.name}' was not converted.")
            object
          end

        # Array can be automatically transfered but content not
        elsif object.is_a?(Array)
          object.map! do |item|
            to_ruby(item)
          end
          object

        # Already transfered
        else
          object
        end
      end

      alias_method :java_to_ruby, :to_ruby
      alias_method :ruby_to_java, :to_java

      private

        def jars
          result = Dir.glob(File.join(@target, '*.jar'))
          result.flatten!
          result
        end

        def objects_with_names(objects)
          hash = {}
          objects.each do |object|
            if object.is_a?(Hash)
              hash.merge!(object)
            else
              key = object.split('.').last.to_sym
              hash[key] = object
            end
          end
          hash
        end

        def java_objects
          objects_with_names(JAVA_OBJECTS)
        end

        def java_test_objects
          objects_with_names(JAVA_TEST_OBJECTS)
        end

        def raise_missing_class(klass)
          raise Spark::JavaBridgeError, "Class #{klass} is missing. Make sure that Spark and RubySpark is assembled."
        end

    end
  end
end


================================================
FILE: lib/spark/java_bridge/jruby.rb
================================================
require 'java'

module Spark
  module JavaBridge
    class JRuby < Base

      def initialize(*args)
        super
        jars.each {|jar| require jar}
      end

      def import(name, klass)
        klass = "Java::#{klass}"
        Object.const_set(name, eval(klass))
      rescue NameError
        raise_missing_class(klass)
      end

      def java_object?(object)
        object.is_a?(JavaProxy)
      end

    end
  end
end


================================================
FILE: lib/spark/java_bridge/rjb.rb
================================================
if !ENV.has_key?('JAVA_HOME')
  raise Spark::ConfigurationError, 'Environment variable JAVA_HOME is not set'
end

require 'rjb'

module Spark
  module JavaBridge
    class RJB < Base

      def initialize(*args)
        super
        Rjb.load(jars)
        Rjb.primitive_conversion = true
      end

      def import(name, klass)
        Object.const_set(name, silence_warnings { Rjb.import(klass) })
      rescue NoClassDefFoundError
        raise_missing_class(klass)
      end

      def java_object?(object)
        object.is_a?(Rjb::Rjb_JavaProxy)
      end

      private

        def jars
          separator = windows? ? ';' : ':'
          super.join(separator)
        end

    end
  end
end


================================================
FILE: lib/spark/java_bridge.rb
================================================
module Spark
  module JavaBridge

    autoload :Base,  'spark/java_bridge/base'
    autoload :JRuby, 'spark/java_bridge/jruby'
    autoload :RJB,   'spark/java_bridge/rjb'

    include Spark::Helper::System

    def self.init(*args)
      if jruby?
        klass = JRuby
      else
        klass = RJB
      end

      klass.new(*args)
    end

  end
end


================================================
FILE: lib/spark/library.rb
================================================
module Spark
  module Library

    def autoload(klass, location, import=true)
      if import
        @for_importing ||= []
        @for_importing << klass
      end

      super(klass, location)
    end

    def autoload_without_import(klass, location)
      autoload(klass, location, false)
    end

    def import(to=Object)
      @for_importing.each do |klass|
        to.const_set(klass, const_get(klass))
      end
      nil
    end

  end
end


================================================
FILE: lib/spark/logger.rb
================================================
# Necessary libraries
Spark.load_lib

module Spark
  class Logger

    attr_reader :jlogger

    def initialize
      @jlogger = JLogger.getLogger('Ruby')
    end

    def level_off
      JLevel.toLevel('OFF')
    end

    # Disable all Spark log
    def disable
      jlogger.setLevel(level_off)
      JLogger.getLogger('org').setLevel(level_off)
      JLogger.getLogger('akka').setLevel(level_off)
      JLogger.getRootLogger.setLevel(level_off)
    end

    def enabled?
      !disabled?
    end

    def info(message)
      jlogger.info(message) if info?
    end

    def debug(message)
      jlogger.debug(message) if debug?
    end

    def trace(message)
      jlogger.trace(message) if trace?
    end

    def warning(message)
      jlogger.warn(message) if warning?
    end

    def error(message)
      jlogger.error(message) if error?
    end

    def info?
      level_enabled?('info')
    end

    def debug?
      level_enabled?('debug')
    end

    def trace?
      level_enabled?('trace')
    end

    def warning?
      level_enabled?('warn')
    end

    def error?
      level_enabled?('error')
    end

    def level_enabled?(type)
      jlogger.isEnabledFor(JPriority.toPriority(type.upcase))
    end

    alias_method :warn, :warning

  end
end


================================================
FILE: lib/spark/mllib/classification/common.rb
================================================
module Spark
  module Mllib
    class ClassificationModel

      attr_reader :weights, :intercept, :threshold

      def initialize(weights, intercept)
        @weights = Spark::Mllib::Vectors.to_vector(weights)
        @intercept = intercept.to_f
        @threshold = nil
      end

      def threshold=(value)
        @threshold = value.to_f
      end

      def clear_threshold
        @threshold = nil
      end

    end
  end
end

module Spark
  module Mllib
    class ClassificationMethodBase < RegressionMethodBase

    end
  end
end


================================================
FILE: lib/spark/mllib/classification/logistic_regression.rb
================================================
module Spark
  module Mllib
    ##
    # LogisticRegressionModel
    #
    # A linear binary classification model derived from logistic regression.
    #
    # == Examples:
    #
    #   Spark::Mllib.import
    #
    #   # Dense vectors
    #   data = [
    #     LabeledPoint.new(0.0, [0.0, 1.0]),
    #     LabeledPoint.new(1.0, [1.0, 0.0]),
    #   ]
    #   lrm = LogisticRegressionWithSGD.train($sc.parallelize(data))
    #
    #   lrm.predict([1.0, 0.0])
    #   # => 1
    #   lrm.predict([0.0, 1.0])
    #   # => 0
    #
    #   lrm.clear_threshold
    #   lrm.predict([0.0, 1.0])
    #   # => 0.123...
    #
    #
    #   # Sparse vectors
    #   data = [
    #     LabeledPoint.new(0.0, SparseVector.new(2, {0 => 0.0})),
    #     LabeledPoint.new(1.0, SparseVector.new(2, {1 => 1.0})),
    #     LabeledPoint.new(0.0, SparseVector.new(2, {0 => 1.0})),
    #     LabeledPoint.new(1.0, SparseVector.new(2, {1 => 2.0}))
    #   ]
    #   lrm = LogisticRegressionWithSGD.train($sc.parallelize(data))
    #
    #   lrm.predict([0.0, 1.0])
    #   # => 1
    #   lrm.predict([1.0, 0.0])
    #   # => 0
    #   lrm.predict(SparseVector.new(2, {1 => 1.0}))
    #   # => 1
    #   lrm.predict(SparseVector.new(2, {0 => 1.0}))
    #   # => 0
    #
    #
    #   # LogisticRegressionWithLBFGS
    #   data = [
    #     LabeledPoint.new(0.0, [0.0, 1.0]),
    #     LabeledPoint.new(1.0, [1.0, 0.0]),
    #   ]
    #   lrm = LogisticRegressionWithLBFGS.train($sc.parallelize(data))
    #
    #   lrm.predict([1.0, 0.0])
    #   # => 1
    #   lrm.predict([0.0, 1.0])
    #   # => 0
    #
    class LogisticRegressionModel < ClassificationModel

      def initialize(*args)
        super
        @threshold = 0.5
      end

      # Predict values for a single data point or an RDD of points using
      # the model trained.
      def predict(vector)
        vector = Spark::Mllib::Vectors.to_vector(vector)
        margin = weights.dot(vector) + intercept
        score = 1.0 / (1.0 + Math.exp(-margin))

        if threshold.nil?
          return score
        end

        if score > threshold
          1
        else
          0
        end
      end

    end
  end
end

module Spark
  module Mllib
    class LogisticRegressionWithSGD < ClassificationMethodBase

      DEFAULT_OPTIONS = {
        iterations: 100,
        step: 1.0,
        mini_batch_fraction: 1.0,
        initial_weights: nil,
        reg_param: 0.01,
        reg_type: 'l2',
        intercept: false,
        validate: true,
        convergence_tol: 0.001
      }

      # Train a logistic regression model on the given data.
      #
      # == Arguments:
      # rdd::
      #   The training data, an RDD of LabeledPoint.
      #
      # iterations::
      #   The number of iterations (default: 100).
      #
      # step::
      #   The step parameter used in SGD (default: 1.0).
      #
      # mini_batch_fraction::
      #   Fraction of data to be used for each SGD iteration.
      #
      # initial_weights::
      #   The initial weights (default: nil).
      #
      # reg_param::
      #   The regularizer parameter (default: 0.01).
      #
      # reg_type::
      #   The type of regularizer used for training our model (default: "l2").
      #
      #   Allowed values:
      #   - "l1" for using L1 regularization
      #   - "l2" for using L2 regularization
      #   - nil for no regularization
      #
      # intercept::
      #   Boolean parameter which indicates the use
      #   or not of the augmented representation for
      #   training data (i.e. whether bias features
      #   are activated or not).
      #   (default: false)
      #
      # validate::
      #   Boolean parameter which indicates if the
      #   algorithm should validate data before training.
      #   (default: true)
      #
      # convergence_tol::
      #   A condition which decides iteration termination.
      #   (default: 0.001)
      #
      def self.train(rdd, options={})
        super

        weights, intercept = Spark.jb.call(RubyMLLibAPI.new, 'trainLogisticRegressionModelWithSGD', rdd,
                                           options[:iterations].to_i,
                                           options[:step].to_f,
                                           options[:mini_batch_fraction].to_f,
                                           options[:initial_weights],
                                           options[:reg_param].to_f,
                                           options[:reg_type],
                                           options[:intercept],
                                           options[:validate],
                                           options[:convergence_tol])

        LogisticRegressionModel.new(weights, intercept)
      end

    end
  end
end

module Spark
  module Mllib
    class LogisticRegressionWithLBFGS < ClassificationMethodBase

      DEFAULT_OPTIONS = {
        iterations: 100,
        initial_weights: nil,
        reg_param: 0.01,
        reg_type: 'l2',
        intercept: false,
        corrections: 10,
        tolerance: 0.0001
      }

      # Train a logistic regression model on the given data.
      #
      # == Arguments:
      # rdd::
      #   The training data, an RDD of LabeledPoint.
      #
      # iterations::
      #   The number of iterations (default: 100).
      #
      # initial_weights::
      #   The initial weights (default: nil).
      #
      # reg_param::
      #   The regularizer parameter (default: 0.01).
      #
      # reg_type::
      #   The type of regularizer used for training our model (default: "l2").
      #
      #   Allowed values:
      #   - "l1" for using L1 regularization
      #   - "l2" for using L2 regularization
      #   - nil for no regularization
      #
      # intercept::
      #   Boolean parameter which indicates the use
      #   or not of the augmented representation for
      #   training data (i.e. whether bias features
      #   are activated or not).
      #
      # corrections::
      #   The number of corrections used in the LBFGS update (default: 10).
      #
      # tolerance::
      #   The convergence tolerance of iterations for L-BFGS (default: 0.0001).
      #
      def self.train(rdd, options={})
        super

        weights, intercept = Spark.jb.call(RubyMLLibAPI.new, 'trainLogisticRegressionModelWithLBFGS', rdd,
                                           options[:iterations].to_i,
                                           options[:initial_weights],
                                           options[:reg_param].to_f,
                                           options[:reg_type],
                                           options[:intercept],
                                           options[:corrections].to_i,
                                           options[:tolerance].to_f)

        LogisticRegressionModel.new(weights, intercept)
      end

    end
  end
end


================================================
FILE: lib/spark/mllib/classification/naive_bayes.rb
================================================
module Spark
  module Mllib
    ##
    # NaiveBayesModel
    #
    # Model for Naive Bayes classifiers.
    #
    # Contains two parameters:
    # pi:: vector of logs of class priors (dimension C)
    # theta:: matrix of logs of class conditional probabilities (CxD)
    #
    # == Examples:
    #
    #   Spark::Mllib.import
    #
    #   # Dense vectors
    #   data = [
    #     LabeledPoint.new(0.0, [0.0, 0.0]),
    #     LabeledPoint.new(0.0, [0.0, 1.0]),
    #     LabeledPoint.new(1.0, [1.0, 0.0])
    #   ]
    #   model = NaiveBayes.train($sc.parallelize(data))
    #
    #   model.predict([0.0, 1.0])
    #   # => 0.0
    #   model.predict([1.0, 0.0])
    #   # => 1.0
    #
    #
    #   # Sparse vectors
    #   data = [
    #     LabeledPoint.new(0.0, SparseVector.new(2, {1 => 0.0})),
    #     LabeledPoint.new(0.0, SparseVector.new(2, {1 => 1.0})),
    #     LabeledPoint.new(1.0, SparseVector.new(2, {0 => 1.0}))
    #   ]
    #   model = NaiveBayes.train($sc.parallelize(data))
    #
    #   model.predict(SparseVector.new(2, {1 => 1.0}))
    #   # => 0.0
    #   model.predict(SparseVector.new(2, {0 => 1.0}))
    #   # => 1.0
    #
    class NaiveBayesModel

      attr_reader :labels, :pi, :theta

      def initialize(labels, pi, theta)
        @labels = labels
        @pi = pi
        @theta = theta
      end

      # Predict values for a single data point or an RDD of points using
      # the model trained.
      def predict(vector)
        vector = Spark::Mllib::Vectors.to_vector(vector)
        array = (vector.dot(theta) + pi).to_a
        index = array.index(array.max)
        labels[index]
      end

    end
  end
end


module Spark
  module Mllib
    class NaiveBayes

      # Trains a Naive Bayes model given an RDD of (label, features) pairs.
      #
      # This is the Multinomial NB (http://tinyurl.com/lsdw6p) which can handle all kinds of
      # discrete data.  For example, by converting documents into TF-IDF vectors, it can be used for
      # document classification.  By making every vector a 0-1 vector, it can also be used as
      # Bernoulli NB (http://tinyurl.com/p7c96j6). The input feature values must be nonnegative.
      #
      # == Arguments:
      # rdd:: RDD of LabeledPoint.
      # lambda:: The smoothing parameter.
      #
      def self.train(rdd, lambda=1.0)
        # Validation
        first = rdd.first
        unless first.is_a?(LabeledPoint)
          raise Spark::MllibError, "RDD should contains LabeledPoint, got #{first.class}"
        end

        labels, pi, theta = Spark.jb.call(RubyMLLibAPI.new, 'trainNaiveBayesModel', rdd, lambda)
        theta = Spark::Mllib::Matrices.dense(theta.size, theta.first.size, theta)

        NaiveBayesModel.new(labels, pi, theta)
      end

    end
  end
end


================================================
FILE: lib/spark/mllib/classification/svm.rb
================================================
module Spark
  module Mllib
    ##
    # SVMModel
    #
    # A support vector machine.
    #
    # == Examples:
    #
    #   Spark::Mllib.import
    #
    #   # Dense vectors
    #   data = [
    #       LabeledPoint.new(0.0, [0.0]),
    #       LabeledPoint.new(1.0, [1.0]),
    #       LabeledPoint.new(1.0, [2.0]),
    #       LabeledPoint.new(1.0, [3.0])
    #   ]
    #   svm = SVMWithSGD.train($sc.parallelize(data))
    #
    #   svm.predict([1.0])
    #   # => 1
    #   svm.clear_threshold
    #   svm.predict([1.0])
    #   # => 1.25...
    #
    #
    #   # Sparse vectors
    #   data = [
    #       LabeledPoint.new(0.0, SparseVector.new(2, {0 => -1.0})),
    #       LabeledPoint.new(1.0, SparseVector.new(2, {1 => 1.0})),
    #       LabeledPoint.new(0.0, SparseVector.new(2, {0 => 0.0})),
    #       LabeledPoint.new(1.0, SparseVector.new(2, {1 => 2.0}))
    #   ]
    #   svm = SVMWithSGD.train($sc.parallelize(data))
    #
    #   svm.predict(SparseVector.new(2, {1 => 1.0}))
    #   # => 1
    #   svm.predict(SparseVector.new(2, {0 => -1.0}))
    #   # => 0
    #
    class SVMModel < ClassificationModel

      def initialize(*args)
        super
        @threshold = 0.0
      end

      # Predict values for a single data point or an RDD of points using
      # the model trained.
      def predict(vector)
        vector = Spark::Mllib::Vectors.to_vector(vector)
        margin = weights.dot(vector) + intercept

        if threshold.nil?
          return margin
        end

        if margin > threshold
          1
        else
          0
        end
      end

    end
  end
end

module Spark
  module Mllib
    class SVMWithSGD < ClassificationMethodBase

      DEFAULT_OPTIONS = {
        iterations: 100,
        step: 1.0,
        reg_param: 0.01,
        mini_batch_fraction: 1.0,
        initial_weights: nil,
        reg_type: 'l2',
        intercept: false,
        validate: true,
        convergence_tol: 0.001
      }

      # Train a support vector machine on the given data.
      #
      # rdd::
      #   The training data, an RDD of LabeledPoint.
      #
      # iterations::
      #   The number of iterations (default: 100).
      #
      # step::
      #   The step parameter used in SGD (default: 1.0).
      #
      # reg_param::
      #   The regularizer parameter (default: 0.01).
      #
      # mini_batch_fraction::
      #   Fraction of data to be used for each SGD iteration.
      #
      # initial_weights::
      #   The initial weights (default: nil).
      #
      # reg_type::
      #   The type of regularizer used for training our model (default: "l2").
      #
      #   Allowed values:
      #   - "l1" for using L1 regularization
      #   - "l2" for using L2 regularization
      #   - nil for no regularization
      #
      # intercept::
      #   Boolean parameter which indicates the use
      #   or not of the augmented representation for
      #   training data (i.e. whether bias features
      #   are activated or not).
      #   (default: false)
      #
      # validateData::
      #   Boolean parameter which indicates if the
      #   algorithm should validate data before training.
      #   (default: true)
      #
      # convergence_tol::
      #   A condition which decides iteration termination.
      #   (default: 0.001)
      #
      def self.train(rdd, options={})
        super

        weights, intercept = Spark.jb.call(RubyMLLibAPI.new, 'trainSVMModelWithSGD', rdd,
                                           options[:iterations].to_i,
                                           options[:step].to_f,
                                           options[:reg_param].to_f,
                                           options[:mini_batch_fraction].to_f,
                                           options[:initial_weights],
                                           options[:reg_type],
                                           options[:intercept],
                                           options[:validate],
                                           options[:convergence_tol])

        SVMModel.new(weights, intercept)
      end

    end
  end
end


================================================
FILE: lib/spark/mllib/clustering/gaussian_mixture.rb
================================================
module Spark
  module Mllib
    ##
    # GaussianMixtureModel
    #
    # A clustering model derived from the Gaussian Mixture Model method.
    #
    # == Examples:
    #
    #   Spark::Mllib.import
    #
    #   data = [
    #     DenseVector.new([-0.1, -0.05]),
    #     DenseVector.new([-0.01, -0.1]),
    #     DenseVector.new([0.9, 0.8]),
    #     DenseVector.new([0.75, 0.935]),
    #     DenseVector.new([-0.83, -0.68]),
    #     DenseVector.new([-0.91, -0.76])
    #   ]
    #
    #   model = GaussianMixture.train($sc.parallelize(data), 3, convergence_tol: 0.0001, max_iterations: 50, seed: 10)
    #
    #   labels = model.predict($sc.parallelize(data)).collect
    #
    class GaussianMixtureModel

      attr_reader :weights, :gaussians, :k

      def initialize(weights, gaussians)
        @weights = weights
        @gaussians = gaussians
        @k = weights.size
      end

      # Find the cluster to which the points in 'x' has maximum membership
      # in this model.
      def predict(rdd)
        if rdd.is_a?(Spark::RDD)
          predict_soft(rdd).map('lambda{|x| x.index(x.max)}')
        else
          raise ArgumentError, 'Argument must be a RDD.'
        end
      end

      # Find the membership of each point in 'x' to all mixture components.
      def predict_soft(rdd)
        Spark.jb.call(RubyMLLibAPI.new, 'predictSoftGMM', rdd, weights, means, sigmas)
      end

      def means
        @means ||= @gaussians.map(&:mu)
      end

      def sigmas
        @sigmas ||= @gaussians.map(&:sigma)
      end

    end
  end
end

module Spark
  module Mllib
    class GaussianMixture

      def self.train(rdd, k, convergence_tol: 0.001, max_iterations: 100, seed: nil)
        weights, means, sigmas = Spark.jb.call(RubyMLLibAPI.new, 'trainGaussianMixtureModel', rdd,
                                               k, convergence_tol, max_iterations, Spark.jb.to_long(seed))

        means.map! {|mu|    Spark.jb.java_to_ruby(mu)}
        sigmas.map!{|sigma| Spark.jb.java_to_ruby(sigma)}

        mvgs = Array.new(k) do |i|
          MultivariateGaussian.new(means[i], sigmas[i])
        end

        GaussianMixtureModel.new(weights, mvgs)
      end

    end
  end
end


================================================
FILE: lib/spark/mllib/clustering/kmeans.rb
================================================
module Spark
  module Mllib
    ##
    # KMeansModel
    #
    # A clustering model derived from the k-means method.
    #
    # == Examples:
    #
    #   Spark::Mllib.import
    #
    #   # Dense vectors
    #   data = [
    #     DenseVector.new([0.0,0.0]),
    #     DenseVector.new([1.0,1.0]),
    #     DenseVector.new([9.0,8.0]),
    #     DenseVector.new([8.0,9.0])
    #   ]
    #
    #   model = KMeans.train($sc.parallelize(data), 2, max_iterations: 10,
    #                        runs: 30, initialization_mode: "random")
    #
    #   model.predict([0.0, 0.0]) == model.predict([1.0, 1.0])
    #   # => true
    #   model.predict([8.0, 9.0]) == model.predict([9.0, 8.0])
    #   # => true
    #
    #
    #   # Sparse vectors
    #   data = [
    #       SparseVector.new(3, {1 => 1.0}),
    #       SparseVector.new(3, {1 => 1.1}),
    #       SparseVector.new(3, {2 => 1.0}),
    #       SparseVector.new(3, {2 => 1.1})
    #   ]
    #   model = KMeans.train($sc.parallelize(data), 2, initialization_mode: "k-means||")
    #
    #   model.predict([0.0, 1.0, 0.0]) == model.predict([0, 1.1, 0.0])
    #   # => true
    #   model.predict([0.0, 0.0, 1.0]) == model.predict([0, 0, 1.1])
    #   # => true
    #   model.predict(data[0]) == model.predict(data[1])
    #   # => true
    #   model.predict(data[2]) == model.predict(data[3])
    #   # => true
    #
    class KMeansModel

      attr_reader :centers

      def initialize(centers)
        @centers = centers
      end

      # Find the cluster to which x belongs in this model.
      def predict(vector)
        vector = Spark::Mllib::Vectors.to_vector(vector)
        best = 0
        best_distance = Float::INFINITY

        @centers.each_with_index do |center, index|
          distance = vector.squared_distance(center)
          if distance < best_distance
            best = index
            best_distance = distance
          end
        end

        best
      end

      def self.from_java(object)
        centers = object.clusterCenters
        centers.map! do |center|
          Spark.jb.java_to_ruby(center)
        end

        KMeansModel.new(centers)
      end

    end
  end
end

module Spark
  module Mllib
    class KMeans

      # Trains a k-means model using the given set of parameters.
      #
      # == Arguments:
      # rdd::
      #   The training data, an RDD of Vectors.
      #
      # k::
      #   Number of clusters.
      #
      # max_iterations::
      #   Max number of iterations.
      #
      # runs::
      #   Number of parallel runs, defaults to 1. The best model is returned.
      #
      # initialization_mode::
      #   Initialization model, either "random" or "k-means||" (default).
      #
      # seed::
      #   Random seed value for cluster initialization.
      #
      # epsilon::
      #   The distance threshold within which we've consider centers to have converged.
      #
      def self.train(rdd, k, max_iterations: 100, runs: 1, initialization_mode: 'k-means||', seed: nil,
                             initialization_steps: 5, epsilon: 0.0001)

        cluster_initial_model = []

        # Call returns KMeansModel
        Spark.jb.call(RubyMLLibAPI.new, 'trainKMeansModel', rdd,
                      k, max_iterations, runs, initialization_mode, Spark.jb.to_long(seed), initialization_steps, epsilon, cluster_initial_model)
      end

    end
  end
end


================================================
FILE: lib/spark/mllib/matrix.rb
================================================
module Spark
  module Mllib
    module Matrices

      def self.dense(*args)
        DenseMatrix.new(*args)
      end

      def self.sparse(*args)
        SparseMatrix.new(*args)
      end

      def self.to_matrix(data)
        if data.is_a?(SparseMatrix) || data.is_a?(DenseMatrix)
          data
        elsif data.is_a?(Array)
          DenseMatrix.new(data)
        end
      end

    end
  end
end

module Spark
  module Mllib
    # @abstract Parent for all type of matrices
    class MatrixBase < MatrixAdapter
    end
  end
end

module Spark
  module Mllib
    ##
    # DenseMatrix
    #
    #   DenseMatrix.new(2, 3, [[1,2,3], [4,5,6]]).values
    #   # => [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]
    #
    class DenseMatrix < MatrixBase

      def initialize(rows, cols, values)
        super(:dense, rows, cols, values.to_a)
      end

      def to_java
        JDenseMatrix.new(shape[0], shape[1], values.flatten)
      end

      def self.from_java(object)
        rows = object.numRows
        cols = object.numCols
        values = object.values

        DenseMatrix.new(rows, cols, values)
      end

    end
  end
end

module Spark
  module Mllib
    ##
    # SparseMatrix
    #
    # == Arguments:
    # rows::
    #   Number of rows.
    #
    # cols::
    #   Number of columns.
    #
    # col_pointers::
    #   The index corresponding to the start of a new column.
    #
    # row_indices::
    #   The row index of the entry. They must be in strictly
    #   increasing order for each column.
    #
    # values::
    #   Nonzero matrix entries in column major.
    #
    # == Examples:
    #
    #   SparseMatrix.new(3, 3, [0, 2, 3, 6], [0, 2, 1, 0, 1, 2], [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).values
    #
    #   # => [
    #   #      [1.0, 0.0, 4.0],
    #   #      [0.0, 3.0, 5.0],
    #   #      [2.0, 0.0, 6.0]
    #   #    ]
    #
    class SparseMatrix < MatrixBase

      attr_reader :col_pointers, :row_indices

      def initialize(rows, cols, col_pointers, row_indices, values)
        super(:sparse, rows, cols)

        @col_pointers = col_pointers
        @row_indices = row_indices
        @values = values

        j = 0
        while j < cols
          idx = col_pointers[j]
          idx_end = col_pointers[j+1]
          while idx < idx_end
            self[row_indices[idx], j] = values[idx]
            idx += 1
          end
          j += 1
        end
      end

    end
  end
end


================================================
FILE: lib/spark/mllib/regression/common.rb
================================================
module Spark
  module Mllib
    ##
    # RegressionModel
    #
    # A linear model that has a vector of coefficients and an intercept.
    #
    class RegressionModel

      attr_reader :weights, :intercept

      def initialize(weights, intercept)
        @weights = Spark::Mllib::Vectors.to_vector(weights)
        @intercept = intercept.to_f
      end

      # Predict the value of the dependent variable given a vector data
      # containing values for the independent variables.
      #
      # == Examples:
      #   lm = RegressionModel.new([1.0, 2.0], 0.1)
      #
      #   lm.predict([-1.03, 7.777]) - 14.624 < 1e-6
      #   # => true
      #
      #   lm.predict(SparseVector.new(2, {0 => -1.03, 1 => 7.777})) - 14.624 < 1e-6
      #   # => true
      #
      def predict(data)
        data = Spark::Mllib::Vectors.to_vector(data)
        @weights.dot(data) + @intercept
      end

    end
  end
end


module Spark
  module Mllib
    ##
    # RegressionMethodBase
    #
    # Parent for regression methods
    #
    class RegressionMethodBase

      def self.train(rdd, options)
        # String keys to symbols
        options.symbolize_keys!

        # Reverse merge
        self::DEFAULT_OPTIONS.each do |key, value|
          if options.has_key?(key)
            # value from user
          else
            options[key] = value
          end
        end

        # Validation
        first = rdd.first
        unless first.is_a?(LabeledPoint)
          raise Spark::MllibError, "RDD should contains LabeledPoint, got #{first.class}"
        end

        # Initial weights is optional for user (not for Spark)
        options[:initial_weights] = Vectors.to_vector(options[:initial_weights] || [0.0] * first.features.size)
      end

    end
  end
end


================================================
FILE: lib/spark/mllib/regression/labeled_point.rb
================================================
module Spark
  module Mllib
    ##
    # LabeledPoint
    #
    # The features and labels of a data point.
    #
    # == Parameters:
    # label::
    #   Label for this data point.
    #
    # features::
    #   Vector of features for this point
    #
    class LabeledPoint

      attr_reader :label, :features

      def initialize(label, features)
        @label = label.to_f
        @features = Spark::Mllib::Vectors.to_vector(features)
      end

      def self.from_java(object)
        LabeledPoint.new(
          object.label,
          Spark.jb.java_to_ruby(object.features)
        )
      end

      def marshal_dump
        [@label, @features]
      end

      def marshal_load(array)
        initialize(array[0], array[1])
      end

    end
  end
end


================================================
FILE: lib/spark/mllib/regression/lasso.rb
================================================
##
# LassoModel
#
# Train a regression model with L1-regularization using Stochastic Gradient Descent.
# This solves the l1-regularized least squares regression formulation
#   f(weights) = 1/2n ||A weights-y||^2^  + regParam ||weights||_1
# Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with
# its corresponding right hand side label y.
# See also the documentation for the precise formulation.
#
# == Examples:
#
#   Spark::Mllib.import
#
#   # Dense vectors
#   data = [
#       LabeledPoint.new(0.0, [0.0]),
#       LabeledPoint.new(1.0, [1.0]),
#       LabeledPoint.new(3.0, [2.0]),
#       LabeledPoint.new(2.0, [3.0])
#   ]
#   lrm = LassoWithSGD.train($sc.parallelize(data), initial_weights: [1.0])
#
#   lrm.predict([0.0]) - 0 < 0.5
#   # => true
#
#   lrm.predict([1.0]) - 1 < 0.5
#   # => true
#
#   lrm.predict(SparseVector.new(1, {0 => 1.0})) - 1 < 0.5
#   # => true
#
#
#   # Sparse vectors
#   data = [
#       LabeledPoint.new(0.0, SparseVector.new(1, {0 => 0.0})),
#       LabeledPoint.new(1.0, SparseVector.new(1, {0 => 1.0})),
#       LabeledPoint.new(3.0, SparseVector.new(1, {0 => 2.0})),
#       LabeledPoint.new(2.0, SparseVector.new(1, {0 => 3.0}))
#   ]
#   lrm = LinearRegressionWithSGD.train($sc.parallelize(data), initial_weights: [1.0])
#
#   lrm.predict([0.0]) - 0 < 0.5
#   # => true
#
#   lrm.predict(SparseVector.new(1, {0 => 1.0})) - 1 < 0.5
#   # => true
#
class Spark::Mllib::LassoModel < Spark::Mllib::RegressionModel
end

module Spark
  module Mllib
    class LassoWithSGD < RegressionMethodBase

      DEFAULT_OPTIONS = {
        iterations: 100,
        step: 1.0,
        reg_param: 0.01,
        mini_batch_fraction: 1.0,
        initial_weights: nil,
        intercept: false,
        validate: true,
        convergence_tol: 0.001
      }

      # Train a Lasso regression model on the given data.
      #
      # == Parameters:
      # rdd::
      #   The training data (RDD instance).
      #
      # iterations::
      #   The number of iterations (default: 100).
      #
      # step::
      #   The step parameter used in SGD (default: 1.0).
      #
      # reg_param::
      #   The regularizer parameter (default: 0.0).
      #
      # mini_batch_fraction::
      #   Fraction of data to be used for each SGD iteration (default: 1.0).
      #
      # initial_weights::
      #   The initial weights (default: nil).
      #
      # intercept::
      #   Boolean parameter which indicates the use
      #   or not of the augmented representation for
      #   training data (i.e. whether bias features
      #   are activated or not).
      #   (default: false)
      #
      # validate::
      #   Boolean parameter which indicates if the
      #   algorithm should validate data before training.
      #   (default: true)
      #
      # convergence_tol::
      #   A condition which decides iteration termination.
      #   (default: 0.001)
      #
      def self.train(rdd, options={})
        super

        weights, intercept = Spark.jb.call(RubyMLLibAPI.new, 'trainLassoModelWithSGD', rdd,
                                           options[:iterations].to_i,
                                           options[:step].to_f,
                                           options[:reg_param].to_f,
                                           options[:mini_batch_fraction].to_f,
                                           options[:initial_weights],
                                           options[:intercept],
                                           options[:validate],
                                           options[:convergence_tol])

        LassoModel.new(weights, intercept)
      end

    end
  end
end


================================================
FILE: lib/spark/mllib/regression/linear.rb
================================================
##
# LinearRegressionModel
#
# Train a linear regression model with no regularization using Stochastic Gradient Descent.
# This solves the least squares regression formulation
#   f(weights) = 1/n ||A weights-y||^2^
# (which is the mean squared error).
# Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with
# its corresponding right hand side label y.
# See also the documentation for the precise formulation.
#
# == Examples:
#
#   Spark::Mllib.import
#
#   # Dense vectors
#   data = [
#     LabeledPoint.new(0.0, [0.0]),
#     LabeledPoint.new(1.0, [1.0]),
#     LabeledPoint.new(3.0, [2.0]),
#     LabeledPoint.new(2.0, [3.0])
#   ]
#   lrm = LinearRegressionWithSGD.train($sc.parallelize(data), initial_weights: [1.0])
#
#   lrm.intercept # => 0.0
#   lrm.weights   # => [0.9285714285714286]
#
#   lrm.predict([0.0]) < 0.5
#   # => true
#
#   lrm.predict([1.0]) - 1 < 0.5
#   # => true
#
#   lrm.predict(SparseVector.new(1, {0 => 1.0})) - 1 < 0.5
#   # => true
#
#   # Sparse vectors
#   data = [
#     LabeledPoint.new(0.0, SparseVector.new(1, {0 => 0.0})),
#     LabeledPoint.new(1.0, SparseVector.new(1, {0 => 1.0})),
#     LabeledPoint.new(3.0, SparseVector.new(1, {0 => 2.0})),
#     LabeledPoint.new(2.0, SparseVector.new(1, {0 => 3.0}))
#   ]
#   lrm = LinearRegressionWithSGD.train($sc.parallelize(data), initial_weights: [1.0])
#
#   lrm.intercept # => 0.0
#   lrm.weights   # => [0.9285714285714286]
#
#   lrm.predict([0.0]) < 0.5
#   # => true
#
#   lrm.predict(SparseVector.new(1, {0 => 1.0})) - 1 < 0.5
#   # => true
#
class Spark::Mllib::LinearRegressionModel < Spark::Mllib::RegressionModel
end

module Spark
  module Mllib
    class LinearRegressionWithSGD < RegressionMethodBase

      DEFAULT_OPTIONS = {
        iterations: 100,
        step: 1.0,
        mini_batch_fraction: 1.0,
        initial_weights: nil,
        reg_param: 0.0,
        reg_type: nil,
        intercept: false,
        validate: true,
        convergence_tol: 0.001
      }

      # Train a linear regression model on the given data.
      #
      # == Parameters:
      # rdd::
      #   The training data (RDD instance).
      #
      # iterations::
      #   The number of iterations (default: 100).
      #
      # step::
      #   The step parameter used in SGD (default: 1.0).
      #
      # mini_batch_fraction::
      #   Fraction of data to be used for each SGD iteration (default: 1.0).
      #
      # initial_weights::
      #   The initial weights (default: nil).
      #
      # reg_param::
      #   The regularizer parameter (default: 0.0).
      #
      # reg_type::
      #   The type of regularizer used for training our model (default: nil).
      #
      #   Allowed values:
      #   - "l1" for using L1 regularization (lasso),
      #   - "l2" for using L2 regularization (ridge),
      #   - None for no regularization
      #
      # intercept::
      #   Boolean parameter which indicates the use
      #   or not of the augmented representation for
      #   training data (i.e. whether bias features
      #   are activated or not).
      #   (default: false)
      #
      # validate::
      #   Boolean parameter which indicates if the
      #   algorithm should validate data before training.
      #   (default: true)
      #
      # convergence_tol::
      #    A condition which decides iteration termination.
      #    (default: 0.001)
      #
      def self.train(rdd, options={})
        super

        weights, intercept = Spark.jb.call(RubyMLLibAPI.new, 'trainLinearRegressionModelWithSGD', rdd,
                                           options[:iterations].to_i,
                                           options[:step].to_f,
                                           options[:mini_batch_fraction].to_f,
                                           options[:initial_weights],
                                           options[:reg_param].to_f,
                                           options[:reg_type],
                                           options[:intercept],
                                           options[:validate],
                                           options[:convergence_tol])

        LinearRegressionModel.new(weights, intercept)
      end

    end
  end
end


================================================
FILE: lib/spark/mllib/regression/ridge.rb
================================================
##
# RidgeRegressionModel
#
# Train a regression model with L2-regularization using Stochastic Gradient Descent.
# This solves the l1-regularized least squares regression formulation
#   f(weights) = 1/2n ||A weights-y||^2^  + regParam/2 ||weights||^2^
# Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with
# its corresponding right hand side label y.
# See also the documentation for the precise formulation.
#
# == Examples:
#
#   Spark::Mllib.import
#
#   data = [
#       LabeledPoint.new(0.0, [0.0]),
#       LabeledPoint.new(1.0, [1.0]),
#       LabeledPoint.new(3.0, [2.0]),
#       LabeledPoint.new(2.0, [3.0])
#   ]
#   lrm = RidgeRegressionWithSGD.train($sc.parallelize(data), initial_weights: [1.0])
#
#   lrm.predict([0.0]) - 0 < 0.5
#   # => true
#
#   lrm.predict([1.0]) - 1 < 0.5
#   # => true
#
#   lrm.predict(SparseVector.new(1, {0 => 1.0})) - 1 < 0.5
#   # => true
#
#   data = [
#       LabeledPoint.new(0.0, SparseVector.new(1, {0 => 0.0})),
#       LabeledPoint.new(1.0, SparseVector.new(1, {0 => 1.0})),
#       LabeledPoint.new(3.0, SparseVector.new(1, {0 => 2.0})),
#       LabeledPoint.new(2.0, SparseVector.new(1, {0 => 3.0}))
#   ]
#   lrm = LinearRegressionWithSGD.train($sc.parallelize(data), initial_weights: [1.0])
#
#   lrm.predict([0.0]) - 0 < 0.5
#   # => true
#
#   lrm.predict(SparseVector.new(1, {0 => 1.0})) - 1 < 0.5
#   # => true
#
class Spark::Mllib::RidgeRegressionModel < Spark::Mllib::RegressionModel
end

module Spark
  module Mllib
    class RidgeRegressionWithSGD < RegressionMethodBase

      DEFAULT_OPTIONS = {
        iterations: 100,
        step: 1.0,
        reg_param: 0.01,
        mini_batch_fraction: 1.0,
        initial_weights: nil,
        intercept: false,
        validate: true,
        convergence_tol: 0.001
      }

      # Train a ridge regression model on the given data.
      #
      # == Parameters:
      # rdd::
      #   The training data (RDD instance).
      #
      # iterations::
      #   The number of iterations (default: 100).
      #
      # step::
      #   The step parameter used in SGD (default: 1.0).
      #
      # reg_param::
      #   The regularizer parameter (default: 0.0).
      #
      # mini_batch_fraction::
      #   Fraction of data to be used for each SGD iteration (default: 1.0).
      #
      # initial_weights::
      #   The initial weights (default: nil).
      #
      # intercept::
      #   Boolean parameter which indicates the use
      #   or not of the augmented representation for
      #   training data (i.e. whether bias features
      #   are activated or not).
      #   (default: false)
      #
      # validate::
      #   Boolean parameter which indicates if the
      #   algorithm should validate data before training.
      #   (default: true)
      #
      # convergence_tol::
      #   A condition which decides iteration termination.
      #   (default: 0.001)
      #
      def self.train(rdd, options={})
        super

        weights, intercept = Spark.jb.call(RubyMLLibAPI.new, 'trainRidgeModelWithSGD', rdd,
                                           options[:iterations].to_i,
                                           options[:step].to_f,
                                           options[:reg_param].to_f,
                                           options[:mini_batch_fraction].to_f,
                                           options[:initial_weights],
                                           options[:intercept],
                                           options[:validate],
                                           options[:convergence_tol])

        RidgeRegressionModel.new(weights, intercept)
      end

    end
  end
end


================================================
FILE: lib/spark/mllib/ruby_matrix/matrix_adapter.rb
================================================
require 'matrix'

module Spark
  module Mllib
    class MatrixAdapter < ::Matrix

      def self.new(*args)
        object = self.allocate

        if args.size == 2
          # Matrix is initialized from Matrix
          # Arguments: rows, column count
          object.__send__(:original_initialize, *args)
        else
          object.__send__(:initialize, *args)
        end

        object
      end

      alias_method :original_initialize, :initialize

      def initialize(type, rows, cols, values=nil)
        case type
        when :dense
          values = values.dup
          if rows * cols == values.size
            # Values are on one row
            # 2x2 => [1,2,3,4]
            values = values.each_slice(cols).to_a
          else
            # 2x2 => [[1,2], [3,4]]
          end
        when :sparse
          values = Array.new(rows) { Array.new(cols) { 0.0 } }
        else
          raise Spark::MllibError, 'Unknow vector type.'
        end

        super(values, cols)
      end

      def shape
        [row_count, column_count]
      end

      def values
        @values || to_a
      end

    end
  end
end


================================================
FILE: lib/spark/mllib/ruby_matrix/vector_adapter.rb
================================================
require 'matrix'

# Based on ruby 2.1

class Vector
  def self.elements(array, copy=true)
    DenseVector.new(convert_to_array(array, copy))
  end
end

module Spark
  module Mllib
    class VectorAdapter < ::Vector

      def self.new(*args)
        object = self.allocate
        object.__send__(:initialize, *args)
        object
      end

      def initialize(*args)
        case args.shift
        when :dense
          values = args.shift.dup
        when :sparse
          values = [0.0] * args.shift.to_i
        else
          raise Spark::MllibError, 'Unknow vector type.'
        end

        super(values)
      end

      def []=(index, value)
        @elements[index] = value
      end

      def dot(other)
        if other.is_a?(Spark::Mllib::MatrixBase)
          other * self
        else
          inner_product(other)
        end
      end

      def squared_distance(other)
        diff = self - other
        diff.dot(diff)
      end

      def values
        @values || to_a
      end

    end
  end
end


================================================
FILE: lib/spark/mllib/stat/distribution.rb
================================================
##
# MultivariateGaussian
#
# This class provides basic functionality for a Multivariate Gaussian (Normal) Distribution. In
# the event that the covariance matrix is singular, the density will be computed in a
# reduced dimensional subspace under which the distribution is supported.
#
# == Arguments:
# mu:: The mean vector of the distribution
# sigma:: The covariance matrix of the distribution
#
Spark::Mllib::MultivariateGaussian = Struct.new(:mu, :sigma)


================================================
FILE: lib/spark/mllib/vector.rb
================================================
module Spark
  module Mllib
    module Vectors

      def self.dense(*args)
        DenseVector.new(*args)
      end

      def self.sparse(*args)
        SparseVector.new(*args)
      end

      def self.parse(data)
        if data.start_with?('[') && data.end_with?(']')
          DenseVector.parse(data)
        elsif data.start_with?('(') && data.end_with?(')')
          SparseVector.parse(data)
        else
          raise ArgumentError, 'Unknow vector.'
        end
      end

      def self.to_vector(data)
        if data.is_a?(SparseVector) || data.is_a?(DenseVector)
          data
        elsif data.is_a?(Array)
          DenseVector.new(data)
        end
      end

    end
  end
end

module Spark
  module Mllib
    # @abstract Parent for all type of vectors
    class VectorBase < VectorAdapter
    end
  end
end

module Spark
  module Mllib
    ##
    # A dense vector represented by a value array.
    #
    # Dense vector is a vector in which most of the elements are non-zero.
    #
    # == Example:
    #   DenseVector.new([1,2,3,4,5]).values
    #   # => [1, 2, 3, 4, 5]
    #
    #   DenseVector.new(1..5).values
    #   # => [1, 2, 3, 4, 5]
    #
    class DenseVector < VectorBase

      def initialize(values)
        super(:dense, values.to_a)
      end

      # Covert string to vector
      #
      #   DenseVector.parse("[1.0,2.0,3.0,4.0,5.0]")
      #
      def self.parse(data)
        unless data =~ /\[[0-9., ]+\]/
          raise ArgumentError, 'Unknow format for DenseVector.'
        end

        data.sub!('[', '')
        data.sub!(']', '')

        data = data.split(',')
        data.map!(&:to_f)

        DenseVector.new(data)
      end

      # Convert vector to string
      #
      #   DenseVector.new([1,2,3,4,5]).to_s
      #   # => "[1.0,2.0,3.0,4.0,5.0]"
      #
      def to_s
        "[#{values.join(',')}]"
      end

      def to_java
        JDenseVector.new(values)
      end

      def self.from_java(object)
        DenseVector.new(object.values)
      end

      def marshal_dump
        values
      end

      def marshal_load(array)
        initialize(array)
      end

    end
  end
end

module Spark
  module Mllib
    ##
    # A sparse vector represented by an index array and an value array.
    #
    # Sparse vector is a vector in which most of the elements are zero.
    #
    # == Example:
    #   SparseVector.new(4, {1 => 1.0, 3 => 5.5}).values
    #   # => [0, 1.0, 0, 5.5]
    #
    #   SparseVector.new(4, [[1, 3], [1.0, 5.5]]).values
    #   # => [0, 1.0, 0, 5.5]
    #
    #   SparseVector.new(4, [1, 3], [1.0, 5.5]).values
    #   # => [0, 1.0, 0, 5.5]
    #
    class SparseVector < VectorBase

      attr_reader :indices

      def initialize(arg1, arg2=nil, arg3=nil)
          super(:sparse, arg1)

          if arg2.is_a?(Hash)
            @indices = arg2.keys
            @values = arg2.values
          else
            @indices = arg2
            @values = arg3
          end

          @indices.zip(@values).each do |(index, value)|
            self[index] = value
          end
      end

      # Covert string to vector
      #
      #   SparseVector.parse("(5,[1,4],[3.0,5.0])")
      #
      def self.parse(data)
        data = data.match(/\(([0-9]+)[ ]*,[ ]*\[([0-9,. ]*)\][ ]*,[ ]*\[([0-9,. ]*)\]\)/)
        if data
          size = data[1].to_i
          indices = data[2].split(',')
          indices.map!(&:to_i)
          values = data[3].split(',')
          values.map!(&:to_f)

          SparseVector.new(size, indices, values)
        else
          raise ArgumentError, 'Unknow format for SparseVector.'
        end
      end

      # Convert vector to string
      #
      #   SparseVector.new(5, {1 => 3, 4 => 5}).to_s
      #   # => "(5,[1,4],[3.0,5.0])"
      #
      def to_s
        "(#{size},[#{indices.join(',')}],[#{values.join(',')}])"
      end

      def marshal_dump
        [size, indices, values]
      end

      def marshal_load(array)
        initialize(array[0], array[1], array[2])
      end

    end
  end
end


================================================
FILE: lib/spark/mllib.rb
================================================
module Spark
  # MLlib is Spark’s scalable machine learning library consisting of common learning algorithms and utilities,
  # including classification, regression, clustering, collaborative filtering, dimensionality reduction,
  # as well as underlying optimization primitives.
  module Mllib
    extend Spark::Library

    # Base classes
    autoload_without_import :VectorBase, 'spark/mllib/vector'
    autoload_without_import :MatrixBase, 'spark/mllib/matrix'
    autoload_without_import :RegressionMethodBase,     'spark/mllib/regression/common'
    autoload_without_import :ClassificationMethodBase, 'spark/mllib/classification/common'

    # Linear algebra
    autoload :Vectors,      'spark/mllib/vector'
    autoload :DenseVector,  'spark/mllib/vector'
    autoload :SparseVector, 'spark/mllib/vector'
    autoload :Matrices,     'spark/mllib/matrix'
    autoload :DenseMatrix,  'spark/mllib/matrix'
    autoload :SparseMatrix, 'spark/mllib/matrix'

    # Regression
    autoload :LabeledPoint,            'spark/mllib/regression/labeled_point'
    autoload :RegressionModel,         'spark/mllib/regression/common'
    autoload :LinearRegressionModel,   'spark/mllib/regression/linear'
    autoload :LinearRegressionWithSGD, 'spark/mllib/regression/linear'
    autoload :LassoModel,              'spark/mllib/regression/lasso'
    autoload :LassoWithSGD,            'spark/mllib/regression/lasso'
    autoload :RidgeRegressionModel,    'spark/mllib/regression/ridge'
    autoload :RidgeRegressionWithSGD,  'spark/mllib/regression/ridge'

    # Classification
    autoload :ClassificationModel,         'spark/mllib/classification/common'
    autoload :LogisticRegressionWithSGD,   'spark/mllib/classification/logistic_regression'
    autoload :LogisticRegressionWithLBFGS, 'spark/mllib/classification/logistic_regression'
    autoload :SVMModel,                    'spark/mllib/classification/svm'
    autoload :SVMWithSGD,                  'spark/mllib/classification/svm'
    autoload :NaiveBayesModel,             'spark/mllib/classification/naive_bayes'
    autoload :NaiveBayes,                  'spark/mllib/classification/naive_bayes'

    # Clustering
    autoload :KMeans,               'spark/mllib/clustering/kmeans'
    autoload :KMeansModel,          'spark/mllib/clustering/kmeans'
    autoload :GaussianMixture,      'spark/mllib/clustering/gaussian_mixture'
    autoload :GaussianMixtureModel, 'spark/mllib/clustering/gaussian_mixture'

    # Stat
    autoload :MultivariateGaussian, 'spark/mllib/stat/distribution'

    def self.prepare
      return if @prepared

      # if narray?
      #   require 'spark/mllib/narray/vector'
      #   require 'spark/mllib/narray/matrix'
      # elsif mdarray?
      #   require 'spark/mllib/mdarray/vector'
      #   require 'spark/mllib/mdarray/matrix'
      # else
      #   require 'spark/mllib/matrix/vector'
      #   require 'spark/mllib/matrix/matrix'
      # end

      require 'spark/mllib/ruby_matrix/vector_adapter'
      require 'spark/mllib/ruby_matrix/matrix_adapter'

      @prepared = true
      nil
    end

    def self.narray?
      Gem::Specification::find_all_by_name('narray').any?
    end

    def self.mdarray?
      Gem::Specification::find_all_by_name('mdarray').any?
    end
  end
end

Spark::Mllib.prepare


================================================
FILE: lib/spark/rdd.rb
================================================
module Spark
  ##
  # A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. Represents an immutable,
  # partitioned collection of elements that can be operated on in parallel. This class contains the
  # basic operations available on all RDDs, such as `map`, `filter`, and `persist`.
  #
  class RDD

    extend Forwardable

    attr_reader :jrdd, :context, :command

    include Spark::Helper::Logger
    include Spark::Helper::Parser
    include Spark::Helper::Statistic

    def_delegators :@command, :serializer, :deserializer, :libraries, :files

    # Initializing RDD, this method is root of all Pipelined RDD - its unique
    # If you call some operations on this class it will be computed in Java
    #
    # == Parameters:
    # jrdd:: org.apache.spark.api.java.JavaRDD
    # context:: {Spark::Context}
    # serializer:: {Spark::Serializer}
    #
    def initialize(jrdd, context, serializer, deserializer=nil)
      @jrdd = jrdd
      @context = context

      @cached = false
      @checkpointed = false

      @command = Spark::CommandBuilder.new(serializer, deserializer)
    end

    def inspect
      comms = @command.commands.join(' -> ')

      result  = %{#<#{self.class.name}:0x#{object_id}}
      result << %{ (#{comms})} unless comms.empty?
      result << %{ (cached)} if cached?
      result << %{\n}
      result << %{  Serializer: "#{serializer}"\n}
      result << %{Deserializer: "#{deserializer}"}
      result << %{>}
      result
    end


    # =============================================================================
    # Operators

    def +(other)
      self.union(other)
    end


    # =============================================================================
    # Commad and serializer

    def add_command(klass, *args)
      @command.deep_copy.add_command(klass, *args)
    end

    # Add ruby library
    # Libraries will be included before computing
    #
    # == Example:
    #   rdd.add_library('pry').add_library('nio4r', 'distribution')
    #
    def add_library(*libraries)
      @command.add_library(*libraries)
      self
    end

    # Bind object to RDD
    #
    # == Example:
    #   text = "test"
    #
    #   rdd = $sc.parallelize(0..5)
    #   rdd = rdd.map(lambda{|x| x.to_s + " " + text})
    #   rdd = rdd.bind(text: text)
    #
    #   rdd.collect
    #   # => ["0 test", "1 test", "2 test", "3 test", "4 test", "5 test"]
    #
    def bind(objects)
      unless objects.is_a?(Hash)
        raise ArgumentError, 'Argument must be a Hash.'
      end

      @command.bind(objects)
      self
    end

    def new_rdd_from_command(klass, *args)
      comm = add_command(klass, *args)
      PipelinedRDD.new(self, comm)
    end


    # =============================================================================
    # Variables and non-computing functions

    def config
      @context.config
    end

    def default_reduce_partitions
      config['spark.default.parallelism'] || partitions_size
    end

    # Count of ParallelCollectionPartition
    def partitions_size
      jrdd.rdd.partitions.size
    end

    # A unique ID for this RDD (within its SparkContext).
    def id
      jrdd.id
    end

    # Persist this RDD with the default storage level MEMORY_ONLY_SER because of serialization.
    def cache
      persist('memory_only_ser')
    end

    # Set this RDD's storage level to persist its values across operations after the first time
    # it is computed. This can only be used to assign a new storage level if the RDD does not
    # have a storage level set yet.
    #
    # See StorageLevel for type of new_level
    #
    def persist(new_level)
      @cached = true
      jrdd.persist(Spark::StorageLevel.java_get(new_level))
      self
    end

    # Mark the RDD as non-persistent, and remove all blocks for it from memory and disk.
    #
    # == Parameters:
    # blocking:: whether to block until all blocks are deleted.
    #
    def unpersist(blocking=true)
      @cached = false
      jrdd.unpersist(blocking)
      self
    end

    def cached?
      @cached
    end

    def checkpointed?
      @checkpointed
    end

    # Return the name of this RDD.
    #
    def name
      _name = jrdd.name
      _name && _name.encode(Encoding::UTF_8)
    end

    # Assign a name to this RDD.
    #
    def set_name(value)
      jrdd.setName(value)
      value
    end

    def name=(value)
      set_name(value)
    end

    def to_java
      marshal = Spark::Serializer.marshal

      if deserializer.batched?
        ser = deserializer.deep_copy
        ser.serializer = marshal
      else
        ser = Spark::Serializer.batched(marshal)
      end

      rdd = self.reserialize(ser)
      RubyRDD.toJava(rdd.jrdd, rdd.serializer.batched?)
    end


    # =============================================================================
    # Actions which return value

    # Return an array that contains all of the elements in this RDD.
    # RJB raise an error if stage is killed.
    def collect(as_enum=false)
      file = Tempfile.new('collect', context.temp_dir)

      context.set_call_site(caller.first)
      RubyRDD.writeRDDToFile(jrdd.rdd, file.path)

      collect_from_file(file, as_enum)
    rescue => e
      raise Spark::RDDError, e.message
    ensure
      context.clear_call_site
    end

    def collect_from_file(file, as_enum=false)
      if self.is_a?(PipelinedRDD)
        klass = @command.serializer
      else
        klass = @command.deserializer
      end

      if as_enum
        result = klass.load_from_file(file)
      else
        result = klass.load_from_io(file).to_a
        file.close
        file.unlink
      end

      result
    end

    # Convert an Array to Hash
    #
    def collect_as_hash
      Hash[collect]
    end

    # Take the first num elements of the RDD.
    #
    # It works by first scanning one partition, and use the results from
    # that partition to estimate the number of additional partitions needed
    # to satisfy the limit.
    #
    # == Example:
    #   rdd = $sc.parallelize(0..100, 20)
    #   rdd.take(5)
    #   # => [0, 1, 2, 3, 4]
    #
    def take(count)
      buffer = []

      parts_count = self.partitions_size
      # No parts was scanned, yet
      last_scanned = -1

      while buffer.empty?
        last_scanned += 1
        buffer += context.run_job_with_command(self, [last_scanned], true, Spark::Command::Take, 0, -1)
      end

      # Assumption. Depend on batch_size and how Spark divided data.
      items_per_part = buffer.size
      left = count - buffer.size

      while left > 0 && last_scanned < parts_count
        parts_to_take = (left.to_f/items_per_part).ceil
        parts_for_scanned = Array.new(parts_to_take) do
          last_scanned += 1
        end

        # We cannot take exact number of items because workers are isolated from each other.
        # => once you take e.g. 50% from last part and left is still > 0 then its very
        # difficult merge new items
        items = context.run_job_with_command(self, parts_for_scanned, true, Spark::Command::Take, left, last_scanned)
        buffer += items

        left = count - buffer.size
        # Average size of all parts
        items_per_part = [items_per_part, items.size].reduce(0){|sum, x| sum + x.to_f/2}
      end

      buffer.slice!(0, count)
    end

    # Return the first element in this RDD.
    #
    # == Example:
    #   rdd = $sc.parallelize(0..100)
    #   rdd.first
    #   # => 0
    #
    def first
      self.take(1)[0]
    end

    # Reduces the elements of this RDD using the specified lambda or method.
    #
    # == Example:
    #   rdd = $sc.parallelize(0..10)
    #   rdd.reduce(lambda{|sum, x| sum+x})
    #   # => 55
    #
    def reduce(f)
      _reduce(Spark::Command::Reduce, f, f)
    end

    # Aggregate the elements of each partition, and then the results for all the partitions, using a
    # given associative function and a neutral "zero value".
    #
    # The function f(x, y) is allowed to modify x and return it as its result value to avoid
    # object allocation; however, it should not modify y.
    #
    # Be careful, zero_values is applied to all stages. See example.
    #
    # == Example:
    #   rdd = $sc.parallelize(0..10, 2)
    #   rdd.fold(1, lambda{|sum, x| sum+x})
    #   # => 58
    #
    def fold(zero_value, f)
      self.aggregate(zero_value, f, f)
    end

    # Aggregate the elements of each partition, and then the results for all the partitions, using
    # given combine functions and a neutral "zero value".
    #
    # This function can return a different result type. We need one operation for merging.
    #
    # Result must be an Array otherwise Serializer Array's zero value will be send
    # as multiple values and not just one.
    #
    # == Example:
    #   # 1 2 3 4 5  => 15 + 1 = 16
    #   # 6 7 8 9 10 => 40 + 1 = 41
    #   # 16 * 41 = 656
    #
    #   seq = lambda{|x,y| x+y}
    #   com = lambda{|x,y| x*y}
    #
    #   rdd = $sc.parallelize(1..10, 2)
    #   rdd.aggregate(1, seq, com)
    #   # => 656
    #
    def aggregate(zero_value, seq_op, comb_op)
      _reduce(Spark::Command::Aggregate, seq_op, comb_op, zero_value)
    end

    # Return the max of this RDD
    #
    # == Example:
    #   rdd = $sc.parallelize(0..10)
    #   rdd.max
    #   # => 10
    #
    def max
      self.reduce('lambda{|memo, item| memo > item ? memo : item }')
    end

    # Return the min of this RDD
    #
    # == Example:
    #   rdd = $sc.parallelize(0..10)
    #   rdd.min
    #   # => 0
    #
    def min
      self.reduce('lambda{|memo, item| memo < item ? memo : item }')
    end

    # Return the sum of this RDD
    #
    # == Example:
    #   rdd = $sc.parallelize(0..10)
    #   rdd.sum
    #   # => 55
    #
    def sum
      self.reduce('lambda{|sum, item| sum + item}')
    end

    # Return the number of values in this RDD
    #
    # == Example:
    #   rdd = $sc.parallelize(0..10)
    #   rdd.count
    #   # => 11
    #
    def count
      # nil is for seq_op => it means the all result go directly to one worker for combine
      @count ||= self.map_partitions('lambda{|iterator| iterator.to_a.size }')
                     .aggregate(0, nil, 'lambda{|sum, item| sum + item }')
    end

    # Return a {Spark::StatCounter} object that captures the mean, variance
    # and count of the RDD's elements in one operation.
    def stats
      @stats ||= new_rdd_from_command(Spark::Command::Stats).reduce('lambda{|memo, item| memo.merge(item)}')
    end

    # Compute the mean of this RDD's elements.
    #
    # == Example:
    #   $sc.parallelize([1, 2, 3]).mean
    #   # => 2.0
    #
    def mean
      stats.mean
    end

    # Compute the variance of this RDD's elements.
    #
    # == Example:
    #   $sc.parallelize([1, 2, 3]).variance
    #   # => 0.666...
    #
    def variance
      stats.variance
    end

    # Compute the standard deviation of this RDD's elements.
    #
    # == Example:
    #   $sc.parallelize([1, 2, 3]).stdev
    #   # => 0.816...
    #
    def stdev
      stats.stdev
    end

    # Compute the sample standard deviation of this RDD's elements (which
    # corrects for bias in estimating the standard deviation by dividing by
    # N-1 instead of N).
    #
    # == Example:
    #   $sc.parallelize([1, 2, 3]).sample_stdev
    #   # => 1.0
    #
    def sample_stdev
      stats.sample_stdev
    end

    # Compute the sample variance of this RDD's elements (which corrects
    # for bias in estimating the variance by dividing by N-1 instead of N).
    #
    # == Example:
    #   $sc.parallelize([1, 2, 3]).sample_variance
    #   # => 1.0
    #
    def sample_variance
      stats.sample_variance
    end

    # Compute a histogram using the provided buckets. The buckets
    # are all open to the right except for the last which is closed.
    # e.g. [1,10,20,50] means the buckets are [1,10) [10,20) [20,50],
    # which means 1<=x<10, 10<=x<20, 20<=x<=50. And on the input of 1
    # and 50 we would have a histogram of 1,0,1.
    #
    # If your histogram is evenly spaced (e.g. [0, 10, 20, 30]),
    # this can be switched from an O(log n) inseration to O(1) per
    # element(where n = # buckets).
    #
    # Buckets must be sorted and not contain any duplicates, must be
    # at least two elements.
    #
    # == Examples:
    #   rdd = $sc.parallelize(0..50)
    #
    #   rdd.histogram(2)
    #   # => [[0.0, 25.0, 50], [25, 26]]
    #
    #   rdd.histogram([0, 5, 25, 50])
    #   # => [[0, 5, 25, 50], [5, 20, 26]]
    #
    #   rdd.histogram([0, 15, 30, 45, 60])
    #   # => [[0, 15, 30, 45, 60], [15, 15, 15, 6]]
    #
    def histogram(buckets)

      # -----------------------------------------------------------------------
      # Integer
      #
      if buckets.is_a?(Integer)

        # Validation
        if buckets < 1
          raise ArgumentError, "Bucket count must be >= 1, #{buckets} inserted."
        end

        # Filter invalid values
        # Nil and NaN
        func = 'lambda{|x|
          if x.nil? || (x.is_a?(Float) && x.nan?)
            false
          else
            true
          end
        }'
        filtered = self.filter(func)

        # Compute the minimum and the maximum
        func = 'lambda{|memo, item|
          [memo[0] < item[0] ? memo[0] : item[0],
           memo[1] > item[1] ? memo[1] : item[1]]
        }'
        min, max = filtered.map('lambda{|x| [x, x]}').reduce(func)

        # Min, max must be valid numbers
        if (min.is_a?(Float) && !min.finite?) || (max.is_a?(Float) && !max.finite?)
          raise Spark::RDDError, 'Histogram on either an empty RDD or RDD containing +/-infinity or NaN'
        end

        # Already finished
        if min == max || buckets == 1
          return [min, max], [filtered.count]
        end

        # Custom range
        begin
          span = max - min # increment
          buckets = (0...buckets).map do |x|
            min + (x * span) / buckets.to_f
          end
          buckets << max
        rescue NoMethodError
          raise Spark::RDDError, 'Can not generate buckets with non-number in RDD'
        end

        even = true

      # -----------------------------------------------------------------------
      # Array
      #
      elsif buckets.is_a?(Array)

        if buckets.size < 2
          raise ArgumentError, 'Buckets should have more than one value.'
        end

        if buckets.detect{|x| x.nil? || (x.is_a?(Float) && x.nan?)}
          raise ArgumentError, 'Can not have nil or nan numbers in buckets.'
        end

        if buckets.detect{|x| buckets.count(x) > 1}
          raise ArgumentError, 'Buckets should not contain duplicated values.'
        end

        if buckets.sort != buckets
          raise ArgumentError, 'Buckets must be sorted.'
        end

        even = false

      # -----------------------------------------------------------------------
      # Other
      #
      else
        raise Spark::RDDError, 'Buckets should be number or array.'
      end

      reduce_func = 'lambda{|memo, item|
        memo.size.times do |i|
          memo[i] += item[i]
        end
        memo
      }'

      return buckets, new_rdd_from_command(Spark::Command::Histogram, even, buckets).reduce(reduce_func)
    end

    # Applies a function f to all elements of this RDD.
    #
    # == Example:
    #   rdd = $sc.parallelize(0..5)
    #   rdd.foreach(lambda{|x| puts x})
    #   # => nil
    #
    def foreach(f, options={})
      new_rdd_from_command(Spark::Command::Foreach, f).collect
      nil
    end

    # Applies a function f to each partition of this RDD.
    #
    # == Example:
    #   rdd = $sc.parallelize(0..5)
    #   rdd.foreachPartition(lambda{|x| puts x.to_s})
    #   # => nil
    #
    def foreach_partition(f, options={})
      new_rdd_from_command(Spark::Command::ForeachPartition, f).collect
      nil
    end


    # =============================================================================
    # Transformations of RDD

    # Return a new RDD by applying a function to all elements of this RDD.
    #
    # == Example:
    #   rdd = $sc.parallelize(0..5)
    #   rdd.map(lambda {|x| x*2}).collect
    #   # => [0, 2, 4, 6, 8, 10]
    #
    def map(f)
      new_rdd_from_command(Spark::Command::Map, f)
    end

    # Return a new RDD by first applying a function to all elements of this
    # RDD, and then flattening the results.
    #
    # == Example:
    #   rdd = $sc.parallelize(0..5)
    #   rdd.flat_map(lambda {|x| [x, 1]}).collect
    #   # => [0, 1, 1, 1, 2, 1, 3, 1, 4, 1, 5, 1]
    #
    def flat_map(f)
      new_rdd_from_command(Spark::Command::FlatMap, f)
    end

    # Return a new RDD by applying a function to each partition of this RDD.
    #
    # == Example:
    #   rdd = $sc.parallelize(0..10, 2)
    #   rdd.map_partitions(lambda{|part| part.reduce(:+)}).collect
    #   # => [15, 40]
    #
    def map_partitions(f)
      new_rdd_from_command(Spark::Command::MapPartitions, f)
    end

    # Return a new RDD by applying a function to each partition of this RDD, while tracking the index
    # of the original partition.
    #
    # == Example:
    #   rdd = $sc.parallelize(0...4, 4)
    #   rdd.map_partitions_with_index(lambda{|part, index| part.first * index}).collect
    #   # => [0, 1, 4, 9]
    #
    def map_partitions_with_index(f, options={})
      new_rdd_from_command(Spark::Command::MapPartitionsWithIndex, f)
    end

    # Return a new RDD containing only the elements that satisfy a predicate.
    #
    # == Example:
    #   rdd = $sc.parallelize(0..10)
    #   rdd.filter(lambda{|x| x.even?}).collect
    #   # => [0, 2, 4, 6, 8, 10]
    #
    def filter(f)
      new_rdd_from_command(Spark::Command::Filter, f)
    end

    # Return a new RDD containing non-nil elements.
    #
    # == Example:
    #   rdd = $sc.parallelize([1, nil, 2, nil, 3])
    #   rdd.compact.collect
    #   # => [1, 2, 3]
    #
    def compact
      new_rdd_from_command(Spark::Command::Compact)
    end

    # Return an RDD created by coalescing all elements within each partition into an array.
    #
    # == Example:
    #   rdd = $sc.parallelize(0..10, 3)
    #   rdd.glom.collect
    #   # => [[0, 1, 2], [3, 4, 5, 6], [7, 8, 9, 10]]
    #
    def glom
      new_rdd_from_command(Spark::Command::Glom)
    end

    # Return a new RDD that is reduced into num_partitions partitions.
    #
    # == Example:
    #   rdd = $sc.parallelize(0..10, 3)
    #   rdd.coalesce(2).glom.collect
    #   # => [[0, 1, 2], [3, 4, 5, 6, 7, 8, 9, 10]]
    #
    def coalesce(num_partitions)
      if self.is_a?(PipelinedRDD)
        deser = @command.serializer
      else
        deser = @command.deserializer
      end

      new_jrdd = jrdd.coalesce(num_partitions)
      RDD.new(new_jrdd, context, @command.serializer, deser)
    end

    # Return the Cartesian product of this RDD and another one, that is, the
    # RDD of all pairs of elements `(a, b)` where `a` is in `self` and
    # `b` is in `other`.
    #
    # == Example:
    #   rdd1 = $sc.parallelize([1,2,3])
    #   rdd2 = $sc.parallelize([4,5,6])
    #
    #   rdd1.cartesian(rdd2).collect
    #   # => [[1, 4], [1, 5], [1, 6], [2, 4], [2, 5], [2, 6], [3, 4], [3, 5], [3, 6]]
    #
    def cartesian(other)
      _deserializer = Spark::Serializer::Cartesian.new(self.deserializer, other.deserializer)

      new_jrdd = jrdd.cartesian(other.jrdd)
      RDD.new(new_jrdd, context, serializer, _deserializer)
    end

    # Return a new RDD containing the distinct elements in this RDD.
    # Ordering is not preserved because of reducing
    #
    # == Example:
    #   rdd = $sc.parallelize([1,1,1,2,3])
    #   rdd.distinct.collect
    #   # => [1, 2, 3]
    #
    def distinct
      self.map('lambda{|x| [x, nil]}')
          .reduce_by_key('lambda{|x,_| x}')
          .map('lambda{|x| x[0]}')
    end

    # Return a shuffled RDD.
    #
    # == Example:
    #   rdd = $sc.parallelize(0..10)
    #   rdd.shuffle.collect
    #   # => [3, 10, 6, 7, 8, 0, 4, 2, 9, 1, 5]
    #
    def shuffle(seed=nil)
      seed ||= Random.new_seed

      new_rdd_from_command(Spark::Command::Shuffle, seed)
    end

    # Return the union of this RDD and another one. Any identical elements will appear multiple
    # times (use .distinct to eliminate them).
    #
    # == Example:
    #   rdd = $sc.parallelize([1, 2, 3])
    #   rdd.union(rdd).collect
    #   # => [1, 2, 3, 1, 2, 3]
    #
    def union(other)
      if self.serializer != other.serializer
        other = other.reserialize(serializer)
      end

      new_jrdd = jrdd.union(other.jrdd)
      RDD.new(new_jrdd, context, serializer, deserializer)
    end

    # Return a new RDD with different serializer. This method is useful during union
    # and join operations.
    #
    # == Example:
    #   rdd = $sc.parallelize([1, 2, 3], nil, serializer: "marshal")
    #   rdd = rdd.map(lambda{|x| x.to_s})
    #   rdd.reserialize("oj").collect
    #   # => ["1", "2", "3"]
    #
    def reserialize(new_serializer)
      if serializer == new_serializer
        return self
      end

      new_command = @command.deep_copy
      new_command.serializer = new_serializer

      PipelinedRDD.new(self, new_command)
    end

    # Return the intersection of this RDD and another one. The output will not contain
    # any duplicate elements, even if the input RDDs did.
    #
    # == Example:
    #   rdd1 = $sc.parallelize([1,2,3,4,5])
    #   rdd2 = $sc.parallelize([1,4,5,6,7])
    #   rdd1.intersection(rdd2).collect
    #   # => [1, 4, 5]
    #
    def intersection(other)
      mapping_function = 'lambda{|item| [item, nil]}'
      filter_function  = 'lambda{|(key, values)| values.size > 1}'

      self.map(mapping_function)
          .cogroup(other.map(mapping_function))
          .filter(filter_function)
          .keys
    end

    # Return a copy of the RDD partitioned using the specified partitioner.
    #
    # == Example:
    #   rdd = $sc.parallelize(["1","2","3","4","5"]).map(lambda {|x| [x, 1]})
    #   rdd.partitionBy(2).glom.collect
    #   # => [[["3", 1], ["4", 1]], [["1", 1], ["2", 1], ["5", 1]]]
    #
    def partition_by(num_partitions, partition_func=nil)
      num_partitions ||= default_reduce_partitions
      partition_func ||= 'lambda{|x| Spark::Digest.portable_hash(x.to_s)}'

      _partition_by(num_partitions, Spark::Command::PartitionBy::Basic, partition_func)
    end

    # Return a sampled subset of this RDD. Operations are base on Poisson and Uniform
    # distributions.
    # TODO: Replace Unfirom for Bernoulli
    #
    # == Examples:
    #   rdd = $sc.parallelize(0..100)
    #
    #   rdd.sample(true, 10).collect
    #   # => [17, 17, 22, 23, 51, 52, 62, 64, 69, 70, 96]
    #
    #   rdd.sample(false, 0.1).collect
    #   # => [3, 5, 9, 32, 44, 55, 66, 68, 75, 80, 86, 91, 98]
    #
    def sample(with_replacement, fraction, seed=nil)
      new_rdd_from_command(Spark::Command::Sample, with_replacement, fraction, seed)
    end

    # Return a fixed-size sampled subset of this RDD in an array
    #
    # == Examples:
    #   rdd = $sc.parallelize(0..100)
    #
    #   rdd.take_sample(true, 10)
    #   # => [90, 84, 74, 44, 27, 22, 72, 96, 80, 54]
    #
    #   rdd.take_sample(false, 10)
    #   # => [5, 35, 30, 48, 22, 33, 40, 75, 42, 32]
    #
    def take_sample(with_replacement, num, seed=nil)

      if num < 0
        raise Spark::RDDError, 'Size have to be greater than 0'
      elsif num == 0
        return []
      end

      # Taken from scala
      num_st_dev = 10.0

      # Number of items
      initial_count = self.count
      return [] if initial_count == 0

      # Create new generator
      seed ||= Random.new_seed
      rng = Random.new(seed)

      # Shuffle elements if requested num if greater than array size
      if !with_replacement && num >= initial_count
        return self.shuffle(seed).collect
      end

      # Max num
      max_sample_size = Integer::MAX - (num_st_dev * Math.sqrt(Integer::MAX)).to_i
      if num > max_sample_size
        raise Spark::RDDError, "Size can not be greate than #{max_sample_size}"
      end

      # Approximate fraction with tolerance
      fraction = compute_fraction(num, initial_count, with_replacement)

      # Compute first samled subset
      samples = self.sample(with_replacement, fraction, seed).collect

      # If the first sample didn't turn out large enough, keep trying to take samples;
      # this shouldn't happen often because we use a big multiplier for their initial size.
      index = 0
      while samples.size < num
        log_warning("Needed to re-sample due to insufficient sample size. Repeat #{index}")
        samples = self.sample(with_replacement, fraction, rng.rand(0..Integer::MAX)).collect
        index += 1
      end

      samples.shuffle!(random: rng)
      samples[0, num]
    end

    # Return an RDD created by piping elements to a forked external process.
    #
    # == Cmds:
    #   cmd = [env,] command... [,options]
    #
    #   env: hash
    #     name => val : set the environment variable
    #     name => nil : unset the environment variable
    #   command...:
    #     commandline                 : command line string which is passed to the standard shell
    #     cmdname, arg1, ...          : command name and one or more arguments (This form does
    #                                   not use the shell. See below for caveats.)
    #     [cmdname, argv0], arg1, ... : command name, argv[0] and zero or more arguments (no shell)
    #   options: hash
    #
    #   See http://ruby-doc.org/core-2.2.0/Process.html#method-c-spawn
    #
    # == Examples:
    #   $sc.parallelize(0..5).pipe('cat').collect
    #   # => ["0", "1", "2", "3", "4", "5"]
    #
    #   rdd = $sc.parallelize(0..5)
    #   rdd = rdd.pipe('cat', "awk '{print $1*10}'")
    #   rdd = rdd.map(lambda{|x| x.to_i + 1})
    #   rdd.collect
    #   # => [1, 11, 21, 31, 41, 51]
    #
    def pipe(*cmds)
      new_rdd_from_command(Spark::Command::Pipe, cmds)
    end


    # =============================================================================
    # Pair functions

    # Merge the values for each key using an associative reduce function. This will also perform
    # the merging locally on each mapper before sending results to a reducer, similarly to a
    # "combiner" in MapReduce. Output will be hash-partitioned with the existing partitioner/
    # parallelism level.
    #
    # == Example:
    #   rdd = $sc.parallelize(["a","b","c","a","b","c","a","c"]).map(lambda{|x| [x, 1]})
    #   rdd.reduce_by_key(lambda{|x,y| x+y}).collect_as_hash
    #   # => {"a"=>3, "b"=>2, "c"=>3}
    #
    def reduce_by_key(f, num_partitions=nil)
      combine_by_key('lambda {|x| x}', f, f, num_partitions)
    end

    # Generic function to combine the elements for each key using a custom set of aggregation
    # functions. Turns a JavaPairRDD[(K, V)] into a result of type JavaPairRDD[(K, C)], for a
    # "combined type" C * Note that V and C can be different -- for example, one might group an
    # RDD of type (Int, Int) into an RDD of type (Int, List[Int]). Users provide three
    # functions:
    #
    # == Parameters:
    # create_combiner:: which turns a V into a C (e.g., creates a one-element list)
    # merge_value:: to merge a V into a C (e.g., adds it to the end of a list)
    # merge_combiners:: to combine two C's into a single one.
    #
    # == Example:
    #   def combiner(x)
    #     x
    #   end
    #
    #   def merge(x,y)
    #     x+y
    #   end
    #
    #   rdd = $sc.parallelize(["a","b","c","a","b","c","a","c"], 2).map(lambda{|x| [x, 1]})
    #   rdd.combine_by_key(method(:combiner), method(:merge), method(:merge)).collect_as_hash
    #   # => {"a"=>3, "b"=>2, "c"=>3}
    #
    def combine_by_key(create_combiner, merge_value, merge_combiners, num_partitions=nil)
      _combine_by_key(
        [Spark::Command::CombineByKey::Combine, create_combiner, merge_value],
        [Spark::Command::CombineByKey::Merge, merge_combiners],
        num_partitions
      )
    end

    # Return an RDD of grouped items.
    #
    # == Example:
    #   rdd = $sc.parallelize(0..5)
    #   rdd.group_by(lambda{|x| x%2}).collect
    #   # => [[0, [0, 2, 4]], [1, [1, 3, 5]]]
    #
    def group_by(f, num_partitions=nil)
      self.key_by(f).group_by_key(num_partitions)
    end

    # Group the values for each key in the RDD into a single sequence. Allows controlling the
    # partitioning of the resulting key-value pair RDD by passing a Partitioner.
    #
    # Note: If you are grouping in order to perform an aggregation (such as a sum or average)
    # over each key, using reduce_by_key or combine_by_key will provide much better performance.
    #
    # == Example:
    #   rdd = $sc.parallelize([["a", 1], ["a", 2], ["b", 3]])
    #   rdd.group_by_key.collect
    #   # => [["a", [1, 2]], ["b", [3]]]
    #
    def group_by_key(num_partitions=nil)
      create_combiner = 'lambda{|item| [item]}'
      merge_value     = 'lambda{|combiner, item| combiner << item; combiner}'
      merge_combiners = 'lambda{|combiner_1, combiner_2| combiner_1 += combiner_2; combiner_1}'

      combine_by_key(create_combiner, merge_value, merge_combiners, num_partitions)
    end

    # Merge the values for each key using an associative function f
    # and a neutral `zero_value` which may be added to the result an
    # arbitrary number of times, and must not change the result
    # (e.g., 0 for addition, or 1 for multiplication.).
    #
    # == Example:
    #   rdd = $sc.parallelize([["a", 1], ["b", 2], ["a", 3], ["a", 4], ["c", 5]])
    #   rdd.fold_by_key(1, lambda{|x,y| x+y})
    #   # => [["a", 9], ["c", 6], ["b", 3]]
    #
    def fold_by_key(zero_value, f, num_partitions=nil)
      self.aggregate_by_key(zero_value, f, f, num_partitions)
    end

    # Aggregate the values of each key, using given combine functions and a neutral zero value.
    #
    # == Example:
    #   def combine(x,y)
    #     x+y
    #   end
    #
    #   def merge(x,y)
    #     x*y
    #   end
    #
    #   rdd = $sc.parallelize([["a", 1], ["b", 2], ["a", 3], ["a", 4], ["c", 5]], 2)
    #   rdd.aggregate_by_key(1, method(:combine), method(:merge))
    #   # => [["b", 3], ["a", 16], ["c", 6]]
    #
    def aggregate_by_key(zero_value, seq_func, comb_func, num_partitions=nil)
      _combine_by_key(
        [Spark::Command::CombineByKey::CombineWithZero, zero_value, seq_func],
        [Spark::Command::CombineByKey::Merge, comb_func],
        num_partitions
      )
    end

    # The same functionality as cogroup but this can grouped only 2 rdd's and you
    # can change num_partitions.
    #
    # == Example:
    #   rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3]])
    #   rdd2 = $sc.parallelize([["a", 4], ["a", 5], ["b", 6]])
    #   rdd1.group_with(rdd2).collect
    #   # => [["a", [1, 2, 4, 5]], ["b", [3, 6]]]
    #
    def group_with(other, num_partitions=nil)
      self.union(other).group_by_key(num_partitions)
    end

    # For each key k in `this` or `other`, return a resulting RDD that contains a tuple with the
    # list of values for that key in `this` as well as `other`.
    #
    # == Example:
    #   rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3]])
    #   rdd2 = $sc.parallelize([["a", 4], ["a", 5], ["b", 6]])
    #   rdd3 = $sc.parallelize([["a", 7], ["a", 8], ["b", 9]])
    #   rdd1.cogroup(rdd2, rdd3).collect
    #   # => [["a", [1, 2, 4, 5, 7, 8]], ["b", [3, 6, 9]]]
    #
    def cogroup(*others)
      unioned = self
      others.each do |other|
        unioned = unioned.union(other)
      end

      unioned.group_by_key
    end

    # Return each (key, value) pair in self RDD that has no pair with matching
    # key in other RDD.
    #
    # == Example:
    #   rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3], ["c", 4]])
    #   rdd2 = $sc.parallelize([["b", 5], ["c", 6]])
    #   rdd1.subtract_by_key(rdd2).collect
    #   # => [["a", 1], ["a", 2]]
    #
    def subtract_by_key(other, num_partitions=nil)
      create_combiner = 'lambda{|item| [[item]]}'
      merge_value     = 'lambda{|combiner, item| combiner.first << item; combiner}'
      merge_combiners = 'lambda{|combiner_1, combiner_2| combiner_1 += combiner_2; combiner_1}'

      self.union(other)
          .combine_by_key(create_combiner, merge_value, merge_combiners, num_partitions)
          .filter('lambda{|(key,values)| values.size == 1}')
          .flat_map_values('lambda{|item| item.first}')
    end

    # Return an RDD with the elements from self that are not in other.
    #
    # == Example:
    #   rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3], ["c", 4]])
    #   rdd2 = $sc.parallelize([["a", 2], ["c", 6]])
    #   rdd1.subtract(rdd2).collect
    #   # => [["a", 1], ["b", 3], ["c", 4]]
    #
    def subtract(other, num_partitions=nil)
      mapping_function = 'lambda{|x| [x,nil]}'

      self.map(mapping_function)
          .subtract_by_key(other.map(mapping_function), num_partitions)
          .keys
    end

    # Sort the RDD by key
    #
    # == Example:
    #   rdd = $sc.parallelize([["c", 1], ["b", 2], ["a", 3]])
    #   rdd.sort_by_key.collect
    #   # => [["a", 3], ["b", 2], ["c", 1]]
    #
    def sort_by_key(ascending=true, num_partitions=nil)
      self.sort_by('lambda{|(key, _)| key}')
    end

    # Sort the RDD by value
    #
    # == Example:
    #   rdd = $sc.parallelize([["a", 3], ["b", 1], ["c", 2]])
    #   rdd.sort_by_value.collect
    #   # => [["b", 1], ["c", 2], ["a", 3]]
    #
    def sort_by_value(ascending=true, num_partitions=nil)
      self.sort_by('lambda{|(_, value)| value}')
    end

    # Sorts this RDD by the given key_function
    #
    # This is a different implementation than spark. Sort by doesn't use
    # key_by method first. It can be slower but take less memory and
    # you can always use map.sort_by_key
    #
    # == Example:
    #   rdd = $sc.parallelize(["aaaaaaa", "cc", "b", "eeee", "ddd"])
    #
    #   rdd.sort_by.collect
    #   # => ["aaaaaaa", "b", "cc", "ddd", "eeee"]
    #
    #   rdd.sort_by(lambda{|x| x.size}).collect
    #   # => ["b", "cc", "ddd", "eeee", "aaaaaaa"]
    #
    def sort_by(key_function=nil, ascending=true, num_partitions=nil)
      key_function   ||= 'lambda{|x| x}'
      num_partitions ||= default_reduce_partitions

      command_klass = Spark::Command::SortByKey

      # Allow spill data to disk due to memory limit
      # spilling = config['spark.shuffle.spill'] || false
      spilling = false
      memory = ''

      # Set spilling to false if worker has unlimited memory
      if memory.empty?
        spilling = false
        memory   = nil
      else
        memory = to_memory_size(memory)
      end

      # Sorting should do one worker
      if num_partitions == 1
        rdd = self
        rdd = rdd.coalesce(1) if partitions_size > 1
        return rdd.new_rdd_from_command(command_klass, key_function, ascending, spilling, memory, serializer)
      end

      # Compute boundary of collection
      # Collection should be evenly distributed
      # 20.0 is from scala RangePartitioner (for roughly balanced output partitions)
      count = self.count
      sample_size = num_partitions * 20.0
      fraction = [sample_size / [count, 1].max, 1.0].min
      samples = self.sample(false, fraction, 1).map(key_function).collect
      samples.sort!
      # Reverse is much faster than reverse sort_by
      samples.reverse! if !ascending

      # Determine part bounds
      bounds = determine_bounds(samples, num_partitions)

      shuffled = _partition_by(num_partitions, Spark::Command::PartitionBy::Sorting, key_function, bounds, ascending, num_partitions)
      shuffled.new_rdd_from_command(command_klass, key_function, ascending, spilling, memory, serializer)
    end

    # Creates array of the elements in this RDD by applying function f.
    #
    # == Example:
    #   rdd = $sc.parallelize(0..5)
    #   rdd.key_by(lambda{|x| x%2}).collect
    #   # => [[0, 0], [1, 1], [0, 2], [1, 3], [0, 4], [1, 5]]
    #
    def key_by(f)
      new_rdd_from_command(Spark::Command::KeyBy, f)
    end

    # Pass each value in the key-value pair RDD through a map function without changing
    # the keys. This also retains the original RDD's partitioning.
    #
    # == Example:
    #   rdd = $sc.parallelize(["ruby", "scala", "java"])
    #   rdd = rdd.map(lambda{|x| [x, x]})
    #   rdd = rdd.map_values(lambda{|x| x.upcase})
    #   rdd.collect
    #   # => [["ruby", "RUBY"], ["scala", "SCALA"], ["java", "JAVA"]]
    #
    def map_values(f)
      new_rdd_from_command(Spark::Command::MapValues, f)
    end

    # Pass each value in the key-value pair RDD through a flat_map function
    # without changing the keys; this also retains the original RDD's
    # partitioning.
    #
    # == Example:
    #   rdd = $sc.parallelize([["a", [1,2]], ["b", [3]]])
    #   rdd = rdd.flat_map_values(lambda{|x| x*2})
    #   rdd.collect
    #   # => [["a", 1], ["a", 2], ["a", 1], ["a", 2], ["b", 3], ["b", 3]]
    #
    def flat_map_values(f)
      new_rdd_from_command(Spark::Command::FlatMapValues, f)
    end

    # Return an RDD with the first element of PairRDD
    #
    # == Example:
    #   rdd = $sc.parallelize([[1,2], [3,4], [5,6]])
    #   rdd.keys.collect
    #   # => [1, 3, 5]
    #
    def keys
      self.map('lambda{|(key, _)| key}')
    end

    # Return an RDD with the second element of PairRDD
    #
    # == Example:
    #   rdd = $sc.parallelize([[1,2], [3,4], [5,6]])
    #   rdd.keys.collect
    #   # => [2, 4, 6]
    #
    def values
      self.map('lambda{|(_, value)| value}')
    end

    # Return the list of values in the RDD for key `key`.
    # TODO: add Partitioner for efficiently searching
    #
    # == Example:
    #   rdd = $sc.parallelize(0..10)
    #   rdd = rdd.group_by(lambda {|x| x%3})
    #   rdd.lookup(2)
    #   # => [[2, 5, 8]]
    #
    #   rdd = $sc.parallelize(0..10)
    #   rdd = rdd.key_by(lambda{|x| x.even?})
    #   rdd.lookup(true)
    #   # => [0, 2, 4, 6, 8, 10]
    #
    def lookup(key)
      lookup_key = "lookup_key_#{object_id}"

      self.filter("lambda{|(key, _)| key == #{lookup_key}}")
          .bind(lookup_key => key)
          .values
          .collect
    end

    # Aliases
    alias_method :partitionsSize, :partitions_size
    alias_method :defaultReducePartitions, :default_reduce_partitions
    alias_method :setName, :set_name
    alias_method :addLibrary, :add_library
    alias_method :require, :add_library

    alias_method :flatMap, :flat_map
    alias_method :mapPartitions, :map_partitions
    alias_method :mapPartitionsWithIndex, :map_partitions_with_index
    alias_method :reduceByKey, :reduce_by_key
    alias_method :combineByKey, :combine_by_key
    alias_method :groupByKey, :group_by_key
    alias_method :groupWith, :group_with
    alias_method :partitionBy, :partition_by
    alias_method :defaultReducePartitions, :default_reduce_partitions
    alias_method :foreachPartition, :foreach_partition
    alias_method :mapValues, :map_values
    alias_method :takeSample, :take_sample
    alias_method :sortBy, :sort_by
    alias_method :sortByKey, :sort_by_key
    alias_method :keyBy, :key_by
    alias_method :groupBy, :group_by
    alias_method :foldByKey, :fold_by_key
    alias_method :aggregateByKey, :aggregate_by_key
    alias_method :subtractByKey, :subtract_by_key
    alias_method :sampleStdev, :sample_stdev
    alias_method :sampleVariance, :sample_variance

    private

      # This is base method for reduce operation. Is used by reduce, fold and aggregation.
      # Only difference is that fold has zero value.
      #
      def _reduce(klass, seq_op, comb_op, zero_value=nil)
        if seq_op.nil?
          # Partitions are already reduced
          rdd = self
        else
          rdd = new_rdd_from_command(klass, seq_op, zero_value)
        end

        # Send all results to one worker and combine results
        rdd = rdd.coalesce(1).compact

        # Add the same function to new RDD
        comm = rdd.add_command(klass, comb_op, zero_value)
        comm.deserializer = @command.serializer

        # Value is returned in array
        PipelinedRDD.new(rdd, comm).collect[0]
      end

      def _partition_by(num_partitions, klass, *args)
        # RDD is transform from [key, value] to [hash, [key, value]]
        keyed = new_rdd_from_command(klass, *args)
        keyed.serializer.unbatch!

        # PairwiseRDD and PythonPartitioner are borrowed from Python
        # but works great on ruby too
        pairwise_rdd = PairwiseRDD.new(keyed.jrdd.rdd).asJavaPairRDD
        partitioner = PythonPartitioner.new(num_partitions, args.first.object_id)
        new_jrdd = pairwise_rdd.partitionBy(partitioner).values

        # Reset deserializer
        RDD.new(new_jrdd, context, @command.serializer, keyed.serializer)
      end

      # For using a different combine_by_key
      #
      # == Used for:
      # * combine_by_key
      # * fold_by_key (with zero value)
      #
      def _combine_by_key(combine, merge, num_partitions)
        num_partitions ||= default_reduce_partitions

        # Combine key
        combined = new_rdd_from_command(combine.shift, *combine)

        # Merge items
        shuffled = combined.partition_by(num_partitions)
        merge_comm = shuffled.add_command(merge.shift, *merge)

        PipelinedRDD.new(shuffled, merge_comm)
      end

  end

  # Pipelined Resilient Distributed Dataset, operations are pipelined and sended to worker
  #
  #   RDD
  #   `-- map
  #       `-- map
  #           `-- map
  #
  # Code is executed from top to bottom
  #
  class PipelinedRDD < RDD

    attr_reader :prev_jrdd, :command

    def initialize(prev, command)

      if prev.is_a?(PipelinedRDD) && prev.pipelinable?
        # Second, ... stages
        @prev_jrdd = prev.prev_jrdd
      else
        # First stage
        @prev_jrdd = prev.jrdd
      end

      @cached = false
      @checkpointed = false

      @context = prev.context
      @command = command
    end

    def pipelinable?
      !(cached? || checkpointed?)
    end

    # Serialization necessary things and sent it to RubyRDD (scala extension)
    def jrdd
      @jrdd ||= _jrdd
    end

    private

      def _jrdd
        command = @command.build

        broadcasts = @command.bound_objects.select{|_, value| value.is_a?(Spark::Broadcast)}.values
        broadcasts = to_java_array_list(broadcasts.map(&:jbroadcast))

        ruby_rdd = RubyRDD.new(@prev_jrdd.rdd, command, broadcasts, @context.jaccumulator)
        ruby_rdd.asJavaRDD
      end

  end
end


================================================
FILE: lib/spark/sampler.rb
================================================
require 'distribution'

# Random Generators
module Spark
  module RandomGenerator
    class Poisson

      def initialize(mean, seed)
        generator = Random.new(seed)
        @exp_rng = Distribution::Exponential.rng(1.0/mean, random: generator)
      end

      def rand
        t = 0.0
        number = 0

        loop{
          t += @exp_rng.call
          if t > 1
            return number
          end
          number += 1
        }
      end

    end
  end
end

# Samplers
module Spark
  module Sampler

    class Base
      attr_reader :fraction, :seed

      def initialize(fraction, seed=nil)
        @fraction = fraction
        @seed = seed || Random.new_seed
      end
    end

    # Poisson Sampler
    # -------------------------------------------------------------------------
    class Poisson < Base

      def sample(iterator)
        iterator.map! do |item|
          count = rng.rand
          Array.new(count) { item }
        end
        iterator.flatten!
        iterator.compact!
        iterator
      end

      def lazy_sample(iterator)
        Enumerator::Lazy.new(iterator) do |yielder, value|
          count = rng.rand
          count.times { yielder << value }
        end
      end

      def rng
        @rng ||= Spark::RandomGenerator::Poisson.new(fraction, seed)
      end

    end

    # Uniform Sampler
    # -------------------------------------------------------------------------
    class Uniform < Base

      def sample(iterator)
        iterator.select!{|item| rng.rand <= fraction}
        iterator
      end

      def lazy_sample(iterator)
        iterator.select do |item|
          rng.rand <= fraction
        end
      end

      def rng
        @rng ||= Random.new(seed)
      end

    end

  end
end


================================================
FILE: lib/spark/serializer/auto_batched.rb
================================================
module Spark
  module Serializer
    ##
    # AutoBatched serializator
    #
    # Batch size is computed automatically. Simillar to Python's AutoBatchedSerializer.
    #
    class AutoBatched < Batched

      MAX_RATIO = 10

      def initialize(serializer, best_size=65536)
        @serializer = serializer
        @best_size = best_size.to_i

        error('Batch size must be greater than 1') if @best_size < 2
      end

      def batched?
        true
      end

      def unbatch!
      end

      def name
        "AutoBatched(#{@best_size})"
      end

      def dump_to_io(data, io)
        check_each(data)

        # Only Array have .slice
        data = data.to_a

        index = 0
        batch = 2
        max = @best_size * MAX_RATIO

        loop do
          chunk = data.slice(index, batch)
          if chunk.nil? || chunk.empty?
            break
          end

          serialized = @serializer.dump(chunk)
          io.write_string(serialized)

          index += batch

          size = serialized.bytesize
          if size < @best_size
            batch *= 2
          elsif size > max && batch > 1
            batch /= 2
          end
        end

        io.flush
      end

    end
  end
end

Spark::Serializer.register('auto_batched', 'autobatched', Spark::Serializer::AutoBatched)


================================================
FILE: lib/spark/serializer/base.rb
================================================
module Spark
  module Serializer
    # @abstract Parent for all serializers
    class Base

      def load_from_io(io)
        return to_enum(__callee__, io) unless block_given?

        loop do
          size = io.read_int_or_eof
          break if size == Spark::Constant::DATA_EOF

          yield load(io.read(size))
        end
      end

      def load_from_file(file, *args)
        return to_enum(__callee__, file, *args) unless block_given?

        load_from_io(file, *args).each do |item|
          yield item
        end

        file.close
        file.unlink
      end

      def ==(other)
        self.to_s == other.to_s
      end

      def batched?
        false
      end

      def unbatch!
      end

      def check_each(data)
        unless data.respond_to?(:each)
          error('Data must be iterable.')
        end
      end

      def error(message)
        raise Spark::SerializeError, message
      end

      def name
        self.class.name.split('::').last
      end

      def to_s
        name
      end

      def inspect
        %{#<Spark::Serializer:0x#{object_id}  "#{self}">}
      end

    end
  end
end


================================================
FILE: lib/spark/serializer/batched.rb
================================================
module Spark
  module Serializer
    class Batched < Base

      attr_writer :serializer

      def initialize(serializer, batch_size=nil)
        batch_size ||= Spark::Serializer::DEFAULT_BATCH_SIZE

        @serializer = serializer
        @batch_size = batch_size.to_i

        error('Batch size must be greater than 0') if @batch_size < 1
      end

      # Really batched
      def batched?
        @batch_size > 1
      end

      def unbatch!
        @batch_size = 1
      end

      def load(data)
        @serializer.load(data)
      end

      def dump(data)
        @serializer.dump(data)
      end

      def name
        "Batched(#{@batch_size})"
      end

      def to_s
        "#{name} -> #{@serializer}"
      end


      # === Dump ==============================================================

      def dump_to_io(data, io)
        check_each(data)

        if batched?
          data = data.each_slice(@batch_size)
        end

        data.each do |item|
          serialized = dump(item)
          io.write_string(serialized)
        end

        io.flush
      end


      # === Load ==============================================================

      def load_from_io(io)
        return to_enum(__callee__, io) unless block_given?

        loop do
          size = io.read_int_or_eof
          break if size == Spark::Constant::DATA_EOF

          data = io.read(size)
          data = load(data)

          if batched?
            data.each{|item| yield item }
          else
            yield data
          end
        end
      end

    end
  end
end

Spark::Serializer.register('batched', Spark::Serializer::Batched)


================================================
FILE: lib/spark/serializer/cartesian.rb
================================================
module Spark
  module Serializer
    class Cartesian < Pair

      def aggregate(item1, item2)
        item1.product(item2)
      end

    end
  end
end

Spark::Serializer.register('cartesian', Spark::Serializer::Cartesian)


================================================
FILE: lib/spark/serializer/compressed.rb
================================================
module Spark
  module Serializer
    class Compressed < Base

      def initialize(serializer)
        @serializer = serializer
      end

      def dump(data)
        Zlib::Deflate.deflate(@serializer.dump(data))
      end

      def load(data)
        @serializer.load(Zlib::Inflate.inflate(data))
      end

    end
  end
end

begin
  # TODO: require only if it is necessary
  require 'zlib'

  Spark::Serializer.register('compress', 'compressed', Spark::Serializer::Compressed)
rescue LoadError
end


================================================
FILE: lib/spark/serializer/marshal.rb
================================================
module Spark
  module Serializer
    class Marshal < Base

      def dump(data)
        ::Marshal.dump(data)
      end

      def load(data)
        ::Marshal.load(data)
      end

    end
  end
end

Spark::Serializer.register('marshal', Spark::Serializer::Marshal)


================================================
FILE: lib/spark/serializer/message_pack.rb
================================================
module Spark
  module Serializer
    class MessagePack < Base

      def dump(data)
        ::MessagePack.dump(data)
      end

      def load(data)
        ::MessagePack.load(data)
      end

    end
  end
end

begin
  # TODO: require only if it is necessary
  require 'msgpack'

  Spark::Serializer.register('messagepack', 'message_pack', 'msgpack', 'msg_pack', Spark::Serializer::MessagePack)
rescue LoadError
end


================================================
FILE: lib/spark/serializer/oj.rb
================================================
module Spark
  module Serializer
    class Oj < Base

      def dump(data)
        ::Oj.dump(data)
      end

      def load(data)
        ::Oj.load(data)
      end

    end
  end
end

begin
  # TODO: require only if it is necessary
  require 'oj'

  Spark::Serializer.register('oj', Spark::Serializer::Oj)
rescue LoadError
end


================================================
FILE: lib/spark/serializer/pair.rb
================================================
module Spark
  module Serializer
    class Pair < Base

      def initialize(serializer1, serializer2)
        @serializer1 = serializer1
        @serializer2 = serializer2
      end

      def to_s
        "#{name}(#{@serializer1}, #{@serializer2})"
      end

      def aggregate(item1, item2)
        item1.zip(item2)
      end

      def load_from_io(io)
        return to_enum(__callee__, io) unless block_given?

        loop do
          size = io.read_int_or_eof
          break if size == Spark::Constant::DATA_EOF

          item1 = @serializer1.load(io.read(size))
          item2 = @serializer2.load(io.read_string)

          item1 = [item1] unless @serializer1.batched?
          item2 = [item2] unless @serializer2.batched?

          aggregate(item1, item2).each do |item|
            yield item
          end
        end
      end

    end
  end
end

Spark::Serializer.register('pair', Spark::Serializer::Pair)


================================================
FILE: lib/spark/serializer/text.rb
================================================
module Spark
  module Serializer
    class Text < Base

      attr_reader :encoding

      def initialize(encoding=Encoding::UTF_8)
        error('Encoding must be an instance of Encoding') unless encoding.is_a?(Encoding)

        @encoding = encoding
      end

      def load(data)
        data.to_s.force_encoding(@encoding)
      end

      def to_s
        "Text(#{@encoding})"
      end

    end
  end
end

Spark::Serializer.register('string', 'text', Spark::Serializer::Text)


================================================
FILE: lib/spark/serializer.rb
================================================
module Spark
  ##
  # Serializer
  #
  module Serializer

    DEFAULT_COMPRESS = false
    DEFAULT_BATCH_SIZE = 1024
    DEFAULT_SERIALIZER_NAME = 'marshal'

    @@registered = {}

    # Register class and create method for quick access.
    # Class will be available also as __name__ for using
    # in build method (Proc binding problem).
    #
    # == Examples:
    #   register('test1', 'test2', Class)
    #
    #   Spark::Serializer.test1
    #   Spark::Serializer.test2
    #
    #   # Proc binding problem
    #   build { marshal } # => Spark::Serializer::Marshal
    #
    #   marshal = 1
    #   build { marshal } # => 1
    #
    #   build { __marshal__ } # => Spark::Serializer::Marshal
    #
    def self.register(*args)
      klass = args.pop
      args.each do |arg|
        @@registered[arg] = klass
        define_singleton_method(arg.to_sym){|*args| klass.new(*args) }
        define_singleton_method("__#{arg}__".to_sym){|*args| klass.new(*args) }
      end
    end

    def self.find(name)
      @@registered[name.to_s.downcase]
    end

    def self.find!(name)
      klass = find(name)

      if klass.nil?
        raise Spark::SerializeError, "Unknow serializer #{name}."
      end

      klass
    end

    def self.build(text=nil, &block)
      if block_given?
        class_eval(&block)
      else
        class_eval(text.to_s.downcase)
      end
    end

  end
end

# Parent
require 'spark/serializer/base'

# Basic
require 'spark/serializer/oj'
require 'spark/serializer/marshal'
require 'spark/serializer/message_pack'
require 'spark/serializer/text'

# Others
require 'spark/serializer/batched'
require 'spark/serializer/auto_batched'
require 'spark/serializer/compressed'
require 'spark/serializer/pair'
require 'spark/serializer/cartesian'


================================================
FILE: lib/spark/sort.rb
================================================
module Spark
  module InternalSorter
    class Base
      def initialize(key_function)
        @key_function = key_function
      end
    end

    class Ascending < Base
      def sort(data)
        data.sort_by!(&@key_function)
      end
    end

    class Descending < Ascending
      def sort(data)
        super
        data.reverse!
      end
    end

    def self.get(ascending, key_function)
      if ascending
        type = Ascending
      else
        type = Descending
      end

      type.new(key_function)
    end
  end
end


module Spark
  class ExternalSorter

    include Spark::Helper::System

    # Items from GC cannot be destroyed so #make_parts need some reserve
    MEMORY_RESERVE = 50 # %

    # How big will be chunk for adding new memory because GC not cleaning
    # immediately un-referenced variables
    MEMORY_FREE_CHUNK = 10 # %

    # How many items will be evaluate from iterator at start
    START_SLICE_SIZE = 10

    # Maximum of slicing. Memory control can be avoided by large value.
    MAX_SLICE_SIZE = 10_000

    # How many values will be taken from each enumerator.
    EVAL_N_VALUES = 10

    # Default key function
    KEY_FUNCTION = lambda{|item| item}

    attr_reader :total_memory, :memory_limit, :memory_chunk, :serializer

    def initialize(total_memory, serializer)
      @total_memory = total_memory
      @memory_limit = total_memory * (100-MEMORY_RESERVE)    / 100
      @memory_chunk = total_memory * (100-MEMORY_FREE_CHUNK) / 100
      @serializer   = serializer
    end

    def add_memory!
      @memory_limit += memory_chunk
    end

    def sort_by(iterator, ascending=true, key_function=KEY_FUNCTION)
      return to_enum(__callee__, iterator, key_function) unless block_given?

      create_temp_folder
      internal_sorter = Spark::InternalSorter.get(ascending, key_function)

      # Make N sorted enumerators
      parts = make_parts(iterator, internal_sorter)

      return [] if parts.empty?

      # Need new key function because items have new structure
      # From: [1,2,3] to [[1, Enumerator],[2, Enumerator],[3, Enumerator]]
      key_function_with_enum = lambda{|(key, _)| key_function[key]}
      internal_sorter = Spark::InternalSorter.get(ascending, key_function_with_enum)

      heap  = []
      enums = []

      # Load first items to heap
      parts.each do |part|
        EVAL_N_VALUES.times {
          begin
            heap << [part.next, part]
          rescue StopIteration
            break
          end
        }
      end

      # Parts can be empty but heap not
      while parts.any? || heap.any?
        internal_sorter.sort(heap)

        # Since parts are sorted and heap contains EVAL_N_VALUES method
        # can add EVAL_N_VALUES items to the result
        EVAL_N_VALUES.times {
          break if heap.empty?

          item, enum = heap.shift
          enums << enum

          yield item
        }

        # Add new element to heap from part of which was result item
        while (enum = enums.shift)
          begin
            heap << [enum.next, enum]
          rescue StopIteration
            parts.delete(enum)
            enums.delete(enum)
          end
        end
      end

    ensure
      destroy_temp_folder
    end

    private

      def create_temp_folder
        @dir = Dir.mktmpdir
      end

      def destroy_temp_folder
        FileUtils.remove_entry_secure(@dir) if @dir
      end

      # New part is created when current part exceeds memory limit (is variable)
      # Every new part have more memory because of ruby GC
      def make_parts(iterator, internal_sorter)
        slice = START_SLICE_SIZE

        parts = []
        part  = []

        loop do
          begin
            # Enumerator does not have slice method
            slice.times { part << iterator.next }
          rescue StopIteration
            break
          end

          # Carefully memory_limit is variable
          if memory_usage > memory_limit
            # Sort current part with origin key_function
            internal_sorter.sort(part)
            # Tempfile for current part
            # will be destroyed on #destroy_temp_folder
            file = Tempfile.new("part", @dir)
            serializer.dump(part, file)
            # Peek is at the end of file
            file.seek(0)
            parts << serializer.load(file)

            # Some memory will be released but not immediately
            # need some new memory for start
            part.clear
            add_memory!
          else
            slice = [slice*2, MAX_SLICE_SIZE].min
          end
        end

        # Last part which is not in the file
        if part.any?
          internal_sorter.sort(part)
          parts << part.each
        end

        parts
      end

  end # ExternalSorter
end # Spark


================================================
FILE: lib/spark/sql/column.rb
================================================
module Spark
  module SQL
    class Column

      # =============================================================================
      # Creating

      def self.to_java(col)
        if col.is_a?(Column)
          col.jcolumn
        else
          from_name(col)
        end
      end

      def self.from_literal(literal)
        JSQLFunctions.lit(literal)
      end

      def self.from_name(name)
        JSQLFunctions.col(name)
      end


      # =============================================================================
      # Functions for virtual columns

      # Evaluates a list of conditions and returns one of multiple possible result expressions.
      # If {Column.otherwise} is not invoked, nil is returned for unmatched conditions.
      #
      # == Parameters:
      # condition:: a boolean {Column} expression
      # value:: a literal value, or a {Column} expression
      #
      # == Example:
      #   df.select(when(df['age'] == 2, 3).otherwise(4).alias("age")).collect()
      #   # [Row(age=3), Row(age=4)]
      #
      #   df.select(when(df.age == 2, df.age + 1).alias("age")).collect()
      #   # [Row(age=3), Row(age=nil)]
      #
      def self.when(condition, value)
        Column.new(JSQLFunctions).when(condition, value)
      end


      # =============================================================================
      # Initialized column

      attr_reader :jcolumn

      def initialize(jcolumn)
        @jcolumn = jcolumn
      end

      FUNC_OPERATORS = {
        '!' => 'not',
        '~' => 'negate',
        '-@' => 'negate'
      }

      BIN_OPERATORS = {
        '[]' => 'apply',
        '+' => 'plus',
        '-' => 'minus',
        '*' => 'multiply',
        '/' => 'divide',
        '%' => 'mod',
        '==' => 'equalTo',
        '!=' => 'notEqual',
        '<' => 'lt',
        '<=' => 'leq',
        '>' => 'gt',
        '>=' => 'geq',
        '&' => 'and',
        '|' => 'or',
        'like' => 'like',
        'starts_with' => 'startsWith',
        'ends_with' => 'endsWith',
        'bitwiseOR' => 'bitwiseOR',
        'bitwiseAND' => 'bitwiseAND',
        'bitwiseXOR' => 'bitwiseXOR',
      }

      UNARY_OPERATORS = {
        'asc' => 'asc',
        'desc' => 'desc',
        'is_null' => 'isNull',
        'is_not_null' => 'isNotNull'
      }

      FUNC_OPERATORS.each do |op, func|
        eval <<-METHOD
          def #{op}
            func_op('#{func}')
          end
        METHOD
      end

      BIN_OPERATORS.each do |op, func|
        eval <<-METHOD
          def #{op}(item)
            bin_op('#{func}', item)
          end
        METHOD
      end

      UNARY_OPERATORS.each do |op, func|
        eval <<-METHOD
          def #{op}
            unary_op('#{func}')
          end
        METHOD
      end

      # An expression that gets an item at position ordinal out of a list,
      # or gets an item by key out of a Hash.
      #
      # == Example:
      #   df.select(df.l.get_item(0), df.d.get_item("key")).show
      #   # +----+------+
      #   # |l[0]|d[key]|
      #   # +----+------+
      #   # |   1| value|
      #   # +----+------+
      #
      #   df.select(df.l[0], df.d["key"]).show
      #   # +----+------+
      #   # |l[0]|d[key]|
      #   # +----+------+
      #   # |   1| value|
      #   # +----+------+
      #
      def get_item(key)
        self[key]
      end

      # An expression that gets a field by name in a StructField.
      #
      # == Example:
      #   df.select(df.r.get_field("b")).show
      #   # +----+
      #   # |r[b]|
      #   # +----+
      #   # |   b|
      #   # +----+
      #
      #   df.select(df.r.a).show
      #   # +----+
      #   # |r[a]|
      #   # +----+
      #   # |   1|
      #   # +----+
      #
      def get_field(name)
        self[name]
      end

      # Return a {Column} which is a substring of the column.
      #
      # == Parameters:
      # start:: start position (Integer or Column)
      # length:: length of the substring (Integer or Column)
      #
      # == Example:
      #   df.select(df.name.substr(1, 3).alias("col")).collect
      #   # => [#<Row(col: "Ali")>, #<Row(col: "Bob")>]
      #
      def substr(start, length)
        if start.is_a?(Integer) && length.is_a?(Integer)
          new_jcolumn = jcolumn.substr(start, length)
        elsif start.is_a?(Column) && length.is_a?(Column)
          new_jcolumn = jcolumn.substr(start.jcolumn, length.jcolumn)
        else
          raise ArgumentError, "Unsupported type: #{start.class} and #{length.class}."
        end

        Column.new(new_jcolumn)
      end

      # A boolean expression that is evaluated to true if the value of this
      # expression is contained by the evaluated values of the arguments.
      #
      # == Example:
      #   df[df.name.isin("Bob", "Mike")].collect
      #   # => [#<Row(age: "5", name: "Bob")>]
      #
      #   df[df.age.isin(1, 2, 3)].collect
      #   # => [#<Row(age: "2", name: "Alice")>]
      #
      def isin(*cols)
        if cols.size == 1 && cols.first.is_a?(Array)
          cols = cols.first
        end

        cols = cols.map do |col|
          Column.from_literal(col)
        end

        new_jcolumn = jcolumn.isin(Spark.jb.to_seq(cols))
        Column.new(new_jcolumn)
      end

      # Returns this column aliased with a new name or names (in the case of expressions that
      # return more than one column, such as explode).
      #
      # == Example:
      #   df.select(df.age.alias("age2")).collect
      #   # => [#<Row(age2: "2")>, #<Row(age2: "5")>]
      #
      def alias(name)
        Column.new(jcolumn.as(name))
      end

      # Convert the column into type data_type.
      #
      # == Example:
      #   df.select(df.age.cast("string").alias('ages')).collect
      #   # => [#<Row(ages: "2")>, #<Row(ages: "5")>]
      #
      #   df.select(df.age.cast(StringType.new).alias('ages')).collect
      #   # => [#<Row(ages: "2")>, #<Row(ages: "5")>]
      #
      def cast(data_type)
        case data_type
        when String
          new_jcolumn = jcolumn.cast(data_type)
        when DataType
          jdata_type = JDataType.fromJson(data_type.json)
          new_jcolumn = jcolumn.cast(jdata_type)
        else
          raise ArgumentError, "Unsupported type: #{data_type.class}"
        end

        Column.new(new_jcolumn)
      end

      # A boolean expression that is evaluated to true if the value of this
      # expression is between the given columns.
      #
      # == Example:
      #   df.select(df.name, df.age.between(2, 4)).show
      #   # +-----+--------------------------+
      #   # | name|((age >= 2) && (age <= 4))|
      #   # +-----+--------------------------+
      #   # |Alice|                      true|
      #   # |  Bob|                     false|
      #   # +-----+--------------------------+
      #
      def between(lower, upper)
        (self >= lower) & (self <= upper)
      end

      # Evaluates a list of conditions and returns one of multiple possible result expressions.
      # If {Column.otherwise} is not invoked, nil is returned for unmatched conditions.
      #
      # == Parameters:
      # condition:: a boolean {Column} expression.
      # value:: a literal value, or a {Column} expression.
      #
      # == Example:
      #   df.select(df.name, Column.when(df.age > 4, 1).when(df.age < 3, -1).otherwise(0)).show
      #   # +-----+--------------------------------------------------------+
      #   # | name|CASE WHEN (age > 4) THEN 1 WHEN (age < 3) THEN -1 ELSE 0|
      #   # +-----+--------------------------------------------------------+
      #   # |Alice|                                                      -1|
      #   # |  Bob|                                                       1|
      #   # +-----+--------------------------------------------------------+
      #
      def when(condition, value)
        unless condition.is_a?(Column)
          raise ArgumentError, "Condition must be a Column"
        end

        if value.is_a?(Column)
          value = value.jcolumn
        end
        new_jcolumn = jcolumn.when(condition.jcolumn, value)
        Column.new(new_jcolumn)
      end


      # Evaluates a list of conditions and returns one of multiple possible result expressions.
      # If {Column.otherwise} is not invoked, nil is returned for unmatched conditions.
      #
      # == Example:
      #   df.select(df.name, Column.when(df.age > 3, 1).otherwise(0)).show
      #   # +-----+---------------------------------+
      #   # | name|CASE WHEN (age > 3) THEN 1 ELSE 0|
      #   # +-----+---------------------------------+
      #   # |Alice|                                0|
      #   # |  Bob|                                1|
      #   # +-----+---------------------------------+
      #
      def otherwise(value)
        if value.is_a?(Column)
          value = value.jcolumn
        end

        new_jcolumn = jcolumn.otherwise(value)
        Column.new(new_jcolumn)
      end

      def over(*)
        raise Spark::NotImplemented
      end

      def method_missing(method, item)
        get_field(item)
      end

      def to_s
        "Column(\"#{jcolumn.toString}\")"
      end

      def inspect
        "#<#{to_s}>"
      end


      alias_method :as, :alias
      alias_method :slice, :substr
      alias_method :astype, :cast

      private

        def func_op(name)
          new_jcolumn = JSQLFunctions.__send__(name, jcolumn)
          Column.new(new_jcolumn)
        end

        def bin_op(name, item)
          if item.is_a?(Column)
            col = item.jcolumn
          else
            col = item
          end

          new_jcolumn = jcolumn.__send__(name, col)
          Column.new(new_jcolumn)
        end

        def unary_op(name)
          new_jcolumn = jcolumn.__send__(name)
          Column.new(new_jcolumn)
        end


    end
  end
end


================================================
FILE: lib/spark/sql/context.rb
================================================
module Spark
  module SQL
    class Context

      attr_reader :spark_context, :jsql_context

      def initialize(spark_context)
        @spark_context = spark_context
        @jsql_context = JSQLContext.new(spark_context.sc)
      end

      def read
        DataFrameReader.new(self)
      end

    end
  end
end


================================================
FILE: lib/spark/sql/data_frame.rb
================================================
module Spark
  module SQL
    ##
    # Spark::SQL::DataFrame
    #
    # All example are base on people.json
    #
    class DataFrame

      attr_reader :jdf, :sql_context

      def initialize(jdf, sql_context)
        @jdf = jdf
        @sql_context = sql_context
      end

      # Returns the column as a {Column}.
      #
      # == Examples:
      #   df.select(df['age']).collect
      #   # => [#<Row {"age"=>2}>, #<Row {"age"=>5}>]
      #
      #   df[ ["name", "age"] ].collect
      #   # => [#<Row {"name"=>"Alice", "age"=>2}>, #<Row {"name"=>"Bob", "age"=>5}>]
      #
      #   df[ df.age > 3 ].collect
      #   # => [#<Row {"age"=>5, "name"=>"Bob"}>]
      #
      #   df[df[0] > 3].collect
      #   # => [#<Row {"age"=>5, "name"=>"Bob"}>]
      #
      def [](item)
        case item
        when String
          jcolumn = jdf.apply(item)
          Column.new(jcolumn)
        when Array
          select(*item)
        when Numeric
          jcolumn = jdf.apply(columns[item])
          Column.new(jcolumn)
        when Column
          where(item)
        else
          raise ArgumentError, "Unsupported type: #{item.class}"
        end
      end

      # Returns all column names as a Array.
      #
      # == Example:
      #   df.columns
      #   # => ['age', 'name']
      #
      def columns
        schema.fields.map(&:name)
      end

      # Returns the schema of this {DataFrame} as a {StructType}.
      def schema
        return @schema if @schema

        begin
          @schema = DataType.parse(JSON.parse(jdf.schema.json))
        rescue => e
          raise Spark::ParseError, 'Unable to parse datatype from schema'
        end
      end

      def show_string(n=20, truncate=true)
        jdf.showString(n, truncate)
      end

      # Prints the first n rows to the console.
      #
      # == Parameters:
      # n:: Number of rows to show.
      # truncate:: Whether truncate long strings and align cells right.
      #
      def show(n=20, truncate=true)
        puts show_string(n, truncate)
      end

      # Prints out the schema in the tree format.
      #
      # == Example:
      #   df.print_schema
      #   # root
      #   #  |-- age: integer (nullable = true)
      #   #  |-- name: string (nullable = true)
      #
      def print_schema
        puts jdf.schema.treeString
      end

      def explain(extended=false)
        if extended
          jdf.queryExecution.toString
        else
          jdf.queryExecution.executedPlan.toString
        end
      end

      # Prints the (logical and physical) plans to the console for debugging purpose.
      #
      # == Example:
      #   df.print_explain
      #   # Scan PhysicalRDD[age#0,name#1]
      #
      #   df.print_explain(true)
      #   # == Parsed Logical Plan ==
      #   # ...
      #   # == Analyzed Logical Plan ==
      #   # ...
      #   # == Optimized Logical Plan ==
      #   # ...
      #   # == Physical Plan ==
      #   # ...
      #
      def print_explain(extended=false)
        puts explain(extended)
      end

      # Returns all column names and their data types as a list.
      #
      # == Example:
      #   df.dtypes
      #   # => [('age', 'int'), ('name', 'string')]
      #
      def dtypes
        schema.fields.map do |field|
          [field.name, field.data_type.simple_string]
        end
      end

      def inspect
        types = dtypes.map do |(name, type)|
          "#{name}: #{type}"
        end

        "#<DataFrame(#{types.join(', ')})>"
      end

      # Get column by name
      def method_missing(method, *args, &block)
        name = method.to_s
        if columns.include?(name)
          self[name]
        else
          super
        end
      end


      # =============================================================================
      # Collect

      # Returns all the records as a list of {Row}.
      #
      # == Example:
      #   df.collect
      #   # => [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
      #
      def collect
        Spark.jb.call(jdf, 'collect')
      end

      def collect_as_hash
        result = collect
        result.map!(&:to_h)
        result
      end

      def values
        result = collect
        result.map! do |item|
          item.to_h.values
        end
        result
      end

      # Returns the number of rows in this {DataFrame}.
      def count
        jdf.count.to_i
      end

      # Returns the first num rows as an Array of {Row}.
      def take(num)
        limit(num).collect
      end

      # Return first {Row}.
      def first
        take(1).first
      end


      # =============================================================================
      # Queries

      # Projects a set of expressions and returns a new {DataFrame}
      #
      # == Parameters:
      # *cols::
      #   List of column names (string) or expressions {Column}.
      #   If one of the column names is '*', that column is expanded to include all columns
      #   in the current DataFrame.
      #
      # == Example:
      #   df.select('*').collect
      #   # => [#<Row {"age"=>2, "name"=>"Alice"}>, #<Row {"age"=>5, "name"=>"Bob"}>]
      #
      #   df.select('name', 'age').collect
      #   # => [#<Row {"name"=>"Alice", "age"=>2}>, #<Row {"name"=>"Bob", "age"=>5}>]
      #
      #   df.select(df.name, (df.age + 10).alias('age')).collect
      #   # => [#<Row {"name"=>"Alice", "age"=>12}>, #<Row {"name"=>"Bob", "age"=>15}>]
      #
      def select(*cols)
        jcols = cols.map do |col|
          Column.to_java(col)
        end

        new_jdf = jdf.select(jcols)
        DataFrame.new(new_jdf, sql_context)
      end

      # Filters rows using the given condition.
      #
      # == Examples:
      #   df.filter(df.age > 3).collect
      #   # => [#<Row {"age"=>5, "name"=>"Bob"}>]
      #
      #   df.where(df.age == 2).collect
      #   # => [#<Row {"age"=>2, "name"=>"Alice"}>]
      #
      #   df.filter("age > 3").collect
      #   # => [#<Row {"age"=>5, "name"=>"Bob"}>]
      #
      #   df.where("age = 2").collect
      #   # => [#<Row {"age"=>2, "name"=>"Alice"}>]
      #
      def filter(condition)
        case condition
        when String
          new_jdf = jdf.filter(condition)
        when Column
          new_jdf = jdf.filter(condition.jcolumn)
        else
          raise ArgumentError, 'Condition must be String or Column'
        end

        DataFrame.new(new_jdf, sql_context)
      end

      # Limits the result count to the number specified.
      def limit(num)
        new_jdf = jdf.limit(num)
        DataFrame.new(new_jdf, sql_context)
      end


      alias_method :where, :filter

    end
  end
end


================================================
FILE: lib/spark/sql/data_frame_reader.rb
================================================
module Spark
  module SQL
    class DataFrameReader

      attr_reader :sql_context, :jreader

      def initialize(sql_context)
        @sql_context = sql_context
        @jreader = sql_context.jsql_context.read
      end

      def df(jdf)
        DataFrame.new(jdf, sql_context)
      end

      # Specifies the input data source format.
      # Parameter is name of the data source, e.g. 'json', 'parquet'.
      def format(source)
        jreader.format(source)
        self
      end

      # Adds an input option for the underlying data source.
      def option(key, value)
        jreader.option(key, value.to_s)
        self
      end

      # Adds input options for the underlying data source.
      def options(options)
        options.each do |key, value|
          jreader.option(key, value.to_s)
        end
        self
      end

      # Loads data from a data source and returns it as a :class`DataFrame`.
      #
      # == Parameters:
      # path:: Optional string for file-system backed data sources.
      # format:: Optional string for format of the data source. Default to 'parquet'.
      # schema:: Optional {StructType} for the input schema.
      # options:: All other string options.
      #
      def load(path=nil, new_format=nil, new_schema=nil, new_options=nil)
        new_format && format(new_format)
        new_schema && schema(new_schema)
        new_options && options(new_options)

        if path.nil?
          df(jreader.load)
        else
          df(jreader.load(path))
        end
      end

      # Specifies the input schema.
      #
      # Some data sources (e.g. JSON) can infer the input schema automatically from data.
      # By specifying the schema here, the underlying data source can skip the schema
      # inference step, and thus speed up data loading.
      #
      # Parameter schema must be StructType object.
      #
      def schema(new_schema)
        unless new_schema.is_a?(StructType)
          raise ArgumentError, 'Schema must be a StructType.'
        end

        jschema = sql_context.jsql_context.parseDataType(new_schema.json)
        jreader.schema(jschema)
        self
      end

      # Loads a JSON file (one object per line) and returns the result as {DataFrame}
      #
      # If the schema parameter is not specified, this function goes
      # through the input once to determine the input schema.
      #
      # == Parameters:
      # path:: string, path to the JSON dataset
      # schema:: an optional {StructType} for the input schema.
      #
      # == Example:
      #   df = sql.read.json('people.json')
      #   df.dtypes
      #   # => [('age', 'bigint'), ('name', 'string')]
      #
      def json(path, new_schema=nil)
        # ClassNotFoundException: Failed to load class for data source: json
        # df(jreader.json(path))

        load(path, 'org.apache.spark.sql.execution.datasources.json', new_schema)
      end

    end
  end
end


================================================
FILE: lib/spark/sql/data_type.rb
================================================
module Spark
  module SQL
    ##
    # Spark::SQL::DataType
    #
    class DataType

      cattr_accessor :atomic_types
      self.atomic_types = {}

      cattr_accessor :complex_types
      self.complex_types = {}

      def self.parse(data)
        if data.is_a?(Hash)
          type = data['type']
          if complex_types.has_key?(type)
            complex_types[type].from_json(data)
          # elsif type == 'udt'
          #   UserDefinedType.from_json(data)
          else
            raise Spark::SQLError, "Unsupported type: #{type}"
          end
        else
          if atomic_types.has_key?(data)
            atomic_types[data].new
          else
            raise Spark::SQLError, "Unsupported type: #{type}"
          end
        end
      end

      def self.class_name
        name.split('::').last
      end

      def self.type_name
        class_name.sub('Type', '').downcase
      end

      def self.complex
        complex_types[type_name] = self
      end

      def self.atomic
        atomic_types[type_name] = self
      end

      def ==(other)
        self.class == other.class && self.to_s == other.to_s
      end

      def type_name
        self.class.type_name
      end

      def simple_string
        type_name
      end

      def json_value
        type_name
      end

      def json
        json_value.to_json
      end

      def to_s
        self.class.class_name
      end

      def inspect
        "#<#{to_s}>"
      end

    end

    ##
    # Spark::SQL::StructType
    #
    # Struct type, consisting of a list of {StructField}.
    # This is the data type representing a {Row}.
    #
    # == Example:
    #   struct1 = StructType.new([StructField.new('f1', StringType.new, true)])
    #   struct2 = StructType.new([StructField.new('f2', StringType.new, true)])
    #   struct1 == struct2
    #   # => true
    #
    class StructType < DataType
      complex

      attr_reader :fields

      def self.from_json(json)
        fields = json['fields'].map do |field|
          StructField.from_json(field)
        end

        StructType.new(fields)
      end

      def initialize(fields=[])
        @fields = fields
        @names = fields.map(&:name)
      end

      def json_value
        {
          'type' => type_name,
          'fields' => fields.map(&:json_value)
        }
      end

      def to_s
        "StructType(#{fields.join(', ')})"
      end
    end


    ##
    # Spark::SQL::StructField
    #
    class StructField < DataType

      attr_reader :name, :data_type, :nullable, :metadata

      def self.from_json(json)
        StructField.new(json['name'], DataType.parse(json['type']), json['nullable'], json['metadata'])
      end

      # A field in {StructType}.
      #
      # == Parameters:
      # name:: string, name of the field.
      # data_type:: {DataType} of the field.
      # nullable:: boolean, whether the field can be null (nil) or not.
      # metadata:: a dict from string to simple type that can be to_internald to JSON automatically
      #
      # == Example:
      #   f1 = StructField.new('f1', StringType.new, true)
      #   f2 = StructField.new('f2', StringType.new, true)
      #   f1 == f2
      #   # => true
      #
      def initialize(name, data_type, nullable=true, metadata={})
        @name = name
        @data_type = data_type
        @nullable = nullable
        @metadata = metadata
      end

      def json_value
        {
          'name' => name,
          'type' => data_type.json_value,
          'nullable' => nullable,
          'metadata' => metadata,
        }
      end

      def to_s
        %{StructField(#{name}, #{data_type}, #{nullable})}
      end
    end

    ##
    # Spark::SQL::AtomicType
    #
    # An internal type used to represent everything that is not
    # null, UDTs, arrays, structs, and maps.
    #
    class AtomicType < DataType
    end

    ##
    # Spark::SQL::BooleanType
    #
    # Boolean data type.
    #
    class BooleanType < AtomicType
      atomic
    end

    ##
    # Spark::SQL::NumericType
    #
    # Numeric data types.
    #
    class NumericType < AtomicType
    end


    ##
    # Spark::SQL::IntegralType
    #
    # Integral data types.
    #
    class IntegralType < NumericType
    end


    ##
    # Spark::SQL::StringType
    #
    # String data type.
    #
    class StringType < AtomicType
      atomic
    end


    ##
    # Spark::SQL::LongType
    #
    # Long data type, i.e. a signed 64-bit integer.
    #
    # If the values are beyond the range of [-9223372036854775808, 9223372036854775807],
    # please use {DecimalType}.
    #
    class LongType < IntegralType
      atomic
    end

  end
end


================================================
FILE: lib/spark/sql/row.rb
================================================
module Spark
  module SQL
    ##
    # Spark::SQL::Row
    #
    class Row
      attr_reader :data

      def self.from_java(object, with_schema=true)
        if with_schema
          fields = object.schema.fieldNames
        else
          # Create virtual schema (t0, t1, t2, ...)
          raise Spark::NotImplemented, 'Row must have a schema'
        end

        if object.anyNull
          data = {}
          object.size.times do |i|
            if object.isNullAt(i)
              value = nil
            else
              value = Spark.jb.to_ruby(object.get(i))
            end

            data[ fields[i] ] = value
          end
        else
          data = fields.zip(Spark.jb.to_ruby(object.values))
        end

        Row.new(data)
      end

      def initialize(data={})
        @data = data.to_h
      end

      def [](item)
        @data[item]
      end

      def to_h
        @data
      end

      def inspect
        formated = data.map do |key, value|
          "#{key}: \"#{value}\""
        end

        %{#<Row(#{formated.join(', ')})>}
      end

    end
  end
end


================================================
FILE: lib/spark/sql.rb
================================================
module Spark
  module SQL
    extend Spark::Library

    autoload_without_import :Context,         'spark/sql/context'
    autoload_without_import :DataType,        'spark/sql/data_type'
    autoload_without_import :DataFrame,       'spark/sql/data_frame'
    autoload_without_import :DataFrameReader, 'spark/sql/data_frame_reader'

    autoload :Row,    'spark/sql/row'
    autoload :Column, 'spark/sql/column'

    # Types
    autoload :StructType,   'spark/sql/data_type'
    autoload :StructField,  'spark/sql/data_type'
    autoload :AtomicType,   'spark/sql/data_type'
    autoload :NumericType,  'spark/sql/data_type'
    autoload :IntegralType, 'spark/sql/data_type'
    autoload :StringType,   'spark/sql/data_type'
    autoload :LongType,     'spark/sql/data_type'
  end

  SQLContext = Spark::SQL::Context
end


================================================
FILE: lib/spark/stat_counter.rb
================================================
module Spark
  class StatCounter

    attr_reader :n   # count of our values
    attr_reader :mu  # mean of our values
    attr_reader :m2  # variance numerator (sum of (x - mean)^2)
    attr_reader :max # max of our values
    attr_reader :min # min of our values

    def initialize(iterator)
      @n = 0
      @mu = 0.0
      @m2 = 0.0
      @max = -Float::INFINITY
      @min = Float::INFINITY

      merge(iterator)
    end

    def merge(other)
      if other.is_a?(Spark::StatCounter)
        merge_stat_counter(other)
      elsif other.respond_to?(:each)
        merge_array(other)
      else
        merge_value(other)
      end

      self
    end

    def sum
      @n * @mu
    end

    # Return the variance of the values.
    def variance
      if @n == 0
        Float::NAN
      else
        @m2 / @n
      end
    end

    # Return the sample variance, which corrects for bias in estimating the variance by dividing
    # by N-1 instead of N.
    def sample_variance
      if @n <= 1
        Float::NAN
      else
        @m2 / (@n - 1)
      end
    end

    # Return the standard deviation of the values.
    def stdev
      Math.sqrt(variance)
    end

    # Return the sample standard deviation of the values, which corrects for bias in estimating the
    # variance by dividing by N-1 instead of N.
    def sample_stdev
      Math.sqrt(sample_variance)
    end

    def to_s
      "(count: #{count}, mean: #{mean}, stdev: #{stdev}, max: #{max}, min: #{min})"
    end

    alias_method :count, :n
    alias_method :mean, :mu
    alias_method :max_value, :max
    alias_method :min_value, :min
    alias_method :sampleStdev, :sample_stdev
    alias_method :sampleVariance, :sample_variance

    private

      def merge_stat_counter(other)
        if other == self
          other = self.deep_copy
        end

        if @n == 0
          @n = other.n
          @mu = other.mu
          @m2 = other.m2
          @max = other.max
          @min = other.min
        elsif other.n != 0
          delta = other.mu - @mu

          if other.n * 10 < @n
            @mu = @mu + (delta * other.n) / (@n + other.n)
          elsif @n * 10 < other.n
            @mu = other.mu - (delta * @n) / (@n + other.n)
          else
            @mu = (@mu * @n + other.mu * other.n) / (@n + other.n)
          end

          @max = [@max, other.max].max
          @min = [@min, other.min].min

          @m2 += other.m2 + (delta * delta * @n * other.n) / (@n + other.n)
          @n += other.n
        end
      end

      def merge_array(array)
        array.each do |item|
          merge_value(item)
        end
      end

      def merge_value(value)
        delta = value - @mu
        @n += 1
        @mu += delta / @n
        @m2 += delta * (value - @mu)
        @max = [@max, value].max
        @min = [@min, value].min
      end

  end
end


================================================
FILE: lib/spark/storage_level.rb
================================================
# Necessary libraries
Spark.load_lib

module Spark
  class StorageLevel

    def self.reload
      return if @reloaded
      reload!
      @reloaded = true
    end

    def self.reload!
      self.const_set(:NONE,                  JStorageLevel.NONE)
      self.const_set(:DISK_ONLY,             JStorageLevel.DISK_ONLY)
      self.const_set(:DISK_ONLY_2,           JStorageLevel.DISK_ONLY_2)
      self.const_set(:MEMORY_ONLY,           JStorageLevel.MEMORY_ONLY)
      self.const_set(:MEMORY_ONLY_SER,       JStorageLevel.MEMORY_ONLY_SER)
      self.const_set(:MEMORY_ONLY_2,         JStorageLevel.MEMORY_ONLY_2)
      self.const_set(:MEMORY_ONLY_SER_2,     JStorageLevel.MEMORY_ONLY_SER_2)
      self.const_set(:MEMORY_AND_DISK,       JStorageLevel.MEMORY_AND_DISK)
      self.const_set(:MEMORY_AND_DISK_2,     JStorageLevel.MEMORY_AND_DISK_2)
      self.const_set(:MEMORY_AND_DISK_SER,   JStorageLevel.MEMORY_AND_DISK_SER)
      self.const_set(:MEMORY_AND_DISK_SER_2, JStorageLevel.MEMORY_AND_DISK_SER_2)
      self.const_set(:OFF_HEAP,              JStorageLevel.OFF_HEAP)
    end

    def self.java_get(arg)
      reload

      if arg.is_a?(String)
        const_get(arg.upcase)
      else
        arg
      end
    end

  end
end


================================================
FILE: lib/spark/version.rb
================================================
module Spark
  VERSION = '1.2.1'
end


================================================
FILE: lib/spark/worker/master.rb
================================================
#!/usr/bin/env ruby

$PROGRAM_NAME = 'RubySparkMaster'

require 'socket'
require 'io/wait'
require 'nio'

require_relative 'worker'

# New process group
# Otherwise master can be killed from pry console
Process.setsid

# =================================================================================================
# Master
#
module Master

  def self.create
    case ARGV[0].to_s.strip
    when 'thread'
      Master::Thread.new
    else
      Master::Process.new
    end
  end

  class Base
    include Spark::Constant

    def initialize
      @port = ARGV[1].to_s.strip.to_i
      @socket = TCPSocket.open('localhost', @port)
      @worker_arguments = @socket.read_string
    end

    def run
      selector = NIO::Selector.new
      monitor = selector.register(@socket, :r)
      monitor.value = Proc.new { receive_message }
      loop {
        selector.select {|monitor| monitor.value.call}
      }
    end

    def receive_message
      command = @socket.read_int

      case command
      when CREATE_WORKER
        create_worker
      when KILL_WORKER
        kill_worker
      when KILL_WORKER_AND_WAIT
        kill_worker_and_wait
      end
    end

    def kill_worker_and_wait
      if kill_worker
        @socket.write_int(SUCCESSFULLY_KILLED)
      else
        @socket.write_int(UNSUCCESSFUL_KILLING)
      end
    end
  end

  # ===============================================================================================
  # Worker::Process
  #
  class Process < Base

    def create_worker
      if fork?
        pid = ::Process.fork do
          Worker::Process.new(@port).run
        end
      else
        pid = ::Process.spawn("ruby #{@worker_arguments} worker.rb #{@port}")
      end

      # Detach child from master to avoid zombie process
      ::Process.detach(pid)
    end

    def kill_worker
      worker_id = @socket.read_long
      ::Process.kill('TERM', worker_id)
    rescue
      nil
    end

    def fork?
      @can_fork ||= _fork?
    end

    def _fork?
      return false if !::Process.respond_to?(:fork)

      pid = ::Process.fork
      exit unless pid # exit the child immediately
      true
    rescue NotImplementedError
      false
    end

  end

  # ===============================================================================================
  # Worker::Thread
  #
  class Thread < Base

    def initialize
      ::Thread.abort_on_exception = true

      # For synchronous access to socket IO
      $mutex_for_command  = Mutex.new
      $mutex_for_iterator = Mutex.new

      super
    end

    def create_worker
      ::Thread.new do
        Worker::Thread.new(@port).run
      end
    end

    def kill_worker
      worker_id = @socket.read_long

      thread = ObjectSpace._id2ref(worker_id)
      thread.kill
    rescue
      nil
    end

  end
end

# Create proper master by worker_type
Master.create.run


================================================
FILE: lib/spark/worker/spark_files.rb
================================================
class SparkFiles

  class << self
    attr_accessor :root_directory
  end

  def self.get(file_name)
    File.join(root_directory, file_name)
  end

  def self.get_content(file_name)
    File.read(get(file_name))
  end

end


================================================
FILE: lib/spark/worker/worker.rb
================================================
#!/usr/bin/env ruby

# Load root of the gem
lib = File.expand_path(File.join('..', '..'), File.dirname(__FILE__))
$LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)

require 'ruby-spark.rb'
require 'socket'

require_relative 'spark_files'


# =================================================================================================
# Worker
#
# Iterator is LAZY !!!
#
module Worker
  class Base

    include Spark::Helper::Serialize
    include Spark::Helper::System
    include Spark::Constant

    attr_accessor :socket

    def initialize(port)
      # Open socket to Spark
      @socket = TCPSocket.open('localhost', port)

      # Send back worker ID
      socket.write_long(id)
    end

    def run
      begin
        compute
      rescue => e
        send_error(e)
      else
        successful_finish
      end
    end

    private

      def before_start
        # Should be implemented in sub-classes
      end

      def before_end
        # Should be implemented in sub-classes
      end

      # These methods must be on one method because iterator is Lazy
      # which mean that exception can be raised at `serializer` or `compute`
      def compute
        before_start

        # Load split index
        @split_index = socket.read_int

        # Load files
        SparkFiles.root_directory = socket.read_string

        # Load broadcast
        count = socket.read_int
        count.times do
          Spark::Broadcast.register(socket.read_long, socket.read_string)
        end

        # Load command
        @command = socket.read_data

        # Load iterator
        @iterator = @command.deserializer.load_from_io(socket).lazy

        # Compute
        @iterator = @command.execute(@iterator, @split_index)

        # Result is not iterable
        @iterator = [@iterator] unless @iterator.respond_to?(:each)

        # Send result
        @command.serializer.dump_to_io(@iterator, socket)
      end

      def send_error(e)
        # Flag
        socket.write_int(WORKER_ERROR)

        # Message
        socket.write_string(e.message)

        # Backtrace
        socket.write_int(e.backtrace.size)
        e.backtrace.each do |item|
          socket.write_string(item)
        end

        socket.flush

        # Wait for spark
        # Socket is closed before throwing an exception
        # Singal that ruby exception was fully received
        until socket.closed?
          sleep(0.1)
        end

        # Depend on type of worker
        kill_worker
      end

      def successful_finish
        # Finish
        socket.write_int(WORKER_DONE)

        # Send changed accumulator
        changed = Spark::Accumulator.changed
        socket.write_int(changed.size)
        changed.each do |accumulator|
          socket.write_data([accumulator.id, accumulator.value])
        end

        # Send it
        socket.flush

        before_end
      end

      def log(message=nil)
        return if !$DEBUG

        $stdout.puts %{==> #{Time.now.strftime('%H:%M:%S')} [#{id}] #{message}}
        $stdout.flush
      end

  end

  # ===============================================================================================
  # Worker::Process
  #
  class Process < Base

    def id
      ::Process.pid
    end

    private

      def before_start
        $PROGRAM_NAME = 'RubySparkWorker'
      end

      def kill_worker
        Process.exit(false)
      end

  end

  # ===============================================================================================
  # Worker::Thread
  #
  class Thread < Base

    def id
      ::Thread.current.object_id
    end

    private

      def load_command
        $mutex_for_command.synchronize { super }
      end

      # Threads changing for reading is very slow
      # Faster way is do it one by one
      def load_iterator
        # Wait for incoming connection for preventing deadlock
        if jruby?
          socket.io_wait
        else
          socket.wait_readable
        end

        $mutex_for_iterator.synchronize { super }
      end

      def kill_worker
        Thread.current.kill
      end

  end
end

# Worker is loaded as standalone
if $PROGRAM_NAME == __FILE__
  worker = Worker::Process.new(ARGV[0])
  worker.run
end


================================================
FILE: lib/spark.rb
================================================
# Gems and libraries
require 'method_source'
require 'securerandom'
require 'forwardable'
require 'sourcify'
require 'socket'
require 'tempfile'
require 'tmpdir'
require 'json'

module Spark
  autoload :Context,        'spark/context'
  autoload :Config,         'spark/config'
  autoload :RDD,            'spark/rdd'
  autoload :CLI,            'spark/cli'
  autoload :Build,          'spark/build'
  autoload :Serializer,     'spark/serializer'
  autoload :Helper,         'spark/helper'
  autoload :StorageLevel,   'spark/storage_level'
  autoload :Command,        'spark/command'
  autoload :CommandBuilder, 'spark/command_builder'
  autoload :Sampler,        'spark/sampler'
  autoload :Logger,         'spark/logger'
  autoload :JavaBridge,     'spark/java_bridge'
  autoload :ExternalSorter, 'spark/sort'
  autoload :Constant,       'spark/constant'
  autoload :Broadcast,      'spark/broadcast'
  autoload :Accumulator,    'spark/accumulator'
  autoload :StatCounter,    'spark/stat_counter'
  autoload :Library,        'spark/library'

  # Mllib
  autoload :Mllib, 'spark/mllib'

  # SQL
  autoload :SQL,        'spark/sql'
  autoload :SQLContext, 'spark/sql'

  include Helper::System

  DEFAULT_CONFIG_FILE = File.join(Dir.home, '.ruby-spark.conf')

  def self.print_logo(message=nil)
    puts <<-STRING

    Welcome to
                  __           ____              __
        ______ __/ /  __ __   / __/__  ___ _____/ /__
       / __/ // / _ \\/ // /  _\\ \\/ _ \\/ _ `/ __/  '_/
      /_/  \\_,_/_.__/\\_, /  /___/ .__/\\_,_/_/ /_/\\_\\   version #{Spark::VERSION}
                    /___/      /_/

    #{message}

    STRING
  end

  # Returns current configuration. Configurations can be changed until
  # context is initialized. In this case config is locked only for reading.
  #
  # == Configuration can be changed:
  #
  #   Spark.config.set('spark.app.name', 'RubySpark')
  #
  #   Spark.config['spark.app.name'] = 'RubySpark'
  #
  #   Spark.config do
  #     set 'spark.app.name', 'RubySpark'
  #   end
  #
  def self.config(&block)
    @config ||= Spark::Config.new

    if block_given?
      @config.instance_eval(&block)
    else
      @config
    end
  end

  # Destroy current configuration. This can be useful for restarting config
  # to set new. It has no effect if context is already started.
  def self.clear_config
    @config = nil
  end

  # Return a current active context or nil.
  def self.context
    @context
  end

  # Current active SQLContext or nil.
  def self.sql_context
    @sql_context
  end

  # Initialize spark context if not already. Config will be automatically
  # loaded on constructor. From that point `config` will use configuration
  # from running Spark and will be locked only for reading.
  def self.start
    @context ||= Spark::Context.new
  end

  def self.start_sql
    @sql_context ||= Spark::SQL::Context.new(start)
  end

  def self.stop
    @context.stop
    RubyWorker.stopServer
    logger.info('Workers were stopped')
  rescue
    nil
  ensure
    @context = nil
    @sql_context = nil
    clear_config
  end

  def self.started?
    !!@context
  end


  # ===============================================================================
  # Defaults

  # Load default configuration for Spark and RubySpark
  # By default are values stored at ~/.ruby-spark.conf
  # File is automatically created
  def self.load_defaults
    unless File.exists?(DEFAULT_CONFIG_FILE)
      save_defaults_to(DEFAULT_CONFIG_FILE)
    end

    load_defaults_from(DEFAULT_CONFIG_FILE)
  end

  # Clear prev setting and load new from file
  def self.load_defaults_from(file_path)
    # Parse values
    values = File.readlines(file_path)
    values.map!(&:strip)
    values.select!{|value| value.start_with?('gem.')}
    values.map!{|value| value.split(nil, 2)}
    values = Hash[values]

    # Clear prev values
    @target_dir = nil
    @ruby_spark_jar = nil
    @spark_home = nil

    # Load new
    @target_dir = values['gem.target']
  end

  # Create target dir and new config file
  def self.save_defaults_to(file_path)
    dir = File.join(Dir.home, ".ruby-spark.#{SecureRandom.uuid}")

    if Dir.exist?(dir)
      save_defaults_to(file_path)
    else
      Dir.mkdir(dir, 0700)
      file = File.open(file_path, 'w')
      file.puts "# Directory where will be Spark saved"
      file.puts "gem.target   #{dir}"
      file.puts ""
      file.puts "# You can also defined spark properties"
      file.puts "# spark.master                       spark://master:7077"
      file.puts "# spark.ruby.serializer              marshal"
      file.puts "# spark.ruby.serializer.batch_size   2048"
      file.close
    end
  end


  # ===============================================================================
  # Global settings and variables

  def self.logger
    @logger ||= Spark::Logger.new
  end

  # Root of the gem
  def self.root
    @root ||= File.expand_path('..', File.dirname(__FILE__))
  end

  # Default directory for java extensions
  def self.target_dir
    @target_dir ||= File.join(root, 'target')
  end

  # Directory where is worker.rb
  def self.worker_dir
    @worker_dir ||= File.join(root, 'lib', 'spark', 'worker')
  end

  def self.ruby_spark_jar
    @ruby_spark_jar ||= File.join(target_dir, 'ruby-spark.jar')
  end

  def self.spark_ext_dir
    @spark_ext_dir ||= File.join(root, 'ext', 'spark')
  end


  # ===============================================================================
  # Load JVM and jars

  # Load dependent libraries, can be use once
  # Cannot load before CLI::install
  #
  # == Parameters:
  # target::
  #   path to directory where are located sparks .jar files or single Spark jar
  #
  def self.load_lib(target=nil)
    return if @java_bridge

    target ||= Spark.target_dir

    @java_bridge = JavaBridge.init(target)
    @java_bridge.import_all
    nil
  end

  def self.java_bridge
    @java_bridge
  end


  # Aliases
  class << self
    alias_method :sc, :context
    alias_method :jb, :java_bridge
    alias_method :home, :root
  end

end

# C/Java extensions
require 'ruby_spark_ext'

# Ruby core extensions
require 'spark/ext/module'
require 'spark/ext/object'
require 'spark/ext/hash'
require 'spark/ext/string'
require 'spark/ext/integer'
require 'spark/ext/ip_socket'
require 'spark/ext/io'

# Other requirments
require 'spark/version'
require 'spark/error'

# Load default settings for gem and Spark
Spark.load_defaults

# Make sure that Spark be always stopped
Kernel.at_exit do
  begin
    Spark.started? && Spark.stop
  rescue
  end
end


================================================
FILE: ruby-spark.gemspec
================================================
# coding: utf-8

lib = File.expand_path('../lib', __FILE__)
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)

require 'spark/version'

Gem::Specification.new do |spec|
  spec.name          = 'ruby-spark'
  spec.version       = Spark::VERSION
  spec.authors       = ['Ondřej Moravčík']
  spec.email         = ['moravcik.ondrej@gmail.com']
  spec.summary       = %q{Ruby wrapper for Apache Spark}
  spec.description   = %q{}
  spec.homepage      = ''
  spec.license       = 'MIT'

  spec.files         = `git ls-files -z`.split("\x0")
  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
  spec.require_paths = ['lib']

  if RUBY_PLATFORM =~ /java/
    spec.platform = 'java'

    extensions = ['ext/ruby_java/extconf.rb']
  else
    extensions = ['ext/ruby_c/extconf.rb']

    spec.add_dependency 'rjb'
  end

  spec.extensions = extensions
  spec.required_ruby_version = '>= 2.0'

  spec.requirements << 'java, scala'

  spec.add_dependency 'sourcify', '0.6.0.rc4'
  spec.add_dependency 'method_source'
  spec.add_dependency 'commander'
  spec.add_dependency 'pry'
  spec.add_dependency 'nio4r'
  spec.add_dependency 'distribution'

  spec.add_development_dependency 'bundler', '~> 1.6'
  spec.add_development_dependency 'rake'
end


================================================
FILE: spec/generator.rb
================================================
class Generator
  def self.numbers(size=1000)
    Array.new(size){ rand(1..1000) }
  end

  def self.numbers_with_zero(size=1000)
    Array.new(size){ rand(0..1000) }
  end

  def self.words(size=1000)
    Array.new(size) { word }
  end

  def self.word(size=10)
    Array.new(rand(1..size)){(97+rand(26)).chr}.join
  end

  def self.lines(size=1000, letters=3)
    Array.new(size) do
      Array.new(rand(50..100)){
        (97+rand(letters)).chr + (' ' * (rand(10) == 0 ? 1 : 0))
      }.join
    end
  end

  def self.hash(size=1000)
    Array.new(size) do
      [word(2), rand(1..10)]
    end
  end

  def self.hash_with_values(size=1000, values_count=10)
    Array.new(size) do
      [word(2), Array.new(values_count) { rand(1..10) }]
    end
  end
end


================================================
FILE: spec/inputs/lorem_300.txt
================================================
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aenean ligula neque, ultricies et lorem
vel, accumsan cursus felis. Maecenas ullamcorper, magna eu lobortis gravida, diam leo rutrum diam,
eget elementum sapien felis non magna. Etiam scelerisque, mauris et cursus fermentum, ipsum nisl
vulputate nisl, sit amet pulvinar libero sem at lectus. Vivamus nibh lectus, elementum eget dui non,
fermentum volutpat orci. Nam imperdiet, dui id placerat pellentesque, purus sem semper augue, id
dictum est ipsum et erat. Integer arcu tortor, ullamcorper ac libero a, iaculis sollicitudin orci.
Sed dapibus hendrerit neque, ac aliquet arcu elementum sed. Phasellus ornare interdum erat, eget
fringilla sapien ornare vitae. In condimentum, mi sed condimentum viverra, nisl sapien scelerisque
mi, vel varius metus dolor eu lorem. Nulla pulvinar ac metus eu volutpat. Suspendisse potenti. Duis
vitae mauris arcu. Proin et dignissim dolor, eget congue purus. Ut malesuada neque massa. Ut viverra
faucibus turpis, in pharetra nulla iaculis quis. Morbi imperdiet risus eu eros varius facilisis.
Aenean nec dapibus sapien. Fusce tempus, risus vitae volutpat faucibus, dolor diam cursus risus, sit
amet faucibus mauris mauris quis orci. Aliquam massa ante, accumsan non sapien quis, ullamcorper
fermentum elit. Pellentesque risus orci, rhoncus ac mi sed, volutpat vehicula sem. Mauris suscipit
odio vel mi scelerisque, at cursus libero ullamcorper. Nulla aliquam metus arcu, in vestibulum sem
ullamcorper eu. Pellentesque laoreet venenatis metus ut accumsan. Quisque ut enim interdum,
fringilla lorem nec, dignissim orci. Fusce vel diam sed ante dictum scelerisque. Vestibulum lectus
enim, gravida sit amet ullamcorper sit amet, rhoncus nec dui. Praesent eget molestie tellus, quis
iaculis sapien. Sed ut rutrum velit. Pellentesque habitant morbi tristique senectus et netus et
malesuada fames ac turpis egestas. Donec tortor quam, venenatis ac rhoncus et, gravida non orci. Ut
lacus dolor, auctor id ante varius, pharetra placerat nulla. Nulla facilisi. Nam quis feugiat nibh,
ut ultrices est. Nulla at mi nec metus porttitor tempor. Donec leo lorem, rhoncus ut arcu eu,
venenatis eleifend risus. Phasellus non porttitor neque, sit amet accumsan nisl. Pellentesque non
urna tempor, interdum orci non, gravida enim. Sed in urna et dolor cursus aliquet et vel magna.
Quisque vestibulum tortor scelerisque orci mattis, eu aliquet sem condimentum. Proin ac ultricies
erat. Integer sodales, turpis quis volutpat pretium, justo lacus lobortis mauris, nec commodo orci
leo sit amet metus. Ut ornare ipsum vitae malesuada aliquam. Quisque lobortis semper elit id
consectetur. Aenean facilisis sapien eu ipsum adipiscing mattis. Praesent malesuada aliquet
venenatis. Ut aliquet vel sapien nec euismod. Morbi eros urna, rutrum ut iaculis sed, vulputate sit
amet nunc. Nulla facilisi. Morbi sagittis nec magna sed scelerisque. Maecenas a euismod eros.
Vestibulum suscipit pharetra velit porta fermentum. Phasellus euismod auctor metus ut interdum.
Quisque lectus lorem, tristique ut libero vel, rhoncus tincidunt tellus. Sed malesuada vestibulum
purus, at tincidunt massa imperdiet vitae. Ut mollis eleifend elit, et sodales nisl facilisis eu.
Fusce ligula ligula, porta id est sed, tincidunt malesuada odio. Maecenas ultricies dignissim nunc,
quis adipiscing urna auctor commodo. Phasellus tincidunt odio non nulla luctus sollicitudin. Mauris
pharetra porttitor est iaculis sollicitudin. Curabitur quam sem, fringilla id tellus vitae,
elementum convallis eros. Morbi sollicitudin eleifend leo, ut euismod ligula ornare sagittis. Nullam
luctus, mi eget dapibus elementum, diam purus fringilla lectus, sit amet sodales neque turpis sed
mi. Sed volutpat sem euismod posuere mollis. Integer viverra egestas lacinia. Quisque viverra metus
massa, in condimentum sem tincidunt a. Proin ac ipsum non leo sollicitudin consectetur id a sem.
Cras tempus venenatis nisl sit amet venenatis. Nulla facilisi. Morbi scelerisque mi est, vitae
lobortis sem ultricies faucibus. In urna ante, faucibus ac eros et, dignissim mollis justo. Quisque
aliquet tortor sem, ac mattis tortor faucibus sed. Donec tortor lacus, egestas in convallis at,
vulputate eu nibh. Aenean ligula augue, imperdiet in tempor id, consequat vitae erat. Sed id eros a
justo semper ultricies. Curabitur nunc nisi, placerat at leo sed, vehicula pulvinar velit. Nullam ut
ipsum augue. Fusce condimentum quam commodo, venenatis massa eleifend, dignissim neque. Curabitur
sit amet hendrerit tortor, a condimentum sem. Morbi lobortis porta porttitor. Maecenas mollis ipsum
ac est venenatis auctor at vel lectus. Mauris luctus euismod dolor. Cras vitae nibh eget sem
placerat adipiscing. Pellentesque ac molestie ligula. Vivamus sit amet lectus odio. Duis lacinia
rutrum faucibus. Curabitur luctus ultricies enim, id imperdiet ipsum viverra vitae. Mauris et
iaculis erat, vel faucibus purus. Fusce non nisl tristique, dignissim lacus id, fermentum velit. Sed
facilisis sapien at interdum viverra. Aliquam erat volutpat. Maecenas suscipit diam vitae velit
vulputate tincidunt. Nulla facilisi. Sed eget tortor et ante mollis cursus. Nullam vitae porttitor
magna. Quisque iaculis massa dui, id rutrum purus blandit eu. Duis convallis ipsum id commodo
iaculis. Praesent sagittis ut tortor ut varius. Curabitur consequat volutpat scelerisque. Cras
pharetra lectus eget urna imperdiet ullamcorper. Sed lacinia ut eros non malesuada. Quisque
hendrerit suscipit convallis. Vivamus posuere vestibulum massa, non accumsan diam tincidunt eu.
Nulla bibendum dictum mi sit amet faucibus. Nullam egestas lorem nunc, vel malesuada elit imperdiet
vitae. Sed luctus ligula at erat tempus tristique. Proin varius mi quis libero sollicitudin
ullamcorper. In hac habitasse platea dictumst. Praesent auctor arcu vel luctus consequat. Curabitur
consequat magna sit amet ante feugiat dictum. Morbi scelerisque faucibus urna, ac dapibus sem
ultricies eu. Pellentesque rhoncus sapien nec eros facilisis consectetur. Duis eleifend vestibulum
suscipit. Morbi orci metus, malesuada sit amet urna ac, laoreet vehicula lacus. Quisque gravida,
nunc fringilla tincidunt vestibulum, lacus urna commodo nisl, quis sodales lectus ipsum et augue. Ut
non erat sit amet neque fermentum ultricies. Vestibulum tincidunt est elit, ac dapibus velit
faucibus id. Praesent in viverra libero. Proin eleifend, odio eget sodales dignissim, nunc arcu
ullamcorper libero, sit amet sodales diam ipsum in tellus. Suspendisse enim nunc, accumsan non
ligula et, vulputate viverra ante. Ut id elit eu dui dictum malesuada at id orci. Vivamus sed felis
aliquam metus consequat euismod nec eu libero. Phasellus mattis malesuada ipsum eu posuere. Nullam
at massa enim. Duis vitae urna blandit, ultricies nisi in, consequat elit. Quisque nec nibh ut
tortor pulvinar euismod. Praesent molestie felis ac risus elementum sollicitudin. Donec eu leo in
augue convallis mattis. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur
ridiculus mus. Integer ut dignissim lectus. Vivamus eros felis, gravida et auctor ut, volutpat vitae
dui. Nunc adipiscing sapien et lectus rutrum vestibulum. Mauris fermentum, metus eu sollicitudin
malesuada, lorem diam vestibulum metus, ut elementum metus nibh sed augue. Cras lectus risus,
feugiat eget fringilla a, cursus et eros. Praesent aliquam justo vel condimentum lacinia. Sed
condimentum dui nec leo blandit, vel elementum odio laoreet. Quisque suscipit molestie iaculis.
Nullam dignissim, mauris sit amet condimentum aliquet, magna sapien scelerisque nisl, tincidunt
auctor purus libero at lectus. Nulla facilisi. Sed egestas erat at dictum egestas. Cras non mauris
ut dolor interdum condimentum. Fusce quis hendrerit purus, dictum cursus mi. Maecenas mattis, turpis
sit amet mollis ultricies, mi turpis ornare velit, eget suscipit magna eros sit amet purus. Integer
ut viverra elit. Praesent eu augue viverra nunc convallis porta. Etiam venenatis dignissim nisl et
semper. Cras eu nisl vitae justo ornare porttitor vel nec augue. Pellentesque faucibus mollis neque,
nec ullamcorper purus mollis sed. Suspendisse ut molestie lectus, faucibus aliquet libero. Aliquam
tristique, neque ut lobortis ultricies, tellus elit ultrices risus, sodales dapibus sem mauris et
magna. Sed et sem porttitor, fringilla mauris vestibulum, porttitor dui. Proin vitae viverra elit.
Integer nec adipiscing velit. Nunc quis urna tristique, ultrices orci eget, aliquet lorem. Curabitur
consequat adipiscing sodales. In elementum condimentum ante id placerat. Cras ac turpis tristique
lacus vulputate dictum vel nec libero. Curabitur fringilla interdum tempus. Integer placerat dolor
ut magna aliquet bibendum. Cras ac metus magna. Curabitur vehicula magna ut sapien viverra ornare.
Donec risus nisi, imperdiet eu laoreet in, tempor lobortis urna. Etiam malesuada et lacus ac
consectetur. Morbi facilisis sapien quis nisl laoreet semper. Suspendisse volutpat sapien vel quam
blandit faucibus. Nam sagittis velit eros, vitae suscipit tortor elementum ac. Pellentesque habitant
morbi tristique senectus et netus et malesuada fames ac turpis egestas. Donec nec nibh dictum,
pretium nulla eu, pharetra mauris. Vestibulum leo mi, convallis et euismod ac, molestie in ligula.
Vestibulum tempor tincidunt porttitor. Integer nisl orci, dignissim ac volutpat a, auctor eget
augue. Suspendisse eget euismod nunc, eu elementum ipsum. Cras libero tortor, gravida quis
vestibulum a, tincidunt aliquam mauris. Integer elementum pellentesque posuere. Donec accumsan
feugiat pulvinar. Aliquam eros justo, dictum non elementum nec, tristique vel massa. Nulla a velit
porttitor, aliquam turpis nec, ultricies ligula. Nam id dignissim dui. Ut placerat arcu nec accumsan
varius. Sed quis accumsan nunc, in dapibus lorem. Morbi egestas sagittis pulvinar. Morbi id mauris
ante. Sed magna nibh, venenatis quis lacinia in, congue quis metus. Nunc lacus lectus, adipiscing
sed consequat id, luctus vel dui. Mauris eu nisi erat. Proin eleifend lectus sit amet ligula
fringilla semper. Suspendisse tristique, quam ac pharetra dictum, libero risus rutrum ipsum, eget
tristique arcu neque vel nisi. Ut auctor nulla vitae porta faucibus. Suspendisse ut tellus enim.
Morbi commodo posuere quam. Proin consequat in quam pulvinar posuere. Nunc id ullamcorper est. Cras
ac molestie massa. Cras leo tellus, tempus id nibh quis, porttitor laoreet elit. Mauris in ornare
nisi. Duis vel velit felis. Suspendisse gravida felis nec nulla hendrerit pretium. Cras at orci
neque. Phasellus vehicula, ipsum at tempus sodales, mauris est condimentum metus, a vehicula ante
tellus sit amet diam. Suspendisse fermentum elit in volutpat viverra. Nullam gravida in augue sed
mollis. Curabitur aliquam diam non quam aliquam ultrices. Quisque pretium semper diam eget
malesuada. Suspendisse porttitor sagittis sem at malesuada. Donec euismod elementum nulla, sit amet
eleifend enim adipiscing nec. Nullam porta, enim ac tincidunt molestie, turpis mi porta justo,
ornare tristique sem orci quis turpis. Nullam leo dolor, pellentesque ac hendrerit et, tempus quis
nisi. Fusce pretium mattis tortor sagittis suscipit. Vestibulum vitae suscipit libero. Mauris
consequat sagittis mi, id tempus est condimentum et. In eget condimentum odio, a malesuada quam.
Vivamus id turpis non nulla eleifend cursus ut sit amet tellus. Proin ultrices luctus nibh, eget
condimentum ligula vestibulum in. Aliquam pharetra aliquet erat nec lacinia. Cras fringilla est
fringilla ante tristique, vitae bibendum dolor malesuada. Praesent ut dui pulvinar, suscipit velit
gravida, malesuada nunc. Cras tempus feugiat interdum. Vivamus lectus lorem, rutrum ut neque at,
sollicitudin euismod nulla. Vestibulum ac ligula suscipit, ultricies felis eget, adipiscing lectus.
Maecenas nec enim vel eros molestie lobortis faucibus sit amet urna. Sed ac consequat nulla. Nulla
et libero nisi. Pellentesque euismod nunc quis ipsum tristique, suscipit elementum magna aliquam.
Praesent sit amet tincidunt leo. Duis tempor arcu eget est posuere imperdiet. Quisque vel dui
adipiscing, auctor nibh vel, vulputate sapien. Curabitur eu sodales lacus. Aliquam felis eros,
mattis a diam eu, ullamcorper vestibulum turpis. Vivamus vitae vulputate lacus, sed convallis lorem.
Vestibulum mattis sollicitudin vulputate. Mauris cursus erat eget nisi accumsan, nec commodo tellus
blandit. Etiam gravida nulla et lorem molestie auctor. Mauris venenatis iaculis nulla vel mollis.
Morbi pretium sed eros at commodo. Aliquam eu justo turpis. Pellentesque lobortis, nisl eget
ultricies dictum, augue sem placerat elit, vitae pretium lectus massa eget tortor. Nulla accumsan,
massa eu rutrum pharetra, mi sapien aliquam massa, viverra facilisis metus nisi in dolor. Duis felis
velit, interdum a elit non, cursus pellentesque libero. Cum sociis natoque penatibus et magnis dis
parturient montes, nascetur ridiculus mus. Nunc vel nisi quis augue accumsan aliquam. Suspendisse
ante lectus, lobortis nec suscipit at, ullamcorper at diam. Aliquam hendrerit, eros ac egestas
condimentum, enim metus lobortis nibh, sit amet convallis augue nulla nec lorem. Lorem ipsum dolor
sit amet, consectetur adipiscing elit. Ut ac ligula eget est blandit scelerisque at vitae nunc. Sed
venenatis eros non quam auctor posuere. Curabitur convallis dapibus semper. Fusce et leo sed massa
posuere porta. Morbi convallis lobortis eros. Quisque ac nisl dictum, sagittis eros et, pellentesque
metus. Quisque mattis sodales lorem quis malesuada. Aenean neque sapien, rutrum vitae euismod quis,
euismod eu mi. Etiam ante tellus, auctor vitae pulvinar a, mattis nec tellus. Morbi libero lectus,
mattis sit amet convallis at, viverra et nisi. Proin a ante tristique, blandit urna at, lobortis
leo. Praesent nec odio sit amet ligula adipiscing pretium at rhoncus felis. Ut ut velit turpis. Sed
tempor lectus massa, vel gravida libero gravida a. Nunc mollis, lorem id dapibus hendrerit, mi orci
gravida orci, at vehicula neque nisl quis nibh. Mauris feugiat, ligula sit amet interdum laoreet,
lectus leo accumsan dolor, eu cursus tortor quam eget lectus. Sed commodo, est in bibendum
condimentum, magna neque dictum sapien, at lacinia sem ipsum ut eros. In eget erat eu nulla
hendrerit tincidunt id vulputate nibh. Nunc sed imperdiet urna, eu tempor orci. Phasellus
pellentesque sapien eu risus tincidunt, ut iaculis risus fermentum. Suspendisse condimentum erat
vitae porta malesuada. Ut a vulputate lorem. Nulla ullamcorper, neque in posuere vulputate, neque
magna tempor erat, sit amet luctus nisi nibh quis ligula. Duis porta urna et fermentum interdum. Sed
pellentesque odio euismod nisi auctor rutrum. Suspendisse mi nibh, dignissim eget porttitor quis,
commodo a massa. Nunc vel eleifend turpis. Sed iaculis, massa quis egestas pellentesque, nibh ante
feugiat ante, a euismod lacus nunc et felis. Nam in aliquet odio. Nulla eget enim aliquam, faucibus
est at, fringilla tellus. Duis molestie massa ornare, sodales leo eget, lobortis nibh. Nam bibendum
mi a facilisis mattis. Duis ultrices arcu tellus, vitae interdum tortor dictum et. Sed id luctus
lectus, eu tempus quam. Duis mi nisl, iaculis vel tortor sit amet, vulputate sodales risus. Cras
vitae lobortis nisi, eu adipiscing ante. Nam eget scelerisque libero. Nulla pulvinar, velit et
posuere sagittis, odio risus venenatis sapien, at tristique enim augue quis sem. Integer rutrum
blandit eros eu faucibus. Etiam eget iaculis felis, in fermentum ante. Nullam a placerat risus, id
accumsan quam. Donec est orci, elementum eu sapien non, ultricies ullamcorper leo. Praesent
tincidunt, mauris in viverra hendrerit, dolor nisi cursus orci, vel lacinia neque ante eu magna. Nam
facilisis massa at nisi accumsan, non condimentum turpis facilisis. Cras quis ipsum at orci ornare
venenatis vitae et ante. Morbi vitae luctus lacus. Nullam eu felis at mi hendrerit commodo a eu
diam. Maecenas ultricies, urna sit amet egestas tempor, dolor ligula dictum nibh, vehicula commodo
ipsum diam at nunc. Proin facilisis tincidunt elit, sed vulputate leo lobortis sed. In tincidunt
risus lorem, venenatis pellentesque tellus accumsan vitae. Integer ullamcorper mi ut risus
consectetur dictum in quis dui. Pellentesque sed diam sed purus egestas mollis id at sapien. Nunc
cursus mi nec accumsan porta. Nullam pulvinar pharetra felis. Etiam porta massa et diam scelerisque,
ut iaculis nisl luctus. Curabitur vel metus id lacus faucibus tempus. Nullam ornare neque orci, nec
scelerisque erat mattis nec. Phasellus ultrices ultrices nisi quis venenatis. Sed ultrices iaculis
diam a faucibus. Phasellus quis suscipit nulla. Nulla ultricies, turpis et dictum ullamcorper, urna
metus porta tellus, quis congue dolor libero quis sem. Nam tempus metus risus, sed rutrum nibh
cursus malesuada. Vivamus bibendum odio eget mi aliquet, sed tempor eros tincidunt. Suspendisse eu
ultricies ligula, non commodo sem. Ut aliquet elit sed leo laoreet aliquam. Vivamus feugiat a justo
non auctor. Sed rhoncus orci ut dictum dignissim. Duis eros libero, tempus non venenatis quis,
suscipit eget turpis. Aliquam sed ullamcorper velit, in tincidunt tellus. Ut dapibus erat vel nunc
feugiat elementum. Cras congue, erat sit amet lacinia venenatis, nisi magna rhoncus nulla, eu
blandit eros neque ac eros. Donec vulputate placerat dapibus. Integer dignissim odio eget iaculis
ultrices. Vestibulum ligula neque, tincidunt at pretium ac, tincidunt sit amet tellus. Sed fermentum
egestas tortor, non volutpat sapien. Aliquam erat volutpat. Duis semper placerat sapien at placerat.
Praesent facilisis pharetra dignissim. Morbi laoreet sed tortor eu rhoncus. Vivamus eleifend felis
eu dui ornare ornare sed at urna. Nulla nulla justo, hendrerit id enim vitae, blandit consequat
nibh. Aliquam mattis diam mattis fringilla tempor. Suspendisse suscipit est sed pulvinar commodo.
Lorem ipsum dolor sit amet, consectetur adipiscing elit. In in scelerisque enim. Phasellus ornare
nisl consequat volutpat bibendum. Vivamus et nunc viverra, ultrices lorem a, cursus purus. Curabitur
nibh libero, hendrerit lobortis malesuada sit amet, fringilla et augue. Vestibulum est lacus,
fringilla sit amet dictum pulvinar, lacinia at leo. Proin iaculis felis vitae metus viverra blandit.
Mauris accumsan sagittis semper. Quisque non diam a quam volutpat faucibus. Pellentesque eros orci,
commodo eget fringilla eu, euismod et turpis. Duis molestie et eros ac ullamcorper. Phasellus
consequat risus eget elementum semper. Donec at mi a justo laoreet condimentum porttitor in purus.
Nulla sit amet libero consectetur, iaculis neque nec, scelerisque turpis. Aliquam interdum nibh eget
accumsan dictum. Ut lobortis, mi non eleifend lobortis, lorem mauris pretium urna, at fermentum
tellus felis eu nunc. Aliquam in nibh tristique, tempus purus a, cursus massa. Suspendisse potenti.
Maecenas porttitor et erat in sollicitudin. Cum sociis natoque penatibus et magnis dis parturient
montes, nascetur ridiculus mus. Vestibulum commodo placerat velit, vel pellentesque neque sagittis
eget. Interdum et malesuada fames ac ante ipsum primis in faucibus. Nullam eu massa placerat,
iaculis eros eget, viverra orci. Aliquam ac lacus porttitor, eleifend elit id, vehicula mauris. Sed
ac interdum libero. Sed laoreet suscipit mi, ac accumsan massa condimentum nec. Suspendisse sodales
libero sollicitudin, malesuada quam ac, viverra enim. Sed sapien libero, egestas sit amet orci non,
venenatis interdum augue. In hac habitasse platea dictumst. Fusce gravida orci at ligula fringilla
adipiscing. Nunc quis ipsum quis nibh egestas porta. Proin et faucibus elit. Etiam in neque at nunc
pharetra adipiscing nec vel magna. Donec at nunc scelerisque, tincidunt risus ut, bibendum nisi.
Donec pulvinar fermentum purus, ac adipiscing urna iaculis at. Nulla ut nunc vitae lorem dapibus
fringilla. Ut placerat dignissim nulla ornare mattis. Mauris rutrum tellus quis odio dictum, ac
tempor velit scelerisque. Quisque ligula elit, convallis nec volutpat vitae, pulvinar id mauris.
Vivamus vel accumsan tortor. Donec eu sollicitudin dolor. Pellentesque egestas congue tristique.
Phasellus ut sollicitudin nisl. Praesent diam neque, malesuada id tincidunt id, malesuada in eros.
Phasellus adipiscing ipsum vel justo molestie vulputate. Praesent ultricies dapibus lacus pulvinar
gravida. Donec consequat, orci et mattis ultrices, nibh enim sagittis metus, vitae eleifend enim
tellus vitae augue. Suspendisse placerat iaculis risus nec iaculis. Ut ullamcorper ultrices dui, sed
blandit mauris hendrerit vitae. Nulla ac dolor lectus. Etiam pellentesque neque at odio bibendum, at
venenatis tellus fermentum. Maecenas a condimentum metus. Phasellus semper scelerisque feugiat.
Fusce varius varius tincidunt. Ut vel auctor magna. Cras dui turpis, euismod in enim a, scelerisque
adipiscing lectus. Duis mollis pharetra risus, sed ultrices nulla blandit non. Integer ac pulvinar
magna. Aenean fermentum auctor magna. Ut in viverra sapien. Proin ac bibendum magna, cursus gravida
elit. Phasellus vehicula facilisis nibh, tempor sagittis mauris accumsan et. Vestibulum sed lacus
luctus diam ornare venenatis non vel felis. Morbi posuere sit amet nisl quis pulvinar. Suspendisse
blandit tempus risus quis pretium. Nullam gravida libero vel aliquam suscipit. Nunc vel nunc at leo
pharetra tempor et ut mi. Aliquam erat volutpat. Nulla placerat odio tellus. Nam adipiscing massa
nec varius posuere. Proin placerat tellus posuere lorem suscipit, sit amet sagittis sem condimentum.
Ut pharetra odio quis tellus mattis facilisis. Quisque eget interdum est. Quisque mattis, felis eu
semper feugiat, quam augue interdum mauris, eget sodales nisi neque quis erat. Curabitur semper, mi
posuere luctus molestie, neque ante sagittis nulla, sit amet vehicula eros eros in justo. Integer
aliquet vehicula arcu, quis iaculis justo. Sed tincidunt sem id est porta volutpat. Mauris varius
felis ut est venenatis, ornare porttitor arcu adipiscing. Sed luctus rutrum ante, consectetur
sollicitudin sapien accumsan vulputate. Vivamus id diam vehicula, fermentum nunc id, viverra justo.
Quisque porttitor, odio in molestie hendrerit, libero eros vehicula odio, id vestibulum sapien neque
quis nibh. Donec vel faucibus est. Ut nec sapien vitae nibh congue egestas vel euismod tellus. Lorem
ipsum dolor sit amet, consectetur adipiscing elit. Vestibulum quis lacus lorem. Integer egestas
euismod ante, vitae condimentum neque eleifend non. Sed posuere bibendum ante, ut facilisis dui
condimentum at. In ut varius augue. Vivamus bibendum eu odio vel convallis. Vivamus cursus sodales
iaculis. Nullam convallis facilisis blandit. Phasellus iaculis porttitor elit, eget vestibulum ipsum
convallis eu. Quisque volutpat justo ipsum, eleifend cursus urna facilisis a. Sed at diam nec sem
semper scelerisque. Aliquam euismod erat quis nisi dictum, at sodales leo fermentum. Nam at nisl
metus. Proin luctus porttitor ante in tincidunt. Maecenas laoreet vitae enim eget elementum. Nulla
id sagittis enim, nec ultrices tortor. Nam rutrum ipsum sit amet erat auctor, eu venenatis libero
ultricies. Ut condimentum neque non diam ullamcorper, ultrices feugiat neque egestas. Pellentesque
at lobortis est, in blandit mi. Maecenas tincidunt eros id massa pulvinar, quis varius eros
lobortis. Curabitur vitae sodales orci. Suspendisse potenti. Pellentesque eu fringilla nibh. Etiam
sed pretium enim, lacinia consequat lectus. Quisque sed mi risus. Praesent posuere dolor sed mauris
dapibus, id tristique mi mattis. Quisque nec urna rutrum, consectetur mauris ut, egestas libero.
Fusce a justo orci. Etiam vitae aliquet ipsum. Curabitur consequat tempor eros, ut placerat lectus
tempus et. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis
egestas. Sed ligula mi, laoreet sit amet nunc id, ullamcorper fermentum magna. Maecenas enim dui,
viverra at nulla ut, lacinia pretium nunc. Donec at ultricies nulla, nec cursus odio. Donec
ullamcorper nec turpis imperdiet hendrerit. Sed euismod aliquam vehicula. Nunc sed enim eleifend
turpis venenatis sagittis. Sed laoreet velit erat. Proin nisl erat, vulputate et fermentum iaculis,
mollis suscipit magna. Sed porta, augue ut accumsan fermentum, arcu tortor rutrum tellus, sit amet
sollicitudin lectus turpis non felis. Vestibulum ante leo, interdum sed venenatis non, porttitor ut
nibh. Sed sit amet luctus erat. Duis id rhoncus justo, non rutrum lorem. Mauris ut laoreet elit.
Praesent sed diam porta, rhoncus massa a, tincidunt lorem. Mauris bibendum nunc nec est ullamcorper
bibendum. Nullam venenatis libero sed ligula scelerisque euismod quis at dui. Donec ac velit luctus,
molestie mi at, tempor leo. Pellentesque a ultricies risus. Maecenas malesuada faucibus nulla quis
consectetur. Phasellus pretium interdum risus sit amet aliquet. Nullam eleifend sem id magna
laoreet, ut lobortis mi tincidunt. Maecenas in justo tempor, viverra ipsum eu, tincidunt nulla. Sed
sed molestie turpis. Pellentesque imperdiet, eros non vulputate fringilla, turpis odio luctus
lectus, eu lacinia purus nisl vitae justo. Etiam non dapibus dolor. Fusce non urna scelerisque,
interdum massa vitae, venenatis metus. Vestibulum scelerisque dolor ac lectus sollicitudin, eget
fringilla sapien fringilla. Suspendisse non quam massa. Donec a sollicitudin eros, ut mollis turpis.
Nullam gravida congue semper. Phasellus vitae tellus vitae nulla cursus tempor et non elit.
Vestibulum pharetra in ligula a venenatis. Maecenas at erat sed nulla vulputate pulvinar et eu
libero. Donec pulvinar arcu nisi, sed posuere turpis cursus a. In nec turpis interdum, condimentum
velit in, consectetur lacus. Duis porta, felis a rhoncus ornare, ligula est elementum nunc, eu
adipiscing massa lorem in nibh. In consequat gravida eros. Phasellus condimentum malesuada sapien
ultrices tempor. Suspendisse sit amet diam in est pulvinar iaculis nec vitae nibh. Vivamus rhoncus
enim lorem, elementum posuere est pretium ut. Duis lectus lorem, ultricies ac dignissim in, egestas
et ipsum. Proin nec est ac dui sagittis dictum. Cras dictum augue ipsum, sit amet gravida ligula
scelerisque nec. Ut congue blandit porta. Nunc porta vitae risus at sagittis. Donec viverra, ante id
porta consectetur, felis turpis fringilla dui, ut vulputate nulla eros sit amet augue. Donec
aliquet, felis ut tempor pretium, enim leo suscipit risus, eget mollis justo ipsum ut augue. Nullam
at lacus eu orci dapibus laoreet nec convallis leo. Fusce rhoncus sed neque sit amet viverra. Donec
arcu nisl, hendrerit non pulvinar eu, blandit ac neque. Curabitur porta velit metus, non ullamcorper
nibh volutpat non. Proin tristique orci nec pretium lobortis. Curabitur quam neque, lacinia vitae
massa id, molestie pellentesque risus. Praesent vitae lectus bibendum, tincidunt augue vel, volutpat
magna. Curabitur quis feugiat magna. In libero risus, commodo eu mauris vitae, euismod ullamcorper
libero. Cras elementum rutrum lacus eu euismod. Morbi purus metus, rutrum nec varius sed, dignissim
eget nisi. Vivamus mauris nibh, hendrerit eu massa sed, ultrices suscipit est. Cras id odio dui.
Nulla condimentum luctus ipsum, eu molestie turpis commodo sed. Aliquam erat volutpat. Ut sodales
urna sit amet est dapibus pharetra. In nec vestibulum mi. Nullam mattis fringilla venenatis. Sed
risus sem, tempor vitae suscipit a, viverra in quam. In malesuada odio nec laoreet accumsan. Donec
justo diam, lacinia eu ante eget, pulvinar molestie mauris. Interdum et malesuada fames ac ante
ipsum primis in faucibus. Sed vulputate ornare dolor a tempor. Maecenas egestas, augue et semper
egestas, elit ipsum varius sem, a dapibus eros velit in sapien. Nulla sit amet eros ullamcorper,
hendrerit nunc eu, aliquet ipsum. Sed sit amet lacus enim. Curabitur faucibus rutrum dui, a tempor
velit vestibulum sed. Curabitur sed nunc id lorem semper malesuada. Maecenas semper eros eu
pellentesque vulputate. Nulla accumsan dolor placerat eros euismod facilisis. Nam vitae velit
tortor. Fusce tincidunt felis luctus, scelerisque dui in, rutrum nulla. Proin a pharetra tellus.
Aenean varius dolor nec risus eleifend fringilla. Proin at tellus ligula. Cras imperdiet mollis nisi
eget auctor. Etiam libero nunc, dictum at fermentum vitae, vehicula tincidunt justo. Proin tempor
risus elit, vestibulum auctor erat tristique vel. Etiam varius dui ante, a fringilla erat
ullamcorper vel. Quisque cursus quam imperdiet ornare dictum. Suspendisse turpis nunc, scelerisque a
congue eget, faucibus ut mauris. Suspendisse venenatis nisi nec dolor pharetra, id euismod sem
accumsan. Quisque et accumsan justo, elementum vulputate nulla. Etiam et sapien scelerisque,
malesuada lacus non, pretium enim. Curabitur ultrices, ipsum hendrerit pulvinar volutpat, dui tortor
mattis tortor, sed tincidunt magna lectus non eros. Ut hendrerit velit non metus pellentesque
mattis. Nullam velit nisi, ornare sit amet ipsum id, commodo tincidunt nisi. Aliquam egestas, ante
non placerat convallis, mi mauris posuere ligula, nec auctor lectus mi quis quam. In auctor
facilisis ante id elementum. Donec interdum ipsum vitae lorem sollicitudin rutrum. Etiam congue
pharetra lorem ac dictum. Donec feugiat interdum vulputate. Curabitur mollis suscipit nisi, vel
tincidunt risus fringilla at. Phasellus tincidunt, nulla a tincidunt tempor, libero turpis imperdiet
tortor, vel convallis orci neque vitae nisi. Nunc euismod massa quis mollis ultricies. Proin non
ante elit. Pellentesque et convallis massa. Curabitur blandit mattis metus, non aliquam erat iaculis
ut. Nam vestibulum ipsum vitae nulla varius, sit amet sodales ipsum congue. Nullam eget mauris ut
est blandit rhoncus sit amet ac arcu. Nulla at purus consequat, lobortis massa sit amet, posuere
ante. Nam bibendum laoreet tempus. Fusce ac nulla consequat, placerat sem vitae, condimentum enim.
Vestibulum sed tellus nec elit varius venenatis. Donec et dapibus dui. Nullam est metus, ultrices
nec lectus vel, fermentum elementum lacus. Curabitur imperdiet vestibulum enim. Aenean sollicitudin
at leo quis ullamcorper. Suspendisse in posuere risus. In quis mattis sem, eu facilisis arcu.
Vestibulum faucibus auctor accumsan. Morbi mattis sit amet augue ac sodales. Integer varius eget
orci iaculis aliquet. Suspendisse a auctor turpis. Fusce vestibulum vestibulum ante sed mattis.
Mauris ornare rhoncus enim ac egestas. Donec turpis eros, interdum non placerat nec, adipiscing eu
urna. Integer feugiat mi quis eros fringilla vehicula. Proin suscipit magna ultricies laoreet
dignissim. Donec vehicula ac lacus non vehicula. Sed euismod mattis facilisis. Etiam nec risus vitae
risus iaculis lobortis. Duis eu dui sit amet turpis tincidunt vulputate. Nunc tortor diam, egestas
in ante ac, scelerisque placerat ante. Nullam interdum ultricies nisl a vehicula. Integer id nunc
elit. Sed rutrum sit amet neque quis tristique.

================================================
FILE: spec/inputs/numbers/1.txt
================================================
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50


================================================
FILE: spec/inputs/numbers/10.txt
================================================
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500


================================================
FILE: spec/inputs/numbers/11.txt
================================================
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550


================================================
FILE: spec/inputs/numbers/12.txt
================================================
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600


================================================
FILE: spec/inputs/numbers/13.txt
================================================
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650


================================================
FILE: spec/inputs/numbers/14.txt
================================================
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700


================================================
FILE: spec/inputs/numbers/15.txt
================================================
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750


================================================
FILE: spec/inputs/numbers/16.txt
================================================
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800


================================================
FILE: spec/inputs/numbers/17.txt
================================================
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850


================================================
FILE: spec/inputs/numbers/18.txt
================================================
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900


================================================
FILE: spec/inputs/numbers/19.txt
================================================
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950


================================================
FILE: spec/inputs/numbers/2.txt
================================================
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100


================================================
FILE: spec/inputs/numbers/20.txt
================================================
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000


================================================
FILE: spec/inputs/numbers/3.txt
================================================
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150


================================================
FILE: spec/inputs/numbers/4.txt
================================================
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200


================================================
FILE: spec/inputs/numbers/5.txt
================================================
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250


================================================
FILE: spec/inputs/numbers/6.txt
================================================
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300


================================================
FILE: spec/inputs/numbers/7.txt
================================================
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350


================================================
FILE: spec/inputs/numbers/8.txt
================================================
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400


================================================
FILE: spec/inputs/numbers/9.txt
================================================
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450


================================================
FILE: spec/inputs/numbers_0_100.txt
================================================
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100

================================================
FILE: spec/inputs/numbers_1_100.txt
================================================
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100

================================================
FILE: spec/inputs/people.json
================================================
{"id":1,"name":"Matthew Fuller","age":49,"email":"mfuller0@blogger.com","active":false}
{"id":2,"name":"Pamela Thomas","age":58,"email":"pthomas1@apache.org","address":"92 Beilfuss Lane","active":false,"ip_address":"41.52.54.168"}
{"id":3,"name":"Joan Stevens","age":33,"email":"jstevens2@xrea.com","address":"1 Wayridge Circle","active":true,"ip_address":"159.204.170.10"}
{"id":4,"name":"Laura Reynolds","email":"lreynolds3@admin.ch","address":"431 Spenser Court","active":true,"ip_address":"164.254.150.90"}
{"id":5,"name":"Daniel Baker","email":"dbaker4@blinklist.com","active":true,"ip_address":"165.138.63.70"}
{"id":6,"name":"Christina Lane","email":"clane5@cnbc.com","address":"7 Chinook Park","active":true,"ip_address":"46.240.67.103"}
{"id":7,"name":"Carlos Washington","age":50,"email":"cwashington6@issuu.com","address":"6487 Memorial Trail","active":false,"ip_address":"152.45.154.18"}
{"id":8,"name":"Harold Reid","age":53,"email":"hreid7@seesaa.net","active":true}
{"id":9,"name":"Earl Harris","age":37,"email":"eharris8@homestead.com","active":false}
{"id":10,"name":"Jack Hernandez","age":30,"email":"jhernandez9@adobe.com","address":"29407 Memorial Alley","active":false,"ip_address":"129.222.144.1"}
{"id":11,"name":"Nicole Torres","age":25,"email":"ntorresa@amazon.de","address":"34804 Havey Point","active":false,"ip_address":"5.114.113.83"}
{"id":12,"name":"Theresa Gordon","age":19,"email":"tgordonb@xinhuanet.com","active":false}
{"id":13,"name":"Emily Schmidt","age":25,"email":"eschmidtc@arstechnica.com","address":"115 Bluestem Pass","active":true}
{"id":14,"name":"Dennis Ford","age":50,"email":"dfordd@hc360.com","address":"4107 Kim Avenue","active":true,"ip_address":"44.170.237.89"}
{"id":15,"name":"Deborah Williams","age":28,"email":"dwilliamse@cmu.edu","address":"7 Kipling Pass","active":false}
{"id":16,"name":"Rachel Sullivan","age":31,"email":"rsullivanf@pagesperso-orange.fr","address":"8196 Harbort Park","active":true,"ip_address":"216.142.141.210"}
{"id":17,"name":"Phillip Jordan","email":"pjordang@liveinternet.ru","active":false}
{"id":18,"name":"Fred Mitchell","email":"fmitchellh@shinystat.com","address":"279 Gateway Parkway","active":false}
{"id":19,"name":"Antonio Dunn","age":23,"email":"adunni@mediafire.com","address":"71 Maple Place","active":true,"ip_address":"39.50.250.70"}
{"id":20,"name":"Alan Boyd","age":59,"email":"aboydj@sbwire.com","address":"4302 Warner Road","active":false,"ip_address":"106.253.236.0"}
{"id":21,"name":"Louise Wright","age":19,"email":"lwrightk@so-net.ne.jp","address":"5 Maryland Hill","active":false,"ip_address":"51.0.99.116"}
{"id":22,"name":"Diane Greene","age":39,"email":"dgreenel@jugem.jp","address":"38 Merrick Lane","active":false,"ip_address":"146.124.156.180"}
{"id":23,"name":"Emily Richardson","age":23,"email":"erichardsonm@csmonitor.com","active":true}
{"id":24,"name":"Joseph Henderson","age":36,"email":"jhendersonn@drupal.org","address":"55 Morningstar Lane","active":true,"ip_address":"54.187.254.99"}
{"id":25,"name":"Chris Fowler","age":31,"email":"cfowlero@msu.edu","address":"4 Oakridge Center","active":false}
{"id":26,"name":"Helen West","age":38,"email":"hwestp@time.com","address":"93 Blaine Parkway","active":true,"ip_address":"159.131.255.177"}
{"id":27,"name":"Jimmy Black","age":46,"email":"jblackq@house.gov","address":"80157 Bay Drive","active":true,"ip_address":"163.137.84.52"}
{"id":28,"name":"Melissa Allen","age":56,"email":"mallenr@upenn.edu","address":"381 Merrick Way","active":false}
{"id":29,"name":"Scott Walker","age":48,"email":"swalkers@etsy.com","active":true}
{"id":30,"name":"Jimmy Wood","email":"jwoodt@bloomberg.com","address":"1041 Claremont Lane","active":true}
{"id":31,"name":"Betty Jacobs","email":"bjacobsu@ihg.com","address":"6520 Anderson Junction","active":false,"ip_address":"166.45.58.141"}
{"id":32,"name":"Richard Stone","age":34,"email":"rstonev@rakuten.co.jp","address":"51 Bay Pass","active":true,"ip_address":"9.35.132.204"}
{"id":33,"name":"Melissa Henderson","age":21,"email":"mhendersonw@washington.edu","address":"06 Delaware Avenue","active":false}
{"id":34,"name":"David Stanley","age":57,"email":"dstanleyx@ucoz.com","address":"692 Lien Avenue","active":true,"ip_address":"194.251.38.0"}
{"id":35,"name":"Cynthia Murphy","age":20,"email":"cmurphyy@xinhuanet.com","active":false}
{"id":36,"name":"Todd Henry","age":38,"address":"589 Katie Center","active":true,"ip_address":"177.233.117.222"}
{"id":37,"name":"Christina Stephens","age":40,"email":"cstephens10@illinois.edu","address":"51039 Hermina Point","active":true}
{"id":38,"name":"Sharon Gomez","email":"sgomez11@parallels.com","address":"57089 Texas Way","active":true,"ip_address":"149.85.104.141"}
{"id":39,"name":"Benjamin Fisher","age":30,"email":"bfisher12@gmpg.org","address":"3 Welch Plaza","active":false,"ip_address":"116.184.105.191"}
{"id":40,"name":"Mark Stewart","age":38,"email":"mstewart13@uiuc.edu","active":false,"ip_address":"167.115.237.197"}
{"id":41,"name":"Mark Black","age":45,"email":"mblack14@tuttocitta.it","address":"9 Rutledge Pass","active":false,"ip_address":"108.90.166.239"}
{"id":42,"name":"Christina Lawrence","age":47,"email":"clawrence15@simplemachines.org","address":"239 Eggendart Junction","active":true,"ip_address":"8.118.127.22"}
{"id":43,"name":"Howard Lynch","age":52,"email":"hlynch16@slideshare.net","active":true}
{"id":44,"name":"Heather Perez","age":60,"email":"hperez17@techcrunch.com","address":"1 Almo Court","active":false,"ip_address":"110.184.153.36"}
{"id":45,"name":"Michael Howell","age":57,"email":"mhowell18@wufoo.com","address":"341 Shelley Alley","active":false}
{"id":46,"name":"Gregory Johnson","age":57,"email":"gjohnson19@japanpost.jp","address":"4 Basil Plaza","active":true,"ip_address":"249.29.102.40"}
{"id":47,"name":"Christopher Miller","age":50,"email":"cmiller1a@google.es","address":"76 Granby Way","active":true}
{"id":48,"name":"Beverly Hall","age":60,"email":"bhall1b@cam.ac.uk","address":"9 Novick Place","active":true}
{"id":49,"name":"Todd Adams","age":58,"email":"tadams1c@yahoo.co.jp","active":false}
{"id":50,"name":"Judith Watkins","age":30,"email":"jwatkins1d@comcast.net","address":"5874 Esker Parkway","active":true,"ip_address":"229.176.89.163"}
{"id":51,"name":"Cheryl Howard","age":34,"email":"choward1e@cam.ac.uk","address":"492 Mandrake Lane","active":false,"ip_address":"255.117.98.35"}
{"id":52,"name":"Mary West","email":"mwest1f@cnn.com","address":"4 Vera Avenue","active":false,"ip_address":"118.130.207.177"}
{"id":53,"name":"Carol Welch","age":39,"email":"cwelch1g@sun.com","address":"794 Burrows Pass","active":true,"ip_address":"205.98.9.218"}
{"id":54,"name":"Donald Reed","age":23,"email":"dreed1h@wsj.com","address":"0769 Dryden Trail","active":true,"ip_address":"35.72.239.99"}
{"id":55,"name":"Michael Wells","age":29,"email":"mwells1i@deviantart.com","address":"9033 Crescent Oaks Way","active":false,"ip_address":"33.18.26.152"}
{"id":56,"name":"Joyce Montgomery","age":34,"email":"jmontgomery1j@sciencedaily.com","address":"29093 Lyons Circle","active":true,"ip_address":"85.155.89.174"}
{"id":57,"name":"Angela Garza","age":24,"email":"agarza1k@hc360.com","address":"388 Kenwood Street","active":false,"ip_address":"204.191.24.172"}
{"id":58,"name":"Rose Green","age":26,"email":"rgreen1l@businessinsider.com","address":"3 Mesta Pass","active":true}
{"id":59,"name":"Wanda Williamson","age":39,"email":"wwilliamson1m@cafepress.com","address":"18596 Westridge Crossing","active":true,"ip_address":"215.98.196.209"}
{"id":60,"name":"Irene Washington","age":49,"email":"iwashington1n@ameblo.jp","address":"83 Monica Crossing","active":false,"ip_address":"141.46.156.186"}
{"id":61,"name":"Anna Freeman","age":50,"email":"afreeman1o@blogs.com","address":"3 Gulseth Way","active":true}
{"id":62,"name":"Kathleen Romero","age":23,"email":"kromero1p@craigslist.org","address":"419 Leroy Court","active":true}
{"id":63,"name":"Matthew Alexander","age":58,"email":"malexander1q@gnu.org","active":false}
{"id":64,"name":"Louis Moore","age":50,"email":"lmoore1r@salon.com","address":"671 Buhler Hill","active":true,"ip_address":"21.247.160.104"}
{"id":65,"name":"Christina Brooks","age":27,"email":"cbrooks1s@google.cn","address":"80405 Jana Circle","active":true,"ip_address":"121.100.200.46"}
{"id":66,"name":"Sarah Moreno","age":30,"address":"03 Cottonwood Way","active":true,"ip_address":"111.174.142.117"}
{"id":67,"name":"Harold Rodriguez","age":24,"email":"hrodriguez1u@squidoo.com","address":"76 Green Circle","active":true}
{"id":68,"name":"Louise Black","age":18,"email":"lblack1v@yale.edu","address":"951 Blackbird Junction","active":false,"ip_address":"212.47.220.126"}
{"id":69,"name":"Adam Montgomery","email":"amontgomery1w@mlb.com","address":"1 Mesta Terrace","active":false}
{"id":70,"name":"Jacqueline Pierce","age":58,"email":"jpierce1x@google.com.au","address":"0161 Village Plaza","active":false,"ip_address":"116.164.88.112"}
{"id":71,"name":"Ann Stone","age":45,"email":"astone1y@yelp.com","address":"1011 Heath Terrace","active":false}
{"id":72,"name":"Teresa Arnold","age":33,"email":"tarnold1z@mayoclinic.com","active":false,"ip_address":"81.165.73.142"}
{"id":73,"name":"Arthur Shaw","age":27,"email":"ashaw20@latimes.com","address":"9956 Hooker Road","active":true}
{"id":74,"name":"Wayne Garrett","age":41,"email":"wgarrett21@adobe.com","address":"34 Grasskamp Street","active":true,"ip_address":"29.26.28.17"}
{"id":75,"name":"Russell Castillo","age":46,"email":"rcastillo22@printfriendly.com","address":"444 South Avenue","active":false}
{"id":76,"name":"Shirley Burke","age":47,"email":"sburke23@lulu.com","address":"70 Florence Drive","active":false}
{"id":77,"name":"Tammy Washington","age":46,"email":"twashington24@youtube.com","address":"559 Hollow Ridge Road","active":true,"ip_address":"230.169.245.123"}
{"id":78,"name":"Diane Freeman","age":49,"email":"dfreeman25@github.com","address":"04 Transport Center","active":false,"ip_address":"138.200.234.169"}
{"id":79,"name":"Anne Morrison","email":"amorrison26@telegraph.co.uk","address":"525 Shasta Junction","active":true}
{"id":80,"name":"Paul Johnston","age":51,"email":"pjohnston27@youku.com","address":"16254 Ryan Center","active":false,"ip_address":"214.38.125.121"}
{"id":81,"name":"Virginia Welch","age":58,"email":"vwelch28@china.com.cn","address":"2 Michigan Hill","active":true}
{"id":82,"name":"Louis Hughes","age":44,"email":"lhughes29@mysql.com","address":"423 Meadow Valley Pass","active":false,"ip_address":"213.45.167.91"}
{"id":83,"name":"Betty Reynolds","age":57,"email":"breynolds2a@furl.net","address":"4486 Kedzie Road","active":true}
{"id":84,"name":"Norma Olson","age":18,"email":"nolson2b@goo.gl","active":true}
{"id":85,"name":"David Ward","age":28,"email":"dward2c@ibm.com","address":"3 Kings Place","active":true}
{"id":86,"name":"Phyllis Williamson","age":26,"email":"pwilliamson2d@nationalgeographic.com","address":"7 Northview Street","active":false,"ip_address":"234.86.8.89"}
{"id":87,"name":"Kathleen Holmes","age":46,"email":"kholmes2e@zdnet.com","address":"4814 Colorado Place","active":false}
{"id":88,"name":"George King","age":23,"email":"gking2f@ask.com","address":"966 Morrow Junction","active":false,"ip_address":"89.94.24.41"}
{"id":89,"name":"Raymond Garcia","age":47,"email":"rgarcia2g@quantcast.com","active":true,"ip_address":"135.10.187.167"}
{"id":90,"name":"Rose Meyer","age":38,"active":true,"ip_address":"228.216.201.80"}
{"id":91,"name":"Jennifer Gray","age":50,"email":"jgray2i@princeton.edu","address":"58241 Calypso Court","active":true,"ip_address":"158.144.236.158"}
{"id":92,"name":"Bonnie Franklin","age":24,"email":"bfranklin2j@slideshare.net","address":"629 Prairieview Center","active":false}
{"id":93,"name":"Sarah Martin","age":52,"email":"smartin2k@cnn.com","address":"997 Kensington Lane","active":false}
{"id":94,"name":"Shirley Hamilton","age":39,"email":"shamilton2l@nih.gov","address":"934 Clarendon Lane","active":false}
{"id":95,"name":"Gregory Kim","age":37,"email":"gkim2m@tinyurl.com","active":true,"ip_address":"216.24.238.78"}
{"id":96,"name":"Betty Sanchez","age":46,"email":"bsanchez2n@washington.edu","active":true}
{"id":97,"name":"Ann Cooper","age":41,"email":"acooper2o@issuu.com","active":false}
{"id":98,"name":"Christopher Cole","active":true}
{"id":99,"name":"Debra Lopez","age":36,"address":"4 Grim Drive","active":false,"ip_address":"1.217.64.60"}
{"id":100,"name":"Shawn Moore","age":35,"email":"smoore2r@mayoclinic.com","active":true}


================================================
FILE: spec/lib/collect_spec.rb
================================================
require 'spec_helper'

RSpec.describe Spark::RDD do

  let(:mapping) { lambda{|x| [x, 1]} }
  let(:numbers) { Generator.numbers }

  it '.collect_as_hash' do
    rdd = $sc.parallelize(numbers)
    rdd = rdd.map(mapping)

    expect(rdd.collect_as_hash).to eql(Hash[numbers.map(&mapping)])
  end

  context '.take' do
    let(:size)    { 1000 }
    let(:numbers) { Generator.numbers(size) }
    let(:rdd)     { $sc.parallelize(numbers) }

    it 'nothing' do
      expect(rdd.take(0)).to eql([])
    end

    it 'first' do
      expect(rdd.first).to eql(numbers.first)
    end

    it 'less than limit' do
      _size = size / 2
      expect(rdd.take(_size)).to eql(numbers.take(_size))
    end

    it 'all' do
      expect(rdd.take(size)).to eql(numbers)
    end

    it 'more than limit' do
      expect(rdd.take(size*2)).to eql(numbers)
    end
  end

end


================================================
FILE: spec/lib/command_spec.rb
================================================
require 'spec_helper'

def to_s_method(x)
  x.to_s
end

RSpec::describe Spark::CommandBuilder do
  let(:numbers) { Generator.numbers }
  let(:rdd)     { $sc.parallelize(numbers, 1) }

  context '.serialize_function' do
    let(:result)  { numbers.map(&:to_s) }

    it 'string' do
      expect(rdd.map('lambda{|x| x.to_s}').collect).to eql(result)
    end

    it 'symbol' do
      expect(rdd.map(:to_s).collect).to eql(result)
    end

    it 'lambda' do
      expect(rdd.map(lambda{|x| x.to_s}).collect).to eql(result)
    end

    it 'method' do
      expect(rdd.map(method(:to_s_method)).collect).to eql(result)
    end
  end

  context '.bind' do
    it 'number' do
      number = rand(0..10000000)
      rdd2 = rdd.map(lambda{|x| x * number}).bind(number: number)

      expect(rdd2.collect).to eq(numbers.map{|x| x * number})
    end

    it 'open struct' do
      require 'ostruct'

      struct = OpenStruct.new
      struct.number = 3
      struct.string = '3'
      struct.array = [1, 2, 3]

      func = lambda{|item|
        item * struct.number + struct.string.to_i + struct.array[0]
      }

      rdd2 = rdd.add_library('ostruct')
      rdd2 = rdd2.map(func)
      rdd2 = rdd2.bind(struct: struct)

      expect(rdd2.collect).to eq(numbers.map(&func))
    end

    it 'different naming' do
      array = [1, 2, 3]

      rdd2 = rdd.map(lambda{|_| my_array.size})
      rdd2 = rdd2.bind(my_array: array)

      expect(rdd2.sum).to eq(numbers.size * array.size)
    end
  end

end


================================================
FILE: spec/lib/config_spec.rb
================================================
require 'spec_helper'

RSpec.describe Spark::Config do

  before(:context) do
    Spark.stop
  end

  after(:context) do
    spark_start
  end

  it 'should be stopped' do
    expect(Spark.started?).to be_falsy
  end

  context 'new config' do

    let(:configuration) do
      {
        'test.test1' => 'test1',
        'test.test2' => 'test2',
        'test.test3' => 'test3'
      }
    end

    before(:each) do
      Spark.clear_config
    end

    it 'throught methods' do
      configuration.each do |key, value|
        Spark.config.set(key, value)
      end

      configuration.each do |key, value|
        expect(Spark.config.get(key)).to eql(value)
      end
    end

    it 'throught hash style' do
      configuration.each do |key, value|
        Spark.config[key] = value
      end

      configuration.each do |key, value|
        expect(Spark.config[key]).to eql(value)
      end
    end

    it 'throught dsl' do
      configuration.each do |key, value|
        Spark.config {
          set key, value
        }
      end

      configuration.each do |key, value|
        expect(Spark.config[key]).to eql(value)
      end
    end
  end

end


================================================
FILE: spec/lib/context_spec.rb
================================================
require 'spec_helper'

RSpec.describe Spark::Context do

  it '.run_job' do
    workers = 5
    numbers = (0...100).to_a
    func = lambda{|part| part.size}

    ser = Spark::Serializer.build { __batched__(__marshal__, 1) }

    rdd = $sc.parallelize(numbers, workers, ser)

    rdd_result = $sc.run_job(rdd, func)
    result = numbers.each_slice(numbers.size/workers).map(&func)
    expect(rdd_result).to eql(result)

    parts = [0, 2]
    func = lambda{|part| part.to_s}

    rdd_result = $sc.run_job(rdd, func, parts)
    result = []
    sliced_numbers = numbers.each_slice(numbers.size/workers).to_a
    parts.each do |part|
      result << func.call(sliced_numbers[part])
    end

    expect(rdd_result).to eql(result)
  end

  it '.broadcast' do
    workers = rand(1..5)

    values1 = [1,2,3]
    values2 = [4,5,6]

    broadcast1 = $sc.broadcast(values1)
    broadcast2 = $sc.broadcast(values2)

    rdd = $sc.parallelize(0..5, workers)
    rdd = rdd.bind(broadcast1: broadcast1, broadcast2: broadcast2)
    rdd = rdd.map_partitions(lambda{|_| broadcast1.value + broadcast2.value })

    expect(rdd.sum).to eql(
      (values1 + values2).reduce(:+) * workers
    )
  end

  # context '.accumulator' do

  #   it 'test' do
  #     accum1 = $sc.accumulator(0,)
  #     accum2 = $sc.accumulator(1, :*, 1)
  #     accum3 = $sc.accumulator(0, lambda{|max, val| val > max ? val : max})

  #     accum1 += 1

  #     accum2.add(2)
  #     accum2.add(2)
  #     accum2.add(2)

  #     accum3.add(9)
  #     accum3.add(6)
  #     accum3.add(7)

  #     expect(accum1.value).to eql(1)
  #     expect(accum2.value).to eql(8)
  #     expect(accum3.value).to eql(9)

  #     func = Proc.new do |_, index|
  #       accum1.add(1)
  #       accum2.add(2)
  #       accum3.add(index * 10)
  #     end

  #     rdd = $sc.parallelize(0..4, 4)
  #     rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3)
  #     rdd = rdd.map_partitions_with_index(func)
  #     rdd.collect

  #     # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
  #     sleep(1)

  #     expect(accum1.value).to eql(5)
  #     expect(accum2.value).to eql(128)
  #     expect(accum3.value).to eql(30)
  #   end

  #   context 'accum param' do
  #     it 'symbol' do
  #       accum1 = $sc.accumulator(1, :+, 0)
  #       accum2 = $sc.accumulator(5, :-, 3)
  #       accum3 = $sc.accumulator(1, :*, 1)
  #       accum4 = $sc.accumulator(1.0, :/, 1.0)
  #       accum5 = $sc.accumulator(2, :**, 2)

  #       func = Proc.new do |_|
  #         accum1.add(1)
  #         accum2.add(1)
  #         accum3.add(2)
  #         accum4.add(2)
  #         accum5.add(2)
  #       end

  #       rdd = $sc.parallelize(0..4, 2)
  #       rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3, accum4: accum4, accum5: accum5)
  #       rdd = rdd.map_partitions(func)
  #       rdd.collect

  #       # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
  #       sleep(1)

  #       expect(accum1.value).to eq(3)
  #       expect(accum2.value).to eq(1)
  #       expect(accum3.value).to eq(4)
  #       expect(accum4.value).to eq(4)
  #       expect(accum5.value).to eq(65536)
  #     end

  #     it 'proc' do
  #       accum1 = $sc.accumulator(1, lambda{|mem, val| mem + val}, 0)
  #       accum2 = $sc.accumulator('a', lambda{|mem, val| mem + val}, '')
  #       accum3 = $sc.accumulator([], lambda{|mem, val| mem << val}, [])

  #       func = Proc.new do |_|
  #         accum1.add(1)
  #         accum2.add('a')
  #         accum3.add(1)
  #       end

  #       rdd = $sc.parallelize(0..4, 2)
  #       rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3)
  #       rdd = rdd.map_partitions(func)
  #       rdd.collect

  #       # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
  #       sleep(1)

  #       expect(accum1.value).to eq(3)
  #       expect(accum2.value).to eq('aaa')
  #       expect(accum3.value).to eq([[1], [1]])
  #     end

  #     it 'string' do
  #       expect { $sc.accumulator(1, '0') }.to raise_error(Spark::SerializeError)

  #       accum = $sc.accumulator(1, 'lambda{|mem, val| mem + val}', 0)

  #       func = Proc.new do |_|
  #         accum.add(1)
  #       end

  #       rdd = $sc.parallelize(0..4, 2)
  #       rdd = rdd.bind(accum: accum)
  #       rdd = rdd.map_partitions(func)
  #       rdd.collect

  #       # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
  #       sleep(1)

  #       expect(accum.value).to eq(3)
  #     end
  #   end
  # end

end


================================================
FILE: spec/lib/ext_spec.rb
================================================
require 'spec_helper'

RSpec.describe Array do

  it '.deep_copy' do
    data = ['a', 'b', 'c']
    new_data = data.dup

    data[0] << 'a'

    expect(data).to eql(new_data)

    new_data = data.deep_copy

    data[1] << 'b'

    expect(data).to_not eql(new_data)
  end

end

RSpec.describe Hash do

  it '.stringify_keys!' do
    data = {
      a: 'a',
      b: 'b',
      c: 'c'
    }

    data.stringify_keys!

    expect(data).to eql({
      'a' => 'a',
      'b' => 'b',
      'c' => 'c'
    })
  end

end

RSpec.describe String do

  it '.camelize' do
    data = 'aaa_bbb_ccc'.camelize
    expect(data).to eql('AaaBbbCcc')
  end

end

RSpec.describe IO do

  it 'serialize' do
    file = Tempfile.new('serialize')
    file.binmode

    file.write_int(1)
    file.write_long(2)
    file.write_string('3')
    file.write_data([4])

    file.rewind

    expect(file.read_int).to eq(1)
    expect(file.read_long).to eq(2)
    expect(file.read_string).to eq('3')
    expect(file.read_data).to eq([4])

    file.unlink
  end

end


================================================
FILE: spec/lib/external_apps_spec.rb
================================================
require 'spec_helper'

RSpec.describe Spark::RDD do

  context '.pipe' do
    let(:words)   { Generator.words }
    let(:numbers) { Generator.numbers }

    it 'single program' do
      skip if windows?

      rdd = $sc.parallelize(words, 1)
      rdd = rdd.pipe('tr a b')

      result = words.dup
      result.map! do |x|
        x.gsub('a', 'b')
      end

      expect(rdd.collect).to eql(result)
    end

    it 'multiple program' do
      skip if windows?

      rdd = $sc.parallelize(numbers, 1)
      rdd = rdd.pipe("tr 1 5", "awk '{print $1*10}'")
      rdd = rdd.map(lambda{|x| x.to_i * 100})

      result = numbers.dup
      result.map! do |x|
        x.to_s.gsub('1', '5')
      end
      result.map! do |x|
        x.to_i * 10
      end
      result.map! do |x|
        x * 100
      end

      expect(rdd.collect).to eql(result)
    end
  end

end


================================================
FILE: spec/lib/filter_spec.rb
================================================
require 'spec_helper'

def func4(item)
  item.start_with?('a') && item.size > 3 && item[1].to_s.ord > 106
end

RSpec.shared_examples 'a filtering' do |workers|
  context "with #{workers || 'default'} worker" do
    it 'when numbers' do
      rdd2 = rdd_numbers(workers)
      rdd2 = rdd2.filter(func1)
      result = numbers.select(&func1)

      expect(rdd2.collect).to eql(result)

      rdd3 = rdd_numbers(workers)
      rdd3 = rdd3.filter(func1)
      rdd3 = rdd3.filter(func2)

      expect(rdd3.collect).to eql([])
    end

    it 'when words' do
      rdd2 = rdd_words(workers)
      rdd2 = rdd2.filter(func3)
      result = words.select{|x| func3.call(x)}

      expect(rdd2.collect).to eql(result)

      rdd3 = rdd_words(workers)
      rdd3 = rdd3.filter(method(:func4))
      result = words.select{|x| func4(x)}

      expect(rdd3.collect).to eql(result)
    end
  end
end

RSpec.describe 'Spark::RDD.filter' do
  let(:func1) { lambda{|x| x.to_i.even?} }
  let(:func2) { lambda{|x| x.to_i.odd?} }
  let(:func3) { lambda{|x| x.to_s.start_with?('b')} }

  context 'throught parallelize' do
    let(:numbers) { Generator.numbers_with_zero }
    let(:words)   { Generator.words }

    def rdd_numbers(workers)
      $sc.parallelize(numbers, workers)
    end

    def rdd_words(workers)
      $sc.parallelize(words, workers)
    end

    it_behaves_like 'a filtering', 2
    # it_behaves_like 'a filtering', nil
    # it_behaves_like 'a filtering', rand(2..10)
  end

  context 'throught text_file' do
    let(:file_numbers) { File.join('spec', 'inputs', 'numbers_0_100.txt') }
    let(:file_words)   { File.join('spec', 'inputs', 'lorem_300.txt') }

    let(:numbers) { File.readlines(file_numbers).map(&:strip) }
    let(:words)   { File.readlines(file_words).map(&:strip) }

    def rdd_numbers(workers)
      $sc.text_file(file_numbers, workers)
    end

    def rdd_words(workers)
      $sc.text_file(file_words, workers)
    end

    it_behaves_like 'a filtering', 2
    # it_behaves_like 'a filtering', nil
    # it_behaves_like 'a filtering', rand(2..10)
  end
end


================================================
FILE: spec/lib/flat_map_spec.rb
================================================
require 'spec_helper'

RSpec.shared_examples 'a flat mapping' do |workers|
  it "with #{workers || 'default'} worker" do
    rdd2 = rdd(workers).map(func1)
    result = numbers.flat_map(&func1)

    expect(rdd2.collect).to eql(result)

    rdd3 = rdd(workers)
    rdd3 = rdd3.flat_map(func1)
    rdd3 = rdd3.flat_map(func2)
    rdd3 = rdd3.flat_map(func3)
    result = numbers.flat_map(&func1).flat_map(&func2).flat_map(&func3)

    expect(rdd3.collect).to eql(result)

    rdd4 = rdd(workers)
    rdd4 = rdd4.flat_map(func1)
    rdd4 = rdd4.flat_map(func2)
    rdd4 = rdd4.flat_map(func3)

    expect(rdd4.collect).to eql(rdd3.collect)
  end
end

RSpec.shared_examples 'a flat mapping values' do |workers|
  it "with #{workers || 'default'} worker" do
    rdd2 = rdd(workers).flat_map_values(func1)
    result = []
    hash_with_values.each do |(key, values)|
      values = func1.call(values).flatten
      values.each do |value|
        result << [key, value]
      end
    end

    expect(rdd2.collect).to eql(result)

    rdd2 = rdd(workers).flat_map_values(func2)
    result = []
    hash_with_values.each do |(key, values)|
      values = func2.call(values).flatten
      values.each do |value|
        result << [key, value]
      end
    end

    expect(rdd2.collect).to eql(result)
  end
end

RSpec.describe 'Spark::RDD' do
  let(:func1) { lambda{|x| x*2} }
  let(:func2) { lambda{|x| [x*3, 1, 1]} }
  let(:func3) { lambda{|x| [x*4, 2, 2]} }

  context 'throught parallelize' do
    context '.flat_map' do
      let(:numbers) { Generator.numbers_with_zero }

      def rdd(workers)
        $sc.parallelize(numbers, workers)
      end

      it_behaves_like 'a flat mapping', 1
      it_behaves_like 'a flat mapping', 2
      # it_behaves_like 'a flat mapping', nil
      # it_behaves_like 'a flat mapping', rand(2..10)
    end

    context '.flat_map_values' do
      let(:func1) { lambda{|x| x*2} }
      let(:func2) { lambda{|x| [x.first]} }
      let(:hash_with_values) { Generator.hash_with_values }

      def rdd(workers)
        $sc.parallelize(hash_with_values, workers)
      end

      it_behaves_like 'a flat mapping values', 1
      it_behaves_like 'a flat mapping values', 2
      # it_behaves_like 'a flat mapping values', nil
      # it_behaves_like 'a flat mapping values', rand(2..10)
    end
  end

  context 'throught text_file' do
    context '.flat_map' do
      let(:file)    { File.join('spec', 'inputs', 'numbers_0_100.txt') }
      let(:numbers) { File.readlines(file).map(&:strip) }

      def rdd(workers)
        $sc.text_file(file, workers)
      end

      it_behaves_like 'a flat mapping', 1
      it_behaves_like 'a flat mapping', 2
      # it_behaves_like 'a flat mapping', nil
      # it_behaves_like 'a flat mapping', rand(2..10)
    end
  end
end


================================================
FILE: spec/lib/group_spec.rb
================================================
require 'spec_helper'

RSpec.shared_examples 'a groupping by key' do |workers|
  it "with #{workers || 'default'} worker" do
    expect(rdd_result(workers)).to eql(result)
  end
end

RSpec.shared_examples 'a cogroupping by key' do |workers|
  context "with #{workers || 'default'} worker" do
    it '.group_with' do
      rdd = rdd_1(workers).group_with(rdd_2(workers))
      expect(rdd.collect_as_hash).to eql(result_12)
    end

    it '.cogroup' do
      rdd = rdd_1(workers).cogroup(rdd_2(workers), rdd_3(workers))
      expect(rdd.collect_as_hash).to eql(result_123)
    end
  end
end

RSpec.shared_examples 'a groupping by' do |workers|
  it "with #{workers || 'default'} worker" do
    rdd = rdd_numbers(workers)
    rdd = rdd.group_by(key_function1)

    expect(rdd.collect_as_hash).to eql(numbers.group_by(&key_function1))

    rdd = rdd_words(workers)
    rdd = rdd.group_by(key_function2)

    expect(rdd.collect_as_hash).to eql(words.group_by(&key_function2))
  end
end

RSpec.describe 'Spark::RDD' do

  def make_result(*hashes)
    _result = {}
    hashes.each do |data|
      data.each do |key, value|
        _result[key] ||= []
        _result[key] << value
      end
    end
    _result
  end

  context '.group_by_key' do
    let(:hash) { Generator.hash }
    let(:result) { make_result(hash) }

    def rdd_result(workers)
      rdd = $sc.parallelize(hash)
      rdd.group_by_key.collect_as_hash
    end

    it_behaves_like 'a groupping by key', 1
    it_behaves_like 'a groupping by key', 2
    # it_behaves_like 'a groupping by key', nil
    # it_behaves_like 'a groupping by key', rand(2..10)
  end

  context 'cogroup' do
    let(:hash1) { Generator.hash }
    let(:hash2) { Generator.hash }
    let(:hash3) { Generator.hash }

    let(:result_12)  { make_result(hash1, hash2) }
    let(:result_123) { make_result(hash1, hash2, hash3) }

    def rdd_1(workers)
      $sc.parallelize(hash1)
    end

    def rdd_2(workers)
      $sc.parallelize(hash2)
    end

    def rdd_3(workers)
      $sc.parallelize(hash3)
    end

    it_behaves_like 'a cogroupping by key', 1
    it_behaves_like 'a cogroupping by key', 2
    # it_behaves_like 'a cogroupping by key', nil
    # it_behaves_like 'a cogroupping by key', rand(2..10)
  end

  context 'group_by' do
    let(:key_function1) { lambda{|x| x%2} }
    let(:key_function2) { lambda{|x| x.size} }

    let(:numbers) { Generator.numbers }
    let(:words)   { Generator.words }

    def rdd_numbers(workers)
      $sc.parallelize(numbers)
    end

    def rdd_words(workers)
      $sc.parallelize(words)
    end

    it_behaves_like 'a groupping by', 1
    it_behaves_like 'a groupping by', 2
    # it_behaves_like 'a groupping by', nil
    # it_behaves_like 'a groupping by', rand(2..10)
  end

end


================================================
FILE: spec/lib/helper_spec.rb
================================================
require 'spec_helper'

RSpec.configure do |c|
  c.include Spark::Helper::Parser
  c.include Spark::Helper::Statistic
end

RSpec.describe Spark::Helper do

  it 'memory size' do
    expect(to_memory_size('512mb')).to eql(524288.0)
    expect(to_memory_size('1586 mb')).to eql(1624064.0)
    expect(to_memory_size('3 MB')).to eql(3072.0)
    expect(to_memory_size('9gb')).to eql(9437184.0)
    expect(to_memory_size('9gb', 'mb')).to eql(9216.0)
    expect(to_memory_size('9mb', 'gb')).to eql(0.01)
    expect(to_memory_size('6652548796kb', 'mb')).to eql(6496629.68)
  end

  context 'statistic' do
    it 'compute_fraction' do
      expect(compute_fraction(1, 1000, true)).to be_within(0.001).of(0.013)
      expect(compute_fraction(2, 1000, true)).to be_within(0.001).of(0.018)
      expect(compute_fraction(3, 1000, true)).to be_within(0.001).of(0.023)
      expect(compute_fraction(4, 1000, true)).to be_within(0.001).of(0.028)
      expect(compute_fraction(5, 1000, true)).to be_within(0.001).of(0.031)

      expect(compute_fraction(1, 1000, false)).to be_within(0.001).of(0.0249)
      expect(compute_fraction(2, 1000, false)).to be_within(0.001).of(0.0268)
      expect(compute_fraction(3, 1000, false)).to be_within(0.001).of(0.0287)
      expect(compute_fraction(4, 1000, false)).to be_within(0.001).of(0.0305)
      expect(compute_fraction(5, 1000, false)).to be_within(0.001).of(0.0322)
    end

    it 'bisect_right' do
      data = [10, 20, 30, 40, 50, 60, 70, 80, 90]

      expect(bisect_right(data, 0)).to eq(0)
      expect(bisect_right(data, 1)).to eq(0)
      expect(bisect_right(data, 1, 2)).to eq(2)
      expect(bisect_right(data, 1, 3)).to eq(3)
      expect(bisect_right(data, 1, 4)).to eq(4)
      expect(bisect_right(data, 9)).to eq(0)
      expect(bisect_right(data, 10)).to eq(1)
      expect(bisect_right(data, 40)).to eq(4)
      expect(bisect_right(data, 42)).to eq(4)
      expect(bisect_right(data, 72)).to eq(7)
      expect(bisect_right(data, 80, 4)).to eq(8)
      expect(bisect_right(data, 80, 5)).to eq(8)
      expect(bisect_right(data, 80, 8)).to eq(8)
      expect(bisect_right(data, 80, 9)).to eq(9)
      expect(bisect_right(data, 200)).to eq(9)
    end

    it 'determine_bounds' do
      data = [10, 20, 30, 40, 50, 60, 70, 80, 90]

      expect(determine_bounds(data, 0)).to eq([])
      expect(determine_bounds(data, 1)).to eq([])
      expect(determine_bounds(data, 2)).to eq([50])
      expect(determine_bounds(data, 3)).to eq([40, 70])
      expect(determine_bounds(data, 4)).to eq([30, 50, 70])
      expect(determine_bounds(data, 20)).to eq(data)
    end
  end

end


================================================
FILE: spec/lib/key_spec.rb
================================================
require 'spec_helper'

RSpec.shared_examples 'a keying by' do |workers|
  it "with #{workers || 'default'} worker" do
    rdd = rdd_numbers(workers)
    rdd = rdd.key_by(key_function1)

    result = numbers.map{|item| [key_function1.call(item), item]}
    expect(rdd.collect).to eql(result)

    rdd = rdd_words(workers)
    rdd = rdd.key_by(key_function2)

    result = words.map{|item| [key_function2.call(item), item]}
    expect(rdd.collect).to eql(result)
  end
end

RSpec.describe 'Spark::RDD' do

  context 'key_by' do
    let(:key_function1) { lambda{|x| x.even?} }
    let(:key_function2) { lambda{|x| x.include?('a')} }

    let(:numbers) { Generator.numbers }
    let(:words)   { Generator.words }

    def rdd_numbers(workers)
      $sc.parallelize(numbers)
    end

    def rdd_words(workers)
      $sc.parallelize(words)
    end

    it_behaves_like 'a keying by', 1
    it_behaves_like 'a keying by', 2
    # it_behaves_like 'a keying by', nil
    # it_behaves_like 'a keying by', rand(2..10)
  end

  it 'lookup' do
    numbers = Generator.numbers
    rdd_numbers = $sc.parallelize(numbers, 2)

    rdd = rdd_numbers.group_by(lambda {|x| x%3})
    rdd.lookup(2)

    expect(rdd.lookup(2).first).to eq(
      numbers.group_by{|x| x%3}[2]
    )

    rdd = rdd_numbers.key_by(lambda{|x| x.even?})
    expect(rdd.lookup(true)).to eq(
      numbers.select(&:even?)
    )
  end

end


================================================
FILE: spec/lib/manipulation_spec.rb
================================================
require 'spec_helper'

RSpec.describe 'Spark::RDD' do
  let(:numbers) { 1..100 }
  let(:rand_numbers) { Generator.numbers }

  it '.glom' do
    rdd = $sc.parallelize(numbers, 1).glom
    expect(rdd.collect).to eql([numbers.to_a])

    ser = Spark::Serializer.build { __batched__(__marshal__, 1) }

    rdd = $sc.parallelize(numbers, 5, ser).glom
    expect(rdd.collect).to eql(numbers.each_slice(20).to_a)
  end

  it '.coalesce' do
    rdd = $sc.parallelize(numbers, 5)

    rdd2 = rdd.glom
    expect(rdd2.collect.size).to eql(5)

    rdd3 = rdd.coalesce(4).glom
    expect(rdd3.collect.size).to eql(4)
  end

  it '.distinct' do
    rdd = $sc.parallelize(rand_numbers, 5)
    rdd = rdd.distinct
    expect(rdd.collect.sort).to eql(rand_numbers.uniq.sort)

    rdd = $sc.parallelize(numbers, 5)
    rdd = rdd.map(lambda{|x| 1})
    rdd = rdd.distinct
    expect(rdd.collect).to eql([1])
  end

  context '.union' do
    it 'classic method' do
      rdd = $sc.parallelize(numbers, 5)
      rdd = rdd.union(rdd).collect

      expect(rdd.collect.sort).to eql((numbers.to_a+numbers.to_a).sort)
    end

    it 'with a different serializer' do
      rdd1 = $sc.parallelize(numbers, 1, Spark::Serializer.build{ __batched__(__marshal__) })
      rdd2 = $sc.parallelize(numbers, 1, Spark::Serializer.build{ __batched__(__oj__) })

      expect { rdd1.union(rdd2).collect }.to_not raise_error
    end

    it 'as operator' do
      rdd1 = $sc.parallelize(numbers)
      rdd2 = $sc.parallelize(rand_numbers)

      expect((rdd1+rdd2).sum).to eql((numbers.to_a+rand_numbers).reduce(:+))
    end
  end

  it '.compact' do
    data = [nil, nil , 0, 0, 1, 2, nil, 6]
    result = data.compact
    ser = Spark::Serializer.build { __batched__(__marshal__, 1) }

    rdd = $sc.parallelize(data, 1).compact
    expect(rdd.collect).to eql(result)

    rdd = $sc.parallelize(data, 5, ser).compact
    expect(rdd.collect).to eql(result)

    rdd = $sc.parallelize(data, 1, ser).compact
    expect(rdd.collect).to eql(result)
  end

  it '.intersection' do
    data1 = [0,1,2,3,4,5,6,7,8,9,10]
    data2 = [5,6,7,8,9,10,11,12,13,14,15]

    rdd1 = $sc.parallelize(data1)
    rdd2 = $sc.parallelize(data2)

    expect(rdd1.intersection(rdd2).collect.sort).to eql(data1 & data2)
  end

  it '.shuffle' do
    data = Generator.numbers
    rdd = $sc.parallelize(data)

    expect(rdd.shuffle.collect).to_not eql(data)
  end

  context '.cartesian' do
    let(:data1) { Generator.numbers(100) }
    let(:data2) { Generator.numbers(100) }
    let(:result) { data1.product(data2).map(&:to_s).sort }

    it 'unbatched' do
      ser = Spark::Serializer.build { __batched__(__marshal__, 1) }

      rdd1 = $sc.parallelize(data1, 2, ser)
      rdd2 = $sc.parallelize(data2, 2, ser)

      rdd = rdd1.cartesian(rdd2).map(lambda{|x| x.to_s})

      expect(rdd.collect.sort).to eql(result)
    end

    it 'batched' do
      ser1 = Spark::Serializer.build { __batched__(__marshal__, rand(4..10)) }
      ser2 = Spark::Serializer.build { __batched__(__marshal__, rand(4..10)) }

      rdd1 = $sc.parallelize(data1, 2, ser1)
      rdd2 = $sc.parallelize(data2, 2, ser2)

      rdd = rdd1.cartesian(rdd2).map(lambda{|x| x.to_s})

      expect(rdd.collect.sort).to eql(result)
    end
  end

end


================================================
FILE: spec/lib/map_partitions_spec.rb
================================================
require 'spec_helper'

def func3(x)
  x.map(&:to_i).reduce(:+)
end

def func4_with_index(data, index)
  [{
    index => data.map(&:to_i).reduce(:*)
  }]
end

RSpec.shared_examples 'a map partitions' do |workers|
  context "with #{workers || 'default'} worker" do
    it 'without index' do
      rdd2 = rdd(workers).map_partitions(func1)
      result = func1.call(numbers)

      expect(func1.call(rdd2.collect)).to eql(result)

      rdd3 = rdd(workers)
      rdd3 = rdd3.map_partitions(func1)
      rdd3 = rdd3.map_partitions(func2)
      rdd3 = rdd3.map_partitions(method(:func3))
      result = func3(func2.call(func1.call(numbers)))

      # Not same number of workers
      expect(rdd3.collect.size).to be >= 1

      rdd4 = rdd(workers)
      rdd4 = rdd4.map_partitions(func1)
      rdd4 = rdd4.map_partitions(func2)
      rdd4 = rdd4.map_partitions(method(:func3))

      expect(rdd4.collect).to eql(rdd3.collect)
    end

    it 'with index' do
      rdd2 = rdd(workers).map_partitions_with_index(method(:func4_with_index))
      result = rdd2.collect

      expect(result).to be_a(Array)

      result.each do |x|
        expect(x).to be_a(Hash)
      end

      # Multiply by 0
      # Some values are 0 because of batched serialization
      expect(result.map(&:values).flatten.compact.uniq.first).to eql(0)
    end
  end
end

RSpec::describe 'Spark::RDD.map_partitions(_with_index)' do
  let(:func1) { lambda{|x| x.map(&:to_i)} }
  let(:func2) {
    lambda{|x|
      x.map{|y| y*2}
    }
  }

  context 'throught parallelize' do
    let(:numbers) { 0..1000 }

    def rdd(workers)
      $sc.parallelize(numbers, workers)
    end

    it_behaves_like 'a map partitions', 1
    it_behaves_like 'a map partitions', 2
    # it_behaves_like 'a map partitions', nil
    # it_behaves_like 'a map partitions', rand(2..10)
  end

  context 'throught text_file' do
    let(:file)    { File.join('spec', 'inputs', 'numbers_0_100.txt') }
    let(:numbers) { File.readlines(file).map(&:strip) }

    def rdd(workers)
      $sc.text_file(file, workers)
    end

    it_behaves_like 'a map partitions', 1
    it_behaves_like 'a map partitions', 2
    # it_behaves_like 'a map partitions', nil
    # it_behaves_like 'a map partitions', rand(2..10)
  end
end


================================================
FILE: spec/lib/map_spec.rb
================================================
require 'spec_helper'

RSpec.shared_examples 'a mapping' do |workers|
  it "with #{workers || 'default'} worker" do
    rdd2 = rdd(workers).map(func1)
    result = numbers.map(&func1)

    expect(rdd2.collect).to eql(result)

    rdd3 = rdd(workers)
    rdd3 = rdd3.map(func1)
    rdd3 = rdd3.map(func2)
    rdd3 = rdd3.map(func3)
    result = numbers.map(&func1).map(&func2).map(&func3)

    expect(rdd3.collect).to eql(result)

    rdd4 = rdd(workers)
    rdd4 = rdd4.map(func3)
    rdd4 = rdd4.map(func2)
    rdd4 = rdd4.map(func1)

    expect(rdd4.collect).to eql(rdd3.collect)
  end
end

RSpec.shared_examples 'a mapping values' do |workers|
  it "with #{workers || 'default'} worker" do
    rdd2 = rdd(workers).map_values(func1)
    result = hash.map{|key, value| [key, func1.call(value)]}

    expect(rdd2.collect).to eql(result)

    rdd3 = rdd(workers)
    rdd3 = rdd3.map_values(func1)
    rdd3 = rdd3.map_values(func2)
    rdd3 = rdd3.map_values(func3)
    result = hash.map{|key, value| [key, func1.call(value)]}
                 .map{|key, value| [key, func2.call(value)]}
                 .map{|key, value| [key, func3.call(value)]}

    expect(rdd3.collect).to eql(result)
  end
end

RSpec.describe 'Spark::RDD' do
  let(:func1) { lambda{|x| x*2} }
  let(:func2) { lambda{|x| x*3} }
  let(:func3) { lambda{|x| x*4} }

  context 'throught parallelize' do
    context '.map' do
      let(:numbers) { Generator.numbers }

      def rdd(workers)
        $sc.parallelize(numbers, workers)
      end

      it_behaves_like 'a mapping', 1
      it_behaves_like 'a mapping', 2
      # it_behaves_like 'a mapping', nil
      # it_behaves_like 'a mapping', rand(2..10)
    end

    context '.map_values' do
      let!(:hash) { Generator.hash }

      def rdd(workers)
        $sc.parallelize(hash, workers)
      end

      it_behaves_like 'a mapping values', 1
      it_behaves_like 'a mapping values', 2
      # it_behaves_like 'a mapping values', nil
      # it_behaves_like 'a mapping values', rand(2..10)
    end
  end

  context 'throught text_file' do
    context '.map' do
      let(:file) { File.join('spec', 'inputs', 'numbers_0_100.txt') }
      let(:numbers) { File.readlines(file).map(&:strip) }

      def rdd(workers)
        $sc.text_file(file, workers)
      end

      it_behaves_like 'a mapping', 1
      it_behaves_like 'a mapping', 2
      # it_behaves_like 'a mapping', nil
      # it_behaves_like 'a mapping', rand(2..10)
    end
  end
end


================================================
FILE: spec/lib/mllib/classification_spec.rb
================================================
require 'spec_helper'

RSpec.describe 'Spark::Mllib classification' do

  let(:data1) do
    [
      LabeledPoint.new(0.0, [1, 0, 0]),
      LabeledPoint.new(1.0, [0, 1, 1]),
      LabeledPoint.new(0.0, [2, 0, 0]),
      LabeledPoint.new(1.0, [0, 2, 1])
    ]
  end

  let(:values1) do
    data1.map do |lp|
      lp.features.values
    end
  end

  let(:rdd1) { $sc.parallelize(data1) }

  context 'logistic regression' do
    it 'test' do
      lrm = LogisticRegressionWithSGD.train(rdd1)

      expect(lrm.predict(values1[0])).to be <= 0
      expect(lrm.predict(values1[1])).to be >  0
      expect(lrm.predict(values1[2])).to be <= 0
      expect(lrm.predict(values1[3])).to be >  0
    end
  end

  context 'svm' do
    it 'test' do
      lrm = SVMWithSGD.train(rdd1)

      expect(lrm.predict(values1[0])).to be <= 0
      expect(lrm.predict(values1[1])).to be >  0
      expect(lrm.predict(values1[2])).to be <= 0
      expect(lrm.predict(values1[3])).to be >  0
    end
  end

  context 'naive bayes' do
    it 'test' do
      lrm = NaiveBayes.train(rdd1)

      expect(lrm.predict(values1[0])).to be <= 0
      expect(lrm.predict(values1[1])).to be >  0
      expect(lrm.predict(values1[2])).to be <= 0
      expect(lrm.predict(values1[3])).to be >  0
    end
  end
end


================================================
FILE: spec/lib/mllib/clustering_spec.rb
================================================
require 'spec_helper'

RSpec.describe 'Spark::Mllib clustering' do
  context 'kmeans' do
    it 'test' do
      data = [
        DenseVector.new([0, 1.1]),
        DenseVector.new([0, 1.2]),
        DenseVector.new([1.1, 0]),
        DenseVector.new([1.2, 0])
      ]
      model = KMeans.train($sc.parallelize(data), 2, initialization_mode: 'k-means||')

      expect(model.predict(data[0])).to eq(model.predict(data[1]))
      expect(model.predict(data[2])).to eq(model.predict(data[3]))
    end

    it 'deterministic' do
      data = Array.new(10) do |i|
        i *= 10
        DenseVector.new([i, i])
      end

      clusters1 = KMeans.train($sc.parallelize(data), 3, initialization_mode: 'k-means||', seed: 42)
      clusters2 = KMeans.train($sc.parallelize(data), 3, initialization_mode: 'k-means||', seed: 42)

      centers1 = clusters1.centers.to_a
      centers2 = clusters2.centers.to_a

      centers1.zip(centers2).each do |c1, c2|
        expect(c1).to eq(c2)
      end
    end
  end
end


================================================
FILE: spec/lib/mllib/matrix_spec.rb
================================================
require 'spec_helper'

RSpec.describe 'Spark::Mllib::Matrix' do
  context 'dense' do
    it 'construct' do
      values = [[1.0, 0.0, 4.0], [0.0, 3.0, 5.0], [2.0, 0.0, 6.0]]
      matrix = DenseMatrix.new(3, 3, [[1.0, 0.0, 4.0], [0.0, 3.0, 5.0], [2.0, 0.0, 6.0]])

      expect(matrix.shape).to eq([3, 3])
      expect(matrix.values).to eq([[1.0, 0.0, 4.0], [0.0, 3.0, 5.0], [2.0, 0.0, 6.0]])
    end
  end

  context 'sparse' do
    it 'construct' do
      values = [1.0, 2.0, 4.0, 5.0]
      column_pointers = [0, 2, 2, 4, 4]
      row_indices = [1, 2, 1, 2]

      matrix = SparseMatrix.new(3, 4, column_pointers, row_indices, values)

      expect(matrix.shape).to eq([3, 4])
      expect(matrix.to_a).to eq(
        [
          [0.0, 0.0, 0.0, 0.0],
          [1.0, 0.0, 4.0, 0.0],
          [2.0, 0.0, 5.0, 0.0]
        ]
      )
    end
  end
end


================================================
FILE: spec/lib/mllib/regression_spec.rb
================================================
require 'spec_helper'

# Mllib functions are tested on Spark
# This just test if ruby call proper methods

RSpec.describe 'Spark::Mllib regression' do

  let(:data1) do
    [
      LabeledPoint.new(-1.0, [0, -1]),
      LabeledPoint.new(1.0, [0, 1]),
      LabeledPoint.new(-1.0, [0, -2]),
      LabeledPoint.new(1.0, [0, 2])
    ]
  end

  let(:values1) do
    data1.map do |lp|
      lp.features.values
    end
  end

  let(:rdd1) { $sc.parallelize(data1) }

  context 'labeled point' do
    let(:lp) { LabeledPoint.new(1, [1,2,3]) }

    it 'from array' do
      expect(lp.label).to eql(1.0)
      expect(lp.features).to be_a(DenseVector)
    end

    it 'serialize' do
      lp2 = Marshal.load(Marshal.dump(lp))

      expect(lp2.label).to eql(lp.label)
      expect(lp2.features.values).to eql(lp.features.values)
    end
  end

  context 'linear regression' do
    context 'test' do
      let(:lrm) { LinearRegressionWithSGD.train(rdd1) }

      it 'test' do
        expect(lrm.predict(values1[0])).to be <= 0
        expect(lrm.predict(values1[1])).to be >  0
        expect(lrm.predict(values1[2])).to be <= 0
        expect(lrm.predict(values1[3])).to be >  0
      end

      it 'test via rdd' do
        rdd = $sc.parallelize(values1, 1)
        rdd = rdd.map(lambda{|value| model.predict(value)})
        rdd = rdd.bind(model: lrm)

        result = rdd.collect

        expect(result[0]).to be <= 0
        expect(result[1]).to be >  0
        expect(result[2]).to be <= 0
        expect(result[3]).to be >  0
      end
    end

    # Y = 3 + 10*X1 + 10*X2
    it 'linear regression' do
      data = Spark.jb.call(RubyMLLibUtilAPI, 'generateLinearInput', 3.0, ['10.0', '10.0'], 100, 42, 0.1)
      rdd = $sc.parallelize(data)

      lrm = LinearRegressionWithSGD.train(rdd, iterations: 1000, intercept: true, step: 1.0)

      expect(lrm.intercept).to be_between(2.5, 3.5)
      expect(lrm.weights.size).to eq(2)
      expect(lrm.weights[0]).to be_between(9.0, 11.0)
      expect(lrm.weights[1]).to be_between(9.0, 11.0)
    end
  end

  context 'lasso' do
    it 'test' do
      lrm = LassoWithSGD.train(rdd1)

      expect(lrm.predict(values1[0])).to be <= 0
      expect(lrm.predict(values1[1])).to be >  0
      expect(lrm.predict(values1[2])).to be <= 0
      expect(lrm.predict(values1[3])).to be >  0
    end

    it 'local random SGD with initial weights' do
      data = Spark.jb.call(RubyMLLibUtilAPI, 'generateLinearInput', 2.0, ['-1.5', '0.01'], 1000, 42, 0.1)
      data.map! do |lp|
        LabeledPoint.new(lp.label, [1.0] + lp.features.values)
      end

      rdd = $sc.parallelize(data);

      lrm = LassoWithSGD.train(rdd, step: 1.0, reg_param: 0.01, iterations: 40, initial_weights: [-1.0, -1.0, -1.0])

      expect(lrm.weights[0]).to be_between(1.9, 2.1)
      expect(lrm.weights[1]).to be_between(-1.60, -1.40)
      expect(lrm.weights[2]).to be_between(-1.0e-2, 1.0e-2)
    end
  end

  context 'ridge' do
    it 'test' do
      lrm = RidgeRegressionWithSGD.train(rdd1)

      expect(lrm.predict(values1[0])).to be <= 0
      expect(lrm.predict(values1[1])).to be >  0
      expect(lrm.predict(values1[2])).to be <= 0
      expect(lrm.predict(values1[3])).to be >  0
    end
  end
end


================================================
FILE: spec/lib/mllib/vector_spec.rb
================================================
require 'spec_helper'

RSpec.describe 'Spark::Mllib::Vector' do

  context 'parsing' do
    it 'dense vector' do
      dv  = DenseVector.new([1.0, 2.0, 3.0, 4.0, 5.0])
      dv2 = DenseVector.parse(dv.to_s)
      dv3 = Vectors.parse(dv.to_s)

      expect(dv.to_s).to eq("[1.0,2.0,3.0,4.0,5.0]")
      expect(dv2.values).to eq(dv.values)
      expect(dv3.values).to eq(dv.values)
    end

    it 'sparse vector' do
      sv  = SparseVector.new(5, {1 => 3, 4 => 5})
      sv2 = SparseVector.parse(sv.to_s)
      sv3 = Vectors.parse(sv.to_s)

      expect(sv.to_s).to eq("(5,[1,4],[3,5])")
      expect(sv2.size).to eq(sv.size)
      expect(sv2.indices).to eq(sv.indices)
      expect(sv2.values).to eq(sv.values)
      expect(sv3.size).to eq(sv.size)
      expect(sv3.indices).to eq(sv.indices)
      expect(sv3.values).to eq(sv.values)
    end
  end

  it 'dot' do
    sv = SparseVector.new(4, {1 => 1, 3 => 2})
    dv = DenseVector.new([1.0, 2.0, 3.0, 4.0])
    lst = DenseVector.new([1, 2, 3, 4])

    expect(sv.dot(dv)).to eq(10.0)
    expect(dv.dot(dv)).to eq(30.0)
    expect(lst.dot(dv)).to eq(30.0)
  end

  it 'squared distance' do
    sv = SparseVector.new(4, {1 => 1, 3 => 2})
    dv = DenseVector.new([1.0, 2.0, 3.0, 4.0])
    lst = DenseVector.new([4, 3, 2, 1])

    expect(sv.squared_distance(dv)).to eq(15)
    expect(sv.squared_distance(lst)).to eq(25)
    expect(dv.squared_distance(lst)).to eq(20)
    expect(dv.squared_distance(sv)).to eq(15)
    expect(lst.squared_distance(sv)).to eq(25)
    expect(lst.squared_distance(dv)).to eq(20)
    expect(sv.squared_distance(sv)).to eq(0)
    expect(dv.squared_distance(dv)).to eq(0)
    expect(lst.squared_distance(lst)).to eq(0)
  end

  it 'sparse vector indexing' do
    sv1 = SparseVector.new(4, {1 => 1, 3 => 2})
    sv2 = SparseVector.new(4, [1, 3], [1, 2])

    expect(sv1[0]).to eq(0)
    expect(sv1[3]).to eq(2)
    expect(sv1[1]).to eq(1)
    expect(sv1[2]).to eq(0)
    expect(sv1[-1]).to eq(2)
    expect(sv1[-2]).to eq(0)
    expect(sv1[-4]).to eq(0)

    expect(sv2[0]).to eq(0)
    expect(sv2[3]).to eq(2)
    expect(sv2[1]).to eq(1)
    expect(sv2[2]).to eq(0)
    expect(sv2[-1]).to eq(2)
    expect(sv2[-2]).to eq(0)
    expect(sv2[-4]).to eq(0)
  end
end


================================================
FILE: spec/lib/reduce_by_key_spec.rb
================================================
require 'spec_helper'

def flat_map(line)
  line.split
end

def map(item)
  [item, 1]
end

def reduce(x,y)
  x+y
end

RSpec.shared_examples 'a words counting' do |workers|
  context "with #{workers || 'default'} worker" do
    let(:result) do
      keyyed = lines.flat_map{|x| x.split}.map{|x| [x,1]}
      result = keyyed.reduce({}){|memo, item|
        key   = item[0]
        value = item[1]

        memo[key] ||= 0
        memo[key] += value
        memo
      }
      result
    end

    it 'when lambda' do
      rdd2 = rdd(workers)
      rdd2 = rdd2.flat_map(lambda{|line| line.split})
      rdd2 = rdd2.map(lambda{|word| [word, 1]})
      rdd2 = rdd2.reduce_by_key(lambda{|x,y| x+y})

      expect(rdd2.collect_as_hash).to eql(result)
    end

    it 'when method' do
      rdd2 = rdd(workers)
      rdd2 = rdd2.flat_map(method(:flat_map))
      rdd2 = rdd2.map(method(:map))
      rdd2 = rdd2.reduce_by_key(method(:reduce))

      expect(rdd2.collect_as_hash).to eql(result)
    end

    it 'keys, values' do
      rdd2 = rdd(workers)
      rdd2 = rdd2.flat_map(method(:flat_map))
      rdd2 = rdd2.map(method(:map))
      rdd2 = rdd2.reduce_by_key(method(:reduce))

      expect(rdd2.keys.collect.sort).to eql(result.keys.sort)
      expect { rdd2.values.collect.reduce(:+) }.to_not raise_error
    end
  end
end

RSpec.describe 'Spark::RDD' do
  context '.reduce_by_key' do
    context 'throught parallelize' do
      let(:lines) { Generator.lines }

      def rdd(workers)
        $sc.parallelize(lines, workers)
      end

      it_behaves_like 'a words counting', 2
      # it_behaves_like 'a words counting', nil
      # it_behaves_like 'a words counting', rand(2..10)
    end

    context 'throught text_file' do
      let(:file)  { File.join('spec', 'inputs', 'lorem_300.txt') }
      let(:lines) { File.readlines(file).map(&:strip) }

      def rdd(workers)
        $sc.text_file(file, workers)
      end

      it_behaves_like 'a words counting', 2
      # it_behaves_like 'a words counting', nil
      # it_behaves_like 'a words counting', rand(2..10)
    end
  end

  context '.fold_by_key' do
    let(:numbers)    { Generator.numbers }
    let(:zero_value) { 0 }
    let(:rdd)        { $sc.parallelize(numbers) }
    let(:map)        { lambda{|x| [x, 1]} }
    let(:add)        { lambda{|x,y| x+y} }

    let(:result) do
      _result = {}
      numbers.map(&map).each do |key, value|
        _result[key] ||= zero_value
        _result[key] = add.call(_result[key], value)
      end
      _result
    end

    def fold_by_key(num_partitions=nil)
      rdd.map(map).fold_by_key(zero_value, add, num_partitions).collect_as_hash
    end

    it 'default num_partitions' do
      expect(fold_by_key).to eq(result)
    end

    it 'default num_partitions' do
      expect(
        fold_by_key rand(1..10)
      ).to eq(result)
    end
  end
end


================================================
FILE: spec/lib/reduce_spec.rb
================================================
require 'spec_helper'

def longest_words(memo, word)
  memo.length > word.length ? memo : word
end

RSpec.shared_examples 'a reducing' do |workers|
  context "with #{workers || 'default'} worker" do
    it '.reduce' do
      rdd2 = rdd_numbers(workers)
      rdd2 = rdd2.map(to_i)
      rdd2 = rdd2.reduce(func1)
      result = numbers.map(&:to_i).reduce(&func1)

      expect(rdd2).to eql(result)

      rdd3 = rdd_numbers(workers)
      rdd3 = rdd3.map(to_i)
      rdd3 = rdd3.reduce(func2)
      result = numbers.map(&:to_i).reduce(&func2)

      expect(rdd3).to eql(result)

      rdd4 = rdd_lines(workers)
      rdd4 = rdd4.flat_map(split)
      rdd4 = rdd4.reduce(method(:longest_words))

      result = lines.flat_map(&split).reduce(&lambda(&method(:longest_words)))

      expect(rdd4).to eql(result)
    end

    it '.fold' do
      rdd2 = rdd_numbers(workers)
      rdd2 = rdd2.map(to_i)
      rdd_result = rdd2.fold(1, func1)

      # all workers add 1 + last reducing phase
      result = numbers.map(&:to_i).reduce(&func1) + rdd2.partitions_size + 1

      expect(rdd_result).to eql(result)
    end

    it '.aggregate' do
      rdd2 = rdd_numbers(workers)
      rdd2 = rdd2.map(to_i)

      # Sum of items + their count
      seq = lambda{|x,y| [x[0] + y, x[1] + 1]}
      com = lambda{|x,y| [x[0] + y[0], x[1] + y[1]]}
      rdd_result = rdd2.aggregate([0,0], seq, com)

      result = [numbers.reduce(:+), numbers.size]

      expect(rdd_result).to eql(result)
    end

    it '.max' do
      rdd2 = rdd_numbers(workers)
      rdd2 = rdd2.map(to_i)

      expect(rdd2.max).to eql(numbers.map(&:to_i).max)
    end

    it '.min' do
      rdd2 = rdd_numbers(workers)
      rdd2 = rdd2.map(to_i)

      expect(rdd2.min).to eql(numbers.map(&:to_i).min)
    end

    it '.sum' do
      rdd2 = rdd_numbers(workers)
      rdd2 = rdd2.map(to_i)

      expect(rdd2.sum).to eql(numbers.map(&:to_i).reduce(:+))
    end

    it '.count' do
      rdd2 = rdd_numbers(workers)
      rdd2 = rdd2.map(to_i)

      expect(rdd2.count).to eql(numbers.size)
    end
  end
end

RSpec.describe 'Spark::RDD' do
  let(:func1) { lambda{|sum, x| sum+x} }
  let(:func2) { lambda{|product, x| product*x} }

  let(:to_i)  { lambda{|item| item.to_i} }
  let(:split) { lambda{|item| item.split} }

  context 'throught parallelize' do
    let(:numbers) { Generator.numbers }
    let(:lines)   { Generator.lines }

    def rdd_numbers(workers)
      $sc.parallelize(numbers, workers)
    end

    def rdd_lines(workers)
      $sc.parallelize(lines, workers)
    end

    it_behaves_like 'a reducing', 1
    it_behaves_like 'a reducing', 2
    # it_behaves_like 'a reducing', nil
    # it_behaves_like 'a reducing', rand(2..10)
  end

  context 'throught text_file' do
    let(:file)       { File.join('spec', 'inputs', 'numbers_0_100.txt') }
    let(:file_lines) { File.join('spec', 'inputs', 'lorem_300.txt') }

    let(:numbers) { File.readlines(file).map(&:strip).map(&:to_i) }
    let(:lines)   { File.readlines(file_lines).map(&:strip) }

    def rdd_numbers(workers)
      $sc.text_file(file, workers)
    end

    def rdd_lines(workers)
      $sc.text_file(file_lines, workers)
    end

    it_behaves_like 'a reducing', 1
    it_behaves_like 'a reducing', 2
    # it_behaves_like 'a reducing', nil
    # it_behaves_like 'a reducing', rand(2..10)
  end
end


================================================
FILE: spec/lib/sample_spec.rb
================================================
require 'spec_helper'

# Sample method can not be tested because of random generator
# Just test it for raising error

RSpec.shared_examples 'a sampler' do |workers|
  context "with #{workers || 'default'} worker" do

    context '.sample' do
      it 'with replacement' do
        rdd2 = rdd(workers).sample(true, rand)
        expect { rdd2.collect }.to_not raise_error
      end

      it 'without replacement' do
        rdd2 = rdd(workers).sample(false, rand)
        expect { rdd2.collect }.to_not raise_error
      end
    end

    context '.take_sample' do
      it 'with replacement' do
        size = rand(10..999)
        expect(rdd(workers).take_sample(true, size).size).to eql(size)
      end

      it 'without replacement' do
        size = rand(10..999)
        expect(rdd(workers).take_sample(false, size).size).to eql(size)
      end
    end

  end
end

RSpec.describe 'Spark::RDD' do
  let(:numbers) { Generator.numbers(1000) }

  def rdd(workers)
    $sc.parallelize(numbers, workers)
  end

  it_behaves_like 'a sampler', 1
  it_behaves_like 'a sampler', 2
  # it_behaves_like 'a sampler', nil
  # it_behaves_like 'a sampler', rand(2..10)
end


================================================
FILE: spec/lib/serializer_spec.rb
================================================
require 'spec_helper'
require 'zlib'

RSpec.describe Spark::Serializer do
  let(:data) { [1, 'test', 2.0, [3], {key: 'value'}, :test, String] }

  it 'find' do
    expect(described_class.find('not_existed_class')).to eql(nil)

    expect(described_class.find('Marshal')).to eq(described_class::Marshal)
    expect(described_class.find('marshal')).to eq(described_class::Marshal)
    expect(described_class.find(:marshal)).to eq(described_class::Marshal)
    expect(described_class.find('batched')).to eq(described_class::Batched)
  end

  it 'find!' do
    expect { expect(described_class.find!('not_existed_class')) }.to raise_error(Spark::SerializeError)
    expect { expect(described_class.find!('marshal')) }.to_not raise_error
    expect { expect(described_class.find!('batched')) }.to_not raise_error
  end

  it 'register' do
    NewSerializer = Class.new

    expect(described_class.find('new_serializer_1')).to eql(nil)
    expect(described_class.find('new_serializer_2')).to eql(nil)
    expect(described_class.find('new_serializer_3')).to eql(nil)

    described_class.register('new_serializer_1', 'new_serializer_2', 'new_serializer_3', NewSerializer)

    expect(described_class.find('new_serializer_1')).to eql(NewSerializer)
    expect(described_class.find('new_serializer_2')).to eql(NewSerializer)
    expect(described_class.find('new_serializer_3')).to eql(NewSerializer)
  end

  it '==' do
    # One class
    marshal1 = described_class::Marshal.new
    marshal2 = described_class::Marshal.new

    expect(marshal1).to eq(marshal1)
    expect(marshal1).to eq(marshal2)

    # Two classes
    compressed1 = described_class::Compressed.new(marshal1)
    compressed2 = described_class::Compressed.new(marshal2)

    expect(compressed1).to eq(compressed1)
    expect(compressed1).to eq(compressed2)

    # Three classes
    batched1 = described_class::Batched.new(compressed1, 1)
    batched2 = described_class::Batched.new(compressed2, 1)
    batched3 = described_class::Batched.new(compressed1, 2)

    expect(batched1).to eq(batched2)
    expect(batched1).to_not eq(batched3)
  end

  context 'build' do
    let(:marshal1)    { described_class::Marshal.new }
    let(:compressed1) { described_class::Compressed.new(marshal1) }
    let(:batched1)    { described_class::Batched.new(compressed1, 1) }

    it 'block' do
      expect(described_class.build{ marshal }).to eq(marshal1)
      expect(described_class.build{ marshal }).to eq(described_class.build{ __marshal__ })
      expect(described_class.build{ compressed(marshal) }).to eq(compressed1)
      expect(described_class.build{ batched(compressed(marshal), 1) }).to eq(batched1)
    end

    it 'text' do
      expect(described_class.build('marshal')).to eq(marshal1)
      expect(described_class.build('compressed(marshal)')).to eq(compressed1)
      expect(described_class.build('batched(compressed(marshal), 1)')).to eq(batched1)
    end
  end

  it 'serialization' do
    marshal1 = described_class.build{ marshal }
    compressed1 = described_class.build{ compressed(marshal) }

    expect(marshal1.dump(data)).to eq(Marshal.dump(data))
    expect(compressed1.dump(data)).to eq(
      Zlib::Deflate.deflate(Marshal.dump(data))
    )
  end

  context 'Auto batched' do
    let(:klass) { Spark::Serializer::AutoBatched }
    let(:marshal) { Spark::Serializer::Marshal.new }
    let(:numbers) { Generator.numbers }

    it 'initialize' do
      expect { klass.new }.to raise_error(ArgumentError)
      expect { klass.new(marshal) }.to_not raise_error
      expect { klass.new(marshal, 1) }.to raise_error(Spark::SerializeError)
    end

    it 'serialization' do
      serializer1 = klass.new(marshal)
      serializer2 = klass.new(marshal, 2)

      rdd1 = Spark.sc.parallelize(numbers, 2, serializer1)
      rdd2 = Spark.sc.parallelize(numbers, 2, serializer2).map(:to_i)

      result = rdd1.collect

      expect(rdd1.serializer).to eq(serializer1)
      expect(result).to eq(numbers)
      expect(result).to eq(rdd2.collect)
    end

  end
end


================================================
FILE: spec/lib/sort_spec.rb
================================================
require 'spec_helper'

RSpec.shared_examples 'a sorting' do |workers|
  it "with #{workers || 'default'} worker" do
    rdd2 = rdd(workers)

    rdd2 = rdd2.flat_map(split)
    result = lines.flat_map(&split)

    # Sort by self
    rdd3 = rdd2.map(map).sort_by_key
    result2 = result.map(&map).sort_by{|(key, _)| key}

    expect(rdd3.collect).to eql(result2)

    # Sort by len
    rdd3 = rdd2.map(len_map).sort_by_key
    result2 = result.map(&len_map).sort_by{|(key, _)| key}

    expect(rdd3.collect).to eql(result2)
  end
end


RSpec.describe 'Spark::RDD' do
  let(:split)   { lambda{|x| x.split} }
  let(:map)     { lambda{|x| [x.to_s, 1]} }
  let(:len_map) { lambda{|x| [x.size, x]} }

  context 'throught parallelize' do
    context '.map' do
      let(:lines) { Generator.lines }

      def rdd(workers)
        $sc.parallelize(lines, workers)
      end

      it_behaves_like 'a sorting', 1
      it_behaves_like 'a sorting', 2
      # it_behaves_like 'a sorting', nil
      # it_behaves_like 'a sorting', rand(2..10)
    end
  end

  context 'throught text_file' do
    context '.map' do
      let(:file)  { File.join('spec', 'inputs', 'lorem_300.txt') }
      let(:lines) { File.readlines(file).map(&:strip) }

      def rdd(workers)
        $sc.text_file(file, workers)
      end

      it_behaves_like 'a sorting', 1
      it_behaves_like 'a sorting', 2
      # it_behaves_like 'a sorting', nil
      # it_behaves_like 'a sorting', rand(2..10)
    end
  end
end


================================================
FILE: spec/lib/sql/column_spec.rb
================================================
require 'spec_helper'

RSpec.shared_examples 'binary comparison' do |op|
  it "#{op}" do
    to_test = 20

    result = df.select('age').where( df.age.__send__(op, to_test) ).values.flatten
    result.each do |item|
      if op == '!='
        expect(item).to_not eq(to_test)
      else
        expect(item).to be.__send__(op, to_test)
      end
    end
  end
end

RSpec.describe Spark::SQL::Column do

  let(:file) { File.join('spec', 'inputs', 'people.json') }
  let(:df) { $sql.read.json(file) }

  let(:data) do
    # Data are line delimited
    result = []
    File.readlines(file).each do |line|
      result << JSON.parse(line)
    end
    result
  end

  context 'operators' do
    it 'func' do
      result = df.select( df.id, df.active, ~df.id, !df.active ).collect_as_hash.map(&:values)
      result.each do |item|
        expect(item[0]).to eq(-item[2])
        expect(item[1]).to eq(!item[3])
      end
    end

    context 'binary' do
      it 'arithmetic' do
        result = df.select( df.id, df.id+1, df.id-1, df.id*2, df.id/2, df.id%2 ).collect_as_hash.map(&:values)
        result.each do |item|
          expect(item[1]).to eq(item[0]+1)
          expect(item[2]).to eq(item[0]-1)
          expect(item[3]).to eq(item[0]*2)
          expect(item[4]).to eq(item[0]/2.0)
          expect(item[5]).to eq(item[0]%2)
        end
      end

      # comparison
      it_behaves_like 'binary comparison', '=='
      it_behaves_like 'binary comparison', '!='
      it_behaves_like 'binary comparison', '<'
      it_behaves_like 'binary comparison', '<='
      it_behaves_like 'binary comparison', '>'
      it_behaves_like 'binary comparison', '>='

      it 'logical' do
        result = df.select('id').where( (df.id >= 20) & (df.id <= 30) ).values.flatten
        expect(result).to all( be_between(20, 30) )

        result = df.select('id').where( (df.id == 1) | (df.id == 2) ).values.flatten
        expect(result).to eq([1, 2])
      end

      it 'like' do
        result = df.select('email').where( df.email.like('%com%') ).values.flatten
        expect(result).to all( include('com') )
      end

      it 'null' do
        result1 = df.select('address').where( df.address.is_null ).values.flatten
        result2 = df.select('address').where( df.address.is_not_null ).values.flatten

        expect(result1).to all( be_nil )
        expect(result2).to all( be_an(String) )
      end
    end
  end

  it 'substr' do
    result = df.select( df.name.substr(1, 3) ).values.flatten
    result.each do |item|
      expect(item.size).to eq(3)
    end
  end

  it 'isin' do
    result = df.select('age').where( df.age.isin(20, 21, 22) ).values.flatten
    expect(result).to all( eq(20).or eq(21).or eq(22) )
  end

  it 'alias' do
    result = df.select( df.id.as('id2') ).collect_as_hash.map(&:keys).flatten
    expect(result).to all( eq('id2') )
  end

  it 'cast' do
    result = df.select( df.id, df.id.cast('string').alias('age2') ).values
    result.each do |item|
      expect(item[0]).to be_an(Integer)
      expect(item[0].to_s).to eq(item[1])
    end
  end

  it 'when, otherwise' do
    result = df.select(df.id, Spark::SQL::Column.when(df.id <= 20, 1).when(df.id >= 30, 3).otherwise(2)).values
    result.each do |item|
      id = item[0]
      value = item[1]

      if id <= 20
        expect(value).to eq(1)
      elsif id >= 30
        expect(value).to eq(3)
      else
        expect(value).to eq(2)
      end
    end
  end

end


================================================
FILE: spec/lib/sql/data_frame_spec.rb
================================================
require 'spec_helper'

RSpec.describe Spark::SQL::DataFrame do

  let(:file) { File.join('spec', 'inputs', 'people.json') }
  let(:df) { $sql.read.json(file) }

  context '[]' do

    it 'String' do
      value = df['age']
      expect(value).to be_a(Spark::SQL::Column)
      expect(value.to_s).to eq('Column("age")')
    end

    it 'Array' do
      value = df[ ['name', 'age'] ]
      expect(value).to be_a(Spark::SQL::DataFrame)
      expect(value.columns).to eq(['name', 'age'])
    end

    it 'Numeric' do
      value = df[0]
      expect(value).to be_a(Spark::SQL::Column)
      expect(value.to_s).to eq('Column("active")')
    end

    it 'Column' do
      value = df[ df[0] == true ]
      expect(value).to be_a(Spark::SQL::DataFrame)
    end

  end

  it 'columns' do
    expect(df.columns).to eq(['active', 'address', 'age', 'email', 'id', 'ip_address', 'name'])
  end

  it 'schema' do
    schema = df.schema
    expect(schema).to be_a(Spark::SQL::StructType)
    expect(schema.type_name).to eq('struct')
  end

  it 'show_string' do
    expect(df.show_string).to start_with('+--')
  end

  it 'dtypes' do
    expect(df.dtypes).to eq([['active', 'boolean'], ['address', 'string'], ['age', 'long'], ['email', 'string'], ['id', 'long'], ['ip_address', 'string'], ['name', 'string']])
  end

  it 'take' do
    expect(df.take(10).size).to eq(10)
  end

  it 'count' do
    expect(df.count).to eq(100)
  end

  context 'select' do

    it '*' do
      row = df.select('*').first
      expect(row.data.keys).to eq(['active', 'address', 'age', 'email', 'id', 'ip_address', 'name'])
    end

    it 'with string' do
      row = df.select('name', 'age').first
      expect(row.data.keys).to eq(['name', 'age'])
    end

    it 'with column' do
      row = df.select(df.name, df.age).first
      expect(row.data.keys).to eq(['name', 'age'])
    end

  end

  context 'where' do

    it 'with string' do
      eq_20 = df.filter('age = 20').collect
      expect(eq_20.map{|c| c['age']}).to all(be == 20)
    end

    it 'with column' do
      nil_values = df.where(df.age.is_null).collect
      greater_or_eq_20 = df.where(df.age >= 20).collect
      lesser_than_20 = df.where(df.age < 20).collect

      expect(nil_values.size + greater_or_eq_20.size + lesser_than_20.size).to eq(df.count)

      expect(nil_values.map{|c| c['age']}).to all(be_nil)
      expect(greater_or_eq_20.map{|c| c['age']}).to all(be >= 20)
      expect(lesser_than_20.map{|c| c['age']}).to all(be < 20)
    end

  end

end


================================================
FILE: spec/lib/statistic_spec.rb
================================================
require 'spec_helper'

RSpec.shared_examples 'a stats' do |workers|
  let(:numbers) { [1.0, 1.0, 2.0, 3.0, 5.0, 8.0] }

  context "with #{workers || 'default'} worker" do
    it 'stats class' do
      stats = $sc.parallelize(numbers, workers).stats

      expect(stats.sum).to             be_within(0.1).of(20)
      expect(stats.mean).to            be_within(0.1).of(20/6.0)
      expect(stats.max).to             be_within(0.1).of(8.0)
      expect(stats.min).to             be_within(0.1).of(1.0)
      expect(stats.variance).to        be_within(0.1).of(6.22222)
      expect(stats.sample_variance).to be_within(0.1).of(7.46667)
      expect(stats.stdev).to           be_within(0.1).of(2.49444)
      expect(stats.sample_stdev).to    be_within(0.1).of(2.73252)
    end

    it 'rdd methods' do
      rdd = $sc.parallelize([1, 2, 3], workers)

      expect(rdd.mean).to            be_within(0.1).of(2.0)
      expect(rdd.variance).to        be_within(0.1).of(0.666)
      expect(rdd.stdev).to           be_within(0.1).of(0.816)
      expect(rdd.sample_stdev).to    be_within(0.1).of(1.0)
      expect(rdd.sample_variance).to be_within(0.1).of(1.0)
    end
  end
end

RSpec.shared_examples 'a histogram' do |workers|

  context "with #{workers || 'default'} worker" do
    it 'empty' do
      rdd = $sc.parallelize([], workers, ser)

      expect( rdd.histogram([0, 10])[1] ).to eq([0])
      expect( rdd.histogram([0, 4, 10])[1] ).to eq([0, 0])
    end

    it 'validation' do
      rdd = $sc.parallelize([], workers, ser)
      expect { rdd.histogram(0) }.to raise_error(ArgumentError)
    end

    it 'double' do
      rdd = $sc.parallelize([1.0, 2.0, 3.0, 4.0], workers, ser)
      buckets, counts = rdd.histogram(2)

      expect(buckets).to eq([1.0, 2.5, 4.0])
      expect(counts).to eq([2, 2])
    end

    it 'out of range' do
      rdd = $sc.parallelize([10.01, -0.01], workers, ser)

      expect( rdd.histogram([0, 10])[1] ).to eq([0])
      expect( rdd.histogram([0, 4, 10])[1] ).to eq([0, 0])
    end

    it 'in range with one bucket' do
      rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)

      expect( rdd.histogram([0, 10])[1] ).to eq([4])
      expect( rdd.histogram([0, 4, 10])[1] ).to eq([3, 1])
    end

    it 'in range with one bucket exact match' do
      rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)
      expect( rdd.histogram([1, 4])[1] ).to eq([4])
    end

    it 'out of range with two buckets' do
      rdd = $sc.parallelize([10.01, -0.01], workers, ser)
      expect( rdd.histogram([0, 5, 10])[1] ).to eq([0, 0])
    end

    it 'out of range with two uneven buckets' do
      rdd = $sc.parallelize([10.01, -0.01], workers, ser)
      expect( rdd.histogram([0, 4, 10])[1] ).to eq([0, 0])
    end

    it 'in range with two buckets' do
      rdd = $sc.parallelize([1, 2, 3, 5, 6], workers, ser)
      expect( rdd.histogram([0, 5, 10])[1] ).to eq([3, 2])
    end

    it 'in range with two bucket and nil' do
      rdd = $sc.parallelize([1, 2, 3, 5, 6, nil, Float::NAN], workers, ser)
      expect( rdd.histogram([0, 5, 10])[1] ).to eq([3, 2])
    end

    it 'in range with two uneven buckets' do
      rdd = $sc.parallelize([1, 2, 3, 5, 6], workers, ser)
      expect( rdd.histogram([0, 5, 11])[1] ).to eq([3, 2])
    end

    it 'mixed range with two uneven buckets' do
      rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.0, 11.01], workers, ser)
      expect( rdd.histogram([0, 5, 11])[1] ).to eq([4, 3])
    end

    it 'mixed range with four uneven buckets' do
      rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1], workers, ser)
      expect( rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1] ).to eq([4, 2, 1, 3])
    end

    it 'mixed range with uneven buckets and NaN' do
      rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1, nil, Float::NAN], workers, ser)
      expect( rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1] ).to eq([4, 2, 1, 3])
    end

    it 'out of range with infinite buckets' do
      rdd = $sc.parallelize([10.01, -0.01, Float::NAN, Float::INFINITY], workers, ser)
      expect( rdd.histogram([-Float::INFINITY, 0, Float::INFINITY])[1] ).to eq([1, 1])
    end

    it 'without buckets' do
      rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)
      expect( rdd.histogram(1) ).to eq([[1, 4], [4]])
    end

    it 'without buckets single element' do
      rdd = $sc.parallelize([1], workers, ser)
      expect( rdd.histogram(1) ).to eq([[1, 1], [1]])
    end

    it 'without bucket no range' do
      rdd = $sc.parallelize([1, 1, 1, 1], workers, ser)
      expect( rdd.histogram(1) ).to eq([[1, 1], [4]])
    end

    it 'without buckets basic two' do
      rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)
      expect( rdd.histogram(2) ).to eq([[1, 2.5, 4], [2, 2]])
    end

    it 'without buckets with more requested than elements' do
      rdd = $sc.parallelize([1, 2], workers, ser)
      buckets = [1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]
      hist = [1, 0, 0, 0, 0, 0, 0, 0, 0, 1]

      expect( rdd.histogram(10) ).to eq([buckets, hist])
    end

    it 'string' do
      rdd = $sc.parallelize(['ab', 'ac', 'b', 'bd', 'ef'], workers, ser)

      expect( rdd.histogram(['a', 'b', 'c'])[1] ).to eq([2, 2])
      expect( rdd.histogram(1) ).to eq([['ab', 'ef'], [5]])
      expect { rdd.histogram(2) }.to raise_error(Spark::RDDError)
    end

  end
end

RSpec.describe Spark::RDD do
  let(:ser) { Spark::Serializer.build { __batched__(__marshal__, 1) } }

  context '.stats' do
    it_behaves_like 'a stats', 1
    it_behaves_like 'a stats', 2
    # it_behaves_like 'a stats', rand(2..5)
  end

  context '.histogram' do
    it_behaves_like 'a histogram', 1
    it_behaves_like 'a histogram', 2
    # it_behaves_like 'a histogram', rand(2..5)
  end
end


================================================
FILE: spec/lib/whole_text_files_spec.rb
================================================
require 'spec_helper'

RSpec.shared_examples 'a whole_text_files' do |workers|
  it "with #{workers || 'default'} worker" do
    rdd2 = rdd(workers).map(get_numbers)
    result = files.size

    expect(rdd2.collect.size).to eql(result)

    rdd3 = rdd(workers)
    rdd3 = rdd3.flat_map(get_numbers)

    result = 0
    files.each{|f| result += File.read(f).split.map(&:to_i).reduce(:+)}

    expect(rdd3.sum).to eql(result)
  end
end

RSpec.describe 'Spark::Context' do
  let(:get_numbers) { lambda{|file, content| content.split.map(&:to_i)} }

  let(:dir)   { File.join('spec', 'inputs', 'numbers') }
  let(:files) { Dir.glob(File.join(dir, '*')) }

  def rdd(workers)
    $sc.whole_text_files(dir, workers)
  end

  it_behaves_like 'a whole_text_files', 1
  it_behaves_like 'a whole_text_files', 2
  # it_behaves_like 'a whole_text_files', nil
  # it_behaves_like 'a whole_text_files', rand(2..10)
end


================================================
FILE: spec/spec_helper.rb
================================================
require 'simplecov'
SimpleCov.start

$LOAD_PATH.unshift File.dirname(__FILE__) + '/../lib'
require 'ruby-spark'
require 'generator'

# Loading
Spark.load_lib
Spark.jb.import_all_test
Spark::Mllib.import

# Keep it on method because its called from config test
def spark_start
  Spark.logger.disable
  Spark.config do
    set 'spark.ruby.serializer.batch_size', 100
  end
  $sc = Spark.start
  $sql = Spark.start_sql
end

def windows?
  RbConfig::CONFIG['host_os'] =~ /mswin|mingw/
end

RSpec.configure do |config|
  config.default_formatter = 'doc'
  config.color = true
  config.tty   = true

  config.before(:suite) do
    spark_start
  end
  config.after(:suite) do
    Spark.stop
  end
end