Full Code of ondra-m/ruby-spark for AI

master d1b9787642fe cached

191 files

440.0 KB

128.7k tokens

1072 symbols

1 requests

Download .txt

Showing preview only (484K chars total). Download the full file or copy to clipboard to get everything.

Repository: ondra-m/ruby-spark
Branch: master
Commit: d1b9787642fe
Files: 191
Total size: 440.0 KB

Directory structure:
gitextract_h83fh3m2/

├── .gitignore
├── .travis.yml
├── CHANGELOG.md
├── Gemfile
├── Guardfile
├── LICENSE.txt
├── README.md
├── Rakefile
├── TODO.md
├── benchmark/
│   ├── aggregate.rb
│   ├── bisect.rb
│   ├── comparison/
│   │   ├── prepare.sh
│   │   ├── python.py
│   │   ├── r.r
│   │   ├── ruby.rb
│   │   ├── run-all.sh
│   │   └── scala.scala
│   ├── custom_marshal.rb
│   ├── digest.rb
│   ├── enumerator.rb
│   ├── serializer.rb
│   ├── sort.rb
│   ├── sort2.rb
│   └── take.rb
├── bin/
│   └── ruby-spark
├── example/
│   ├── pi.rb
│   └── website_search.rb
├── ext/
│   ├── ruby_c/
│   │   ├── extconf.rb
│   │   ├── murmur.c
│   │   ├── murmur.h
│   │   └── ruby-spark.c
│   ├── ruby_java/
│   │   ├── Digest.java
│   │   ├── Murmur2.java
│   │   ├── RubySparkExtService.java
│   │   └── extconf.rb
│   └── spark/
│       ├── build.sbt
│       ├── project/
│       │   └── plugins.sbt
│       ├── sbt/
│       │   └── sbt
│       └── src/
│           ├── main/
│           │   └── scala/
│           │       ├── Exec.scala
│           │       ├── MLLibAPI.scala
│           │       ├── Marshal.scala
│           │       ├── MarshalDump.scala
│           │       ├── MarshalLoad.scala
│           │       ├── RubyAccumulatorParam.scala
│           │       ├── RubyBroadcast.scala
│           │       ├── RubyConstant.scala
│           │       ├── RubyMLLibAPI.scala
│           │       ├── RubyMLLibUtilAPI.scala
│           │       ├── RubyPage.scala
│           │       ├── RubyRDD.scala
│           │       ├── RubySerializer.scala
│           │       ├── RubyTab.scala
│           │       ├── RubyUtils.scala
│           │       └── RubyWorker.scala
│           └── test/
│               └── scala/
│                   └── MarshalSpec.scala
├── lib/
│   ├── ruby-spark.rb
│   ├── spark/
│   │   ├── accumulator.rb
│   │   ├── broadcast.rb
│   │   ├── build.rb
│   │   ├── cli.rb
│   │   ├── command/
│   │   │   ├── base.rb
│   │   │   ├── basic.rb
│   │   │   ├── pair.rb
│   │   │   ├── sort.rb
│   │   │   └── statistic.rb
│   │   ├── command.rb
│   │   ├── command_builder.rb
│   │   ├── command_validator.rb
│   │   ├── config.rb
│   │   ├── constant.rb
│   │   ├── context.rb
│   │   ├── error.rb
│   │   ├── ext/
│   │   │   ├── hash.rb
│   │   │   ├── integer.rb
│   │   │   ├── io.rb
│   │   │   ├── ip_socket.rb
│   │   │   ├── module.rb
│   │   │   ├── object.rb
│   │   │   └── string.rb
│   │   ├── helper/
│   │   │   ├── logger.rb
│   │   │   ├── parser.rb
│   │   │   ├── serialize.rb
│   │   │   ├── statistic.rb
│   │   │   └── system.rb
│   │   ├── helper.rb
│   │   ├── java_bridge/
│   │   │   ├── base.rb
│   │   │   ├── jruby.rb
│   │   │   └── rjb.rb
│   │   ├── java_bridge.rb
│   │   ├── library.rb
│   │   ├── logger.rb
│   │   ├── mllib/
│   │   │   ├── classification/
│   │   │   │   ├── common.rb
│   │   │   │   ├── logistic_regression.rb
│   │   │   │   ├── naive_bayes.rb
│   │   │   │   └── svm.rb
│   │   │   ├── clustering/
│   │   │   │   ├── gaussian_mixture.rb
│   │   │   │   └── kmeans.rb
│   │   │   ├── matrix.rb
│   │   │   ├── regression/
│   │   │   │   ├── common.rb
│   │   │   │   ├── labeled_point.rb
│   │   │   │   ├── lasso.rb
│   │   │   │   ├── linear.rb
│   │   │   │   └── ridge.rb
│   │   │   ├── ruby_matrix/
│   │   │   │   ├── matrix_adapter.rb
│   │   │   │   └── vector_adapter.rb
│   │   │   ├── stat/
│   │   │   │   └── distribution.rb
│   │   │   └── vector.rb
│   │   ├── mllib.rb
│   │   ├── rdd.rb
│   │   ├── sampler.rb
│   │   ├── serializer/
│   │   │   ├── auto_batched.rb
│   │   │   ├── base.rb
│   │   │   ├── batched.rb
│   │   │   ├── cartesian.rb
│   │   │   ├── compressed.rb
│   │   │   ├── marshal.rb
│   │   │   ├── message_pack.rb
│   │   │   ├── oj.rb
│   │   │   ├── pair.rb
│   │   │   └── text.rb
│   │   ├── serializer.rb
│   │   ├── sort.rb
│   │   ├── sql/
│   │   │   ├── column.rb
│   │   │   ├── context.rb
│   │   │   ├── data_frame.rb
│   │   │   ├── data_frame_reader.rb
│   │   │   ├── data_type.rb
│   │   │   └── row.rb
│   │   ├── sql.rb
│   │   ├── stat_counter.rb
│   │   ├── storage_level.rb
│   │   ├── version.rb
│   │   └── worker/
│   │       ├── master.rb
│   │       ├── spark_files.rb
│   │       └── worker.rb
│   └── spark.rb
├── ruby-spark.gemspec
└── spec/
    ├── generator.rb
    ├── inputs/
    │   ├── lorem_300.txt
    │   ├── numbers/
    │   │   ├── 1.txt
    │   │   ├── 10.txt
    │   │   ├── 11.txt
    │   │   ├── 12.txt
    │   │   ├── 13.txt
    │   │   ├── 14.txt
    │   │   ├── 15.txt
    │   │   ├── 16.txt
    │   │   ├── 17.txt
    │   │   ├── 18.txt
    │   │   ├── 19.txt
    │   │   ├── 2.txt
    │   │   ├── 20.txt
    │   │   ├── 3.txt
    │   │   ├── 4.txt
    │   │   ├── 5.txt
    │   │   ├── 6.txt
    │   │   ├── 7.txt
    │   │   ├── 8.txt
    │   │   └── 9.txt
    │   ├── numbers_0_100.txt
    │   ├── numbers_1_100.txt
    │   └── people.json
    ├── lib/
    │   ├── collect_spec.rb
    │   ├── command_spec.rb
    │   ├── config_spec.rb
    │   ├── context_spec.rb
    │   ├── ext_spec.rb
    │   ├── external_apps_spec.rb
    │   ├── filter_spec.rb
    │   ├── flat_map_spec.rb
    │   ├── group_spec.rb
    │   ├── helper_spec.rb
    │   ├── key_spec.rb
    │   ├── manipulation_spec.rb
    │   ├── map_partitions_spec.rb
    │   ├── map_spec.rb
    │   ├── mllib/
    │   │   ├── classification_spec.rb
    │   │   ├── clustering_spec.rb
    │   │   ├── matrix_spec.rb
    │   │   ├── regression_spec.rb
    │   │   └── vector_spec.rb
    │   ├── reduce_by_key_spec.rb
    │   ├── reduce_spec.rb
    │   ├── sample_spec.rb
    │   ├── serializer_spec.rb
    │   ├── sort_spec.rb
    │   ├── sql/
    │   │   ├── column_spec.rb
    │   │   └── data_frame_spec.rb
    │   ├── statistic_spec.rb
    │   └── whole_text_files_spec.rb
    └── spec_helper.rb

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
/.gemtags
/.tags
/java/spark.jar
.jbundler
target/*
*.class
*.jar
pom.xml
vendor/*
*.gem
*.rbc
.bundle
.config
.yardoc
Gemfile.lock
InstalledFiles
_yardoc
coverage
doc/
lib/bundler/man
pkg
rdoc
spec/reports
test/tmp
test/version_tmp
tmp
*.bundle
*.so
*.o
*.a
mkmf.log
ext/spark/target/*
ext/spark/project/target/*
ext/spark/project/project/target/*
wiki
/benchmark/performance/spark/*
/benchmark/performance/rspark/*
/_*


================================================
FILE: .travis.yml
================================================
language: ruby

rvm:
  - 2.2.0

before_script:
  - bundle exec rake compile
  - bundle exec ruby bin/ruby-spark build

cache:
  bundler: true
  directories:
    - $HOME/.m2
    - $HOME/.ivy2
    - $HOME/.sbt


================================================
FILE: CHANGELOG.md
================================================
## Unreleased

## 1.3.0

  - new method on RDD (lookup)
  - fix sbt url
  - Spark 1.5.0

## 1.2.0 (15.06.2015)

  - target folder is now located at HOME
  - better serializators
  - error when java class does not exist
  - default setting at ~/.ruby-spark.conf
  - compatible with Spark 1.4.0
  - added calling site to RDD


================================================
FILE: Gemfile
================================================
source 'https://rubygems.org'

gemspec

gem 'sourcify', '0.6.0.rc4'
gem 'method_source'
gem 'commander'
gem 'pry'
gem 'nio4r'
gem 'distribution'

platform :mri do
  gem 'rjb'
  gem 'msgpack'
  gem 'oj'
  gem 'narray'
end

platform :jruby do
  gem 'msgpack-jruby', require: 'msgpack'

  # NameError: no constructorfor arguments (org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.joda.time.chrono.GJChronology) on Java::OrgJodaTime::DateTime
  # gem 'mdarray'
end

group :stats do
  # gem 'nmatrix'
  # gem 'statsample'
  # gem 'statsample-glm'
  # gem 'statsample-timeseries'
  # gem 'statistics2'
  # gem 'statsample-optimization' # libgsl0-dev
  # gem 'narray'
  # gem 'gsl-nmatrix'
end

group :development do
  gem 'benchmark-ips'
  gem 'rspec'
  gem 'rake-compiler'
  gem 'guard'
  gem 'guard-rspec'
  gem 'listen'
end

group :test do
  gem 'simplecov', require: false
end


================================================
FILE: Guardfile
================================================
guard :rspec, cmd: 'rspec' do
  watch(%r{^spec/.+_spec\.rb$})
  watch(%r{^lib/(.+)\.rb$})     { |m| "spec/lib/#{m[1]}_spec.rb" }
  watch('spec/spec_helper.rb')  { "spec" }
end


================================================
FILE: LICENSE.txt
================================================
Copyright (c) 2014 Ondřej Moravčík

MIT License

Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:

The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


================================================
FILE: README.md
================================================
# Ruby-Spark [![Build Status](https://travis-ci.org/ondra-m/ruby-spark.svg?branch=master)](https://travis-ci.org/ondra-m/ruby-spark)

Apache Spark™ is a fast and general engine for large-scale data processing.

This Gem allows the use Spark functionality on Ruby.

> Word count in Spark's Ruby API

```ruby
file = spark.text_file("hdfs://...")

file.flat_map(:split)
    .map(lambda{|word| [word, 1]})
    .reduce_by_key(lambda{|a, b| a+b})
```

- [Apache Spark](http://spark.apache.org)
- [Wiki](https://github.com/ondra-m/ruby-spark/wiki)
- [Rubydoc](http://www.rubydoc.info/gems/ruby-spark)

## Installation

### Requirments

- Java 7+
- Ruby 2+
- wget or curl
- MRI or JRuby

Add this line to your application's Gemfile:

```ruby
gem 'ruby-spark'
```

And then execute:

```
$ bundle
```

Or install it yourself as:

```
$ gem install ruby-spark
```

Run `rake compile` if you are using gem from local filesystem.

### Build Apache Spark

This command will download Spark and build extensions for this gem ([SBT](ext/spark/build.sbt) is used for compiling). For more informations check [wiki](https://github.com/ondra-m/ruby-spark/wiki/Installation). Jars will be stored at you HOME directory.

```
$ ruby-spark build
```


## Usage

You can use Ruby Spark via interactive shell (Pry is used)

```
$ ruby-spark shell
```

Or on existing project.

If you want configure Spark first. See [configurations](https://github.com/ondra-m/ruby-spark/wiki/Configuration) for more details.

```ruby
require 'ruby-spark'

# Configuration
Spark.config do
   set_app_name "RubySpark"
   set 'spark.ruby.serializer', 'oj'
   set 'spark.ruby.serializer.batch_size', 100
end

# Start Apache Spark
Spark.start

# Context reference
Spark.sc
```

Finally, to stop the cluster. On the shell is Spark stopped automatically when environment exit.

```ruby
Spark.stop
```
After first use, global configuration is created at **~/.ruby-spark.conf**. There can be specified properties for Spark and RubySpark.



## Creating RDD (a new collection)

Single text file:

```ruby
rdd = sc.text_file(FILE, workers_num, serializer=nil)
```

All files on directory:

```ruby
rdd = sc.whole_text_files(DIRECTORY, workers_num, serializer=nil)
```

Direct uploading structures from ruby:

```ruby
rdd = sc.parallelize([1,2,3,4,5], workers_num, serializer=nil)
rdd = sc.parallelize(1..5, workers_num, serializer=nil)
```

There is 2 conditions:
1. choosen serializer must be able to serialize it
2. data must be iterable

If you do not specified serializer -> default is used (defined from spark.ruby.serializer.* options). [Check this](https://github.com/ondra-m/ruby-spark/wiki/Loading-data#custom-serializer) if you want create custom serializer.

## Operations

All operations can be divided into 2 groups:

- **Transformations:** append new operation to current RDD and return new
- **Actions:** add operation and start calculations

More informations:

- [Wiki page](https://github.com/ondra-m/ruby-spark/wiki/RDD)
- [Rubydoc](http://www.rubydoc.info/github/ondra-m/ruby-spark/master/Spark/RDD)
- [rdd.rb](https://github.com/ondra-m/ruby-spark/blob/master/lib/spark/rdd.rb)

You can also check official Spark documentation. First make sure that method is implemented here.

- [Transformations](http://spark.apache.org/docs/latest/programming-guide.html#transformations)
- [Actions](http://spark.apache.org/docs/latest/programming-guide.html#actions)

#### Transformations

<dl>          
  <dt><code>rdd.map(function)</code></dt>
  <dd>Return a new RDD by applying a function to all elements of this RDD.</dd>

  <dt><code>rdd.flat_map(function)</code></dt>
  <dd>Return a new RDD by first applying a function to all elements of this RDD, and then flattening the results.</dd>

  <dt><code>rdd.map_partitions(function)</code></dt>
  <dd>Return a new RDD by applying a function to each partition of this RDD.</dd>

  <dt><code>rdd.filter(function)</code></dt>
  <dd>Return a new RDD containing only the elements that satisfy a predicate.</dd>

  <dt><code>rdd.cartesian(other)</code></dt>
  <dd>Return the Cartesian product of this RDD and another one, that is, the RDD of all pairs of elements `(a, b)` where `a` is in `self` and `b` is in `other`.</dd>

  <dt><code>rdd.intersection(other)</code></dt>
  <dd>Return the intersection of this RDD and another one. The output will not contain any duplicate elements, even if the input RDDs did.</dd>

  <dt><code>rdd.sample(with_replacement, fraction, seed)</code></dt>
  <dd>Return a sampled subset of this RDD. Operations are base on Poisson and Uniform distributions.</dd>

  <dt><code>rdd.group_by_key(num_partitions)</code></dt>
  <dd>Group the values for each key in the RDD into a single sequence.</dd>
  
  <dt><a href="http://www.rubydoc.info/gems/ruby-spark/Spark/RDD" target="_blank"><code>...many more...</code></a></dt>
  <dd></dd>
</dl>


#### Actions

<dl> 
  <dt><code>rdd.take(count)</code></dt>
  <dd>Take the first num elements of the RDD.</dd>

  <dt><code>rdd.reduce(function)</code></dt>
  <dd>Reduces the elements of this RDD using the specified lambda or method.</dd>

  <dt><code>rdd.aggregate(zero_value, seq_op, comb_op)</code></dt>
  <dd>Aggregate the elements of each partition, and then the results for all the partitions, using given combine functions and a neutral “zero value”.</dd>

  <dt><code>rdd.histogram(buckets)</code></dt>
  <dd>Compute a histogram using the provided buckets.</dd>

  <dt><code>rdd.collect</code></dt>
  <dd>Return an array that contains all of the elements in this RDD.</dd>

  <dt><a href="http://www.rubydoc.info/gems/ruby-spark/Spark/RDD" target="_blank"><code>...many more...</code></a></dt>
  <dd></dd>
</dl>


## Examples

##### Basic methods

```ruby
# Every batch will be serialized by Marshal and will have size 10
ser = Spark::Serializer.build('batched(marshal, 10)')

# Range 0..100, 2 workers, custom serializer
rdd = Spark.sc.parallelize(0..100, 2, ser)


# Take first 5 items
rdd.take(5)
# => [0, 1, 2, 3, 4]


# Numbers reducing
rdd.reduce(lambda{|sum, x| sum+x})
rdd.reduce(:+)
rdd.sum
# => 5050


# Reducing with zero items
seq = lambda{|x,y| x+y}
com = lambda{|x,y| x*y}
rdd.aggregate(1, seq, com)
# 1. Every workers adds numbers
#    => [1226, 3826]
# 2. Results are multiplied
#    => 4690676


# Statistic method
rdd.stats
# => StatCounter: (count, mean, max, min, variance,
#                  sample_variance, stdev, sample_stdev)


# Compute a histogram using the provided buckets.
rdd.histogram(2)
# => [[0.0, 50.0, 100], [50, 51]]


# Mapping
rdd.map(lambda {|x| x*2}).collect
# => [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, ...]
rdd.map(:to_f).collect
# => [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ...]


# Mapping the whole collection
rdd.map_partitions(lambda{|part| part.reduce(:+)}).collect
# => [1225, 3825]


# Selecting
rdd.filter(lambda{|x| x.even?}).collect
# => [0, 2, 4, 6, 8, 10, 12, 14, 16, ...]


# Sampling
rdd.sample(true, 10).collect
# => [3, 36, 40, 54, 58, 82, 86, 95, 98]


# Sampling X items
rdd.take_sample(true, 10)
# => [53, 87, 71, 74, 18, 75, 55, 94, 46, 32]


# Using external process
rdd.pipe('cat', "awk '{print $1*10}'")
# => ["0", "10", "20", "30", "40", "50", ...]
```

##### Words count using methods

```ruby
# Content:
# "first line"
# "second line"
rdd = sc.text_file(PATH)

# ["first", "line", "second", "line"]
rdd = rdd.flat_map(lambda{|line| line.split})

# [["first", 1], ["line", 1], ["second", 1], ["line", 1]]
rdd = rdd.map(lambda{|word| [word, 1]})

# [["first", 1], ["line", 2], ["second", 1]]
rdd = rdd.reduce_by_key(lambda{|a, b| a+b})

# {"first"=>1, "line"=>2, "second"=>1}
rdd.collect_as_hash
```

##### Estimating PI with a custom serializer

```ruby
slices = 3
n = 100000 * slices

def map(_)
  x = rand * 2 - 1
  y = rand * 2 - 1

  if x**2 + y**2 < 1
    return 1
  else
    return 0
  end
end

rdd = Spark.context.parallelize(1..n, slices, serializer: 'oj')
rdd = rdd.map(method(:map))

puts 'Pi is roughly %f' % (4.0 * rdd.sum / n)
```

##### Estimating PI

```ruby
rdd = sc.parallelize([10_000], 1)
rdd = rdd.add_library('bigdecimal/math')
rdd = rdd.map(lambda{|x| BigMath.PI(x)})
rdd.collect # => #<BigDecimal, '0.31415926...'>
```

### Mllib (Machine Learning Library)

Mllib functions are using Spark's Machine Learning Library. Ruby objects are serialized and deserialized in Java so you cannot use custom classes. Supported are primitive types such as string or integers.

All supported methods/models:

- [Rubydoc / Mllib](http://www.rubydoc.info/github/ondra-m/ruby-spark/Spark/Mllib)
- [Github / Mllib](https://github.com/ondra-m/ruby-spark/tree/master/lib/spark/mllib)

##### Linear regression

```ruby
# Import Mllib classes into Object
# Otherwise are accessible via Spark::Mllib::LinearRegressionWithSGD
Spark::Mllib.import(Object)

# Training data
data = [
  LabeledPoint.new(0.0, [0.0]),
  LabeledPoint.new(1.0, [1.0]),
  LabeledPoint.new(3.0, [2.0]),
  LabeledPoint.new(2.0, [3.0])
]

# Train a model
lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initial_weights: [1.0])

lrm.predict([0.0])
```

##### K-Mean

```ruby
Spark::Mllib.import

# Dense vectors
data = [
  DenseVector.new([0.0,0.0]),
  DenseVector.new([1.0,1.0]),
  DenseVector.new([9.0,8.0]),
  DenseVector.new([8.0,9.0])
]

model = KMeans.train(sc.parallelize(data), 2)

model.predict([0.0, 0.0]) == model.predict([1.0, 1.0])
# => true
model.predict([8.0, 9.0]) == model.predict([9.0, 8.0])
# => true
```

## Benchmarks



================================================
FILE: Rakefile
================================================
#-*- mode: ruby -*-

require "bundler/gem_tasks"
require "rspec/core/rake_task"

RSpec::Core::RakeTask.new

task default: :spec
task test:    :spec

def java?
  RUBY_PLATFORM =~ /java/
end

if java?
  require "rake/javaextensiontask"
  Rake::JavaExtensionTask.new("ruby_java") do |ext|
    ext.name = "ruby_spark_ext"
  end
else
  require "rake/extensiontask"
  Rake::ExtensionTask.new("ruby_c") do |ext|
    ext.name = "ruby_spark_ext"
  end
end


task :clean do
  Dir['lib/*.{jar,o,so}'].each do |path|
    puts "Deleting #{path} ..."
    File.delete(path)
  end
  FileUtils.rm_rf('./pkg')
  FileUtils.rm_rf('./tmp')
end


================================================
FILE: TODO.md
================================================
- refactor JavaBridge
  - to_java, from_java
  - every type should have class
  - automatic registration
- add Streaming
- worker informations (time, memory, ...)
- killing zombie workers
- add_rb, add_inline_rb to Spark::{Context, RDD}
- fix broadcast for cluster
- dump to disk if there is memory limit
- Add Partitioner to RDD
- add NonExist serializer


================================================
FILE: benchmark/aggregate.rb
================================================
require 'benchmark'
require 'benchmark/ips'

data = 0..1_000_000
zero_value = rand(100_000)
function = Proc.new{|sum, n| sum+n}

Benchmark.ips do |r|  
  r.report('each') do
    sum = zero_value
    data.each do |n|
      sum += n
    end
  end

  r.report('reduce') do
    data.reduce(zero_value){|sum, n| sum+n}
  end

  r.report('each with function') do
    sum = zero_value
    data.each do |n|
      sum = function.call(sum, n)
    end
  end

  r.report('reduce with function') do
    data.reduce(zero_value, &function)
  end

  r.compare!
end



================================================
FILE: benchmark/bisect.rb
================================================
require "benchmark"

def bisect_left1(a, x, opts={})
  return nil if a.nil?
  return 0 if a.empty?

  lo = (opts[:lo] || opts[:low]).to_i
  hi = opts[:hi] || opts[:high] || a.length

  while lo < hi
    mid = (lo + hi) / 2
    v = a[mid]
    if v < x
      lo = mid + 1
    else
      hi = mid
    end
  end
  return lo
end

def bisect_left2(list, item)
  count = 0
  list.each{|i|
    return count if i >= item
    count += 1
  }
  nil
end

def bisect_left3(list, item, lo = 0, hi = list.size)
  while lo < hi
    i = (lo + hi - 1) >> 1

    if 0 <= (list[i] <=> item)
      hi = i
    else
      lo = i + 1
    end
  end
  return hi
end

array = Array.new(1000000) { rand(0..1000000) };
to_find = Array.new(500) { rand(0..10000) };

Benchmark.bm(20) do |x|
  x.report("bisect_left1") do
    to_find.each do |item|
      bisect_left1(array, item)
    end
  end

  x.report("bisect_left2") do
    to_find.each do |item|
      bisect_left2(array, item)
    end
  end

  x.report("bisect_left3") do
    to_find.each do |item|
      bisect_left3(array, item)
    end
  end
end

array = Array.new(100000) { Array.new(rand(1..10)){(97+rand(26)).chr}.join };
to_find = Array.new(500) { (97+rand(26)).chr };

Benchmark.bm(20) do |x|
  x.report("bisect_left1") do
    to_find.each do |item|
      bisect_left1(array, item)
    end
  end

  x.report("bisect_left2") do
    to_find.each do |item|
      bisect_left2(array, item)
    end
  end

  x.report("bisect_left3") do
    to_find.each do |item|
      bisect_left3(array, item)
    end
  end
end


================================================
FILE: benchmark/comparison/prepare.sh
================================================
#!/usr/bin/env bash

# Current dir
cd "$(dirname "$0")"

# Exit immediately if a pipeline returns a non-zero status.
set -e

# Spark
wget "http://d3kbcqa49mib13.cloudfront.net/spark-1.3.0-bin-hadoop2.4.tgz" -O spark.tgz
tar xvzf spark.tgz
mv spark-1.3.0-bin-hadoop2.4 spark
rm spark.tgz

# RSpark (only for 1.3.0)
git clone git@github.com:amplab-extras/SparkR-pkg.git rspark
cd rspark
SPARK_VERSION=1.3.0 ./install-dev.sh


================================================
FILE: benchmark/comparison/python.py
================================================
import os
import math
from time import time
from random import random
from operator import add
from pyspark import SparkContext

sc = SparkContext(appName="Python", master="local[*]")

log_file = open(os.environ.get('PYTHON_LOG'), 'w')

def log(*values):
  values = map(lambda x: str(x), values)
  log_file.write(';'.join(values))
  log_file.write('\n')

workers = int(os.environ.get('WORKERS'))
numbers_count = int(os.environ.get('NUMBERS_COUNT'))
text_file = os.environ.get('TEXT_FILE')

numbers = range(numbers_count)
floats = [float(i) for i in numbers]
with open(text_file) as t:
  strings = t.read().split("\n")


# =============================================================================
# Serialization
# =============================================================================

t = time()
rdd_numbers = sc.parallelize(numbers, workers)
t = time() - t
log('NumbersSerialization', t)


t = time()
rdd_floats = sc.parallelize(floats, workers)
t = time() - t
log('FloatsSerialization', t)


t = time()
rdd_strings = sc.parallelize(strings, workers)
t = time() - t
log('StringsSerialization', t)


# =============================================================================
# Computing
# =============================================================================


# --- Is prime? ---------------------------------------------------------------

def is_prime(x):
  if x < 2:
    return [x, False]
  elif x == 2:
    return [x, True]
  elif x % 2 == 0:
    return [x, False]
  else:
    upper = int(math.sqrt(float(x)))
    result = True

    i = 3
    while i <= upper:
      if x % i == 0:
        result = False
        break

      i += 2

    return [x, result]

t = time()
rdd_numbers.map(is_prime).collect()
t = time() - t

log('IsPrime', t)


# --- Matrix multiplication ---------------------------------------------------

matrix_size = int(os.environ.get('MATRIX_SIZE'))

matrix = []
for row in range(matrix_size):
  matrix.append([])
  for col in range(matrix_size):
    matrix[row].append(row+col)

def multiplication_func(matrix):
  matrix = list(matrix)
  size = len(matrix)

  new_matrix = []
  for row in range(size):
    new_matrix.append([])
    for col in range(size):

      result = 0
      for i in range(size):
        result += matrix[row][i] * matrix[col][i]
      new_matrix[row].append(result)

  return new_matrix

t = time()
rdd = sc.parallelize(matrix, 1)
rdd.mapPartitions(multiplication_func).collect()
t = time() - t

log('MatrixMultiplication', t)


# --- Pi digits ---------------------------------------------------------------
# http://rosettacode.org/wiki/Pi#Python

pi_digit = int(os.environ.get('PI_DIGIT'))

def pi_func(size):
  size = size.next()
  result = ''

  q, r, t, k, n, l = 1, 0, 1, 1, 3, 3
  while size > 0:
    if 4*q+r-t < n*t:
      result += str(n)
      size -= 1
      nr = 10*(r-n*t)
      n  = ((10*(3*q+r))//t)-10*n
      q  *= 10
      r  = nr
    else:
      nr = (2*q+r)*l
      nn = (q*(7*k)+2+(r*l))//(t*l)
      q  *= k
      t  *= l
      l  += 2
      k += 1
      n  = nn
      r  = nr

  return [result]

t = time()
rdd = sc.parallelize([pi_digit], 1)
rdd.mapPartitions(pi_func).collect()
t = time() - t

log('PiDigit', t)


log_file.close()


================================================
FILE: benchmark/comparison/r.r
================================================
library(SparkR)
sc <- sparkR.init(master="local[*]")

logFile <- file(Sys.getenv("R_LOG"), "w")

logInfo <- function(...){
  args <- list(...)
  line <- paste(args, collapse = ";")
  writeLines(line, logFile)
}

workers <- as.integer(Sys.getenv('WORKERS'))
numbersCount <- as.integer(Sys.getenv('NUMBERS_COUNT'))
textFile <- Sys.getenv('TEXT_FILE')


# =============================================================================
# Serialization
# =============================================================================

time <- proc.time()
rddNumbers <- parallelize(sc, as.numeric(seq(0, numbersCount)), workers)
time <- as.double(proc.time()-time)[3]

logInfo('NumbersSerialization', time)


# =============================================================================
# Computing
# =============================================================================

isPrime = function(x) {
  if(x < 2){
    c(x, FALSE)
  }
  else if(x == 2){
    c(x, TRUE)
  }
  else if(x %% 2 == 0){
    c(x, FALSE)
  }
  else{
    upper <- as.numeric(sqrt(as.double(x)))
    result <- TRUE

    i <- 3
    while(i <= upper){
      if(x %% i == 0){
        result = FALSE
        break
      }

      i <- i+2
    }

    c(x, result)
  }
}

time <- proc.time()
rdd <- map(rddNumbers, isPrime)
capture.output(collect(rdd), file='/dev/null')
time <- as.double(proc.time()-time)[3]

logInfo('IsPrime', time)


close(logFile)
sparkR.stop()


================================================
FILE: benchmark/comparison/ruby.rb
================================================
#!/usr/bin/env ruby

lib = File.expand_path(File.dirname(__FILE__) + '/../../lib')
$LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)

require 'ruby-spark'
require 'benchmark'

Spark.start
sc = Spark.context

$log_file = File.open(ENV['RUBY_LOG'], 'w')

def log(*values)
  $log_file.puts(values.join(';'))
end

workers = ENV['WORKERS'].to_i
numbers_count = ENV['NUMBERS_COUNT'].to_i
text_file = ENV['TEXT_FILE']

numbers = (0...numbers_count).to_a
floats = numbers.map(&:to_f)
strings = File.read(text_file).split("\n")


# =============================================================================
# Serialization
# =============================================================================

time = Benchmark.realtime do
  @rdd_numbers = sc.parallelize(numbers, workers)
end

log('NumbersSerialization', time)


time = Benchmark.realtime do
  @rdd_floats = sc.parallelize(floats, workers)
end

log('FloatsSerialization', time)


time = Benchmark.realtime do
  @rdd_strings = sc.parallelize(strings, workers)
end

log('StringsSerialization', time)


# =============================================================================
# Computing
# =============================================================================


# --- Is prime? ---------------------------------------------------------------

is_prime = Proc.new do |x|
  case
  when x < 2
    [x, false]
  when x == 2
    [x, true]
  when x % 2 == 0
    [x, false]
  else
    upper = Math.sqrt(x.to_f).to_i
    result = true

    i = 3
    while i <= upper
      if x % i == 0
        result = false
        break
      end

      i += 2
    end

    [x, result]
  end
end

time = Benchmark.realtime do
  @rdd_numbers.map(is_prime).collect
end

log('IsPrime', time)


# --- Matrix multiplication ---------------------------------------------------

matrix_size = ENV['MATRIX_SIZE'].to_i

matrix = Array.new(matrix_size) do |row|
  Array.new(matrix_size) do |col|
    row+col
  end
end;

multiplication_func = Proc.new do |matrix|
  size = matrix.size

  Array.new(size) do |row|
    Array.new(size) do |col|
      matrix[row]

      result = 0
      size.times do |i|
        result += matrix[row][i] * matrix[col][i]
      end
      result
    end
  end
end

time = Benchmark.realtime do
  rdd = sc.parallelize(matrix, 1)
  rdd.map_partitions(multiplication_func).collect
end

log('MatrixMultiplication', time)


# --- Pi digits ---------------------------------------------------------------
# http://rosettacode.org/wiki/Pi#Ruby

pi_digit = ENV['PI_DIGIT'].to_i

pi_func = Proc.new do |size|
  size = size.first
  result = ''

  q, r, t, k, n, l = 1, 0, 1, 1, 3, 3
  while size > 0
    if 4*q+r-t < n*t
      result << n.to_s
      size -= 1
      nr = 10*(r-n*t)
      n = ((10*(3*q+r)) / t) - 10*n
      q *= 10
      r = nr
    else
      nr = (2*q+r) * l
      nn = (q*(7*k+2)+r*l) / (t*l)
      q *= k
      t *= l
      l += 2
      k += 1
      n = nn
      r = nr
    end
  end

  [result]
end

time = Benchmark.realtime do
  rdd = sc.parallelize([pi_digit], 1)
  rdd.map_partitions(pi_func).collect
end

log('PiDigit', time)


$log_file.close


================================================
FILE: benchmark/comparison/run-all.sh
================================================
#!/usr/bin/env bash

# Current dir
cd "$(dirname "$0")"

# Exit immediately if a pipeline returns a non-zero status.
set -e

# Settings
export WORKERS=2
export MATRIX_SIZE=100
export NUMBERS_COUNT=1000000
export TEXT_FILE=$(mktemp)
export PI_DIGIT=1000
export RUBY_BATCH_SIZE=2048

text_file_rows=10
text_file_per_line=10
text_file_duplicates=50

mx="4096m"
ms="4096m"


# Parse arguments
while (( "$#" )); do
  case $1 in
    --workers)
      WORKERS="$2"
      shift
      ;;
    --matrix-size)
      MATRIX_SIZE="$2"
      shift
      ;;
    --numbers-count)
      NUMBERS_COUNT="$2"
      shift
      ;;
    --random-file-rows)
      text_file_rows="$2"
      shift
      ;;
    --text-file-per-line)
      text_file_per_line="$2"
      shift
      ;;
    --text-file-duplicates)
      text_file_duplicates="$2"
      shift
      ;;
    --pi-digit)
      PI_DIGIT="$2"
      shift
      ;;
    --ruby-batch-size)
      RUBY_BATCH_SIZE="$2"
      shift
      ;;
    --mx)
      mx="$2"
      shift
      ;;
    --ms)
      ms="$2"
      shift
      ;;
    *)
      break
      ;;
  esac
  shift
done


# Generating
file=$(mktemp)

for (( i=0; i<$text_file_rows; i++ ))
do
  shuf -n $text_file_per_line /usr/share/dict/words | tr '\n' ' ' >> $file
  echo >> $file
done

for (( i=0; i<$text_file_duplicates; i++ ))
do
  cat $file >> $TEXT_FILE
done


# Before run
if [[ -z "$SPARK_HOME" ]]; then
  export SPARK_HOME=$(pwd)/spark
fi

if [[ -z "$RSPARK_HOME" ]]; then
  export RSPARK_HOME=$(pwd)/rspark
fi

export SPARK_RUBY_BATCH_SIZE="$RUBY_BATCH_SIZE"
SPARK_CLASSPATH=$($SPARK_HOME/bin/compute-classpath.sh 2>/dev/null)

export _JAVA_OPTIONS="$_JAVA_OPTIONS -Xms$ms -Xmx$mx"


# Log files
export RUBY_MARSHAL_LOG=$(mktemp)
export RUBY_OJ_LOG=$(mktemp)
export PYTHON_LOG=$(mktemp)
export SCALA_LOG=$(mktemp)
export R_LOG=$(mktemp)


# Run:
echo "Workers: $WORKERS"
echo "Matrix size: $MATRIX_SIZE"
echo "Numbers count: $NUMBERS_COUNT"
echo "Pi digits: $PI_DIGIT"
echo "File: rows = $(($text_file_rows * $text_file_duplicates))"
echo "      per line = $text_file_per_line"

# --- Ruby
export SPARK_RUBY_SERIALIZER='marshal'
export RUBY_LOG="$RUBY_MARSHAL_LOG"
/usr/bin/env ruby ruby.rb &>/dev/null

export SPARK_RUBY_SERIALIZER='oj'
export RUBY_LOG="$RUBY_OJ_LOG"
/usr/bin/env ruby ruby.rb &>/dev/null

# # --- Python
"$SPARK_HOME"/bin/spark-submit --master "local[*]" $(pwd)/python.py &>/dev/null

# # --- Scala
/usr/bin/env scalac -cp $SPARK_CLASSPATH scala.scala -d scala.jar &>/dev/null
"$SPARK_HOME"/bin/spark-submit --master "local[*]" $(pwd)/scala.jar &>/dev/null

# --- R
# "$RSPARK_HOME"/sparkR r.r #&>/dev/null


# Parse results
echo "# Ruby (Marshal)"
cat $RUBY_MARSHAL_LOG
echo ""

echo "# Ruby (Oj)"
cat $RUBY_OJ_LOG
echo ""

echo "# Python"
cat $PYTHON_LOG
echo ""

echo "# Scala"
cat $SCALA_LOG
echo ""

echo "# R"
cat $R_LOG


================================================
FILE: benchmark/comparison/scala.scala
================================================
import java.io._
import scala.math
import scala.io.Source
import org.apache.spark._

object Scala {

  val logFile = new PrintWriter(new File(System.getenv("SCALA_LOG")))

  def log(args: Any*) {
    logFile.write(args.mkString(";"))
    logFile.write("\n")
  }

  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("Scala")
    val sc = new SparkContext(conf)

    val workers = System.getenv("WORKERS").toInt
    val numbersCount = System.getenv("NUMBERS_COUNT").toInt
    val textFile = System.getenv("TEXT_FILE")

    val numbers = 0 until numbersCount
    val floats = numbers.map(_.toDouble)
    val strings = Source.fromFile(textFile).mkString.split("\n")


    // =============================================================================
    // Serialization
    // =============================================================================

    var time: Long = 0

    time = System.currentTimeMillis
    val rddNumbers = sc.parallelize(numbers, workers)
    time = System.currentTimeMillis - time

    log("NumbersSerialization", time/1000.0)


    time = System.currentTimeMillis
    val rddFloats = sc.parallelize(floats, workers)
    time = System.currentTimeMillis - time

    log("FloatsSerialization", time/1000.0)


    time = System.currentTimeMillis
    val rddStrings = sc.parallelize(strings, workers)
    time = System.currentTimeMillis - time

    log("StringsSerialization", time/1000.0)


    // =============================================================================
    // Computing
    // =============================================================================

    // --- Is prime? ---------------------------------------------------------------

    time = System.currentTimeMillis
    val primes = rddNumbers.map{ x =>
      if(x < 2){
        (x, false)
      }
      else if(x == 2){
        (x, true)
      }
      else if(x % 2 == 0){
        (x, false)
      }
      else{
        val upper = math.sqrt(x.toDouble).toInt
        var result = true

        var i = 3
        while(i <= upper && result == true){
          if(x % i == 0){
            result = false
          }
          else{
            i += 2
          }
        }

        (x, result)
      }
    }
    primes.collect()
    time = System.currentTimeMillis - time

    log("IsPrime", time/1000.0)


    // --- Matrix multiplication ---------------------------------------------------

    val matrixSize = System.getenv("MATRIX_SIZE").toInt

    val matrix = new Array[Array[Long]](matrixSize)

    for( row <- 0 until matrixSize ) {
      matrix(row) = new Array[Long](matrixSize)
      for( col <- 0 until matrixSize ) {
        matrix(row)(col) = row + col
      }
    }

    time = System.currentTimeMillis
    val rdd = sc.parallelize(matrix, 1)
    rdd.mapPartitions { it =>
      val matrix = it.toArray
      val size = matrix.size

      val newMatrix = new Array[Array[Long]](size)

      for( row <- 0 until size ) {
        newMatrix(row) = new Array[Long](size)
        for( col <- 0 until size ) {

          var result: Long = 0
          for( i <- 0 until size ) {
            result += matrix(row)(i) * matrix(col)(i)
          }
          newMatrix(row)(col) = result
        }
      }

      newMatrix.toIterator
    }
    time = System.currentTimeMillis - time

    log("MatrixMultiplication", time/1000.0)


    // --- Pi digits ---------------------------------------------------------------
    // http://rosettacode.org/wiki/Pi#Scala

    val piDigit = System.getenv("PI_DIGIT").toInt

    time = System.currentTimeMillis
    val piDigits = sc.parallelize(Array(piDigit), 1)
    piDigits.mapPartitions { it =>
      var size = it.toArray.asInstanceOf[Array[Int]](0)
      var result = ""

      var r: BigInt = 0
      var q, t, k: BigInt = 1
      var n, l: BigInt = 3
      var nr, nn: BigInt = 0

      while(size > 0){
        while((4*q+r-t) >= (n*t)){
          nr = (2*q+r)*l
          nn = (q*(7*k)+2+(r*l))/(t*l)
          q = q * k
          t = t * l
          l = l + 2
          k = k + 1
          n  = nn
          r  = nr
        }

        result += n.toString
        size -= 1
        nr = 10*(r-n*t)
        n  = ((10*(3*q+r))/t)-(10*n)
        q  = q * 10
        r  = nr
      }

      Iterator(result)
    }
    time = System.currentTimeMillis - time

    log("PiDigit", time/1000.0)


    sc.stop()
    logFile.close()
  }
}


================================================
FILE: benchmark/custom_marshal.rb
================================================
require 'benchmark'
require 'benchmark/ips'

def pack_int(data)
  [data].pack('l>')
end

def pack_long(data)
  [data].pack('q>')
end

def pack_doubles(data)
  data.pack('G*')
end

module Standard
  class LabeledPoint
    def initialize(label, features)
      @label = label
      @features = Standard::Vector.new(features)
    end

    def marshal_dump
      [@label, @features]
    end

    def marshal_load(*)
    end
  end

  class Vector
    def initialize(array)
      @values = array
    end

    def marshal_dump
      [@values]
    end

    def marshal_load(*)
    end
  end
end

module Custom
  class LabeledPoint
    def initialize(label, features)
      @label = label
      @features = Custom::Vector.new(features)
    end

    def _dump(*)
      pack_long(@label) + @features._dump
    end

    def self._load(*)
    end
  end

  class Vector
    def initialize(array)
      @values = array
    end

    def _dump(*)
      result = 'v'
      result << pack_int(@values.size)
      result << pack_doubles(@values)
      result.encode(Encoding::ASCII_8BIT)
    end

    def self._load(*)
    end
  end
end

data_size = 10_000
vector_size = 1_000
values = Array.new(vector_size) { |x| rand(10_000..100_000) }

@data1 = Array.new(data_size) {|i| Standard::LabeledPoint.new(i, values)}
@data2 = Array.new(data_size) {|i| Custom::LabeledPoint.new(i, values)}

Benchmark.ips do |r|
  r.report('standard') do
    Marshal.dump(@data1)
  end

  r.report('custom') do
    Marshal.dump(@data2)
  end

  r.compare!
end


================================================
FILE: benchmark/digest.rb
================================================
lib = File.expand_path(File.dirname(__FILE__) + '/../lib')
$LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)

def java?
  RUBY_PLATFORM =~ /java/
end

unless java?
  require 'murmurhash3'
end

require 'digest'
require 'benchmark'
require 'ruby-spark'

TEST = 5_000_000
WORDS = ["wefwefwef", "rgwefiwefwe", "a", "rujfwgrethrzjrhgawf", "irncrnuggo"]

puts "TEST COUNT = #{TEST*WORDS.size}"

# =================================================================================================
# Pure ruby mumrumur
# funny-falcon/murmurhash3-ruby

MASK32 = 0xffffffff

def murmur3_32_rotl(x, r)
  ((x << r) | (x >> (32 - r))) & MASK32
end

def murmur3_32_fmix(h)
  h &= MASK32
  h ^= h >> 16
  h = (h * 0x85ebca6b) & MASK32
  h ^= h >> 13
  h = (h * 0xc2b2ae35) & MASK32
  h ^ (h >> 16)
end

def murmur3_32__mmix(k1)
  k1 = (k1 * 0xcc9e2d51) & MASK32
  k1 = murmur3_32_rotl(k1, 15)
  (k1 * 0x1b873593) & MASK32
end

def murmur3_32_str_hash(str, seed=0)
  h1 = seed
  numbers = str.unpack('V*C*')
  tailn = str.bytesize % 4
  tail = numbers.slice!(numbers.size - tailn, tailn)
  for k1 in numbers
    h1 ^= murmur3_32__mmix(k1)
    h1 = murmur3_32_rotl(h1, 13)
    h1 = (h1*5 + 0xe6546b64) & MASK32
  end

  unless tail.empty?
    k1 = 0
    tail.reverse_each do |c1|
      k1 = (k1 << 8) | c1
    end
    h1 ^= murmur3_32__mmix(k1)
  end

  h1 ^= str.bytesize
  murmur3_32_fmix(h1)
end


# =================================================================================================
# Benchmark

Benchmark.bm(18) do |x|

  x.report("ruby hash"){
    TEST.times{
      WORDS.each{ |word|
        word.hash
      }
    }    
  }

  x.report("ext portable"){
    TEST.times{
      WORDS.each{ |word|
        Spark::Digest.portable_hash(word)
      }
    }    
  }

  x.report("murmur3 32"){
    TEST.times{
      WORDS.each{ |word|
        # MurmurHash3::V128.str_hash(word)
        # [MurmurHash3::V128.str_hash(word).join.to_i].pack("q>")
        # MurmurHash3::V128.str_hash(word)
        # a = MurmurHash3::V32.str_hash(word).to_s
        # a.slice!(0,8)

        MurmurHash3::V32.str_hash(word)
      }
    }    
  } unless java?

  # Too slow
  # x.report("murmur3 32 (ruby)"){
  #   TEST.times{
  #     WORDS.each{ |word|
  #       # MurmurHash3::V128.str_hash(word)
  #       # [MurmurHash3::V128.str_hash(word).join.to_i].pack("q>")
  #       # MurmurHash3::V128.str_hash(word)
  #       # a = murmur3_32_str_hash(word).to_s
  #       # a.slice!(0,8)

  #       murmur3_32_str_hash(word)
  #     }
  #   }    
  # }

  x.report("murmur3 128"){
    TEST.times{
      WORDS.each{ |word|
        # MurmurHash3::V128.str_hash(word)
        # [MurmurHash3::V128.str_hash(word).join.to_i].pack("q>")
        # a = MurmurHash3::V128.str_hash(word).to_s
        # a.slice!(0,8)

        MurmurHash3::V128.str_hash(word)
      }
    }    
  } unless java?

  # x.report("sha256"){
  #   TEST.times{
  #     WORDS.each{ |word|
  #       a = Digest::SHA256.digest(word)
  #       # a.slice!(0,8)
  #     }
  #   }    
  # }

  # x.report("md5"){
  #   TEST.times{
  #     WORDS.each{ |word|
  #       a = Digest::MD5.digest(word)
  #       # a.slice!(0,8)
  #     }
  #   }    
  # }
end


================================================
FILE: benchmark/enumerator.rb
================================================
require "benchmark"

class Enumerator
  def defer(&blk)
    self.class.new do |y|
      each do |*input|
        blk.call(y, *input)
      end
    end
  end
end

ARRAY_SIZE = 50_000_000

def type_yield
  return to_enum(__callee__) unless block_given?

  ARRAY_SIZE.times { |i|
    yield i
  }
end

def yield_map_x2(enum)
  return to_enum(__callee__, enum) unless block_given?
  
  enum.each do |item|
    yield item*2
  end
end

def type_enumerator_new
  Enumerator.new do |e|
    ARRAY_SIZE.times { |i|
      e << i
    }
  end
end

def enumerator_new_map_x2(enum)
  Enumerator.new do |e|
    enum.each do |item|
      e << item*2
    end
  end
end

def enumerator_defer_x2(enum)
  enum.defer do |out, inp|
    out << inp*2
  end
end

Benchmark.bm(26) do |x|
  x.report("yield max") do
    type_yield.max
  end

  x.report("yield sum") do
    type_yield.reduce(:+)
  end

  x.report("yield map x*2 sum") do
    yield_map_x2(type_yield).reduce(:+)
  end

  x.report("yield defer map x*2 sum") do
    enumerator_defer_x2(type_yield).reduce(:+)
  end

  x.report("-----"){}

  x.report("Enum.new max") do
    type_enumerator_new.max
  end

  x.report("Enum.new sum") do
    type_enumerator_new.reduce(:+)
  end

  x.report("Enum.new map x*2 sum") do
    enumerator_new_map_x2(type_enumerator_new).reduce(:+)
  end

  x.report("Enum.new defer map x*2 sum") do
    enumerator_defer_x2(type_enumerator_new).reduce(:+)
  end

end


================================================
FILE: benchmark/serializer.rb
================================================
require "benchmark"
require "yaml"
require "msgpack"
require "oj"
# require "thrift"
 
puts "Simple"

data = (0..100000).to_a

Benchmark.bmbm do |x|
  x.report("YAML") do
    serialized = YAML.dump(data)
    deserialized = YAML.load(serialized)
    puts "Size: #{serialized.size}, Equal: #{deserialized == data}"
  end

  x.report("Marshal") do
    serialized = Marshal.dump(data)
    deserialized = Marshal.load(serialized)
    puts "Size: #{serialized.size}, Equal: #{deserialized == data}"
  end

  x.report("MessagePack") do
    serialized = MessagePack.dump(data)
    deserialized = MessagePack.load(serialized)
    puts "Size: #{serialized.size}, Equal: #{deserialized == data}"
  end

  x.report("Oj") do
    serialized = Oj.dump(data)
    deserialized = Oj.load(serialized)
    puts "Size: #{serialized.size}, Equal: #{deserialized == data}"
  end

  # x.report("Thrift") do
  #   serializer = Thrift::Serializer.new
  #   deserializer = Thrift::Deserializer.new

  #   serialized = serializer.serialize(data)
  # end
end

puts ""
puts "More complex"

data = Array.new(10000000) { 
  [rand(97..122).chr, rand(10000000)]
}

Benchmark.bm do |x|
  # Take too long
  # x.report("YAML") do
  #   serialized = YAML.dump(data)
  #   YAML.load(serialized)
  # end

  x.report("Marshal") do
    serialized = Marshal.dump(data)
    deserialized = Marshal.load(serialized)
    puts " Size: #{serialized.size}, Equal: #{deserialized == data}"
  end

  x.report("MessagePack") do
    serialized = MessagePack.dump(data)
    deserialized = MessagePack.load(serialized)
    puts " Size: #{serialized.size}, Equal: #{deserialized == data}"
  end

  x.report("Oj") do
    serialized = Oj.dump(data)
    deserialized = Oj.load(serialized)
    puts " Size: #{serialized.size}, Equal: #{deserialized == data}"
  end

  # x.report("Thrift") do
  #   serializer = Thrift::Serializer.new
  #   deserializer = Thrift::Deserializer.new

  #   serialized = serializer.serialize(data)
  # end
end


================================================
FILE: benchmark/sort.rb
================================================
require "benchmark"

array = []
1000.times { 
  array << {:bar => rand(1000)} 
}

n = 500
Benchmark.bm(20) do |x|
  x.report("sort")               { n.times { array.sort{ |a,b| b[:bar] <=> a[:bar] } } }
  x.report("sort reverse")       { n.times { array.sort{ |a,b| a[:bar] <=> b[:bar] }.reverse } }
  x.report("sort_by -a[:bar]")   { n.times { array.sort_by{ |a| -a[:bar] } } }
  x.report("sort_by a[:bar]*-1") { n.times { array.sort_by{ |a| a[:bar]*-1 } } }
  x.report("sort_by.reverse!")   { n.times { array.sort_by{ |a| a[:bar] }.reverse } }
end


array = Array.new(10000) { Array.new(rand(1..10)){(97+rand(26)).chr}.join }

Benchmark.bm(20) do |x|
  x.report("sort asc")         { n.times { array.sort } }
  x.report("sort asc block")   { n.times { array.sort{|a,b| a <=> b} } }
  x.report("sort desc")        { n.times { array.sort{|a,b| b <=> a} } }
  x.report("sort asc reverse") { n.times { array.sort.reverse } }
end


key_value = Struct.new(:key, :value) do
  def <=>(other)
    key <=> other.key
  end
end

count = 10000
item_range = 1000000
array1 = Array.new(count) { [rand(item_range), rand(item_range)] }
array2 = Array.new(count) { key_value.new rand(item_range), rand(item_range) }

Benchmark.bm(20) do |x|
  x.report("sort_by")       { n.times { array1.sort_by {|a| a[0]} } }
  x.report("sort struct")   { n.times { array2.sort } }
end



================================================
FILE: benchmark/sort2.rb
================================================
require "benchmark"
require "algorithms"

NUMBER_OF_SORTING = 1
NUMBER_OF_ARRAY   = 10
WORDS_IN_ARRAY    = 100000
MAX_WORD_SIZE     = 10
EVAL_N_VALUES     = 10

puts "NUMBER_OF_SORTING: #{NUMBER_OF_SORTING}"
puts "NUMBER_OF_ARRAY: #{NUMBER_OF_ARRAY}"
puts "WORDS_IN_ARRAY: #{WORDS_IN_ARRAY}"
puts "MAX_WORD_SIZE: #{MAX_WORD_SIZE}"
puts "EVAL_N_VALUES: #{EVAL_N_VALUES}"

def words
  Array.new(WORDS_IN_ARRAY) { word }
end

def word
  Array.new(rand(1..MAX_WORD_SIZE)){(97+rand(26)).chr}.join
end

@array = Array.new(NUMBER_OF_ARRAY) { words.sort }


# =================================================================================================
# Sort1

# Vrátí nový (nevyhodnocený) enumerator
def sort1(data)
  return to_enum(__callee__, data) unless block_given?

  heap = []

  # Inicializuji heap s prvními položkami
  # připojím samotné enumeratory pro volání .next
  data.each do |a|
    heap << [a.next, a]
  end

  while data.any?
    begin
      # Seřadím pole podle hodnot
      heap.sort_by!{|(item,_)| item}
      # Uložím si hodnotu a enumerator
      item, enum = heap.shift
      # Hodnota půjde do výsledku
      yield item
      # Místo odstraněné položky nahradí další ze stejného seznamu
      heap << [enum.next, enum]
    rescue StopIteration
      # Enumerator je prázdný
      data.delete(enum)
    end
  end
end


# =================================================================================================
# Sort1_2

# Vrátí nový (nevyhodnocený) enumerator
def sort1_2(data)
  return to_enum(__callee__, data) unless block_given?

  heap = []
  enums = []

  # Inicializuji heap s prvními položkami
  # připojím samotné enumeratory pro volání .next
  data.each do |a|
    EVAL_N_VALUES.times {
      begin
        heap << [a.next, a]
      rescue StopIteration
      end
    }
  end

  while data.any? || heap.any?
      # Seřadím pole podle hodnot
      heap.sort_by!{|(item,_)| item}

      # Minimálně můžu vzít EVAL_N_VALUES
      EVAL_N_VALUES.times {
        break if heap.empty?

        # Uložím si hodnotu a enumerator
        item, enum = heap.shift
        # Hodnota půjde do výsledku
        yield item

        enums << enum
      }

    while (enum = enums.shift)
      begin
        heap << [enum.next, enum]
      rescue StopIteration
        data.delete(enum)
        enums.delete(enum)
      end
    end

  end
end


# =================================================================================================
# Sort 2

def sort2(data)
  return to_enum(__callee__, data) unless block_given?

  heap = Containers::Heap.new

  data.each do |enum|
    item = enum.next
    heap.push(item, [item, enum])
  end

  while data.any?
    begin
      item, enum = heap.pop
      yield item

      item = enum.next
      heap.push(item, [item, enum])
    rescue StopIteration
      data.delete(enum)
    end
  end
end


# =================================================================================================
# Benchmark

Benchmark.bm(10) do |x|
  x.report("sort") do
    NUMBER_OF_SORTING.times {
      @result = @array.flatten.sort
    }
  end

  x.report("sort 1") do
    NUMBER_OF_SORTING.times { 
      raise "Bad sorting" if @result != sort1(@array.map(&:each)).to_a
    }
  end

  x.report("sort 1_2") do
    NUMBER_OF_SORTING.times { 
      raise "Bad sorting" if @result != sort1_2(@array.map(&:each)).to_a
    }
  end

  # x.report("sort 2") do
  #   NUMBER_OF_SORTING.times {
  #     raise "Bad sorting" if @result != sort2(@array.map(&:each)).to_a
  #   }
  # end
end


================================================
FILE: benchmark/take.rb
================================================
require "benchmark"

SIZE = 100_000_000

@array1 = (0..SIZE).to_a;
@array2 = (0..SIZE).to_a;
@array3 = (0..SIZE).to_a;

TAKE = 100_000

Benchmark.bm(15) do |x|
  # Fastest
  x.report("take"){
    a=@array1.take(TAKE)
  }

  # Slowest and take most memory
  x.report("reverse drop"){
    @array2.reverse!
    @array2.drop(@array2.size - TAKE)
    @array2.reverse!
  }

  # Least memory
  x.report("splice"){
    a=@array2.slice!(0, TAKE)
  }
end


================================================
FILE: bin/ruby-spark
================================================
#!/usr/bin/env ruby

lib = File.expand_path(File.dirname(__FILE__) + '/../lib')
$LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)

require 'ruby-spark'

Spark::CLI.new.run


================================================
FILE: example/pi.rb
================================================
#!/usr/bin/env ruby

lib = File.expand_path(File.dirname(__FILE__) + '/../lib')
$LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)

require 'ruby-spark'

Spark.logger.disable
Spark.start

slices = 3
n = 100000 * slices

def map(_)
  x = rand * 2 - 1
  y = rand * 2 - 1

  if x**2 + y**2 < 1
    return 1
  else
    return 0
  end
end

rdd = Spark.context.parallelize(1..n, slices)
rdd = rdd.map(method(:map))

puts 'Pi is roughly %f' % (4.0 * rdd.sum / n)


================================================
FILE: example/website_search.rb
================================================
#!/usr/bin/env ruby

# Parse sitemap and search word on every page

require 'optparse'
require 'open-uri'
require 'nokogiri'
require 'ruby-spark'

options = {
  sitemap: 'http://fit.cvut.cz/sitemap.xml',
  query: 'cvut',
  workers: 2
}

opt_parser = OptionParser.new do |opts|
  opts.banner = 'Usage: website_search.rb [options]'

  opts.separator ''
  opts.separator 'Specific options:'

  opts.on('-s', '--sitemap SITEMAP', 'Sitemap URL') do |sitemap|
    options[:sitemap] = sitemap
  end

  opts.on('-q', '--query QUERY', 'Query to search') do |query|
    options[:query] = query
  end

  opts.on('-w', '--workers WORKERS_NUM', Integer, 'Number of workers') do |workers|
    options[:workers] = workers
  end

  opts.on('--quite', 'Run quitely') do |v|
    Spark.logger.disabled
  end

  opts.on_tail('-h', '--help', 'Show this message') do
    puts opts
    exit
  end
end

opt_parser.parse!

@links = []

def parse_sitemap(doc)
  doc.xpath('//sitemapindex/sitemap/loc').each do |loc|
    next_doc = Nokogiri::HTML(open(loc.text))
    parse_sitemap(next_doc)
  end

  doc.xpath('//url/loc').each do |loc|
    @links << loc.text
  end
end

doc = Nokogiri::HTML(open(options[:sitemap]))
parse_sitemap(doc)

# Map function
func = Proc.new do |url|
  begin
    open(url) {|f|
      [url, f.read.scan(query).count]
    }
  rescue
    [url, 0]
  end
end

Spark.start

rdd = Spark.sc.parallelize(@links, options[:workers])
              .add_library('open-uri')
              .bind(query: options[:query])
              .map(func)
              .sort_by(lambda{|(_, value)| value}, false)

rdd.collect.each do |(url, count)|
  puts "#{url} => #{count}"
end


================================================
FILE: ext/ruby_c/extconf.rb
================================================
require 'mkmf'

create_makefile("ruby_spark_ext")


================================================
FILE: ext/ruby_c/murmur.c
================================================
#include "murmur.h"

#if defined(_MSC_VER)
#define BIG_CONSTANT(x) (x)
#else
#define BIG_CONSTANT(x) (x##LLU)
#endif

/*-----------------------------------------------------------------------------
// MurmurHash2, 64-bit versions, by Austin Appleby
//
// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment 
// and endian-ness issues if used across multiple platforms.
//
// 64-bit hash for 64-bit platforms
*/

uint64_t MurmurHash64A(const void * key, int len, uint64_t seed)
{
  const uint64_t m = BIG_CONSTANT(0xc6a4a7935bd1e995);
  const int r = 47;

  uint64_t h = seed ^ (len * m);

  const uint64_t * data = (const uint64_t *)key;
  const uint64_t * end = data + (len/8);

  while(data != end)
  {
    uint64_t k = *data++;

    k *= m; 
    k ^= k >> r; 
    k *= m; 
    
    h ^= k;
    h *= m; 
  }

  const unsigned char * data2 = (const unsigned char*)data;

  switch(len & 7)
  {
  case 7: h ^= ((uint64_t) data2[6]) << 48;
  case 6: h ^= ((uint64_t) data2[5]) << 40;
  case 5: h ^= ((uint64_t) data2[4]) << 32;
  case 4: h ^= ((uint64_t) data2[3]) << 24;
  case 3: h ^= ((uint64_t) data2[2]) << 16;
  case 2: h ^= ((uint64_t) data2[1]) << 8;
  case 1: h ^= ((uint64_t) data2[0]);
          h *= m;
  };
 
  h ^= h >> r;
  h *= m;
  h ^= h >> r;

  return h;
} 

/* 64-bit hash for 32-bit platforms */

uint64_t MurmurHash64B(const void * key, int len, uint64_t seed)
{
  const uint32_t m = 0x5bd1e995;
  const int r = 24;

  uint32_t h1 = ((uint32_t) seed) ^ len;
  uint32_t h2 = ((uint32_t) (seed >> 32));

  const uint32_t * data = (const uint32_t *)key;

  while(len >= 8)
  {
    uint32_t k1 = *data++;
    k1 *= m; k1 ^= k1 >> r; k1 *= m;
    h1 *= m; h1 ^= k1;
    len -= 4;

    uint32_t k2 = *data++;
    k2 *= m; k2 ^= k2 >> r; k2 *= m;
    h2 *= m; h2 ^= k2;
    len -= 4;
  }

  if(len >= 4)
  {
    uint32_t k1 = *data++;
    k1 *= m; k1 ^= k1 >> r; k1 *= m;
    h1 *= m; h1 ^= k1;
    len -= 4;
  }

  switch(len)
  {
  case 3: h2 ^= ((unsigned char*)data)[2] << 16;
  case 2: h2 ^= ((unsigned char*)data)[1] << 8;
  case 1: h2 ^= ((unsigned char*)data)[0];
      h2 *= m;
  };

  h1 ^= h2 >> 18; h1 *= m;
  h2 ^= h1 >> 22; h2 *= m;
  h1 ^= h2 >> 17; h1 *= m;
  h2 ^= h1 >> 19; h2 *= m;

  uint64_t h = h1;

  h = (h << 32) | h2;

  return h;
}



// ================================================================================================
// Ruby methods

#define PORTABLE_HASH_SEED 16154832


VALUE murmur2_digest(VALUE rb_str, uint64_t seed)
{
  StringValue(rb_str);

  void * key = RSTRING_PTR(rb_str);
  long   len = RSTRING_LEN(rb_str);

  uint64_t result = MurmurHash64A(key, len, seed);

  return LONG2FIX(result);
}

// ------------------------------------------------------------------------------------------------
// Spark::Digest::Murmur2.digest

VALUE method_murmur2_digest(int argc, VALUE *argv, VALUE klass)
{
  if(argc == 0 || argc > 2){
    rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
  }

  uint64_t seed = (argc == 1 ? 0 : NUM2UINT(argv[1]));

  return murmur2_digest(argv[0], seed);
}

// ------------------------------------------------------------------------------------------------
// Spark::Digest.portable_hash

VALUE method_portable_hash(int argc, VALUE *argv, VALUE klass)
{
  if(argc != 1){
    rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
  }

  return murmur2_digest(argv[0], PORTABLE_HASH_SEED);
}


================================================
FILE: ext/ruby_c/murmur.h
================================================
#ifndef MURMUR_INCLUDED
#define MURMUR_INCLUDED

#include "ruby.h"

VALUE method_portable_hash(int argc, VALUE *argv, VALUE klass);
VALUE method_murmur2_digest(int argc, VALUE *argv, VALUE klass);

#endif


================================================
FILE: ext/ruby_c/ruby-spark.c
================================================
#include "ruby.h"
#include "murmur.h"


VALUE SparkModule;
VALUE SparkDigestModule;
VALUE SparkDigestMurmur2Class;


void Init_ruby_spark_ext()
{
  SparkModule             = rb_define_module("Spark");
  SparkDigestModule       = rb_define_module_under(SparkModule, "Digest");
  SparkDigestMurmur2Class = rb_define_class_under(SparkDigestModule, "Murmur2", rb_cObject);

  rb_define_singleton_method(SparkDigestModule, "portable_hash", method_portable_hash, -1);
  rb_define_singleton_method(SparkDigestMurmur2Class, "digest", method_murmur2_digest, -1);
}


================================================
FILE: ext/ruby_java/Digest.java
================================================
import org.jruby.Ruby;
import org.jruby.RubyModule;
import org.jruby.RubyObject;
import org.jruby.RubyClass;
import org.jruby.RubyString;
import org.jruby.RubyFixnum;
import org.jruby.anno.JRubyModule;
import org.jruby.anno.JRubyMethod;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;

@JRubyModule(name="Spark::Digest")
public class Digest extends RubyObject{

  // Have to be the same as in C extension
  final static long PORTABLE_HASH_SEED = 16154832;

  public Digest(final Ruby ruby, RubyClass rubyClass) {
    super(ruby, rubyClass);
  }

  @JRubyMethod(module=true)
  public static IRubyObject portable_hash(ThreadContext context, IRubyObject self, IRubyObject arg) {
    Ruby ruby = self.getRuntime();

    RubyString keyString = (RubyString)arg;

    long hash = Murmur2.hash64(keyString.getBytes(), (int)keyString.length().getLongValue(), PORTABLE_HASH_SEED);

    RubyFixnum result = new RubyFixnum(ruby, hash);

    return result;
  }

}



================================================
FILE: ext/ruby_java/Murmur2.java
================================================
import org.jruby.Ruby;
import org.jruby.RubyClass;
import org.jruby.RubyObject;
import org.jruby.RubyString;
import org.jruby.RubyFixnum;
import org.jruby.anno.JRubyClass;
import org.jruby.anno.JRubyMethod;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;

/** Murmur hash 2.0.
 * 
 * The murmur hash is a relative fast hash function from
 * http://murmurhash.googlepages.com/ for platforms with efficient
 * multiplication.
 *
 * http://d3s.mff.cuni.cz/~holub/sw/javamurmurhash/
 *
 */

@JRubyClass(name="Spark::Digest::Murmur2")
public class Murmur2 extends RubyObject {

  public Murmur2(final Ruby ruby, RubyClass rubyClass) {
    super(ruby, rubyClass);
  }

  @JRubyMethod(required=1, optional=1, module=true)
  public static IRubyObject digest(ThreadContext context, IRubyObject self, IRubyObject[] args) {
    Ruby ruby = context.getRuntime();

    RubyString keyString = (RubyString)args[0];
    long seed;

    if(args.length > 1){
      RubyFixnum rb_seed = (RubyFixnum)args[1];
      seed = rb_seed.getLongValue();
    }
    else{
      seed = 0;
    }

    long hash = hash64(keyString.getBytes(), (int)keyString.length().getLongValue(), seed);

    RubyFixnum result = new RubyFixnum(ruby, hash);
    return result;
  }


  /** Generates 64 bit hash from byte array of the given length and seed.
   * 
   * @param data byte array to hash
   * @param length length of the array to hash
   * @param seed initial seed value
   * @return 64 bit hash of the given array
   */
  public static long hash64(final byte[] data, int length, long seed) {
    final long m = 0xc6a4a7935bd1e995L;
    final int r = 47;

    long h = (seed&0xffffffffl)^(length*m);

    int length8 = length/8;

    for (int i=0; i<length8; i++) {
      final int i8 = i*8;
      long k =  ((long)data[i8+0]&0xff)      +(((long)data[i8+1]&0xff)<<8)
          +(((long)data[i8+2]&0xff)<<16) +(((long)data[i8+3]&0xff)<<24)
          +(((long)data[i8+4]&0xff)<<32) +(((long)data[i8+5]&0xff)<<40)
          +(((long)data[i8+6]&0xff)<<48) +(((long)data[i8+7]&0xff)<<56);
      
      k *= m;
      k ^= k >>> r;
      k *= m;
      
      h ^= k;
      h *= m; 
    }
    
    switch (length%8) {
    case 7: h ^= (long)(data[(length&~7)+6]&0xff) << 48;
    case 6: h ^= (long)(data[(length&~7)+5]&0xff) << 40;
    case 5: h ^= (long)(data[(length&~7)+4]&0xff) << 32;
    case 4: h ^= (long)(data[(length&~7)+3]&0xff) << 24;
    case 3: h ^= (long)(data[(length&~7)+2]&0xff) << 16;
    case 2: h ^= (long)(data[(length&~7)+1]&0xff) << 8;
    case 1: h ^= (long)(data[length&~7]&0xff);
            h *= m;
    };
   
    h ^= h >>> r;
    h *= m;
    h ^= h >>> r;

    return h;
  }

}


================================================
FILE: ext/ruby_java/RubySparkExtService.java
================================================
import org.jruby.Ruby;
import org.jruby.RubyClass;
import org.jruby.RubyModule;
import org.jruby.runtime.ObjectAllocator;
import org.jruby.runtime.builtin.IRubyObject;
import org.jruby.runtime.load.BasicLibraryService;

public class RubySparkExtService implements BasicLibraryService
{
  public boolean basicLoad(final Ruby ruby) throws java.io.IOException {

    RubyModule sparkModule = ruby.defineModule("Spark");
    RubyModule sparkDigestModule = sparkModule.defineModuleUnder("Digest");
    RubyClass  sparkDigestMurmur2Class = sparkDigestModule.defineClassUnder("Murmur2", ruby.getObject(), sparkDigestMurmur2Allocator);

    sparkDigestModule.defineAnnotatedMethods(Digest.class);
    sparkDigestMurmur2Class.defineAnnotatedMethods(Murmur2.class);

    return true;
  }

  public static ObjectAllocator sparkDigestMurmur2Allocator = new ObjectAllocator() {
    public IRubyObject allocate(Ruby ruby, RubyClass rubyClass) {
      return new Murmur2(ruby, rubyClass);
    }
  };

}


================================================
FILE: ext/ruby_java/extconf.rb
================================================
require 'mkmf'

create_makefile("ruby_spark_ext")


================================================
FILE: ext/spark/build.sbt
================================================
import AssemblyKeys._

assemblySettings

// Default values
val defaultScalaVersion     = "2.10.4"
val defaultSparkVersion     = "1.6.0"
val defaultSparkCoreVersion = "2.10"
val defaultTargetDir        = "target"
val defaultHadoopVersion    = "1.0.4"

// Values
val _hadoopVersion    = scala.util.Properties.envOrElse("HADOOP_VERSION", defaultHadoopVersion)
val _scalaVersion     = scala.util.Properties.envOrElse("SCALA_VERSION", defaultScalaVersion)
val _sparkVersion     = scala.util.Properties.envOrElse("SPARK_VERSION", defaultSparkVersion)
val _sparkCoreVersion = scala.util.Properties.envOrElse("SPARK_CORE_VERSION", defaultSparkCoreVersion)
val _targetDir        = scala.util.Properties.envOrElse("TARGET_DIR", defaultTargetDir)

// Project settings
name := "ruby-spark"

version := "1.0.0"

scalaVersion := _scalaVersion

javacOptions ++= Seq("-source", "1.7", "-target", "1.7")

// Jar target folder
artifactPath in Compile in packageBin := file(s"${_targetDir}/ruby-spark.jar")
outputPath in packageDependency := file(s"${_targetDir}/ruby-spark-deps.jar")

// Protocol buffer support
seq(sbtprotobuf.ProtobufPlugin.protobufSettings: _*)

// Additional libraries
libraryDependencies ++= Seq(
  "org.apache.spark"  %% "spark-core"    % _sparkVersion excludeAll(ExclusionRule(organization = "org.apache.hadoop")),
  "org.apache.spark"  %% "spark-graphx"  % _sparkVersion,
  "org.apache.spark"  %% "spark-mllib"   % _sparkVersion,
  "org.apache.spark"  %% "spark-sql"     % _sparkVersion,
  "org.apache.hadoop" %  "hadoop-client" % _hadoopVersion,
  "com.github.fommil.netlib" % "all" % "1.1.2",
  "org.scalatest" % "scalatest_2.10" % "2.2.1" % "test"
)

// Repositories
resolvers ++= Seq(
  "JBoss Repository"     at "http://repository.jboss.org/nexus/content/repositories/releases/",
  "Spray Repository"     at "http://repo.spray.io/",
  "Cloudera Repository"  at "https://repository.cloudera.com/artifactory/cloudera-repos/",
  "Akka Repository"      at "http://repo.akka.io/releases/",
  "Twitter4J Repository" at "http://twitter4j.org/maven2/",
  "Apache HBase"         at "https://repository.apache.org/content/repositories/releases",
  "Twitter Maven Repo"   at "http://maven.twttr.com/",
  "scala-tools"          at "https://oss.sonatype.org/content/groups/scala-tools",
  "Typesafe repository"  at "http://repo.typesafe.com/typesafe/releases/",
  "Second Typesafe repo" at "http://repo.typesafe.com/typesafe/maven-releases/",
  "Mesosphere Public Repository" at "http://downloads.mesosphere.io/maven",
  Resolver.sonatypeRepo("public")
)

// Merge strategy
mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) =>
  {
    case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard
    case m if m.startsWith("META-INF") => MergeStrategy.discard
    case PathList("javax", "servlet", xs @ _*) => MergeStrategy.first
    case PathList("org", "apache", xs @ _*) => MergeStrategy.first
    case PathList("org", "jboss", xs @ _*) => MergeStrategy.first
    case "about.html"  => MergeStrategy.rename
    case "reference.conf" => MergeStrategy.concat
    case _ => MergeStrategy.first
  }
}


================================================
FILE: ext/spark/project/plugins.sbt
================================================
resolvers += Resolver.url("artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns)

resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/"

resolvers += "Spray Repository" at "http://repo.spray.io/"

addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.10.2")

addSbtPlugin("com.github.gseitz" % "sbt-protobuf" % "0.3.3")


================================================
FILE: ext/spark/sbt/sbt
================================================
#!/bin/bash

# This script launches sbt for this project. If present it uses the system
# version of sbt. If there is no system version of sbt it attempts to download
# sbt locally.
SBT_VERSION=0.13.9
URL1=http://dl.bintray.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
URL2=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
JAR=sbt/sbt-launch-${SBT_VERSION}.jar

# Download sbt launch jar if it hasn't been downloaded yet
if [ ! -f ${JAR} ]; then
  # Download
  printf "Attempting to fetch sbt\n"
  JAR_DL=${JAR}.part
  if hash wget 2>/dev/null; then
    (wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR}
  elif hash curl 2>/dev/null; then
    (curl --progress-bar ${URL1} > ${JAR_DL} || curl --progress-bar ${URL2} > ${JAR_DL}) && mv ${JAR_DL} ${JAR}
  else
    printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n"
    exit -1
  fi
fi
if [ ! -f ${JAR} ]; then
  # We failed to download
  printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n"
  exit -1
fi
printf "Launching sbt from ${JAR}\n"
java \
  -Xmx1200m -XX:MaxPermSize=350m -XX:ReservedCodeCacheSize=256m \
  -jar ${JAR} \
  "$@"


================================================
FILE: ext/spark/src/main/scala/Exec.scala
================================================
package org.apache.spark.api.ruby

import java.io.{File, FileOutputStream, InputStreamReader, BufferedReader}

import scala.collection.JavaConversions._

import org.apache.spark.{SparkEnv, Logging}
import org.apache.spark.util._


/* =================================================================================================
 * class FileCommand
 * =================================================================================================
 *
 * Save command to file and than execute him because from Scala you cannot simply run
 * something like "bash --norc -i -c 'source .zshrc; ruby master.rb'"
 */

class FileCommand(command: String) extends Logging {

  var pb: ProcessBuilder = null
  var file: File = null

  // Command is complete.
  def this(command: String, env: SparkEnv) = {
    this(command)
    create(env)
  }

  // Template must contains %s which will be replaced for command
  def this(template: String, command: String, env: SparkEnv, envVars: Map[String, String]) = {
    this(template.format(command), env)
    setEnvVars(envVars)
  }

  private def create(env: SparkEnv) {
    val dir = new File(env.sparkFilesDir)
    val ext = if(Utils.isWindows) ".cmd" else ".sh"
    val shell = if(Utils.isWindows) "cmd" else "bash"

    file = File.createTempFile("command", ext, dir)

    val out = new FileOutputStream(file)
    out.write(command.getBytes)
    out.close

    logInfo(s"New FileCommand at ${file.getAbsolutePath}")

    pb = new ProcessBuilder(shell, file.getAbsolutePath)
  }

  def setEnvVars(vars: Map[String, String]) {
    pb.environment().putAll(vars)
  }

  def run = {
    new ExecutedFileCommand(pb.start)
  }
}


/* =================================================================================================
 * class ExecutedFileCommand
 * =================================================================================================
 *
 * Represent process executed from file.
 */

class ExecutedFileCommand(process: Process) {

  var reader: BufferedReader = null

  def readLine = {
    openInput
    reader.readLine.toString.trim
  }

  def openInput {
    if(reader != null){
      return
    }

    val input = new InputStreamReader(process.getInputStream)
    reader = new BufferedReader(input)
  }

  // Delegation
  def destroy = process.destroy
  def getInputStream = process.getInputStream
  def getErrorStream = process.getErrorStream
}


================================================
FILE: ext/spark/src/main/scala/MLLibAPI.scala
================================================
package org.apache.spark.mllib.api.python

// PythonMLLibAPI is private for python
class MLLibAPI extends PythonMLLibAPI {}


================================================
FILE: ext/spark/src/main/scala/Marshal.scala
================================================
package org.apache.spark.api.ruby.marshal

import java.io.{DataInputStream, DataOutputStream, ByteArrayInputStream, ByteArrayOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.collection.JavaConverters._


/* =================================================================================================
 * object Marshal
 * =================================================================================================
 */
object Marshal {
  def load(bytes: Array[Byte]) = {
    val is = new DataInputStream(new ByteArrayInputStream(bytes))

    val majorVersion = is.readUnsignedByte // 4
    val minorVersion = is.readUnsignedByte // 8

    (new MarshalLoad(is)).load
  }

  def dump(data: Any) = {
    val aos = new ByteArrayOutputStream
    val os = new DataOutputStream(aos)

    os.writeByte(4)
    os.writeByte(8)

    (new MarshalDump(os)).dump(data)
    aos.toByteArray
  }
}


/* =================================================================================================
 * class IterableMarshaller
 * =================================================================================================
 */
class IterableMarshaller(iter: Iterator[Any]) extends Iterator[Array[Byte]] {
  private val buffer = new ArrayBuffer[Any]

  override def hasNext: Boolean = iter.hasNext

  override def next(): Array[Byte] = {
    while (iter.hasNext) {
      buffer += iter.next()
    }

    Marshal.dump(buffer)
  }
}


================================================
FILE: ext/spark/src/main/scala/MarshalDump.scala
================================================
package org.apache.spark.api.ruby.marshal

import java.io.{DataInputStream, DataOutputStream, ByteArrayInputStream, ByteArrayOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.collection.JavaConverters._
import scala.reflect.{ClassTag, classTag}

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.{Vector, DenseVector, SparseVector}


/* =================================================================================================
 * class MarshalDump
 * =================================================================================================
 */
class MarshalDump(os: DataOutputStream) {

  val NAN_BYTELIST               = "nan".getBytes
  val NEGATIVE_INFINITY_BYTELIST = "-inf".getBytes
  val INFINITY_BYTELIST          = "inf".getBytes

  def dump(data: Any) {
    data match {
      case null =>
        os.writeByte('0')

      case item: Boolean =>
        val char = if(item) 'T' else 'F'
        os.writeByte(char)

      case item: Int =>
        os.writeByte('i')
        dumpInt(item)

      case item: Array[_] =>
        os.writeByte('[')
        dumpArray(item)

      case item: Double =>
        os.writeByte('f')
        dumpFloat(item)

      case item: ArrayBuffer[Any] => dump(item.toArray)
    }
  }

  def dumpInt(data: Int) {
    if(data == 0){
      os.writeByte(0)
    }
    else if (0 < data && data < 123) {
      os.writeByte(data + 5)
    }
    else if (-124 < data && data < 0) {
      os.writeByte((data - 5) & 0xff)
    }
    else {
      val buffer = new Array[Byte](4)
      var value = data

      var i = 0
      while(i != 4 && value != 0 && value != -1){
        buffer(i) = (value & 0xff).toByte
        value = value >> 8

        i += 1
      }
      val lenght = i + 1
      if(value < 0){
        os.writeByte(-lenght)
      }
      else{
        os.writeByte(lenght)
      }
      os.write(buffer, 0, lenght)
    }
  }

  def dumpArray(array: Array[_]) {
    dumpInt(array.size)

    for(item <- array) {
      dump(item)
    }
  }

  def dumpFloat(value: Double) {
    if(value.isPosInfinity){
      dumpString(NEGATIVE_INFINITY_BYTELIST)
    }
    else if(value.isNegInfinity){
      dumpString(INFINITY_BYTELIST)
    }
    else if(value.isNaN){
      dumpString(NAN_BYTELIST)
    }
    else{
      // dumpString("%.17g".format(value))
      dumpString(value.toString)
    }
  }

  def dumpString(data: String) {
    dumpString(data.getBytes)
  }

  def dumpString(data: Array[Byte]) {
    dumpInt(data.size)
    os.write(data)
  }

}


================================================
FILE: ext/spark/src/main/scala/MarshalLoad.scala
================================================
package org.apache.spark.api.ruby.marshal

import java.io.{DataInputStream, DataOutputStream, ByteArrayInputStream, ByteArrayOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.collection.JavaConverters._
import scala.reflect.{ClassTag, classTag}

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.{Vector, DenseVector, SparseVector}


/* =================================================================================================
 * class MarshalLoad
 * =================================================================================================
 */
class MarshalLoad(is: DataInputStream) {

  case class WaitForObject()

  val registeredSymbols = ArrayBuffer[String]()
  val registeredLinks = ArrayBuffer[Any]()

  def load: Any = {
    load(is.readUnsignedByte.toChar)
  }

  def load(dataType: Char): Any = {
    dataType match {
      case '0' => null
      case 'T' => true
      case 'F' => false
      case 'i' => loadInt
      case 'f' => loadAndRegisterFloat
      case ':' => loadAndRegisterSymbol
      case '[' => loadAndRegisterArray
      case 'U' => loadAndRegisterUserObject
      case _ =>
        throw new IllegalArgumentException(s"Format is not supported: $dataType.")
    }
  }


  // ----------------------------------------------------------------------------------------------
  // Load by type

  def loadInt: Int = {
    var c = is.readByte.toInt

    if (c == 0) {
      return 0
    } else if (4 < c && c < 128) {
      return c - 5
    } else if (-129 < c && c < -4) {
      return c + 5
    }

    var result: Long = 0

    if (c > 0) {
      result = 0
      for( i <- 0 until c ) {
        result |= (is.readUnsignedByte << (8 * i)).toLong
      }
    } else {
      c = -c
      result = -1
      for( i <- 0 until c ) {
        result &= ~((0xff << (8 * i)).toLong)
        result |= (is.readUnsignedByte << (8 * i)).toLong
      }
    }

    result.toInt
  }

  def loadAndRegisterFloat: Double = {
    val result = loadFloat
    registeredLinks += result
    result
  }

  def loadFloat: Double = {
    val string = loadString
    string match {
      case "nan"  => Double.NaN
      case "inf"  => Double.PositiveInfinity
      case "-inf" => Double.NegativeInfinity
      case _ => string.toDouble
    }
  }

  def loadString: String = {
    new String(loadStringBytes)
  }

  def loadStringBytes: Array[Byte] = {
    val size = loadInt
    val buffer = new Array[Byte](size)

    var readSize = 0
    while(readSize < size){
      val read = is.read(buffer, readSize, size-readSize)

      if(read == -1){
        throw new IllegalArgumentException("Marshal too short.")
      }

      readSize += read
    }

    buffer
  }

  def loadAndRegisterSymbol: String = {
    val result = loadString
    registeredSymbols += result
    result
  }

  def loadAndRegisterArray: Array[Any] = {
    val size = loadInt
    val array = new Array[Any](size)

    registeredLinks += array

    for( i <- 0 until size ) {
      array(i) = loadNextObject
    }

    array
  }

  def loadAndRegisterUserObject: Any = {
    val klass = loadNextObject.asInstanceOf[String]

    // Register future class before load the next object
    registeredLinks += WaitForObject()
    val index = registeredLinks.size - 1

    val data = loadNextObject

    val result = klass match {
      case "Spark::Mllib::LabeledPoint" => createLabeledPoint(data)
      case "Spark::Mllib::DenseVector" => createDenseVector(data)
      case "Spark::Mllib::SparseVector" => createSparseVector(data)
      case other =>
        throw new IllegalArgumentException(s"Object $other is not supported.")
    }

    registeredLinks(index) = result

    result
  }


  // ----------------------------------------------------------------------------------------------
  // Other loads

  def loadNextObject: Any = {
    val dataType = is.readUnsignedByte.toChar

    if(isLinkType(dataType)){
      readLink(dataType)
    }
    else{
      load(dataType)
    }
  }


  // ----------------------------------------------------------------------------------------------
  // To java objects

  def createLabeledPoint(data: Any): LabeledPoint = {
    val array = data.asInstanceOf[Array[_]]
    new LabeledPoint(array(0).asInstanceOf[Double], array(1).asInstanceOf[Vector])
  }

  def createDenseVector(data: Any): DenseVector = {
    new DenseVector(data.asInstanceOf[Array[_]].map(toDouble(_)))
  }

  def createSparseVector(data: Any): SparseVector = {
    val array = data.asInstanceOf[Array[_]]
    val size = array(0).asInstanceOf[Int]
    val indices = array(1).asInstanceOf[Array[_]].map(_.asInstanceOf[Int])
    val values = array(2).asInstanceOf[Array[_]].map(toDouble(_))

    new SparseVector(size, indices, values)
  }


  // ----------------------------------------------------------------------------------------------
  // Helpers

  def toDouble(data: Any): Double = data match {
    case x: Int => x.toDouble
    case x: Double => x
    case _ => 0.0
  }


  // ----------------------------------------------------------------------------------------------
  // Cache

  def readLink(dataType: Char): Any = {
    val index = loadInt

    dataType match {
      case '@' => registeredLinks(index)
      case ';' => registeredSymbols(index)
    }
  }

  def isLinkType(dataType: Char): Boolean = {
    dataType == ';' || dataType == '@'
  }

}


================================================
FILE: ext/spark/src/main/scala/RubyAccumulatorParam.scala
================================================
package org.apache.spark.api.ruby

import java.io._
import java.net._
import java.util.{List, ArrayList}

import scala.collection.JavaConversions._
import scala.collection.immutable._

import org.apache.spark._
import org.apache.spark.util.Utils

/**
 * Internal class that acts as an `AccumulatorParam` for Ruby accumulators. Inside, it
 * collects a list of pickled strings that we pass to Ruby through a socket.
 */
private class RubyAccumulatorParam(serverHost: String, serverPort: Int)
  extends AccumulatorParam[List[Array[Byte]]] {

  // Utils.checkHost(serverHost, "Expected hostname")

  val bufferSize = SparkEnv.get.conf.getInt("spark.buffer.size", 65536)

  // Socket shoudl not be serialized
  // Otherwise: SparkException: Task not serializable
  @transient var socket: Socket = null
  @transient var socketOutputStream: DataOutputStream = null
  @transient var socketInputStream:  DataInputStream = null

  def openSocket(){
    synchronized {
      if (socket == null || socket.isClosed) {
        socket = new Socket(serverHost, serverPort)

        socketInputStream  = new DataInputStream(new BufferedInputStream(socket.getInputStream, bufferSize))
        socketOutputStream = new DataOutputStream(new BufferedOutputStream(socket.getOutputStream, bufferSize))
      }
    }
  }

  override def zero(value: List[Array[Byte]]): List[Array[Byte]] = new ArrayList

  override def addInPlace(val1: List[Array[Byte]], val2: List[Array[Byte]]) : List[Array[Byte]] = synchronized {
    if (serverHost == null) {
      // This happens on the worker node, where we just want to remember all the updates
      val1.addAll(val2)
      val1
    } else {
      // This happens on the master, where we pass the updates to Ruby through a socket
      openSocket()

      socketOutputStream.writeInt(val2.size)
      for (array <- val2) {
        socketOutputStream.writeInt(array.length)
        socketOutputStream.write(array)
      }
      socketOutputStream.flush()

      // Wait for acknowledgement
      // http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
      //
      // if(in.readInt() != RubyConstant.ACCUMULATOR_ACK){
      //   throw new SparkException("Accumulator was not acknowledged")
      // }

      new ArrayList
    }
  }
}


================================================
FILE: ext/spark/src/main/scala/RubyBroadcast.scala
================================================
package org.apache.spark.api.ruby

import org.apache.spark.api.python.PythonBroadcast

/**
 * An Wrapper for Ruby Broadcast, which is written into disk by Ruby. It also will
 * write the data into disk after deserialization, then Ruby can read it from disks.
 *
 * Class use Python logic - only for semantic
 */
class RubyBroadcast(@transient var _path: String, @transient var id: java.lang.Long) extends PythonBroadcast(_path) {

}


================================================
FILE: ext/spark/src/main/scala/RubyConstant.scala
================================================
package org.apache.spark.api.ruby

object RubyConstant {
  val DATA_EOF = -2
  val WORKER_ERROR = -1
  val WORKER_DONE = 0
  val CREATE_WORKER = 1
  val KILL_WORKER = 2
  val KILL_WORKER_AND_WAIT = 3
  val SUCCESSFULLY_KILLED = 4
  val UNSUCCESSFUL_KILLING = 5
  val ACCUMULATOR_ACK = 6
}


================================================
FILE: ext/spark/src/main/scala/RubyMLLibAPI.scala
================================================
package org.apache.spark.mllib.api.ruby

import java.util.ArrayList

import scala.collection.JavaConverters._

import org.apache.spark.rdd.RDD
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.clustering.GaussianMixtureModel
import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
import org.apache.spark.mllib.api.python.MLLibAPI


class RubyMLLibAPI extends MLLibAPI {
  // trainLinearRegressionModelWithSGD
  // trainLassoModelWithSGD
  // trainRidgeModelWithSGD
  // trainLogisticRegressionModelWithSGD
  // trainLogisticRegressionModelWithLBFGS
  // trainSVMModelWithSGD
  // trainKMeansModel
  // trainGaussianMixtureModel

  // Rjb have a problem with theta: Array[Array[Double]]
  override def trainNaiveBayesModel(data: JavaRDD[LabeledPoint], lambda: Double) = {
    val model = NaiveBayes.train(data.rdd, lambda)

    List(
      Vectors.dense(model.labels),
      Vectors.dense(model.pi),
      model.theta.toSeq
    ).map(_.asInstanceOf[Object]).asJava
  }

  // On python is wt just Object
  def predictSoftGMM(
      data: JavaRDD[Vector],
      wt: ArrayList[Object],
      mu: ArrayList[Object],
      si: ArrayList[Object]): RDD[Array[Double]] = {

      // val weight = wt.asInstanceOf[Array[Double]]
      val weight = wt.toArray.map(_.asInstanceOf[Double])
      val mean = mu.toArray.map(_.asInstanceOf[DenseVector])
      val sigma = si.toArray.map(_.asInstanceOf[DenseMatrix])
      val gaussians = Array.tabulate(weight.length){
        i => new MultivariateGaussian(mean(i), sigma(i))
      }
      val model = new GaussianMixtureModel(weight, gaussians)
      model.predictSoft(data)
  }
}


================================================
FILE: ext/spark/src/main/scala/RubyMLLibUtilAPI.scala
================================================
package org.apache.spark.mllib.api.ruby

import java.util.ArrayList

import org.apache.spark.mllib.util.LinearDataGenerator
import org.apache.spark.mllib.regression.LabeledPoint

object RubyMLLibUtilAPI {

  // Ruby does have a problem with creating Array[Double]
  def generateLinearInput(
      intercept: Double,
      weights: ArrayList[String],
      nPoints: Int,
      seed: Int,
      eps: Double = 0.1): Seq[LabeledPoint] = {

    LinearDataGenerator.generateLinearInput(intercept, weights.toArray.map(_.toString.toDouble), nPoints, seed, eps)
  }

}


================================================
FILE: ext/spark/src/main/scala/RubyPage.scala
================================================
package org.apache.spark.ui.ruby

// import javax.servlet.http.HttpServletRequest

// import scala.xml.Node

// import org.apache.spark.ui.{WebUIPage, UIUtils}
// import org.apache.spark.util.Utils

// private[ui] class RubyPage(parent: RubyTab, rbConfig: Array[Tuple2[String, String]]) extends WebUIPage("") {

//   def render(request: HttpServletRequest): Seq[Node] = {
//     val content = UIUtils.listingTable(header, row, rbConfig)
//     UIUtils.headerSparkPage("Ruby Config", content, parent)
//   }

//   private def header = Seq(
//     "Number"
//   )

//   private def row(keyValue: (String, String)): Seq[Node] = {
//     // scalastyle:off
//     keyValue match {
//       case (key, value) =>
//         <tr>
//           <td>{key}</td>
//           <td>{value}</td>
//         </tr>
//     }
//     // scalastyle:on
//   }
// }

class RubyPage {}


================================================
FILE: ext/spark/src/main/scala/RubyRDD.scala
================================================
package org.apache.spark.api.ruby

import java.io._
import java.net._
import java.util.{List, ArrayList, Collections}

import scala.util.Try
import scala.reflect.ClassTag
import scala.collection.JavaConversions._

import org.apache.spark._
import org.apache.spark.{SparkEnv, Partition, SparkException, TaskContext}
import org.apache.spark.api.ruby._
import org.apache.spark.api.ruby.marshal._
import org.apache.spark.api.java.{JavaSparkContext, JavaPairRDD, JavaRDD}
import org.apache.spark.api.python.PythonRDD
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.util.Utils
import org.apache.spark.InterruptibleIterator


/* =================================================================================================
 * Class RubyRDD
 * =================================================================================================
 */

class RubyRDD(
    @transient parent: RDD[_],
    command: Array[Byte],
    broadcastVars: ArrayList[Broadcast[RubyBroadcast]],
    accumulator: Accumulator[List[Array[Byte]]])
  extends RDD[Array[Byte]](parent){

    val bufferSize = conf.getInt("spark.buffer.size", 65536)

    val asJavaRDD: JavaRDD[Array[Byte]] = JavaRDD.fromRDD(this)

    override def getPartitions: Array[Partition] = firstParent.partitions

    override val partitioner = None

    /* ------------------------------------------------------------------------------------------ */

    override def compute(split: Partition, context: TaskContext): Iterator[Array[Byte]] = {

      val env = SparkEnv.get

      // Get worker and id
      val (worker, workerId) = RubyWorker.create(env)

      // Start a thread to feed the process input from our parent's iterator
      val writerThread = new WriterThread(env, worker, split, context)

      context.addTaskCompletionListener { context =>
        writerThread.shutdownOnTaskCompletion()
        writerThread.join()

        // Cleanup the worker socket. This will also cause the worker to exit.
        try {
          RubyWorker.remove(worker, workerId)
          worker.close()
        } catch {
          case e: Exception => logWarning("Failed to close worker socket", e)
        }
      }

      val stream = new DataInputStream(new BufferedInputStream(worker.getInputStream, bufferSize))

      // Send data
      writerThread.start()

      // For violent termination of worker
      new MonitorThread(workerId, worker, context).start()

      // Return an iterator that read lines from the process's stdout
      val stdoutIterator = new StreamReader(stream, writerThread, context)

      // An iterator that wraps around an existing iterator to provide task killing functionality.
      new InterruptibleIterator(context, stdoutIterator)

    } // end compute

    /* ------------------------------------------------------------------------------------------ */

    class WriterThread(env: SparkEnv, worker: Socket, split: Partition, context: TaskContext)
      extends Thread("stdout writer for worker") {

      @volatile private var _exception: Exception = null

      setDaemon(true)

      // Contains the exception thrown while writing the parent iterator to the process.
      def exception: Option[Exception] = Option(_exception)

      // Terminates the writer thread, ignoring any exceptions that may occur due to cleanup.
      def shutdownOnTaskCompletion() {
        assert(context.isCompleted)
        this.interrupt()
      }

      // -------------------------------------------------------------------------------------------
      // Send the necessary data for worker
      //   - split index
      //   - command
      //   - iterator

      override def run(): Unit = Utils.logUncaughtExceptions {
        try {
          SparkEnv.set(env)
          val stream = new BufferedOutputStream(worker.getOutputStream, bufferSize)
          val dataOut = new DataOutputStream(stream)

          // Partition index
          dataOut.writeInt(split.index)

          // Spark files
          PythonRDD.writeUTF(SparkFiles.getRootDirectory, dataOut)

          // Broadcast variables
          dataOut.writeInt(broadcastVars.length)
          for (broadcast <- broadcastVars) {
            dataOut.writeLong(broadcast.value.id)
            PythonRDD.writeUTF(broadcast.value.path, dataOut)
          }

          // Serialized command
          dataOut.writeInt(command.length)
          dataOut.write(command)

          // Send it
          dataOut.flush()

          // Data
          PythonRDD.writeIteratorToStream(firstParent.iterator(split, context), dataOut)
          dataOut.writeInt(RubyConstant.DATA_EOF)
          dataOut.flush()
        } catch {
          case e: Exception if context.isCompleted || context.isInterrupted =>
            logDebug("Exception thrown after task completion (likely due to cleanup)", e)

          case e: Exception =>
            // We must avoid throwing exceptions here, because the thread uncaught exception handler
            // will kill the whole executor (see org.apache.spark.executor.Executor).
            _exception = e
        } finally {
          Try(worker.shutdownOutput()) // kill worker process
        }
      }
    } // end WriterThread


    /* ------------------------------------------------------------------------------------------ */

    class StreamReader(stream: DataInputStream, writerThread: WriterThread, context: TaskContext) extends Iterator[Array[Byte]] {

      def hasNext = _nextObj != null
      var _nextObj = read()

      // -------------------------------------------------------------------------------------------

      def next(): Array[Byte] = {
        val obj = _nextObj
        if (hasNext) {
          _nextObj = read()
        }
        obj
      }

      // -------------------------------------------------------------------------------------------

      private def read(): Array[Byte] = {
        if (writerThread.exception.isDefined) {
          throw writerThread.exception.get
        }
        try {
          stream.readInt() match {
            case length if length > 0 =>
              val obj = new Array[Byte](length)
              stream.readFully(obj)
              obj
            case RubyConstant.WORKER_DONE =>
              val numAccumulatorUpdates = stream.readInt()
              (1 to numAccumulatorUpdates).foreach { _ =>
                val updateLen = stream.readInt()
                val update = new Array[Byte](updateLen)
                stream.readFully(update)
                accumulator += Collections.singletonList(update)
              }
              null
            case RubyConstant.WORKER_ERROR =>
              // Exception from worker

              // message
              val length = stream.readInt()
              val obj = new Array[Byte](length)
              stream.readFully(obj)

              // stackTrace
              val stackTraceLen = stream.readInt()
              val stackTrace = new Array[String](stackTraceLen)
              (0 until stackTraceLen).foreach { i =>
                val length = stream.readInt()
                val obj = new Array[Byte](length)
                stream.readFully(obj)

                stackTrace(i) = new String(obj, "utf-8")
              }

              // Worker will be killed
              stream.close

              // exception
              val exception = new RubyException(new String(obj, "utf-8"), writerThread.exception.getOrElse(null))
              exception.appendToStackTrace(stackTrace)

              throw exception
          }
        } catch {

          case e: Exception if context.isInterrupted =>
            logDebug("Exception thrown after task interruption", e)
            throw new TaskKilledException

          case e: Exception if writerThread.exception.isDefined =>
            logError("Worker exited unexpectedly (crashed)", e)
            throw writerThread.exception.get

          case eof: EOFException =>
            throw new SparkException("Worker exited unexpectedly (crashed)", eof)
        }
      }
    } // end StreamReader

    /* ---------------------------------------------------------------------------------------------
     * Monitor thread for controll worker. Kill worker if task is interrupted.
     */

    class MonitorThread(workerId: Long, worker: Socket, context: TaskContext)
      extends Thread("Worker Monitor for worker") {

      setDaemon(true)

      override def run() {
        // Kill the worker if it is interrupted, checking until task completion.
        while (!context.isInterrupted && !context.isCompleted) {
          Thread.sleep(2000)
        }
        if (!context.isCompleted) {
          try {
            logWarning("Incomplete task interrupted: Attempting to kill Worker "+workerId.toString())
            RubyWorker.kill(workerId)
          } catch {
            case e: Exception =>
              logError("Exception when trying to kill worker "+workerId.toString(), e)
          }
        }
      }
    } // end MonitorThread
  } // end RubyRDD



/* =================================================================================================
 * Class PairwiseRDD
 * =================================================================================================
 *
 * Form an RDD[(Array[Byte], Array[Byte])] from key-value pairs returned from Ruby.
 * This is used by PySpark's shuffle operations.
 * Borrowed from Python Package -> need new deserializeLongValue ->
 *   Marshal will add the same 4b header
 */

class PairwiseRDD(prev: RDD[Array[Byte]]) extends RDD[(Long, Array[Byte])](prev) {
  override def getPartitions = prev.partitions
  override def compute(split: Partition, context: TaskContext) =
    prev.iterator(split, context).grouped(2).map {
      case Seq(a, b) => (Utils.deserializeLongValue(a.reverse), b)
      case x => throw new SparkException("PairwiseRDD: unexpected value: " + x)
    }
  val asJavaPairRDD : JavaPairRDD[Long, Array[Byte]] = JavaPairRDD.fromRDD(this)
}



/* =================================================================================================
 * Object RubyRDD
 * =================================================================================================
 */

object RubyRDD extends Logging {

  def runJob(
      sc: SparkContext,
      rdd: JavaRDD[Array[Byte]],
      partitions: ArrayList[Int],
      allowLocal: Boolean,
      filename: String): String = {
    type ByteArray = Array[Byte]
    type UnrolledPartition = Array[ByteArray]
    val allPartitions: Array[UnrolledPartition] =
      sc.runJob(rdd, (x: Iterator[ByteArray]) => x.toArray, partitions, allowLocal)
    val flattenedPartition: UnrolledPartition = Array.concat(allPartitions: _*)
    writeRDDToFile(flattenedPartition.iterator, filename)
  }

  def readRDDFromFile(sc: JavaSparkContext, filename: String, parallelism: Int): JavaRDD[Array[Byte]] = {
    val file = new DataInputStream(new BufferedInputStream(new FileInputStream(filename)))
    val objs = new collection.mutable.ArrayBuffer[Array[Byte]]
    try {
      while (true) {
        val length = file.readInt()
        val obj = new Array[Byte](length)
        file.readFully(obj)
        objs.append(obj)
      }
    } catch {
      case eof: EOFException => {}
    }
    JavaRDD.fromRDD(sc.sc.parallelize(objs, parallelism))
  }

  def writeRDDToFile[T](items: Iterator[T], filename: String): String = {
    val file = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(filename)))

    try {
      PythonRDD.writeIteratorToStream(items, file)
    } finally {
      file.close()
    }

    filename
  }

  def writeRDDToFile[T](rdd: RDD[T], filename: String): String = {
    writeRDDToFile(rdd.collect.iterator, filename)
  }

  def readBroadcastFromFile(sc: JavaSparkContext, path: String, id: java.lang.Long): Broadcast[RubyBroadcast] = {
    sc.broadcast(new RubyBroadcast(path, id))
  }

  /**
   * Convert an RDD of serialized Ruby objects to RDD of objects, that is usable in Java.
   */
  def toJava(rbRDD: JavaRDD[Array[Byte]], batched: Boolean): JavaRDD[Any] = {
    rbRDD.rdd.mapPartitions { iter =>
      iter.flatMap { item =>
        val obj = Marshal.load(item)
        if(batched){
          obj.asInstanceOf[Array[_]]
        }
        else{
          Seq(item)
        }
      }
    }.toJavaRDD()
  }

  /**
   * Convert an RDD of Java objects to an RDD of serialized Ruby objects, that is usable by Ruby.
   */
  def toRuby(jRDD: JavaRDD[_]): JavaRDD[Array[Byte]] = {
    jRDD.rdd.mapPartitions { iter => new IterableMarshaller(iter) }
  }

}



/* =================================================================================================
 * Class RubyException
 * =================================================================================================
 */

class RubyException(msg: String, cause: Exception) extends RuntimeException(msg, cause) {
  def appendToStackTrace(toAdded: Array[String]) {
    val newStactTrace = getStackTrace.toBuffer

    var regexpMatch = "(.*):([0-9]+):in `([a-z]+)'".r

    for(item <- toAdded) {
      item match {
        case regexpMatch(fileName, lineNumber, methodName) =>
          newStactTrace += new StackTraceElement("RubyWorker", methodName, fileName, lineNumber.toInt)
        case _ => null
      }
    }

    setStackTrace(newStactTrace.toArray)
  }
}


================================================
FILE: ext/spark/src/main/scala/RubySerializer.scala
================================================
package org.apache.spark.api.ruby

import scala.collection.JavaConverters._
import scala.reflect.{ClassTag, classTag}

import org.apache.spark.api.java.JavaRDD
import org.apache.spark.api.ruby.marshal._


/* =================================================================================================
 * object RubySerializer
 * =================================================================================================
 */
object RubySerializer { }


================================================
FILE: ext/spark/src/main/scala/RubyTab.scala
================================================
package org.apache.spark.ui.ruby

import scala.collection.mutable.HashMap

import org.apache.spark.ui._

// class RubyTab(parent: SparkUI, rbConfig: HashMap[String, String]) extends SparkUITab(parent, "ruby"){
//   attachPage(new RubyPage(this, rbConfig.toArray))
// }

class RubyTab {}


================================================
FILE: ext/spark/src/main/scala/RubyUtils.scala
================================================
package org.apache.spark.api.ruby

import org.apache.spark.util._
import org.apache.spark.{SparkConf, Logging}

object RubyUtils extends Logging {

  def loadPropertiesFile(conf: SparkConf, path: String): String = {
    Utils.getPropertiesFromFile(path).foreach {
      case (key, value) => conf.set(key, value)
    }
    path
  }

}


================================================
FILE: ext/spark/src/main/scala/RubyWorker.scala
================================================
package org.apache.spark.api.ruby

import java.io.{File, DataInputStream, InputStream, DataOutputStream, FileOutputStream}
import java.net.{InetAddress, ServerSocket, Socket, SocketException}
import java.nio.file.Paths

import scala.collection.mutable
import scala.collection.JavaConversions._

import org.apache.spark._
import org.apache.spark.api.python.PythonRDD
import org.apache.spark.util.Utils
import org.apache.spark.util.RedirectThread


/* =================================================================================================
 * Object RubyWorker
 * =================================================================================================
 *
 * Create and store server for creating workers.
 */

object RubyWorker extends Logging {

  val PROCESS_WAIT_TIMEOUT = 10000

  private var serverSocket: ServerSocket = null
  private val serverHost = InetAddress.getByAddress(Array(127, 0, 0, 1))
  private var serverPort: Int = 0

  private var master: ExecutedFileCommand = null
  private var masterSocket: Socket = null
  private var masterOutputStream: DataOutputStream = null
  private var masterInputStream: DataInputStream = null

  private var workers = new mutable.WeakHashMap[Socket, Long]()


  /* ----------------------------------------------------------------------------------------------
   * Create new worker but first check if exist SocketServer and master process.
   * If not it will create them. Worker have 2 chance to create.
   */

  def create(env: SparkEnv): (Socket, Long) = {
    synchronized {
      // Create the server if it hasn't been started
      createServer(env)

      // Attempt to connect, restart and retry once if it fails
      try {
        createWorker
      } catch {
        case exc: SocketException =>
          logWarning("Worker unexpectedly quit, attempting to restart")
          createWorker
      }
    }
  }

  /* ----------------------------------------------------------------------------------------------
   * Create a worker throught master process. Return new socket and id.
   * According spark.ruby.worker.type id will be:
   *   process: PID
   *   thread: thread object id
   */

  def createWorker: (Socket, Long) = {
    synchronized {
      masterOutputStream.writeInt(RubyConstant.CREATE_WORKER)
      var socket = serverSocket.accept()

      var id = new DataInputStream(socket.getInputStream).readLong()
      workers.put(socket, id)

      (socket, id)
    }
  }

  /* ----------------------------------------------------------------------------------------------
   * Create SocketServer and bind it to the localhost. Max numbers of connection on queue
   * is set to default. If server is created withou exception -> create master.
   */

  private def createServer(env: SparkEnv){
    synchronized {
      // Already running?
      if(serverSocket != null && masterSocket != null) {
        return
      }

      try {
        // Start Socket Server for comunication
        serverSocket = new ServerSocket(0, 0, serverHost)
        serverPort = serverSocket.getLocalPort

        // Create a master for worker creations
        createMaster(env)
      } catch {
        case e: Exception =>
          throw new SparkException("There was a problem with creating a server", e)
      }
    }
  }

  /* ----------------------------------------------------------------------------------------------
   * In this point SocketServer must be created. Master process create and kill workers.
   * Creating workers from Java can be an expensive operation because new process can
   * get copy of address space.
   */

  private def createMaster(env: SparkEnv){
    synchronized {
      val isDriver = env.executorId == SparkContext.DRIVER_IDENTIFIER
      val executorOptions = env.conf.get("spark.ruby.executor.options", "")
      val commandTemplate = env.conf.get("spark.ruby.executor.command")
      val workerType = env.conf.get("spark.ruby.worker.type")

      // Where is root of ruby-spark
      var executorLocation = ""

      if(isDriver){
        // Use worker from current active gem location
        executorLocation = env.conf.get("spark.ruby.driver_home")
      }
      else{
        // Use gem installed on the system
        try {
          val homeCommand = (new FileCommand(commandTemplate, "ruby-spark home", env, getEnvVars(env))).run
          executorLocation = homeCommand.readLine
        } catch {
          case e: Exception =>
            throw new SparkException("Ruby-spark gem is not installed.", e)
        }
      }

      // Master and worker are saved in GEM_ROOT/lib/spark/worker
      executorLocation = Paths.get(executorLocation, "lib", "spark", "worker").toString

      // Create master command
      // -C: change worker dir before execution
      val masterRb = s"ruby $executorOptions -C $executorLocation master.rb $workerType $serverPort"
      val masterCommand = new FileCommand(commandTemplate, masterRb, env, getEnvVars(env))

      // Start master
      master = masterCommand.run

      // Redirect master stdout and stderr
      redirectStreamsToStderr(master.getInputStream, master.getErrorStream)

      // Wait for it to connect to our socket
      serverSocket.setSoTimeout(PROCESS_WAIT_TIMEOUT)
      try {
        // Use socket for comunication. Keep stdout and stdin for log
        masterSocket = serverSocket.accept()
        masterOutputStream = new DataOutputStream(masterSocket.getOutputStream)
        masterInputStream  = new DataInputStream(masterSocket.getInputStream)

        PythonRDD.writeUTF(executorOptions, masterOutputStream)
      } catch {
        case e: Exception =>
          throw new SparkException("Ruby master did not connect back in time", e)
      }
    }
  }

  /* ----------------------------------------------------------------------------------------------
   * Gel all environment variables for executor
   */

  def getEnvVars(env: SparkEnv): Map[String, String] = {
    val prefix = "spark.ruby.executor.env."
    env.conf.getAll.filter{case (k, _) => k.startsWith(prefix)}
                   .map{case (k, v) => (k.substring(prefix.length), v)}
                   .toMap
  }

  /* ------------------------------------------------------------------------------------------- */

  def kill(workerId: Long){
    masterOutputStream.writeInt(RubyConstant.KILL_WORKER)
    masterOutputStream.writeLong(workerId)
  }

  /* ------------------------------------------------------------------------------------------- */

  def killAndWait(workerId: Long){
    masterOutputStream.writeInt(RubyConstant.KILL_WORKER_AND_WAIT)
    masterOutputStream.writeLong(workerId)

    // Wait for answer
    masterInputStream.readInt() match {
      case RubyConstant.SUCCESSFULLY_KILLED =>
        logInfo(s"Worker $workerId was successfully killed")
      case RubyConstant.UNSUCCESSFUL_KILLING =>
        logInfo(s"Worker $workerId cannot be killed (maybe is already killed)")
    }
  }

  /* ----------------------------------------------------------------------------------------------
   * workers HashMap is week but it avoid long list of workers which cannot be killed (killAndWait)
   */

  def remove(worker: Socket, workerId: Long){
    try {
      workers.remove(worker)
    } catch {
      case e: Exception => logWarning(s"Worker $workerId does not exist (maybe is already removed)")
    }
  }

  /* ------------------------------------------------------------------------------------------- */

  def stopServer{
    synchronized {
      // Kill workers
      workers.foreach { case (socket, id) => killAndWait(id) }

      // Kill master
      master.destroy

      // Stop SocketServer
      serverSocket.close()

      // Clean variables
      serverSocket = null
      serverPort = 0
      master = null
      masterSocket = null
      masterOutputStream = null
      masterInputStream = null
    }
  }

  /* ------------------------------------------------------------------------------------------- */

  private def redirectStreamsToStderr(streams: InputStream*) {
    try {
      for(stream <- streams) {
        new RedirectThread(stream, System.err, "stream reader").start()
      }
    } catch {
      case e: Exception =>
        logError("Exception in redirecting streams", e)
    }
  }

  /* ------------------------------------------------------------------------------------------- */
}


================================================
FILE: ext/spark/src/test/scala/MarshalSpec.scala
================================================
package org.apache.spark.api.ruby.marshal

import org.scalatest._


import org.apache.spark.api.ruby.marshal._

class MarshalSpec extends FunSpec with Matchers {

  // ====================================================================================
  // Load

  describe("Marshal.load"){
    describe("single value"){
      it("int"){
        val data = 1
        val serialized = Array[Byte](4, 8, 105, 6)

        Marshal.load(serialized) should equal(data)
      }

      it("double"){
        val data = 1.2
        val serialized = Array[Byte](4, 8, 102, 8, 49, 46, 50)

        Marshal.load(serialized) should equal(data)
      }
    }

    describe("array"){
      it("ints"){
        val data = Array(1, 2, 3, 4, 5)
        val serialized = Array[Byte](4, 8, 91, 10, 105, 6, 105, 7, 105, 8, 105, 9, 105, 10)

        Marshal.load(serialized) should equal(data)
      }

      it("doubles"){
        val data = Array(1.1, 2.2, 3.3)
        val serialized = Array[Byte](4, 8, 91, 8, 102, 8, 49, 46, 49, 102, 8, 50, 46, 50, 102, 8, 51, 46, 51)

        Marshal.load(serialized) should equal(data)
      }
    }
  }

  // ====================================================================================
  // Dump

  describe("Marshal.dump"){
    describe("single value"){
      it("int"){
        val data = 1
        val serialized = Array(4, 8, 105, 6)

        Marshal.dump(data) should equal(serialized)
      }

      it("double"){
        val data = 1.2
        val serialized = Array(4, 8, 102, 8, 49, 46, 50)

        Marshal.dump(data) should equal(serialized)
      }
    }

    describe("array"){
      it("ints"){
        val data = Array(1, 2, 3, 4, 5)
        val serialized = Array(4, 8, 91, 10, 105, 6, 105, 7, 105, 8, 105, 9, 105, 10)

        Marshal.dump(data) should equal(serialized)
      }

      it("doubles"){
        val data = Array(1.1, 2.2, 3.3)
        val serialized = Array(4, 8, 91, 8, 102, 8, 49, 46, 49, 102, 8, 50, 46, 50, 102, 8, 51, 46, 51)

        Marshal.dump(data) should equal(serialized)
      }
    }
  }

}


================================================
FILE: lib/ruby-spark.rb
================================================
require_relative 'spark'


================================================
FILE: lib/spark/accumulator.rb
================================================
module Spark
  ##
  # A shared variable that can be accumulated, i.e., has a commutative and associative "add"
  # operation. Worker tasks on a Spark cluster can add values to an Accumulator with the `+=`
  # operator, but only the driver program is allowed to access its value, using value.
  # Updates from the workers get propagated automatically to the driver program.
  #
  # == Arguments:
  # value::
  #   Initial value for accumulator. This values is stored only on driver process
  #
  # accum_param::
  #   How merge 2 value on worker or driver process.
  #   Symbol or Proc (or String)
  #
  # zero_value::
  #   Initial value for worker process
  #
  #
  # == Examples:
  #
  #   accum1 = $sc.accumulator(1)
  #   accum2 = $sc.accumulator(2, :*, 1)
  #   accum3 = $sc.accumulator(3, lambda{|max, val| val > max ? val : max})
  #
  #   accum1 += 1
  #
  #   accum2.add(2)
  #   accum2.add(2)
  #   accum2.add(2)
  #
  #   accum3.add(9)
  #   accum3.add(6)
  #   accum3.add(7)
  #
  #   accum1.value # => 2
  #   accum2.value # => 16
  #   accum3.value # => 9
  #
  #   func = Proc.new do |_, index|
  #     accum1.add(1)
  #     accum2.add(2)
  #     accum3.add(index * 10)
  #   end
  #
  #   rdd = $sc.parallelize(0..4, 4)
  #   rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3)
  #   rdd = rdd.map_partitions_with_index(func)
  #   rdd.collect
  #
  #   accum1.value # => 6
  #   accum2.value # => 256
  #   accum3.value # => 30
  #
  class Accumulator

    attr_reader :id, :value, :accum_param, :zero_value

    @@instances = {}
    @@changed = []

    SUPPORTED_SYMBOLS = [:+, :-, :*, :/, :**]


    # =========================================================================
    # Creating and selecting Spark::Accumulator

    def initialize(value, accum_param=:+, zero_value=0)
      @id = object_id
      @value = value
      @accum_param = accum_param
      @zero_value = zero_value
      @driver = true

      valid_accum_param

      @@instances[@id] = self
    end

    def inspect
      result  = %{#<#{self.class.name}:0x#{object_id}\n}
      result << %{   ID: #{@id}\n}
      result << %{ Zero: #{@zero_value.to_s[0, 10]}\n}
      result << %{Value: #{@value.to_s[0, 10]}>}
      result
    end

    def self.changed
      @@changed
    end

    def self.instances
      @@instances
    end

    def valid_accum_param
      if @accum_param.is_a?(Symbol)
        raise Spark::AccumulatorError, "Unsupported symbol #{@accum_param}" unless SUPPORTED_SYMBOLS.include?(@accum_param)
        @serialized_accum_param = @accum_param
        return
      end

      if @accum_param.is_a?(Proc)
        begin
          @serialized_accum_param = @accum_param.to_source
          return
        rescue
          raise Spark::SerializeError, 'Proc can not be serialized. Use String instead.'
        end
      end

      if @accum_param.is_a?(String)
        @serialized_accum_param = @accum_param
        @accum_param = eval(@accum_param)

        unless @accum_param.is_a?(Proc)
          raise Spark::SerializeError, 'Yours param is not a Proc.'
        end

        return
      end

      raise Spark::AccumulatorError, 'Unsupported param. Use Symbol, Proc or String.'
    end

    # Driver process or worker
    def driver?
      @driver
    end


    # =========================================================================
    # Operations

    def add(term)
      if !driver? && !@@changed.include?(self)
        @@changed << self
      end

      if @accum_param.is_a?(Proc)
        @value = @accum_param.call(@value, term)
      else
        add_by_symbol(term)
      end
    end

    def +(term)
      add(term)
      self
    end

    def add_by_symbol(term)
      case @accum_param
      when :+
        @value += term
      when :-
        @value -= term
      when :*
        @value *= term
      when :/
        @value /= term
      when :**
        @value **= term
      end
    end


    # =========================================================================
    # Dump and load

    def marshal_dump
      [@id, @zero_value, @serialized_accum_param]
    end

    def marshal_load(array)
      @id, @zero_value, @serialized_accum_param = array

      @value = @zero_value
      @driver = false
      load_accum_param
    end

    def load_accum_param
      if @serialized_accum_param.is_a?(String)
        @accum_param = eval(@serialized_accum_param)
      else
        @accum_param = @serialized_accum_param
      end
    end

  end
end

# =============================================================================
# Server for handeling Accumulator update
#
module Spark
  class Accumulator
    class Server

      attr_reader :server, :host, :port

      def self.start
        @instance ||= Spark::Accumulator::Server.new
      end

      def self.stop
        @instance && @instance.stop
      end

      def self.host
        start
        @instance.host
      end

      def self.port
        start
        @instance.port
      end

      def initialize
        @server = TCPServer.new(0)
        @host = @server.hostname
        @port = @server.port

        @threads = []
        handle_accept
      end

      def stop
        @threads.each(&:kill)
      rescue
        nil
      end

      def handle_accept
        @threads << Thread.new do
          loop {
            handle_connection(@server.accept)
          }
        end

      end

      def handle_connection(socket)
        @threads << Thread.new do
          until socket.closed?
            count = socket.read_int
            count.times do
              data = socket.read_data
              accum = Spark::Accumulator.instances[data[0]]
              if accum
                accum.add(data[1])
              else
                Spark.logger.warn("Accumulator with id #{data[0]} does not exist.")
              end
            end

            # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
            # socket.write_int(Spark::Constant::ACCUMULATOR_ACK)
          end

        end
      end

    end
  end
end


================================================
FILE: lib/spark/broadcast.rb
================================================
module Spark
  ##
  # Broadcast a read-only variable to the cluster, returning a Spark::Broadcast
  # object for reading it in distributed functions. The variable will
  # be sent to each cluster only once.
  #
  # == Example:
  #
  #   broadcast1 = $sc.broadcast('a')
  #   broadcast2 = $sc.broadcast('b')
  #   broadcast3 = $sc.broadcast([1,2,3])
  #
  #   func = Proc.new do |part, index|
  #     [
  #       broadcast1.value * index,
  #       broadcast2.value * index,
  #       broadcast3.value.reduce(:+)
  #     ]
  #   end
  #
  #   rdd = $sc.parallelize(0..5, 4)
  #   rdd = rdd.bind(broadcast1: broadcast1, broadcast2: broadcast2, broadcast3: broadcast3)
  #   rdd = rdd.map_partitions_with_index(func)
  #   rdd.collect
  #   # => ["", "", 6, "a", "b", 6, "aa", "bb", 6, "aaa", "bbb", 6]
  #
  class Broadcast

    LOADED       = 0 # id, value, path
    NOT_LOADED   = 1 # id, path
    WITHOUT_PATH = 2 # id

    attr_reader :id, :state, :path, :jbroadcast

    @@registered = {}

    # =========================================================================
    # Creating broadcast for SparkContext

    # Create new Broadcast and dump value to the disk
    #
    #   b = $sc.broadcast('a')
    #
    #   b.value # => 'a'
    #   b.path
    #   b.jbroadcast
    #
    def initialize(sc, value)
      @id = object_id
      @value = value
      @state = LOADED

      file = Tempfile.create('broadcast', sc.temp_dir)
      file.binmode
      file.write(Marshal.dump(value))
      file.close

      @path = file.path
      @jbroadcast = RubyRDD.readBroadcastFromFile(sc.jcontext, @path, Spark.jb.to_long(@id))

      ObjectSpace.define_finalizer(self, proc { File.unlink(@path) })
    end

    def inspect
      result  = %{#<#{self.class.name}:0x#{object_id}\n}
      result << %{   ID: #{@id}\n}
      result << %{Value: #{@value.to_s[0, 10]}>}
      result
    end

    def self.register(id, path)
      @@registered[id] = path
    end

    def value
      case state
      when LOADED
        @value
      when NOT_LOADED
        @value = Marshal.load(File.read(@path))
        @state = LOADED
        @value
      when WITHOUT_PATH
        @path = @@registered[id]

        if @path
          @state = NOT_LOADED
          value
        else
          raise Spark::BroadcastError, "Broadcast #{@id} do not have registered path."
        end
      end
    end

    def marshal_dump
      @id
    end

    def marshal_load(id)
      @id = id
      @state = WITHOUT_PATH
    end

  end
end


================================================
FILE: lib/spark/build.rb
================================================
module Spark
  module Build

    DEFAULT_SCALA_VERSION  = '2.10.4'
    DEFAULT_CORE_VERSION   = '2.10'
    DEFAULT_SPARK_VERSION  = '1.6.0'
    DEFAULT_HADOOP_VERSION = '1.0.4'

    SBT       = 'sbt/sbt'
    SBT_DEPS  = 'assemblyPackageDependency'
    SBT_EXT   = 'package'
    SBT_CLEAN = 'clean'

    def self.build(options={})
      scala_version      = options[:scala_version]      || DEFAULT_SCALA_VERSION
      spark_core_version = options[:spark_core_version] || DEFAULT_CORE_VERSION
      spark_version      = options[:spark_version]      || DEFAULT_SPARK_VERSION
      hadoop_version     = options[:hadoop_version]     || DEFAULT_HADOOP_VERSION
      target             = options[:target]             || Spark.target_dir
      only_ext           = options[:only_ext]           || false

      env = {
        'SCALA_VERSION' => scala_version,
        'SPARK_VERSION' => spark_version,
        'SPARK_CORE_VERSION' => spark_core_version,
        'HADOOP_VERSION' => hadoop_version,
        'TARGET_DIR' => target
      }

      cmd = [SBT]
      cmd << SBT_EXT
      cmd << SBT_DEPS unless only_ext
      cmd << SBT_CLEAN unless $DEBUG

      Dir.chdir(Spark.spark_ext_dir) do
        unless Kernel.system(env, cmd.join(' '))
          raise Spark::BuildError, 'Spark cannot be assembled.'
        end
      end
    end

  end
end


================================================
FILE: lib/spark/cli.rb
================================================
require 'commander'

module Commander
  module UI
    # Disable paging
    # for 'classic' help
    def self.enable_paging
    end
  end
end

module Spark
  class CLI
    include Commander::Methods

    # IRB_HISTORY_FILE = File.join(Dir.home, '.irb_spark_history')
    # IRB_HISTORY_SIZE = 100

    def run
      program :name, 'RubySpark'
      program :version, Spark::VERSION
      program :description, 'Ruby wrapper for Spark'

      global_option('-d', '--debug', 'Logging message to stdout'){ $DEBUG = true }
      default_command :help


      # Build ---------------------------------------------------------------
      command :build do |c|
        c.syntax = 'build [options]'
        c.description = 'Build spark and gem extensions'
        c.option '--hadoop-version STRING', String, 'Version of hadoop which will assembled with the Spark'
        c.option '--spark-core-version STRING', String, 'Version of Spark core'
        c.option '--spark-version STRING', String, 'Version of Spark'
        c.option '--scala-version STRING', String, 'Version of Scala'
        c.option '--target STRING', String, 'Directory where Spark will be stored'
        c.option '--only-ext', 'Build only extension for RubySpark'

        c.action do |args, options|
          Spark::Build.build(options.__hash__)
          puts
          puts 'Everything is OK'
        end
      end
      alias_command :install, :build


      # Shell -----------------------------------------------------------------
      command :shell do |c|
        c.syntax = 'shell [options]'
        c.description = 'Start ruby shell for spark'
        c.option '--target STRING', String, 'Directory where Spark is stored'
        c.option '--properties-file STRING', String, 'Path to a file from which to load extra properties'
        c.option '--[no-]start', 'Start Spark immediately'
        c.option '--[no-]logger', 'Enable/disable logger (default: enable)'
        c.option '--auto-reload', 'Autoreload changed files'

        c.action do |args, options|
          options.default start: true, logger: true

          Spark.load_lib(options.target)
          Spark.logger.disable unless options.logger

          Spark.config do
            set_app_name 'RubySpark'
          end

          Spark.config.from_file(options.properties_file)

          if options.auto_reload
            require 'listen'
            listener = Listen.to(File.join(Spark.root, 'lib')) do |modified, added, removed|
              (modified+added).each do |file|
                silence_warnings { load(file) }
              end
            end
            listener.start
          end

          if options.start
            # Load Java and Spark
            Spark.start
            $sc = Spark.context

            Spark.print_logo('Spark context is loaded as $sc')
          else
            Spark.print_logo('You can start Spark with Spark.start')
          end

          # Load Pry
          require 'pry'
          Pry.start
        end
      end


      # # IRB -------------------------------------------------------------------
      # command :irb do |c|
      #   c.syntax = 'irb [options]'
      #   c.description = 'Start ruby shell for spark'
      #   c.option '--spark-home STRING', String, 'Directory where Spark is stored'
      #   c.option '--[no-]start', 'Start Spark immediately'
      #   c.option '--[no-]logger', 'Enable/disable logger (default: enable)'
      #
      #   c.action do |args, options|
      #     options.default start: true, logger: true
      #
      #     Spark.load_lib(options.spark_home)
      #     Spark::Logger.disable unless options.logger
      #
      #     Spark.config do
      #       set_app_name 'Pry RubySpark'
      #     end
      #
      #     if options.start
      #       # Load Java and Spark
      #       Spark.start
      #       $sc = Spark.context
      #
      #       Spark.print_logo('Spark context is loaded as $sc')
      #     else
      #       Spark.print_logo('You can start Spark with Spark.start')
      #     end
      #
      #     # Load IRB
      #     require 'irb'
      #     require 'irb/completion'
      #     require 'irb/ext/save-history'
      #
      #     begin
      #       file = File.expand_path(IRB_HISTORY_FILE)
      #       if File.exists?(file)
      #         lines = IO.readlines(file).collect { |line| line.chomp }
      #         Readline::HISTORY.push(*lines)
      #       end
      #       Kernel.at_exit do
      #         lines = Readline::HISTORY.to_a.reverse.uniq.reverse
      #         lines = lines[-IRB_HISTORY_SIZE, IRB_HISTORY_SIZE] if lines.nitems > IRB_HISTORY_SIZE
      #         File.open(IRB_HISTORY_FILE, File::WRONLY | File::CREAT | File::TRUNC) { |io| io.puts lines.join("\n") }
      #       end
      #     rescue
      #     end
      #
      #     ARGV.clear # Clear Thor ARGV, otherwise IRB will parse it
      #     ARGV.concat ['--readline', '--prompt-mode', 'simple']
      #     IRB.start
      #   end
      # end


      # Home ------------------------------------------------------------------
      command :home do |c|
        c.action do |args, options|
          puts Spark.home
          exit(0)
        end
      end


      # Ruby spark jar --------------------------------------------------------
      command :ruby_spark_jar do |c|
        c.action do |args, options|
          puts Spark.ruby_spark_jar
          exit(0)
        end
      end

      run!
    end

  end
end


================================================
FILE: lib/spark/command/base.rb
================================================
##
# Spark::Command::Base
#
# Parent for all commands (Map, FlatMap, Sort, ...)
#
class Spark::Command::Base

  DEFAULT_VARIABLE_OPTIONS = {
    type: Hash,
    function: true
  }

  def initialize(*args)
    settings.variables.each do |name, options|
      instance_variable_set("@#{name}", args.shift)
    end
  end

  def to_s
    self.class.name.split('::').last
  end

  def self.error(message)
    raise Spark::CommandError, message
  end

  def error(message)
    self.class.error(message)
  end

  def log(message=nil)
    $stdout.puts %{==> #{Time.now.strftime("%H:%M:%S")} [#{self.class.name}] #{message}}
    $stdout.flush
  end


  # ===============================================================================================
  # Methods called during class loading
  # This is not nicer way but these methods set/get classes variables for child

  # Settings for command (variables)
  def self.settings
    init_settings
    class_variable_get(:@@settings)
  end

  def settings
    self.class.settings
  end

  # Init empty settings
  def self.init_settings
    if !class_variable_defined?(:@@settings)
      struct = Struct.new(:variables)

      class_variable_set(:@@settings, struct.new)
      settings.variables = {}
    end
  end

  # New variable for command
  #
  # == Example:
  #
  #   class Map < Spark::Command::Base
  #     variable :map_function
  #   end
  #
  #   command = Map.new(1)
  #
  #   command.instance_variables
  #   # => [:@map_function]
  #   command.instance_variable_get(:@map_function)
  #   # => 1
  #
  def self.variable(name, options={})
    if settings.variables.has_key?(name)
      error "Function #{name} already exist."
    end

    settings.variables[name] = DEFAULT_VARIABLE_OPTIONS.merge(options)
  end


  # ===============================================================================================
  # Executing methods

  # Execute command for data and split index
  def execute(iterator, split_index)
    # Implemented on Base but can be override
    before_run

    # Run has to be implemented on child
    if iterator.is_a?(Enumerator::Lazy) && respond_to?(:lazy_run)
      return lazy_run(iterator, split_index)
    end

    iterator = iterator.to_a
    run(iterator, split_index)
  end

  def prepared?
    !!@prepared
  end

  # This is called before execution. Executing will be stopped if
  # some command contains error (e.g. badly serialized lambda).
  #
  # == What is doing?
  # * evaluate lambda
  # * evaluate method
  # * make new lambda
  #
  def prepare
    return if prepared?

    to_function = settings.variables.select {|_, options| options[:function]}
    to_function.each do |name, options|
      name = "@#{name}"
      data = instance_variable_get(name)

      case data[:type]
      when 'proc'
        result = eval(data[:content])
      when 'symbol'
        result = lambda(&data[:content])
      when 'method'
        # Method must me added to instance not Class
        instance_eval(data[:content])
        # Method will be available as Proc
        result = lambda(&method(data[:name]))
      end

      instance_variable_set(name, result)
    end

    @prepared = true
  end

  # This method is called before every execution.
  def before_run
  end


  # ===============================================================================================
  # Bound objects

  attr_accessor :__objects__

  def method_missing(method, *args, &block)
    if __objects__ && __objects__.has_key?(method)
      return __objects__[method]
    end

    super
  end

end


================================================
FILE: lib/spark/command/basic.rb
================================================
_Base = Spark::Command::Base

# -------------------------------------------------------------------------------------------------
# Map

class Spark::Command::Map < _Base
  variable :map_function

  def run(iterator, *)
    iterator.map! do |item|
      @map_function.call(item)
    end
    iterator
  end

  def lazy_run(iterator, *)
    iterator.map do |item|
      @map_function.call(item)
    end
  end
end

# -------------------------------------------------------------------------------------------------
# FlatMap

class Spark::Command::FlatMap < Spark::Command::Map
  def run(iterator, *)
    iterator = super
    iterator.flatten!(1)
    iterator
  end

  def lazy_run(iterator, *)
    iterator.flat_map do |item|
      @map_function.call(item)
    end
  end
end

# -------------------------------------------------------------------------------------------------
# MapPartitionsWithIndex

class Spark::Command::MapPartitionsWithIndex < _Base
  variable :partition_function

  def run(iterator, index)
    iterator = @partition_function.call(iterator, index)
    iterator
  end

  # User should controll if there is Enumerator or not
  # alias_method :lazy_run, :run
end

# -------------------------------------------------------------------------------------------------
# MapPartitions

class Spark::Command::MapPartitions < Spark::Command::MapPartitionsWithIndex
  def run(iterator, *)
    # Do not use `super` because `@partition_function` can be method with 1 argument
    iterator = @partition_function.call(iterator)
    iterator
  end
  # alias_method :lazy_run, :run
end

# -------------------------------------------------------------------------------------------------
# Filter

class Spark::Command::Filter < _Base
  variable :filter_function

  def run(iterator, *)
    iterator.select! do |item|
      @filter_function.call(item)
    end
    iterator
  end

  def lazy_run(iterator, *)
    iterator.select do |item|
      @filter_function.call(item)
    end
  end
end

# -------------------------------------------------------------------------------------------------
# Compact

class Spark::Command::Compact < _Base
  def run(iterator, *)
    iterator.compact!
    iterator
  end

  def lazy_run(iterator, *)
    iterator.select do |item|
      !item.nil?
    end
  end
end

# -------------------------------------------------------------------------------------------------
# Glom

class Spark::Command::Glom < _Base
  def run(iterator, *)
    [iterator]
  end

  def lazy_run(iterator, *)
    run(iterator.to_a)
  end
end

# -------------------------------------------------------------------------------------------------
# Shuffle

class Spark::Command::Shuffle < _Base
  variable :seed, function: false, type: Integer

  def run(iterator, *)
    iterator.shuffle!(random: rng)
    iterator
  end

  def rng
    Random.new(@seed)
  end
end

# -------------------------------------------------------------------------------------------------
# PartitionBy

class Spark::Command::PartitionBy

  class Base < Spark::Command::Base
    include Spark::Helper::Serialize

    def prepare
      super

      # Default. Keep it after super because Sorting has own key_function.
      @key_function ||= lambda{|x| x[0]}
    end

    def run(iterator, *)
      iterator.map! do |item|
        make_partition_item(item)
      end
      iterator.flatten!(1)
      iterator
    end

    def lazy_run(iterator, *)
      iterator.flat_map do |item|
        make_partition_item(item)
      end
    end

    private

      def make_partition_item(item)
        [
          pack_long(@partition_func.call(@key_function[item])),
          item
        ]
      end
  end

  class Basic < Base
    variable :partition_func
  end

  class Sorting < Base
    variable :key_function
    variable :bounds, function: false, type: Array
    variable :ascending, function: false, type: [TrueClass, FalseClass]
    variable :num_partitions, function: false, type: Numeric

    def prepare
      super

      # Index by bisect alghoritm
      @partition_func ||= Proc.new do |key|
        count = 0
        @bounds.each{|i|
          break if i >= key
          count += 1
        }

        if @ascending
          count
        else
          @num_partitions - 1 - count
        end
      end
    end

  end # Sorting
end # PartitionBy

# -------------------------------------------------------------------------------------------------
# Aggregate

class Spark::Command::Aggregate < _Base
  variable :reduce_func
  variable :zero_value, function: false, type: Object

  def run(iterator, *)
    [iterator.reduce(@zero_value, &@reduce_func)]
  end

  def lazy_run(iterator, *)
    run(iterator)
  end
end

# -------------------------------------------------------------------------------------------------
# Reduce

class Spark::Command::Reduce < Spark::Command::Aggregate
  def run(iterator, *)
    [iterator.reduce(&@reduce_func)]
  end
end

# -------------------------------------------------------------------------------------------------
# Foreach

class Spark::Command::Foreach < _Base
  variable :each_function

  def run(iterator, *)
    iterator.each do |item|
      @each_function.call(item)
    end
    nil
  end
end

# -------------------------------------------------------------------------------------------------
# ForeachPartition

class Spark::Command::ForeachPartition < _Base
  variable :partition_function

  def run(iterator, *)
    @partition_function.call(iterator)
    nil
  end
end

# -------------------------------------------------------------------------------------------------
# KeyBy

class Spark::Command::KeyBy < _Base
  variable :key_function

  def run(iterator, *)
    iterator.map! do |item|
      [@key_function.call(item), item]
    end
    iterator
  end

  def lazy_run(iterator, *)
    iterator.map do |item|
      [@key_function.call(item), item]
    end
  end
end

# -------------------------------------------------------------------------------------------------
# Take

class Spark::Command::Take < _Base
  variable :total,     function: false, type: Numeric
  variable :last_part, function: false, type: Numeric

  def run(iterator, index)
    if index == @last_part && iterator.size > @total
      return iterator.slice!(0, @total)
    end

    iterator
  end
end

# -------------------------------------------------------------------------------------------------
# Pipe

class Spark::Command::Pipe < _Base
  variable :cmds, function: false, type: Array

  def before_run
    require 'open3'

    @in, @out, @threads = Open3.pipeline_rw(*@cmds)
  end

  def run(iterator, *)
    create_writing_thread(iterator)

    new_iterator = []

    # Read full input
    begin
      loop {
        new_iterator << @out.readline.rstrip
      }
    rescue EOFError
    end

    new_iterator
  end

  def lazy_run(iterator, *)
    create_writing_thread(iterator)

    Enumerator::Lazy.new([nil]) do |yielder, _|
      begin
        loop {
          yielder << @out.readline.rstrip
        }
      rescue EOFError
      end
    end
  end

  private

    def create_writing_thread(iterator)
      @writing_thread = Thread.new do
        # Send complete iterator to the pipe
        iterator.each do |item|
          @in.puts(item.to_s.rstrip)
        end

        # Input must be closed for EOFError
        @in.close
      end
    end

end


================================================
FILE: lib/spark/command/pair.rb
================================================
_Base = Spark::Command::Base

# -------------------------------------------------------------------------------------------------
# CombineByKey

class Spark::Command::CombineByKey

  # ---------------

  class Base < Spark::Command::Base
    def run(iterator, *)
      _run(iterator).to_a
    end

    def lazy_run(iterator, *)
      _run(iterator).lazy
    end
  end

  # ---------------

  class Combine < Base
    variable :create_combiner
    variable :merge_value

    def _run(iterator)
      # Not use combiners[key] ||= ..
      # it tests nil and not has_key?
      combiners = {}
      iterator.each do |key, value|
        if combiners.has_key?(key)
          combiners[key] = @merge_value.call(combiners[key], value)
        else
          combiners[key] = @create_combiner.call(value)
        end
      end
      combiners
    end
  end

  # ---------------

  class Merge < Base
    variable :merge_combiners

    def _run(iterator, *)
      combiners = {}
      iterator.each do |key, value|
        if combiners.has_key?(key)
          combiners[key] = @merge_combiners.call(combiners[key], value)
        else
          combiners[key] = value
        end
      end
      combiners
    end
  end

  # ---------------

  class CombineWithZero < Base
    variable :zero_value, function: false, type: Object
    variable :merge_value

    def _run(iterator)
      # Not use combiners[key] ||= ..
      # it tests nil and not has_key?
      combiners = {}
      iterator.each do |key, value|
        unless combiners.has_key?(key)
          combiners[key] = @zero_value
        end

        combiners[key] = @merge_value.call(combiners[key], value)
      end
      combiners
    end
  end


  # ---------------

end

# -------------------------------------------------------------------------------------------------
# MapValues

class Spark::Command::MapValues < _Base
  variable :map_function

  def run(iterator, *)
    iterator.map! do |item|
      item[1] = @map_function.call(item[1])
      item
    end
    iterator
  end

  def lazy_run(iterator, *)
    iterator.map do |item|
      item[1] = @map_function.call(item[1])
      item
    end
  end
end

# -------------------------------------------------------------------------------------------------
# FlatMapValues

class Spark::Command::FlatMapValues < _Base
  variable :map_function

  def run(iterator, *)
    iterator.map! do |(key, values)|
      values = @map_function.call(values)
      values.flatten!(1)
      values.map! do |value|
        [key, value]
      end
    end
    iterator.flatten!(1)
    iterator
  end
end


================================================
FILE: lib/spark/command/sort.rb
================================================
_Base = Spark::Command::Base

# -------------------------------------------------------------------------------------------------
# Sort

class Spark::Command::SortByKey < _Base
  variable :key_function
  variable :ascending,  function: false, type: [TrueClass, FalseClass]
  variable :spilling,   function: false, type: [TrueClass, FalseClass]
  variable :memory,     function: false, type: [Numeric, NilClass]
  variable :serializer, function: false, type: Spark::Serializer::Base

  # Currently disabled
  def before_run
    @spilling = false
  end

  def run(iterator, _)
    if @spilling
      iterator = run_with_spilling(iterator.each)
    else
      run_without_spilling(iterator)
    end

    iterator
  end

  def run_with_enum(iterator, _)
    if @spilling
      iterator = run_with_spilling(iterator)
    else
      iterator = iterator.to_a
      run_without_spilling(iterator)
    end

    iterator
  end

  private

    def run_with_spilling(iterator)
      sorter = Spark::ExternalSorter.new(@memory, @serializer)
      sorter.sort_by(iterator, @ascending, @key_function)
    end

    def run_without_spilling(iterator)
      iterator.sort_by!(&@key_function)
      iterator.reverse! unless @ascending
    end

end


================================================
FILE: lib/spark/command/statistic.rb
================================================
_Base = Spark::Command::Base

# -------------------------------------------------------------------------------------------------
# Sample

class Spark::Command::Sample < _Base
  variable :with_replacement, function: false, type: [TrueClass, FalseClass]
  variable :fraction,         function: false, type: Numeric
  variable :seed,             function: false, type: [NilClass, Numeric]

  def run(iterator, _)
    sampler.sample(iterator)
  end

  def lazy_run(iterator, _)
    sampler.lazy_sample(iterator)
  end

  def sampler
    @sampler ||= _sampler
  end

  def _sampler
    if @with_replacement
      sampler = Spark::Sampler::Poisson
    else
      sampler = Spark::Sampler::Uniform
    end

    sampler = sampler.new(@fraction, @seed)
  end
end

# -------------------------------------------------------------------------------------------------
# Stats

class Spark::Command::Stats < _Base

  def run(iterator, *)
    [Spark::StatCounter.new(iterator)]
  end

  def lazy_run(iterator, *)
    run(iterator)
  end

end

# -------------------------------------------------------------------------------------------------
# Histogram

class Spark::Command::Histogram < _Base
  include Spark::Helper::Statistic

  variable :even,    function: false, type: [TrueClass, FalseClass]
  variable :buckets, function: false, type: Array

  def run(iterator, *)
    counters = Array.new(counter_size) { 0 }
    iterator.each do |item|
      if item.nil? || (item.is_a?(Float) && !item.finite?) || item > max || item < min
        next
      end

      x = bucket_function.call(item)
      if x.nil?
        # next
      else
        counters[x] += 1
      end
    end
    [counters]
  end

  def lazy_run(iterator, *)
    run(iterator)
  end

  private

    def min
      @buckets.first
    end

    def max
      @buckets.last
    end

    def counter_size
      @buckets.size-1
    end

    def increment
      @buckets[1]-@buckets[0]
    end

    # Decide which bucket function to pass. We decide here rather than having
    # a general function so that the decission need only be made once.
    def bucket_function
      @bucket_function ||= _bucket_function
    end

    def _bucket_function
      if @even
        fast_bucket_function
      else
        basic_bucket_function
      end
    end

    # Determine the bucket function in constant time.
    # Requires that buckets are evenly spaced
    def fast_bucket_function
      Proc.new do |item|
        if item.is_a?(Float) && item.nan?
          nil
        else
          bucket_number = (item - min)/increment
          if bucket_number > counter_size || bucket_number < 0
            nil
          else
            [bucket_number.to_i, counter_size-1].min
          end
        end
      end
    end

    # Basic bucket function. Same as right bisect.
    def basic_bucket_function
      Proc.new do |item|
        bucket_number = bisect_right(@buckets, item) - 1

        # Counters is @buckets.size - 1
        # [bucket_number, counter_size-1].min

        if bucket_number > counter_size-1
          counter_size-1
        else
          bucket_number
        end
      end
    end

end


================================================
FILE: lib/spark/command.rb
================================================
module Spark
  ##
  # Container which includes all commands and other things for worker
  # Every RDD have own copy of Command
  #
  class Command

    attr_accessor :serializer, :deserializer, :commands, :libraries, :bound_objects

    def initialize
      @serializer = nil
      @deserializer = nil
      @commands = []
      @libraries = []
      @bound_objects = {}
    end

    def execute(iterator, split_index)
      # Require necessary libraries
      libraries.each{|lib| require lib}

      # Prepare bound objects
      @commands.each do |command|
        command.__objects__ = bound_objects
      end

      # Prepare for running
      @commands.each(&:prepare)

      # Run all task
      @commands.each do |command|
        iterator = command.execute(iterator, split_index)
      end

      # Return changed iterator. This is not be necessary for some tasks
      # because of using inplace changing but some task can return
      # only one value (for example reduce).
      iterator
    end

    def last
      @commands.last
    end

    def bound_objects
      # Objects from users
      # Already initialized objects on worker
      return @bound_objects if @bound_objects

      if @serialized_bound_objects
        # Still serialized
        @bound_objects = Marshal.load(@serialized_bound_objects)
      else
        # Something else
        @bound_objects = {}
      end
    end

    # Bound objects can depend on library which is loaded during @execute
    # In that case worker raise "undefined class/module"
    def marshal_dump
      [@serializer, @deserializer, @commands, @libraries, serialized_bound_objects]
    end

    def marshal_load(array)
      @serializer = array.shift
      @deserializer = array.shift
      @commands = array.shift
      @libraries = array.shift
      @serialized_bound_objects = array.shift
    end

    private

      def serialized_bound_objects
        @serialized_bound_objects ||= Marshal.dump(@bound_objects)
      end

  end
end

require 'spark/command/base'
require 'spark/command/basic'
require 'spark/command/pair'
require 'spark/command/statistic'
require 'spark/command/sort'


================================================
FILE: lib/spark/command_builder.rb
================================================
require 'spark/command_validator'

module Spark
  ##
  # Builder for building correct {Spark::Command}
  #
  class CommandBuilder

    extend Forwardable

    include Spark::Helper::Serialize
    include Spark::Helper::System
    include Spark::CommandValidator

    attr_reader :command

    def_delegators :@command, :serializer, :serializer=, :deserializer, :deserializer=, :commands,
                              :commands=, :libraries, :libraries=, :bound_objects, :bound_objects=

    def initialize(serializer, deserializer=nil)
      create_command
      self.serializer   = serializer
      self.deserializer = deserializer || serializer.dup
    end

    def create_command
      @command = Spark::Command.new
    end

    # Do not user Marshal.dump(Marshal.load(self)) because some variables
    # have marshal_dump prepared for worker.
    def deep_copy
      copy = self.dup
      copy.create_command
      copy.serializer    = self.serializer.deep_copy
      copy.deserializer  = self.deserializer.deep_copy
      copy.commands      = self.commands.dup
      copy.libraries     = self.libraries.dup
      copy.bound_objects = self.bound_objects.dup
      copy
    end

    # Serialize Command class for worker
    # Java use signed number
    def build
      unpack_chars(Marshal.dump(@command))
    end

    def add_command(klass, *args)
      variables = klass.settings.variables
      validate_size(variables, args)

      built_args = []
      variables.values.zip(args) do |var, arg|
        if var[:function]
          arg = serialize_function(arg)
        end

        validate(arg, var)
        built_args << arg
      end

      comm = klass.new(*built_args)
      @command.commands << comm
      self
    end

    def add_library(*libraries)
      @command.libraries += libraries
    end

    def bind(objects)
      objects.symbolize_keys!
      @command.bound_objects.merge!(objects)
    end

    private

        # Serialized can be Proc and Method
        #
        # === Func
        # * *string:* already serialized proc
        # * *proc:* proc
        # * *symbol:* name of method
        # * *method:* Method class
        #
        def serialize_function(func)
          case func
          when String
            serialize_function_from_string(func)
          when Symbol
            serialize_function_from_symbol(func)
          when Proc
            serialize_function_from_proc(func)
          when Method
            serialize_function_from_method(func)
          else
            raise Spark::CommandError, 'You must enter String, Symbol, Proc or Method.'
          end
        end

        def serialize_function_from_string(string)
          {type: 'proc', content: string}
        end

        def serialize_function_from_symbol(symbol)
          {type: 'symbol', content: symbol}
        end

        # Serialize Proc as String
        #
        #   lambda{|x| x*x}.to_source
        #   # => "proc { |x| (x * x) }"
        #
        def serialize_function_from_proc(proc)
          serialize_function_from_string(proc.to_source)
        rescue
          raise Spark::SerializeError, 'Proc can not be serialized. Use String instead.'
        end

        # Serialize method as string
        #
        #   def test(x)
        #     x*x
        #   end
        #   serialize_function_from_method(method(:test))
        #
        #   # => "def test(x)\n  x*x\nend\n"
        #
        def serialize_function_from_method(meth)
          if pry?
            meth = Pry::Method.new(meth)
          end

          {type: 'method', name: meth.name, content: meth.source}
        rescue
          raise Spark::SerializeError, 'Method can not be serialized. Use full path or Proc.'
        end

  end
end


================================================
FILE: lib/spark/command_validator.rb
================================================
module Spark
  module CommandValidator

    def validate(value, options)
      validate_type(value, options[:type])
    end

    def valid?(value, options)
      begin
        validate(value, options)
        return true
      rescue
        return false
      end
    end

    def validate_type(value, types)
      types = [types] if !types.is_a?(Array)

      types.each do |type|
        return if value.is_a?(type)
      end

      error "Value: #{value} should be a #{types.join(' or ')} but is #{value.class}."
    end

    def validate_size(array1, array2)
      if array1.size != array2.size
        error "Wrong number of arguments (#{array1.size} for #{array2.size})"
      end
    end

  end
end


================================================
FILE: lib/spark/config.rb
================================================
# Necessary libraries
Spark.load_lib

module Spark
  # Common configuration for RubySpark and Spark
  class Config

    include Spark::Helper::System

    TYPES = {
      'spark.shuffle.spill' => :boolean,
      'spark.ruby.serializer.compress' => :boolean
    }

    # Initialize java SparkConf and load default configuration.
    def initialize
      @spark_conf = SparkConf.new(true)
      set_default
      from_file(Spark::DEFAULT_CONFIG_FILE)
    end

    def from_file(file)
      check_read_only

      if file && File.exist?(file)
        file = File.expand_path(file)
        RubyUtils.loadPropertiesFile(spark_conf, file)
      end
    end

    def [](key)
      get(key)
    end

    def []=(key, value)
      set(key, value)
    end

    def spark_conf
      if Spark.started?
        # Get latest configuration
        Spark.context.jcontext.conf
      else
        @spark_conf
      end
    end

    def valid!
      errors = []

      if !contains?('spark.app.name')
        errors << 'An application name must be set in your configuration.'
      end

      if !contains?('spark.master')
        errors << 'A master URL must be set in your configuration.'
      end

      if Spark::Serializer.find(get('spark.ruby.serializer')).nil?
        errors << 'Unknow serializer.'
      end

      scanned = get('spark.ruby.executor.command').scan('%s')

      if scanned.size == 0
        errors << "Executor command must contain '%s'."
      end

      if scanned.size > 1
        errors << "Executor command can contain only one '%s'."
      end

      if errors.any?
        errors.map!{|error| "- #{error}"}

        raise Spark::ConfigurationError, "Configuration is not valid:\r\n#{errors.join("\r\n")}"
      end
    end

    def read_only?
      Spark.started?
    end

    # Rescue from NoSuchElementException
    def get(key)
      value = spark_conf.get(key.to_s)

      case TYPES[key]
      when :boolean
        parse_boolean(value)
      when :integer
        parse_integer(value)
      else
        value
      end
    rescue
      nil
    end

    def get_all
      Hash[spark_conf.getAll.map{|tuple| [tuple._1, tuple._2]}]
    end

    def contains?(key)
      spark_conf.contains(key.to_s)
    end

    def set(key, value)
      check_read_only
      spark_conf.set(key.to_s, value.to_s)
    end

    def set_app_name(name)
      set('spark.app.name', name)
    end

    def set_master(master)
      set('spark.master', master)
    end

    def parse_boolean(value)
      case value
      when 'true'
        true
      when 'false'
        false
      end
    end

    def parse_integer(value)
      value.to_i
    end

    # =============================================================================
    # Defaults

    def set_default
      set_app_name('RubySpark')
      set_master('local[*]')
      set('spark.ruby.driver_home', Spark.home)
      set('spark.ruby.serializer', default_serializer)
      set('spark.ruby.serializer.compress', default_serializer_compress)
      set('spark.ruby.serializer.batch_size', default_serializer_batch_size)
      set('spark.ruby.executor.command', default_executor_command)
      set('spark.ruby.executor.options', default_executor_options)
      set('spark.ruby.worker.type', default_worker_type)
      load_executor_envs
      # set('spark.ruby.executor.install', default_executor_install)
    end

    def default_serializer
      ENV['SPARK_RUBY_SERIALIZER'] || Spark::Serializer::DEFAULT_SERIALIZER_NAME
    end

    def default_serializer_compress
      ENV['SPARK_RUBY_SERIALIZER_COMPRESS'] || Spark::Serializer::DEFAULT_COMPRESS
    end

    def default_serializer_batch_size
      ENV['SPARK_RUBY_SERIALIZER_BATCH_SIZE'] || Spark::Serializer::DEFAULT_BATCH_SIZE
    end

    # Command template which is applied when scala want create a ruby
    # process (e.g. master, home request). Command is represented by '%s'.
    #
    # == Example:
    #   bash --norc -i -c "export HOME=/home/user; cd; source .bashrc; %s"
    #
    def default_executor_command
      ENV['SPARK_RUBY_EXECUTOR_COMMAND'] || '%s'
    end

    # Options for every worker.
    #
    # == Example:
    #   -J-Xmx512m
    #
    def default_executor_options
      ENV['SPARK_RUBY_EXECUTOR_OPTIONS'] || ''
    end

    # # Install command which is triggered before on start.
    # # This command using executor command template.
    # #
    # # == Example:
    # #   gem install ruby-spark -v 1.2.0
    # #
    # def default_executor_install
    #   ENV['SPARK_RUBY_EXECUTOR_INSTALL'] || ''
    # end

    # Type of worker.
    #
    # == Options:
    # process:: (default)
    # thread:: (experimental)
    #
    def default_worker_type
      ENV['SPARK_RUBY_WORKER_TYPE'] || 'process'
    end

    # Load environment variables for executor from ENV.
    #
    # == Examples:
    #   SPARK_RUBY_EXECUTOR_ENV_KEY1="1"
    #   SPARK_RUBY_EXECUTOR_ENV_KEY2="2"
    #
    def load_executor_envs
      prefix = 'SPARK_RUBY_EXECUTOR_ENV_'

      envs = ENV.select{|key, _| key.start_with?(prefix)}
      envs.each do |key, value|
        key = key.dup # ENV keys are frozen
        key.slice!(0, prefix.size)

        set("spark.ruby.executor.env.#{key}", value)
      end
    end

    # Aliases
    alias_method :getAll,     :get_all
    alias_method :setAppName, :set_app_name
    alias_method :setMaster,  :set_master

    private

      def check_read_only
        if read_only?
          raise Spark::ConfigurationError, 'Configuration is ready only'
        end
      end

  end
end


================================================
FILE: lib/spark/constant.rb
================================================
module Spark
  # Commond constant for Ruby and Spark
  module Constant
    DATA_EOF = -2
    WORKER_ERROR = -1
    WORKER_DONE = 0
    CREATE_WORKER = 1
    KILL_WORKER = 2
    KILL_WORKER_AND_WAIT = 3
    SUCCESSFULLY_KILLED = 4
    UNSUCCESSFUL_KILLING = 5
    ACCUMULATOR_ACK = 6
  end
end


================================================
FILE: lib/spark/context.rb
================================================
# Necessary libraries
Spark.load_lib

module Spark
  ##
  # Main entry point for Spark functionality. A SparkContext represents the connection to a Spark
  # cluster, and can be used to create RDDs, accumulators and broadcast variables on that cluster.
  #
  class Context

    include Spark::Helper::System
    include Spark::Helper::Parser
    include Spark::Helper::Logger

    attr_reader :jcontext, :jaccumulator, :temp_dir

    # Constructor for Ruby context. Configuration is automatically is taken
    # from Spark. Config will be automatically set to default if user start
    # context first.
    #
    def initialize
      Spark.config.valid!
      @jcontext = JavaSparkContext.new(Spark.config.spark_conf)
      @jcontext.addJar(Spark.ruby_spark_jar)

      # Does not work on 1.2
      # ui.attachTab(RubyTab.new(ui, to_java_hash(RbConfig::CONFIG)))

      spark_local_dir = JUtils.getLocalDir(sc.conf)
      @temp_dir = JUtils.createTempDir(spark_local_dir, 'ruby').getAbsolutePath

      accum_server = Spark::Accumulator::Server
      accum_server.start
      @jaccumulator = @jcontext.accumulator(ArrayList.new, RubyAccumulatorParam.new(accum_server.host, accum_server.port))

      log_info("Ruby accumulator server is running on port #{accum_server.port}")

      set_call_site('Ruby') # description of stage
    end

    def inspect
      result  = %{#<#{self.class.name}:0x#{object_id}\n}
      result << %{Tempdir: "#{temp_dir}">}
      result
    end

    def stop
      Spark::Accumulator::Server.stop
      log_info('Ruby accumulator server was stopped')
      @jcontext.stop
    end

    def sc
      @jcontext.sc
    end

    def ui
      sc.ui
    end

    # Default level of parallelism to use when not given by user (e.g. parallelize and makeRDD)
    #
    def default_parallelism
      sc.defaultParallelism
    end

    # Default serializer
    #
    # Batch -> Compress -> Basic
    #
    def default_serializer
      # Basic
      serializer = Spark::Serializer.find!(config('spark.ruby.serializer')).new

      # Compress
      if config('spark.ruby.serializer.compress')
        serializer = Spark::Serializer.compressed(serializer)
      end

      # Bactching
      batch_size = default_batch_size
      if batch_size == 'auto'
        serializer = Spark::Serializer.auto_batched(serializer)
      else
        serializer = Spark::Serializer.batched(serializer, batch_size)
      end

      # Finally, "container" contains serializers
      serializer
    end

    def default_batch_size
      size = config('spark.ruby.serializer.batch_size').to_i
      if size >= 1
        size
      else
        'auto'
      end
    end

    # Set a local property that affects jobs submitted from this thread, such as the
    # Spark fair scheduler pool.
    #
    def set_local_property(key, value)
      jcontext.setLocalProperty(key, value)
    end

    # Get a local property set in this thread, or null if it is missing
    #
    def get_local_property(key)
      jcontext.getLocalProperty(key)
    end

    # Support function for API backtraces.
    #
    def set_call_site(site)
      jcontext.setCallSite(site)
    end

    def clear_call_site
      jcontext.clearCallSite
    end

    # Return a copy of this SparkContext's configuration. The configuration *cannot*
    # be changed at runtime.
    #
    def config(key=nil)
      if key
        Spark.config.get(key)
      else
        Spark.config
      end
    end

    # Add a file to be downloaded with this Spark job on every node.
    # The path of file passed can be either a local file, a file in HDFS
    # (or other Hadoop-supported filesystems), or an HTTP, HTTPS or FTP URI.
    #
    # To access the file in Spark jobs, use `SparkFiles.get(file_name)` with the
    # filename to find its download location.
    #
    # == Example:
    #   `echo 10 > test.txt`
    #
    #   $sc.add_file('test.txt')
    #   $sc.parallelize(0..5).map(lambda{|x| x * SparkFiles.get_content('test.txt').to_i}).collect
    #   # => [0, 10, 20, 30, 40, 50]
    #
    def add_file(*files)
      files.each do |file|
        sc.addFile(file)
      end
    end

    # Broadcast a read-only variable to the cluster, returning a Spark::Broadcast
    # object for reading it in distributed functions. The variable will
    # be sent to each cluster only once.
    #
    # == Example:
    #   broadcast1 = $sc.broadcast('a')
    #   broadcast2 = $sc.broadcast('b')
    #
    #   rdd = $sc.parallelize(0..5, 4)
    #   rdd = rdd.bind(broadcast1: broadcast1, broadcast2: broadcast2)
    #   rdd = rdd.map_partitions_with_index(lambda{|part, index| [broadcast1.value * index, broadcast2.value * index] })
    #   rdd.collect
    #   # => ["", "", "a", "b", "aa", "bb", "aaa", "bbb"]
    #
    def broadcast(value)
      Spark::Broadcast.new(self, value)
    end

    # Create an Accumulator with the given initial value, using a given
    # accum_param helper object to define how to add values of the
    # data type if provided.
    #
    # == Example:
    #   accum = $sc.accumulator(7)
    #
    #   rdd = $sc.parallelize(0..5, 4)
    #   rdd = rdd.bind(accum: accum)
    #   rdd = rdd.map_partitions(lambda{|_| accum.add(1) })
    #   rdd = rdd.collect
    #
    #   accum.value
    #   # => 11
    #
    def accumulator(value, accum_param=:+, zero_value=0)
      Spark::Accumulator.new(value, accum_param, zero_value)
    end

    # Distribute a local Ruby collection to form an RDD
    # Direct method can be slow so be careful, this method update data inplace
    #
    # == Parameters:
    # data:: Range or Array
    # num_slices:: number of slice
    # serializer:: custom serializer (default: serializer based on configuration)
    #
    # == Examples:
    #   $sc.parallelize(["1", "2", "3"]).map(lambda{|x| x.to_i}).collect
    #   #=> [1, 2, 3]
    #
    #   $sc.parallelize(1..3).map(:to_s).collect
    #   #=> ["1", "2", "3"]
    #
    def parallelize(data, num_slices=nil, serializer=nil)
      num_slices ||= default_parallelism
      serializer ||= default_serializer

      serializer.check_each(data)

      # Through file
      file = Tempfile.new('to_parallelize', temp_dir)
      serializer.dump_to_io(data, file)
      file.close # not unlink
      jrdd = RubyRDD.readRDDFromFile(jcontext, file.path, num_slices)

      Spark::RDD.new(jrdd, self, serializer)
    ensure
      file && file.unlink
    end

    # Read a text file from HDFS, a local file system (available on all nodes), or any
    # Hadoop-supported file system URI, and return it as an RDD of Strings.
    #
    # == Example:
    #   f = Tempfile.new("test")
    #   f.puts("1")
    #   f.puts("2")
    #   f.close
    #
    #   $sc.text_file(f.path).map(lambda{|x| x.to_i}).collect
    #   # => [1, 2]
    #
    def text_file(path, min_partitions=nil, encoding=Encoding::UTF_8, serializer=nil)
      min_partitions ||= default_parallelism
      serializer     ||= default_serializer
      deserializer     = Spark::Serializer.build { __text__(encoding) }

      Spark::RDD.new(@jcontext.textFile(path, min_partitions), self, serializer, deserializer)
    end

    # Read a directory of text files from HDFS, a local file system (available on all nodes), or any
    # Hadoop-supported file system URI. Each file is read as a single record and returned in a
    # key-value pair, where the key is the path of each file, the value is the content of each file.
    #
    # == Example:
    #   dir = Dir.mktmpdir
    #   f1 = Tempfile.new("test1", dir)
    #   f2 = Tempfile.new("test2", dir)
    #   f1.puts("1"); f1.puts("2");
    #   f2.puts("3"); f2.puts("4");
    #   f1.close
    #   f2.close
    #
    #   $sc.whole_text_files(dir).flat_map(lambda{|key, value| value.split}).collect
    #   # => ["1", "2", "3", "4"]
    #
    def whole_text_files(path, min_partitions=nil, serializer=nil)
      min_partitions ||= default_parallelism
      serializer     ||= default_serializer
      deserializer     = Spark::Serializer.build{ __pair__(__text__, __text__) }

      Spark::RDD.new(@jcontext.wholeTextFiles(path, min_partitions), self, serializer, deserializer)
    end

    # Executes the given partition function f on the specified set of partitions,
    # returning the result as an array of elements.
    #
    # If partitions is not specified, this will run over all partitions.
    #
    # == Example:
    #   rdd = $sc.parallelize(0..10, 5)
    #   $sc.run_job(rdd, lambda{|x| x.to_s}, [0,2])
    #   # => ["[0, 1]", "[4, 5]"]
    #
    def run_job(rdd, f, partitions=nil, allow_local=false)
      run_job_with_command(rdd, partitions, allow_local, Spark::Command::MapPartitions, f)
    end

    # Execute the given command on specific set of partitions.
    #
    def run_job_with_command(rdd, partitions, allow_local, command, *args)
      if !partitions.nil? && !partitions.is_a?(Array)
        raise Spark::ContextError, 'Partitions must be nil or Array'
      end

      partitions_size = rdd.partitions_size

      # Execute all parts
      if partitions.nil?
        partitions = (0...partitions_size).to_a
      end

      # Can happend when you use coalesce
      partitions.delete_if {|part| part >= partitions_size}

      # Rjb represent Fixnum as Integer but Jruby as Long
      partitions = to_java_array_list(convert_to_java_int(partitions))

      # File for result
      file = Tempfile.new('collect', temp_dir)

      mapped = rdd.new_rdd_from_command(command, *args)
      RubyRDD.runJob(rdd.context.sc, mapped.jrdd, partitions, allow_local, file.path)

      mapped.collect_from_file(file)
    end


    # Aliases
    alias_method :textFile, :text_file
    alias_method :wholeTextFiles, :whole_text_files
    alias_method :defaultParallelism, :default_parallelism
    alias_method :setLocalProperty, :set_local_property
    alias_method :getLocalProperty, :get_local_property
    alias_method :setCallSite, :set_call_site
    alias_method :clearCallSite, :clear_call_site
    alias_method :runJob, :run_job
    alias_method :runJobWithCommand, :run_job_with_command
    alias_method :addFile, :add_file

  end
end


================================================
FILE: lib/spark/error.rb
================================================
module Spark
  # Extension cannot be built
  class BuildError < StandardError
  end

  # Proc.to_source
  # Java object cannot be converted
  class SerializeError < StandardError
  end

  # Serializer method
  # Non-existing serializer
  class NotImplemented < StandardError
  end

  # Missison app_name or master
  class ConfigurationError < StandardError
  end

  # Wrong parameters
  class RDDError < StandardError
  end

  # Validations
  class CommandError < StandardError
  end

  # Parser helper
  # SQL DataType
  class ParseError < StandardError
  end

  # Validation in context
  class ContextError < StandardError
  end

  # Broadcasts
  # Missing path
  class BroadcastError < StandardError
  end

  # Accumulators
  # Existing keys
  # Wrong ID
  class AccumulatorError < StandardError
  end

  # Wrong instances
  class MllibError < StandardError
  end

  # Wrong datatype
  class SQLError < StandardError
  end

  # Missing Java class
  class JavaBridgeError < StandardError
  end
end


================================================
FILE: lib/spark/ext/hash.rb
================================================
module Spark
  module CoreExtension
    module Hash
      module ClassMethods
      end

      module InstanceMethods
        # Destructively convert all keys to strings.
        def stringify_keys_with_spark!
          transform_keys!{ |key| key.to_s }
        end

        # Destructively convert all keys to symbols, as long as they respond
        def symbolize_keys_with_spark!
          transform_keys!{ |key| key.to_sym rescue key }
        end

        # Destructively convert all keys using the block operations.
        # Same as transform_keys but modifies +self+.
        def transform_keys_with_spark!
          keys.each do |key|
            self[yield(key)] = delete(key)
          end
          self
        end
      end

      def self.included(base)
        base.extend(ClassMethods)
        base.send(:include, InstanceMethods)
        base.class_eval do
          patch_unless_exist :stringify_keys!, :spark
          patch_unless_exist :symbolize_keys!, :spark
          patch_unless_exist :transform_keys!, :spark
        end
      end
    end
  end
end

Hash.__send__(:include, Spark::CoreExtension::Hash)


================================================
FILE: lib/spark/ext/integer.rb
================================================
module Spark
  module CoreExtension
    module Integer
      module ClassMethods
      end

      module InstanceMethods
      end

      def self.included(base)
        base.extend(ClassMethods)
        base.send(:include, InstanceMethods)
        base.class_eval do
          const_set :MAX_WITH_SPARK, 1 << (1.size * 8 - 2) - 1
          const_set :MIN_WITH_SPARK, -const_get(:MAX_WITH_SPARK) - 1

          path_const_unless_exist :MAX, :SPARK
          path_const_unless_exist :MIN, :SPARK
        end
      end
    end
  end
end

Integer.__send__(:include, Spark::CoreExtension::Integer)


================================================
FILE: lib/spark/ext/io.rb
================================================
module Spark
  module CoreExtension
    module IO
      module ClassMethods
      end

      module InstanceMethods

        # Reading

        def read_int
          unpack_int(read(4))
        end

        def read_int_or_eof
          bytes = read(4)
          return Spark::Constant::DATA_EOF if bytes.nil?
          unpack_int(bytes)
        end

        def read_long
          unpack_long(read(8))
        end

        def read_string
          read(read_int)
        end

        def read_data
          Marshal.load(read_string)
        end


        # Writing

        def write_int(data)
          write(pack_int(data))
        end

        def write_long(data)
          write(pack_long(data))
        end

        # Size and data can have different encoding
        # Marshal: both ASCII
        # Oj: ASCII and UTF-8
        def write_string(data)
          write_int(data.bytesize)
          write(data)
        end

        def write_data(data)
          write_string(Marshal.dump(data))
        end
      end

      def self.included(base)
        base.extend(ClassMethods)
        base.send(:include, Spark::Helper::Serialize)
        base.send(:include, InstanceMethods)
      end
    end
  end
end

IO.__send__(:include, Spark::CoreExtension::IO)
StringIO.__send__(:include, Spark::CoreExtension::IO)


================================================
FILE: lib/spark/ext/ip_socket.rb
================================================
module Spark
  module CoreExtension
    module IPSocket
      module ClassMethods
      end

      module InstanceMethods
        def port
          addr[1]
        end

        def hostname
          addr(true)[2]
        end

        def numeric_address
          addr[3]
        end
      end

      def self.included(base)
        base.extend(ClassMethods)
        base.send(:include, InstanceMethods)
      end
    end
  end
end

IPSocket.__send__(:include, Spark::CoreExtension::IPSocket)


================================================
FILE: lib/spark/ext/module.rb
================================================
module Spark
  module CoreExtension
    module Module

      # Patch method to class unless already exist
      #
      # == Example:
      #
      #   class Hash
      #     def a
      #       1
      #     end
      #   end
      #
      #   module HashExtension
      #     module InstanceMethods
      #       def a_with_spark
      #         2
      #       end
      #
      #       def b_with_spark
      #         1
      #       end
      #     end
      #
      #     def self.included(base)
      #       base.send(:include, InstanceMethods)
      #       base.class_eval do
      #         patch_unless_exist :a, :spark
      #         patch_unless_exist :b, :spark
      #       end
      #     end
      #   end
      #
      #   Hash.include(HashExtension)
      #
      #   Hash.new.a # => 1
      #   Hash.new.b # => 1
      #
      def patch_unless_exist(target, suffix)
        unless method_defined?(target)
          aliased_target, punctuation = target.to_s.sub(/([?!=])$/, ''), $1

          alias_method target, "#{aliased_target}_with_#{suffix}#{punctuation}"
        end
      end

      def path_const_unless_exist(target, suffix)
        unless const_defined?(target)
          const_set(target, const_get("#{target}_WITH_#{suffix}"))
        end
      end

    end
  end
end

Module.__send__(:include, Spark::CoreExtension::Module)


================================================
FILE: lib/spark/ext/object.rb
================================================
module Spark
  module CoreExtension
    module Object
      module ClassMethods
      end

      module InstanceMethods
        def deep_copy_with_spark
          Marshal.load(Marshal.dump(self))
        end

        def silence_warnings
          old_verbose, $VERBOSE = $VERBOSE, nil
          yield
        ensure
          $VERBOSE = old_verbose
        end

        def cattr_reader_with_spark(*syms)
          syms.each do |sym|
            raise NameError.new("Invalid attribute name: #{sym}") unless sym =~ /^[_A-Za-z]\w*$/

            class_eval(<<-EOS, __FILE__, __LINE__ + 1)
              @@#{sym} = nil unless defined? @@#{sym}
              def self.#{sym}
                @@#{sym}
              end
            EOS

            class_eval(<<-EOS, __FILE__, __LINE__ + 1)
              def #{sym}
                @@#{sym}
              end
            EOS
          end
        end

        def cattr_writer_with_spark(*syms)
          syms.each do |sym|
            raise NameError.new("Invalid attribute name: #{sym}") unless sym =~ /^[_A-Za-z]\w*$/

            class_eval(<<-EOS, __FILE__, __LINE__ + 1)
              @@#{sym} = nil unless defined? @@#{sym}
              def self.#{sym}=(obj)
                @@#{sym} = obj
              end
            EOS

            class_eval(<<-EOS, __FILE__, __LINE__ + 1)
              def #{sym}=(obj)
                @@#{sym} = obj
              end
            EOS
          end
        end

        def cattr_accessor_with_spark(*syms)
          cattr_reader_with_spark(*syms)
          cattr_writer_with_spark(*syms)
        end
      end

      def self.included(base)
        base.extend(ClassMethods)
        base.send(:include, InstanceMethods)
        base.class_eval do
          patch_unless_exist :deep_copy, :spark
          patch_unless_exist :silence_warnings, :spark
          patch_unless_exist :cattr_accessor, :spark
        end
      end
    end
  end
end

Object.__send__(:include, Spark::CoreExtension::Object)


================================================
FILE: lib/spark/ext/string.rb
================================================
module Spark
  module CoreExtension
    module String
      module ClassMethods
      end

      module InstanceMethods
        def camelize_with_spark
          self.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase }
        end
      end

      def self.included(base)
        base.extend(ClassMethods)
        base.send(:include, InstanceMethods)
        base.class_eval do
          patch_unless_exist :camelize, :spark
        end
      end
    end
  end
end

String.__send__(:include, Spark::CoreExtension::String)


================================================
FILE: lib/spark/helper/logger.rb
================================================
module Spark
  module Helper
    module Logger

      def self.included(base)
        base.send :extend,  Methods
        base.send :include, Methods
      end

      module Methods
        def log_info(message)
          Spark.logger.info(message)
        end

        def log_debug(message)
          Spark.logger.debug(message)
        end

        def log_trace(message)
          Spark.logger.trace(message)
        end

        def log_warning(message)
          Spark.logger.warning(message)
        end

        def log_error(message)
          Spark.logger.error(message)
        end

        alias_method :logInfo,    :log_info
        alias_method :logDebug,   :log_debug
        alias_method :logTrace,   :log_trace
        alias_method :logWarning, :log_warning
        alias_method :logError,   :log_error

      end # Methods
    end # Logger
  end # Helper
end # Spark


================================================
FILE: lib/spark/helper/parser.rb
================================================
module Spark
  module Helper
    module Parser
      
      def self.included(base)
        base.send :extend,  Methods
        base.send :include, Methods
      end
     
      module Methods
        def to_java_hash(hash)
          hash_map = HashMap.new
          hash.each_pair do |key, value|
            begin
              # RJB raise Object is NULL (but new record is put correctly)
              hash_map.put(key, value)
            rescue RuntimeError
            end
          end
          hash_map
        end

        def convert_to_java_int(data)
          if data.is_a?(Array)
            data.map{|x| JInteger.new(x)}
          else
            JInteger.new(data)
          end
        end

        def to_java_array_list(array)
          array_list = ArrayList.new
          array.each do |item|
            array_list.add(item)
          end
          array_list
        end

        # Parse and convert memory size. Shifting be better but Float doesn't support it.
        #
        # == Examples:
        #   to_memory_size("512mb")
        #   # => 524288
        #
        #   to_memory_size("512 MB")
        #   # => 524288
        #
        #   to_memory_size("512mb", "GB")
        #   # => 0.5
        #
        def to_memory_size(memory, result_unit="KB")
          match = memory.match(/([\d]+)[\s]*([\w]*)/)
          if match.nil?
            raise Spark::ParseError, "Memory has wrong format. Use: 'SIZE UNIT'"
          end

          size = match[1].to_f
          unit = match[2]

          size *= memory_multiplier_based_kb(unit)
          size /= memory_multiplier_based_kb(result_unit)
          size.round(2)
        end

        # Based to KB
        def memory_multiplier_based_kb(type)
          case type.to_s.upcase
          when "G", "GB"
            1048576
          when "M", "MB"
            1024
          when "K", "KB"
            1
          else
            raise Spark::ParseError, "Unsupported type #{type}"
          end
        end

      end # Methods

    end # Parser
  end # Helper
end # Spark




================================================
FILE: lib/spark/helper/serialize.rb
================================================
module Spark
  module Helper
    module Serialize

      DIRECTIVE_INTEGER_BIG_ENDIAN = 'l>'
      DIRECTIVE_INTEGERS_BIG_ENDIAN = 'l>*'
      DIRECTIVE_LONG_BIG_ENDIAN = 'q>'
      DIRECTIVE_LONGS_BIG_ENDIAN = 'q>*'
      DIRECTIVE_DOUBLE_BIG_ENDIAN = 'G'
      DIRECTIVE_DOUBLES_BIG_ENDIAN = 'G*'
      DIRECTIVE_UNSIGNED_CHARS = 'C*'
      DIRECTIVE_CHARS = 'c*'

      # Packing

      def pack_int(data)
        [data].pack(DIRECTIVE_INTEGER_BIG_ENDIAN)
      end

      def pack_long(data)
        [data].pack(DIRECTIVE_LONG_BIG_ENDIAN)
      end

      def pack_double(data)
        [data].pack(DIRECTIVE_DOUBLE_BIG_ENDIAN)
      end

      def pack_unsigned_chars(data)
        data.pack(DIRECTIVE_UNSIGNED_CHARS)
      end

      def pack_ints(data)
        __check_array(data)
        data.pack(DIRECTIVE_INTEGERS_BIG_ENDIAN)
      end

      def pack_longs(data)
        __check_array(data)
        data.pack(DIRECTIVE_LONGS_BIG_ENDIAN)
      end

      def pack_doubles(data)
        __check_array(data)
        data.pack(DIRECTIVE_DOUBLES_BIG_ENDIAN)
      end

      # Unpacking

      def unpack_int(data)
        data.unpack(DIRECTIVE_INTEGER_BIG_ENDIAN)[0]
      end

      def unpack_long(data)
        data.unpack(DIRECTIVE_LONG_BIG_ENDIAN)[0]
      end

      def unpack_chars(data)
        data.unpack(DIRECTIVE_CHARS)
      end

      private

        def __check_array(data)
          unless data.is_a?(Array)
            raise ArgumentError, 'Data must be an Array.'
          end
        end

    end
  end
end


================================================
FILE: lib/spark/helper/statistic.rb
================================================
module Spark
  module Helper
    module Statistic

      # Returns a sampling rate that guarantees a sample of size >= sampleSizeLowerBound 99.99% of the time.
      #
      # == How the sampling rate is determined:
      # Let p = num / total, where num is the sample size and total is the total number of
      # datapoints in the RDD. We're trying to compute q > p such that
      # * when sampling with replacement, we're drawing each datapoint with prob_i ~ Pois(q),
      #   where we want to guarantee Pr[s < num] < 0.0001 for s = sum(prob_i for i from 0 to total),
      #   i.e. the failure rate of not having a sufficiently large sample < 0.0001.
      #   Setting q = p + 5 * sqrt(p/total) is sufficient to guarantee 0.9999 success rate for
      #   num > 12, but we need a slightly larger q (9 empirically determined).
      # * when sampling without replacement, we're drawing each datapoint with prob_i
      #   ~ Binomial(total, fraction) and our choice of q guarantees 1-delta, or 0.9999 success
      #   rate, where success rate is defined the same as in sampling with replacement.
      #
      def compute_fraction(lower_bound, total, with_replacement)
        lower_bound = lower_bound.to_f

        if with_replacement
          upper_poisson_bound(lower_bound) / total
        else
          fraction = lower_bound / total
          upper_binomial_bound(0.00001, total, fraction)
        end
      end

      def upper_poisson_bound(bound)
        num_std = if bound < 6
          12
        elsif bound < 16
          9
        else
          6
        end.to_f

        [bound + num_std * Math.sqrt(bound), 1e-10].max
      end

      def upper_binomial_bound(delta, total, fraction)
        gamma = -Math.log(delta) / total
        [1, fraction + gamma + Math.sqrt(gamma*gamma + 2*gamma*fraction)].min
      end

      # Bisect right
      #
      # == Examples:
      #   data = [1,5,6,8,96,120,133]
      #
      #   bisect_right(data, 0)   # => 0
      #   bisect_right(data, 1)   # => 1
      #   bisect_right(data, 5)   # => 2
      #   bisect_right(data, 9)   # => 4
      #   bisect_right(data, 150) # => 7
      #
      def bisect_right(data, value, low=0, high=data.size)
        if low < 0
          raise ArgumentError, 'Low must be >= 0.'
        end

        while low < high
          mid = (low + high) / 2
          if value < data[mid]
            high = mid
          else
            low = mid + 1
          end
        end

        low
      end

      # Determine bound of partitioning
      #
      # == Example:
      #   data = [0,1,2,3,4,5,6,7,8,9,10]
      #   determine_bounds(data, 3)
      #   # => [3, 7]
      #
      def determine_bounds(data, num_partitions)
        if num_partitions > data.size
          return data
        end

        bounds = []
        count = data.size
        (0...(num_partitions-1)).each do |index|
          bounds << data[count * (index+1) / num_partitions]
        end
        bounds
      end

    end
  end
end


================================================
FILE: lib/spark/helper/system.rb
================================================
module Spark
  module Helper
    module System

      def self.included(base)
        base.send :extend,  Methods
        base.send :include, Methods
      end
     
      module Methods
        def windows?
          RbConfig::CONFIG['host_os'] =~ /mswin|mingw/
        end

        def mri?
          RbConfig::CONFIG['ruby_install_name'] == 'ruby'
        end

        def jruby?
          RbConfig::CONFIG['ruby_install_name'] == 'jruby'
        end

        def pry?
          !!Thread.current[:__pry__]
        end

        # Memory usage in kb
        def memory_usage
          if jruby?
            runtime = java.lang.Runtime.getRuntime
            (runtime.totalMemory - runtime.freeMemory) >> 10
          elsif windows?
            # not yet
          else
            `ps -o rss= -p #{Process.pid}`.to_i
          end
        end
      end # Methods

    end # System
  end # Helper
end # Spark


================================================
FILE: lib/spark/helper.rb
================================================
module Spark
  module Helper
    autoload :System,    "spark/helper/system"
    autoload :Logger,    "spark/helper/logger"
    autoload :Statistic, "spark/helper/statistic"
    autoload :Serialize, "spark/helper/serialize"
    autoload :Partition, "spark/helper/partition"
    autoload :Parser,    "spark/helper/parser"
  end
end


================================================
FILE: lib/spark/java_bridge/base.rb
================================================
##
# Spark::JavaBridge::Base
#
# Parent for all adapter (ruby - java)
#
module Spark
  module JavaBridge
    class Base

      include Spark::Helper::System

      JAVA_OBJECTS = [
        'java.util.ArrayList',
        'scala.collection.mutable.HashMap',
        'org.apache.spark.SparkConf',
        'org.apache.spark.api.java.JavaSparkContext',
        'org.apache.spark.api.ruby.RubyRDD',
        'org.apache.spark.api.ruby.RubyUtils',
        'org.apache.spark.api.ruby.RubyWorker',
        'org.apache.spark.api.ruby.PairwiseRDD',
        'org.apache.spark.api.ruby.RubyAccumulatorParam',
        'org.apache.spark.api.ruby.RubySerializer',
        'org.apache.spark.api.python.PythonRDD',
        'org.apache.spark.api.python.PythonPartitioner',
        'org.apache.spark.api.python.PythonUtils',
        'org.apache.spark.ui.ruby.RubyTab',
        'org.apache.spark.mllib.api.ruby.RubyMLLibAPI',
        :JInteger  => 'java.lang.Integer',
        :JLong     => 'java.lang.Long',
        :JLogger   => 'org.apache.log4j.Logger',
        :JLevel    => 'org.apache.log4j.Level',
        :JPriority => 'org.apache.log4j.Priority',
        :JUtils    => 'org.apache.spark.util.Utils',
        :JDataType => 'org.apache.spark.sql.types.DataType',
        :JSQLContext => 'org.apache.spark.sql.SQLContext',
        :JDenseVector => 'org.apache.spark.mllib.linalg.DenseVector',
        :JDenseMatrix => 'org.apache.spark.mllib.linalg.DenseMatrix',
        :JStorageLevel => 'org.apache.spark.storage.StorageLevel',
        :JSQLFunctions => 'org.apache.spark.sql.functions'
      ]

      JAVA_TEST_OBJECTS = [
        'org.apache.spark.mllib.api.ruby.RubyMLLibUtilAPI'
      ]

      RUBY_TO_JAVA_SKIP = [Fixnum, Integer]

      def initialize(target)
        @target = target
      end

      # Import all important classes into Objects
      def import_all
        return if @imported

        java_objects.each do |name, klass|
          import(name, klass)
        end

        @imported = true
        nil
      end

      # Import classes for testing
      def import_all_test
        return if @imported_test

        java_test_objects.each do |name, klass|
          import(name, klass)
        end

        @imported_test = true
        nil
      end

      # Call java object
      def call(klass, method, *args)
        # To java
        args.map!{|item| to_java(item)}

        # Call java
        result = klass.__send__(method, *args)

        # To ruby
        to_ruby(result)
      end

      def to_array_list(array)
        array_list = ArrayList.new
        array.each do |item|
          array_list.add(to_java(item))
        end
        array_list
      end

      def to_seq(array)
        PythonUtils.toSeq(to_array_list(array))
      end

      def to_long(number)
        return nil if number.nil?
        JLong.new(number)
      end

      def to_java(object)
        if RUBY_TO_JAVA_SKIP.include?(object.class)
          # Some object are convert automatically
          # This is for preventing errors
          # For example: jruby store integer as long so 1.to_java is Long
          object
        elsif object.respond_to?(:to_java)
          object.to_java
        elsif object.is_a?(Array)
          to_array_list(object)
        else
          object
        end
      end

      # Array problem:
      #   Rjb:   object.toArray -> Array
      #   Jruby: object.toArray -> java.lang.Object
      #
      def to_ruby(object)
        # Java object
        if java_object?(object)
          class_name = object.getClass.getSimpleName
          case class_name
          when 'ArraySeq'
            result = []
            iterator = object.iterator
            while iterator.hasNext
              result << to_ruby(iterator.next)
            end
            result
          when 'Map2', 'Map3', 'Map4', 'HashTrieMap'
            Hash[
              object.toSeq.array.to_a.map!{|item| [item._1, item._2]}
            ]
          when 'SeqWrapper'; object.toArray.to_a.map!{|item| to_ruby(item)}
          when 'ofRef';      object.array.to_a.map!{|item| to_ruby(item)} # WrappedArray$ofRef
          when 'LabeledPoint'; Spark::Mllib::LabeledPoint.from_java(object)
          when 'DenseVector';  Spark::Mllib::DenseVector.from_java(object)
          when 'KMeansModel';  Spark::Mllib::KMeansModel.from_java(object)
          when 'DenseMatrix';  Spark::Mllib::DenseMatrix.from_java(object)
          when 'GenericRowWithSchema'; Spark::SQL::Row.from_java(object, true)
          else
            # Some RDD
            if class_name != 'JavaRDD' && class_name.end_with?('RDD')
              object = object.toJavaRDD
              class_name = 'JavaRDD'
            end

            # JavaRDD
            if class_name == 'JavaRDD'
              jrdd = RubyRDD.toRuby(object)

              serializer = Spark::Serializer.build { __batched__(__marshal__) }
              serializer = Spark::Serializer.build { __batched__(__marshal__, 2) }

              return Spark::RDD.new(jrdd, Spark.sc, serializer, deserializer)
            end

            # Unknow
            Spark.logger.warn("Java object '#{object.getClass.name}' was not converted.")
            object
          end

        # Array can be automatically transfered but content not
        elsif object.is_a?(Array)
          object.map! do |item|
            to_ruby(item)
          end
          object

        # Already transfered
        else
          object
        end
      end

      alias_method :java_to_ruby, :to_ruby
      alias_method :ruby_to_java, :to_java

      private

        def jars
          result = Dir.glob(File.join(@target, '*.jar'))
          result.flatten!
          result
        end

        def objects_with_names(objects)
          hash = {}
          objects.each do |object|
            if object.is_a?(Hash)
              hash.merge!(object)
            else
              key = object.split('.').last.to_sym
              hash[key] = object
            end
          end
          hash
        end

        def java_objects
          objects_with_names(JAVA_OBJECTS)
        end

        def java_test_objects
          objects_with_names(JAVA_TEST_OBJECTS)
        end

        def raise_missing_class(klass)
          raise Spark::JavaBridgeError, "Class #{klass} is missing. Make sure that Spark and RubySpark is assembled."
        end

    end
  end
end


================================================
FILE: lib/spark/java_bridge/jruby.rb
================================================
require 'java'

module Spark
  module JavaBridge
    class JRuby < Base

      def initialize(*args)
        super
        jars.each {|jar| require jar}
      end

      def import(name, klass)
        klass = "Java::#{klass}"
        Object.const_set(name, eval(klass))
      rescue NameError
        raise_missing_class(klass)
      end

      def java_object?(object)
        object.is_a?(JavaProxy)
      end

    end
  end
end


================================================
FILE: lib/spark/java_bridge/rjb.rb
================================================
if !ENV.has_key?('JAVA_HOME')
  raise Spark::ConfigurationError, 'Environment variable JAVA_HOME is not set'
end

require 'rjb'

module Spark
  module JavaBridge
    class RJB < Base

      def initialize(*args)
        super
        Rjb.load(jars)
        Rjb.primitive_conversion = true
      end

      def import(name, klass)
        Object.const_set(name, silence_warnings { Rjb.import(klass) })
      rescue NoClassDefFoundError
        raise_missing_class(klass)
      end

      def java_object?(object)
        object.is_a?(Rjb::Rjb_JavaProxy)
      end

      private

        def jars
          separator = windows? ? ';' : ':'
          super.join(separator)
        end

    end
  end
end


================================================
FILE: lib/spark/java_bridge.rb
================================================
module Spark
  module JavaBridge

    autoload :Base,  'spark/java_bridge/base'
    autoload :JRuby, 'spark/java_bridge/jruby'
    autoload :RJB,   'spark/java_bridge/rjb'

    include Spark::Helper::System

    def self.init(*args)
      if jruby?
        klass = JRuby
      else
        klass = RJB
      end

      klass.new(*args)
    end

  end
end


================================================
FILE: lib/spark/library.rb
================================================
module Spark
  module Library

    def autoload(klass, location, import=true)
      if import
        @for_importing ||= []
        @for_importing << klass
      end

      super(klass, location)
    end

    def autoload_without_import(klass, location)
      autoload(klass, location, false)
    end

    def import(to=Object)
      @for_importing.each do |klass|
        to.const_set(klass, const_get(klass))
      end
      nil
    end

  end
end


================================================
FILE: lib/spark/logger.rb
================================================
# Necessary libraries
Spark.load_lib

module Spark
  class Logger

    attr_reader :jlogger

    def initialize
      @jlogger = JLogger.getLogger('Ruby')
    end

    def level_off
      JLevel.toLevel('OFF')
    end

    # Disable all Spark log
    def disable
      jlogger.setLevel(level_off)
      JLogger.getLogger('org').setLevel(level_off)
      JLogger.getLogger('akka').setLevel(level_off)
      JLogger.getRootLogger.setLevel(level_off)
    end

    def enabled?
      !disabled?
    end

    def info(message)
      jlogger.info(message) if info?
    end

    def debug(message)
      jlogger.debug(message) if debug?
    end

    def trace(message)
      jlogger.trace(message) if trace?
    end

    def warning(message)
      jlogger.warn(message) if warning?
    end

    def error(message)
      jlogger.error(message) if error?
    end

    def info?
      level_enabled?('info')
    end

    def debug?
      level_enabled?('debug')
    end

    def trace?
      level_enabled?('trace')
    end

    def warning?
      level_enabled?('warn')
    end

    def error?
      level_enabled?('error')
    end

    def level_enabled?(type)
      jlogger.isEnabledFor(JPriority.toPriority(type.upcase))
    end

    alias_method :warn, :warning

  end
end


================================================
FILE: lib/spark/mllib/classification/common.rb
================================================
module Spark
  module Mllib
    class ClassificationModel

      attr_reader :weights, :intercept, :threshold

      def initialize(weights, intercept)
        @weights = Spark::Mllib::Vectors.to_vector(weights)
        @intercept = intercept.to_f
        @threshold = nil
      end

      def threshold=(value)
        @threshold = value.to_f
      end

      def clear_threshold
        @threshold = nil
      end

    end
  end
end

module Spark
  module Mllib
    class ClassificationMethodBase < RegressionMethodBase

    end
  end
end


================================================
FILE: lib/spark/mllib/classification/logistic_regression.rb
================================================
module Spark
  module Mllib
    ##
    # LogisticRegressionModel
    #
    # A linear binary classification model derived from logistic regression.
    #
    # == Examples:
    #
    #   Spark::Mllib.import
    #
    #   # Dense vectors
    #   data = [
    #     LabeledPoint.new(0.0, [0.0, 1.0]),
    #     LabeledPoint.new(1.0, [1.0, 0.0]),
    #   ]
    #   lrm = LogisticRegressionWithSGD.train($sc.parallelize(data))
    #

Download .txt

gitextract_h83fh3m2/

├── .gitignore
├── .travis.yml
├── CHANGELOG.md
├── Gemfile
├── Guardfile
├── LICENSE.txt
├── README.md
├── Rakefile
├── TODO.md
├── benchmark/
│   ├── aggregate.rb
│   ├── bisect.rb
│   ├── comparison/
│   │   ├── prepare.sh
│   │   ├── python.py
│   │   ├── r.r
│   │   ├── ruby.rb
│   │   ├── run-all.sh
│   │   └── scala.scala
│   ├── custom_marshal.rb
│   ├── digest.rb
│   ├── enumerator.rb
│   ├── serializer.rb
│   ├── sort.rb
│   ├── sort2.rb
│   └── take.rb
├── bin/
│   └── ruby-spark
├── example/
│   ├── pi.rb
│   └── website_search.rb
├── ext/
│   ├── ruby_c/
│   │   ├── extconf.rb
│   │   ├── murmur.c
│   │   ├── murmur.h
│   │   └── ruby-spark.c
│   ├── ruby_java/
│   │   ├── Digest.java
│   │   ├── Murmur2.java
│   │   ├── RubySparkExtService.java
│   │   └── extconf.rb
│   └── spark/
│       ├── build.sbt
│       ├── project/
│       │   └── plugins.sbt
│       ├── sbt/
│       │   └── sbt
│       └── src/
│           ├── main/
│           │   └── scala/
│           │       ├── Exec.scala
│           │       ├── MLLibAPI.scala
│           │       ├── Marshal.scala
│           │       ├── MarshalDump.scala
│           │       ├── MarshalLoad.scala
│           │       ├── RubyAccumulatorParam.scala
│           │       ├── RubyBroadcast.scala
│           │       ├── RubyConstant.scala
│           │       ├── RubyMLLibAPI.scala
│           │       ├── RubyMLLibUtilAPI.scala
│           │       ├── RubyPage.scala
│           │       ├── RubyRDD.scala
│           │       ├── RubySerializer.scala
│           │       ├── RubyTab.scala
│           │       ├── RubyUtils.scala
│           │       └── RubyWorker.scala
│           └── test/
│               └── scala/
│                   └── MarshalSpec.scala
├── lib/
│   ├── ruby-spark.rb
│   ├── spark/
│   │   ├── accumulator.rb
│   │   ├── broadcast.rb
│   │   ├── build.rb
│   │   ├── cli.rb
│   │   ├── command/
│   │   │   ├── base.rb
│   │   │   ├── basic.rb
│   │   │   ├── pair.rb
│   │   │   ├── sort.rb
│   │   │   └── statistic.rb
│   │   ├── command.rb
│   │   ├── command_builder.rb
│   │   ├── command_validator.rb
│   │   ├── config.rb
│   │   ├── constant.rb
│   │   ├── context.rb
│   │   ├── error.rb
│   │   ├── ext/
│   │   │   ├── hash.rb
│   │   │   ├── integer.rb
│   │   │   ├── io.rb
│   │   │   ├── ip_socket.rb
│   │   │   ├── module.rb
│   │   │   ├── object.rb
│   │   │   └── string.rb
│   │   ├── helper/
│   │   │   ├── logger.rb
│   │   │   ├── parser.rb
│   │   │   ├── serialize.rb
│   │   │   ├── statistic.rb
│   │   │   └── system.rb
│   │   ├── helper.rb
│   │   ├── java_bridge/
│   │   │   ├── base.rb
│   │   │   ├── jruby.rb
│   │   │   └── rjb.rb
│   │   ├── java_bridge.rb
│   │   ├── library.rb
│   │   ├── logger.rb
│   │   ├── mllib/
│   │   │   ├── classification/
│   │   │   │   ├── common.rb
│   │   │   │   ├── logistic_regression.rb
│   │   │   │   ├── naive_bayes.rb
│   │   │   │   └── svm.rb
│   │   │   ├── clustering/
│   │   │   │   ├── gaussian_mixture.rb
│   │   │   │   └── kmeans.rb
│   │   │   ├── matrix.rb
│   │   │   ├── regression/
│   │   │   │   ├── common.rb
│   │   │   │   ├── labeled_point.rb
│   │   │   │   ├── lasso.rb
│   │   │   │   ├── linear.rb
│   │   │   │   └── ridge.rb
│   │   │   ├── ruby_matrix/
│   │   │   │   ├── matrix_adapter.rb
│   │   │   │   └── vector_adapter.rb
│   │   │   ├── stat/
│   │   │   │   └── distribution.rb
│   │   │   └── vector.rb
│   │   ├── mllib.rb
│   │   ├── rdd.rb
│   │   ├── sampler.rb
│   │   ├── serializer/
│   │   │   ├── auto_batched.rb
│   │   │   ├── base.rb
│   │   │   ├── batched.rb
│   │   │   ├── cartesian.rb
│   │   │   ├── compressed.rb
│   │   │   ├── marshal.rb
│   │   │   ├── message_pack.rb
│   │   │   ├── oj.rb
│   │   │   ├── pair.rb
│   │   │   └── text.rb
│   │   ├── serializer.rb
│   │   ├── sort.rb
│   │   ├── sql/
│   │   │   ├── column.rb
│   │   │   ├── context.rb
│   │   │   ├── data_frame.rb
│   │   │   ├── data_frame_reader.rb
│   │   │   ├── data_type.rb
│   │   │   └── row.rb
│   │   ├── sql.rb
│   │   ├── stat_counter.rb
│   │   ├── storage_level.rb
│   │   ├── version.rb
│   │   └── worker/
│   │       ├── master.rb
│   │       ├── spark_files.rb
│   │       └── worker.rb
│   └── spark.rb
├── ruby-spark.gemspec
└── spec/
    ├── generator.rb
    ├── inputs/
    │   ├── lorem_300.txt
    │   ├── numbers/
    │   │   ├── 1.txt
    │   │   ├── 10.txt
    │   │   ├── 11.txt
    │   │   ├── 12.txt
    │   │   ├── 13.txt
    │   │   ├── 14.txt
    │   │   ├── 15.txt
    │   │   ├── 16.txt
    │   │   ├── 17.txt
    │   │   ├── 18.txt
    │   │   ├── 19.txt
    │   │   ├── 2.txt
    │   │   ├── 20.txt
    │   │   ├── 3.txt
    │   │   ├── 4.txt
    │   │   ├── 5.txt
    │   │   ├── 6.txt
    │   │   ├── 7.txt
    │   │   ├── 8.txt
    │   │   └── 9.txt
    │   ├── numbers_0_100.txt
    │   ├── numbers_1_100.txt
    │   └── people.json
    ├── lib/
    │   ├── collect_spec.rb
    │   ├── command_spec.rb
    │   ├── config_spec.rb
    │   ├── context_spec.rb
    │   ├── ext_spec.rb
    │   ├── external_apps_spec.rb
    │   ├── filter_spec.rb
    │   ├── flat_map_spec.rb
    │   ├── group_spec.rb
    │   ├── helper_spec.rb
    │   ├── key_spec.rb
    │   ├── manipulation_spec.rb
    │   ├── map_partitions_spec.rb
    │   ├── map_spec.rb
    │   ├── mllib/
    │   │   ├── classification_spec.rb
    │   │   ├── clustering_spec.rb
    │   │   ├── matrix_spec.rb
    │   │   ├── regression_spec.rb
    │   │   └── vector_spec.rb
    │   ├── reduce_by_key_spec.rb
    │   ├── reduce_spec.rb
    │   ├── sample_spec.rb
    │   ├── serializer_spec.rb
    │   ├── sort_spec.rb
    │   ├── sql/
    │   │   ├── column_spec.rb
    │   │   └── data_frame_spec.rb
    │   ├── statistic_spec.rb
    │   └── whole_text_files_spec.rb
    └── spec_helper.rb

Download .txt

SYMBOL INDEX (1072 symbols across 108 files)

FILE: benchmark/bisect.rb
  function bisect_left1 (line 3) | def bisect_left1(a, x, opts={})
  function bisect_left2 (line 22) | def bisect_left2(list, item)
  function bisect_left3 (line 31) | def bisect_left3(list, item, lo = 0, hi = list.size)

FILE: benchmark/comparison/python.py
  function log (line 12) | def log(*values):
  function is_prime (line 56) | def is_prime(x):
  function multiplication_func (line 94) | def multiplication_func(matrix):
  function pi_func (line 123) | def pi_func(size):

FILE: benchmark/comparison/ruby.rb
  function log (line 14) | def log(*values)

FILE: benchmark/custom_marshal.rb
  function pack_int (line 4) | def pack_int(data)
  function pack_long (line 8) | def pack_long(data)
  function pack_doubles (line 12) | def pack_doubles(data)
  type Standard (line 16) | module Standard
    class LabeledPoint (line 17) | class LabeledPoint
      method initialize (line 18) | def initialize(label, features)
      method marshal_dump (line 23) | def marshal_dump
      method marshal_load (line 27) | def marshal_load(*)
    class Vector (line 31) | class Vector
      method initialize (line 32) | def initialize(array)
      method marshal_dump (line 36) | def marshal_dump
      method marshal_load (line 40) | def marshal_load(*)
  type Custom (line 45) | module Custom
    class LabeledPoint (line 46) | class LabeledPoint
      method initialize (line 47) | def initialize(label, features)
      method _dump (line 52) | def _dump(*)
      method _load (line 56) | def self._load(*)
    class Vector (line 60) | class Vector
      method initialize (line 61) | def initialize(array)
      method _dump (line 65) | def _dump(*)
      method _load (line 72) | def self._load(*)

FILE: benchmark/digest.rb
  function java? (line 4) | def java?
  function murmur3_32_rotl (line 27) | def murmur3_32_rotl(x, r)
  function murmur3_32_fmix (line 31) | def murmur3_32_fmix(h)
  function murmur3_32__mmix (line 40) | def murmur3_32__mmix(k1)
  function murmur3_32_str_hash (line 46) | def murmur3_32_str_hash(str, seed=0)

FILE: benchmark/enumerator.rb
  class Enumerator (line 3) | class Enumerator
    method defer (line 4) | def defer(&blk)
  function type_yield (line 15) | def type_yield
  function yield_map_x2 (line 23) | def yield_map_x2(enum)
  function type_enumerator_new (line 31) | def type_enumerator_new
  function enumerator_new_map_x2 (line 39) | def enumerator_new_map_x2(enum)
  function enumerator_defer_x2 (line 47) | def enumerator_defer_x2(enum)

FILE: benchmark/sort.rb
  function <=> (line 29) | def <=>(other)

FILE: benchmark/sort2.rb
  function words (line 16) | def words
  function word (line 20) | def word
  function sort1 (line 31) | def sort1(data)
  function sort1_2 (line 64) | def sort1_2(data)
  function sort2 (line 113) | def sort2(data)

FILE: example/pi.rb
  function map (line 14) | def map(_)

FILE: example/website_search.rb
  function parse_sitemap (line 48) | def parse_sitemap(doc)

FILE: ext/ruby_c/murmur.c
  function MurmurHash64A (line 18) | uint64_t MurmurHash64A(const void * key, int len, uint64_t seed)
  function MurmurHash64B (line 63) | uint64_t MurmurHash64B(const void * key, int len, uint64_t seed)
  function VALUE (line 122) | VALUE murmur2_digest(VALUE rb_str, uint64_t seed)
  function VALUE (line 137) | VALUE method_murmur2_digest(int argc, VALUE *argv, VALUE klass)
  function VALUE (line 151) | VALUE method_portable_hash(int argc, VALUE *argv, VALUE klass)

FILE: ext/ruby_c/ruby-spark.c
  function Init_ruby_spark_ext (line 10) | void Init_ruby_spark_ext()

FILE: ext/ruby_java/Digest.java
  class Digest (line 12) | @JRubyModule(name="Spark::Digest")
    method Digest (line 18) | public Digest(final Ruby ruby, RubyClass rubyClass) {
    method portable_hash (line 22) | @JRubyMethod(module=true)

FILE: ext/ruby_java/Murmur2.java
  class Murmur2 (line 21) | @JRubyClass(name="Spark::Digest::Murmur2")
    method Murmur2 (line 24) | public Murmur2(final Ruby ruby, RubyClass rubyClass) {
    method digest (line 28) | @JRubyMethod(required=1, optional=1, module=true)
    method hash64 (line 57) | public static long hash64(final byte[] data, int length, long seed) {

FILE: ext/ruby_java/RubySparkExtService.java
  class RubySparkExtService (line 8) | public class RubySparkExtService implements BasicLibraryService
    method basicLoad (line 10) | public boolean basicLoad(final Ruby ruby) throws java.io.IOException {
    method allocate (line 23) | public IRubyObject allocate(Ruby ruby, RubyClass rubyClass) {

FILE: lib/spark.rb
  type Spark (line 11) | module Spark
    function print_logo (line 43) | def self.print_logo(message=nil)
    function config (line 71) | def self.config(&block)
    function clear_config (line 83) | def self.clear_config
    function context (line 88) | def self.context
    function sql_context (line 93) | def self.sql_context
    function start (line 100) | def self.start
    function start_sql (line 104) | def self.start_sql
    function stop (line 108) | def self.stop
    function started? (line 120) | def self.started?
    function load_defaults (line 131) | def self.load_defaults
    function load_defaults_from (line 140) | def self.load_defaults_from(file_path)
    function save_defaults_to (line 158) | def self.save_defaults_to(file_path)
    function logger (line 181) | def self.logger
    function root (line 186) | def self.root
    function target_dir (line 191) | def self.target_dir
    function worker_dir (line 196) | def self.worker_dir
    function ruby_spark_jar (line 200) | def self.ruby_spark_jar
    function spark_ext_dir (line 204) | def self.spark_ext_dir
    function load_lib (line 219) | def self.load_lib(target=nil)
    function java_bridge (line 229) | def self.java_bridge

FILE: lib/spark/accumulator.rb
  type Spark (line 1) | module Spark
    class Accumulator (line 55) | class Accumulator
      method initialize (line 68) | def initialize(value, accum_param=:+, zero_value=0)
      method inspect (line 80) | def inspect
      method changed (line 88) | def self.changed
      method instances (line 92) | def self.instances
      method valid_accum_param (line 96) | def valid_accum_param
      method driver? (line 127) | def driver?
      method add (line 135) | def add(term)
      method + (line 147) | def +(term)
      method add_by_symbol (line 152) | def add_by_symbol(term)
      method marshal_dump (line 171) | def marshal_dump
      method marshal_load (line 175) | def marshal_load(array)
      method load_accum_param (line 183) | def load_accum_param
      class Server (line 199) | class Server
        method start (line 203) | def self.start
        method stop (line 207) | def self.stop
        method host (line 211) | def self.host
        method port (line 216) | def self.port
        method initialize (line 221) | def initialize
        method stop (line 230) | def stop
        method handle_accept (line 236) | def handle_accept
        method handle_connection (line 245) | def handle_connection(socket)
    class Accumulator (line 198) | class Accumulator
      method initialize (line 68) | def initialize(value, accum_param=:+, zero_value=0)
      method inspect (line 80) | def inspect
      method changed (line 88) | def self.changed
      method instances (line 92) | def self.instances
      method valid_accum_param (line 96) | def valid_accum_param
      method driver? (line 127) | def driver?
      method add (line 135) | def add(term)
      method + (line 147) | def +(term)
      method add_by_symbol (line 152) | def add_by_symbol(term)
      method marshal_dump (line 171) | def marshal_dump
      method marshal_load (line 175) | def marshal_load(array)
      method load_accum_param (line 183) | def load_accum_param
      class Server (line 199) | class Server
        method start (line 203) | def self.start
        method stop (line 207) | def self.stop
        method host (line 211) | def self.host
        method port (line 216) | def self.port
        method initialize (line 221) | def initialize
        method stop (line 230) | def stop
        method handle_accept (line 236) | def handle_accept
        method handle_connection (line 245) | def handle_connection(socket)
  type Spark (line 197) | module Spark
    class Accumulator (line 55) | class Accumulator
      method initialize (line 68) | def initialize(value, accum_param=:+, zero_value=0)
      method inspect (line 80) | def inspect
      method changed (line 88) | def self.changed
      method instances (line 92) | def self.instances
      method valid_accum_param (line 96) | def valid_accum_param
      method driver? (line 127) | def driver?
      method add (line 135) | def add(term)
      method + (line 147) | def +(term)
      method add_by_symbol (line 152) | def add_by_symbol(term)
      method marshal_dump (line 171) | def marshal_dump
      method marshal_load (line 175) | def marshal_load(array)
      method load_accum_param (line 183) | def load_accum_param
      class Server (line 199) | class Server
        method start (line 203) | def self.start
        method stop (line 207) | def self.stop
        method host (line 211) | def self.host
        method port (line 216) | def self.port
        method initialize (line 221) | def initialize
        method stop (line 230) | def stop
        method handle_accept (line 236) | def handle_accept
        method handle_connection (line 245) | def handle_connection(socket)
    class Accumulator (line 198) | class Accumulator
      method initialize (line 68) | def initialize(value, accum_param=:+, zero_value=0)
      method inspect (line 80) | def inspect
      method changed (line 88) | def self.changed
      method instances (line 92) | def self.instances
      method valid_accum_param (line 96) | def valid_accum_param
      method driver? (line 127) | def driver?
      method add (line 135) | def add(term)
      method + (line 147) | def +(term)
      method add_by_symbol (line 152) | def add_by_symbol(term)
      method marshal_dump (line 171) | def marshal_dump
      method marshal_load (line 175) | def marshal_load(array)
      method load_accum_param (line 183) | def load_accum_param
      class Server (line 199) | class Server
        method start (line 203) | def self.start
        method stop (line 207) | def self.stop
        method host (line 211) | def self.host
        method port (line 216) | def self.port
        method initialize (line 221) | def initialize
        method stop (line 230) | def stop
        method handle_accept (line 236) | def handle_accept
        method handle_connection (line 245) | def handle_connection(socket)

FILE: lib/spark/broadcast.rb
  type Spark (line 1) | module Spark
    class Broadcast (line 27) | class Broadcast
      method initialize (line 48) | def initialize(sc, value)
      method inspect (line 64) | def inspect
      method register (line 71) | def self.register(id, path)
      method value (line 75) | def value
      method marshal_dump (line 95) | def marshal_dump
      method marshal_load (line 99) | def marshal_load(id)

FILE: lib/spark/build.rb
  type Spark (line 1) | module Spark
    type Build (line 2) | module Build
      function build (line 14) | def self.build(options={})

FILE: lib/spark/cli.rb
  type Commander (line 3) | module Commander
    type UI (line 4) | module UI
      function enable_paging (line 7) | def self.enable_paging
  type Spark (line 12) | module Spark
    class CLI (line 13) | class CLI
      method run (line 19) | def run

FILE: lib/spark/command.rb
  type Spark (line 1) | module Spark
    class Command (line 6) | class Command
      method initialize (line 10) | def initialize
      method execute (line 18) | def execute(iterator, split_index)
      method last (line 41) | def last
      method bound_objects (line 45) | def bound_objects
      method marshal_dump (line 61) | def marshal_dump
      method marshal_load (line 65) | def marshal_load(array)
      method serialized_bound_objects (line 75) | def serialized_bound_objects

FILE: lib/spark/command/base.rb
  class Spark::Command::Base (line 6) | class Spark::Command::Base
    method initialize (line 13) | def initialize(*args)
    method to_s (line 19) | def to_s
    method error (line 23) | def self.error(message)
    method error (line 27) | def error(message)
    method log (line 31) | def log(message=nil)
    method settings (line 42) | def self.settings
    method settings (line 47) | def settings
    method init_settings (line 52) | def self.init_settings
    method variable (line 76) | def self.variable(name, options={})
    method execute (line 89) | def execute(iterator, split_index)
    method prepared? (line 102) | def prepared?
    method prepare (line 114) | def prepare
    method before_run (line 141) | def before_run
    method method_missing (line 150) | def method_missing(method, *args, &block)

FILE: lib/spark/command/basic.rb
  class Spark::Command::Map (line 6) | class Spark::Command::Map < _Base
    method run (line 9) | def run(iterator, *)
    method lazy_run (line 16) | def lazy_run(iterator, *)
  class Spark::Command::FlatMap (line 26) | class Spark::Command::FlatMap < Spark::Command::Map
    method run (line 27) | def run(iterator, *)
    method lazy_run (line 33) | def lazy_run(iterator, *)
  class Spark::Command::MapPartitionsWithIndex (line 43) | class Spark::Command::MapPartitionsWithIndex < _Base
    method run (line 46) | def run(iterator, index)
  class Spark::Command::MapPartitions (line 58) | class Spark::Command::MapPartitions < Spark::Command::MapPartitionsWithI...
    method run (line 59) | def run(iterator, *)
  class Spark::Command::Filter (line 70) | class Spark::Command::Filter < _Base
    method run (line 73) | def run(iterator, *)
    method lazy_run (line 80) | def lazy_run(iterator, *)
  class Spark::Command::Compact (line 90) | class Spark::Command::Compact < _Base
    method run (line 91) | def run(iterator, *)
    method lazy_run (line 96) | def lazy_run(iterator, *)
  class Spark::Command::Glom (line 106) | class Spark::Command::Glom < _Base
    method run (line 107) | def run(iterator, *)
    method lazy_run (line 111) | def lazy_run(iterator, *)
  class Spark::Command::Shuffle (line 119) | class Spark::Command::Shuffle < _Base
    method run (line 122) | def run(iterator, *)
    method rng (line 127) | def rng
  class Spark::Command::PartitionBy (line 135) | class Spark::Command::PartitionBy
    class Base (line 137) | class Base < Spark::Command::Base
      method prepare (line 140) | def prepare
      method run (line 147) | def run(iterator, *)
      method lazy_run (line 155) | def lazy_run(iterator, *)
      method make_partition_item (line 163) | def make_partition_item(item)
    class Basic (line 171) | class Basic < Base
    class Sorting (line 175) | class Sorting < Base
      method prepare (line 181) | def prepare
  class Spark::Command::Aggregate (line 206) | class Spark::Command::Aggregate < _Base
    method run (line 210) | def run(iterator, *)
    method lazy_run (line 214) | def lazy_run(iterator, *)
  class Spark::Command::Reduce (line 222) | class Spark::Command::Reduce < Spark::Command::Aggregate
    method run (line 223) | def run(iterator, *)
  class Spark::Command::Foreach (line 231) | class Spark::Command::Foreach < _Base
    method run (line 234) | def run(iterator, *)
  class Spark::Command::ForeachPartition (line 245) | class Spark::Command::ForeachPartition < _Base
    method run (line 248) | def run(iterator, *)
  class Spark::Command::KeyBy (line 257) | class Spark::Command::KeyBy < _Base
    method run (line 260) | def run(iterator, *)
    method lazy_run (line 267) | def lazy_run(iterator, *)
  class Spark::Command::Take (line 277) | class Spark::Command::Take < _Base
    method run (line 281) | def run(iterator, index)
  class Spark::Command::Pipe (line 293) | class Spark::Command::Pipe < _Base
    method before_run (line 296) | def before_run
    method run (line 302) | def run(iterator, *)
    method lazy_run (line 318) | def lazy_run(iterator, *)
    method create_writing_thread (line 333) | def create_writing_thread(iterator)

FILE: lib/spark/command/pair.rb
  class Spark::Command::CombineByKey (line 6) | class Spark::Command::CombineByKey
    class Base (line 10) | class Base < Spark::Command::Base
      method run (line 11) | def run(iterator, *)
      method lazy_run (line 15) | def lazy_run(iterator, *)
    class Combine (line 22) | class Combine < Base
      method _run (line 26) | def _run(iterator)
    class Merge (line 43) | class Merge < Base
      method _run (line 46) | def _run(iterator, *)
    class CombineWithZero (line 61) | class CombineWithZero < Base
      method _run (line 65) | def _run(iterator)
  class Spark::Command::MapValues (line 88) | class Spark::Command::MapValues < _Base
    method run (line 91) | def run(iterator, *)
    method lazy_run (line 99) | def lazy_run(iterator, *)
  class Spark::Command::FlatMapValues (line 110) | class Spark::Command::FlatMapValues < _Base
    method run (line 113) | def run(iterator, *)

FILE: lib/spark/command/sort.rb
  class Spark::Command::SortByKey (line 6) | class Spark::Command::SortByKey < _Base
    method before_run (line 14) | def before_run
    method run (line 18) | def run(iterator, _)
    method run_with_enum (line 28) | def run_with_enum(iterator, _)
    method run_with_spilling (line 41) | def run_with_spilling(iterator)
    method run_without_spilling (line 46) | def run_without_spilling(iterator)

FILE: lib/spark/command/statistic.rb
  class Spark::Command::Sample (line 6) | class Spark::Command::Sample < _Base
    method run (line 11) | def run(iterator, _)
    method lazy_run (line 15) | def lazy_run(iterator, _)
    method sampler (line 19) | def sampler
    method _sampler (line 23) | def _sampler
  class Spark::Command::Stats (line 37) | class Spark::Command::Stats < _Base
    method run (line 39) | def run(iterator, *)
    method lazy_run (line 43) | def lazy_run(iterator, *)
  class Spark::Command::Histogram (line 52) | class Spark::Command::Histogram < _Base
    method run (line 58) | def run(iterator, *)
    method lazy_run (line 75) | def lazy_run(iterator, *)
    method min (line 81) | def min
    method max (line 85) | def max
    method counter_size (line 89) | def counter_size
    method increment (line 93) | def increment
    method bucket_function (line 99) | def bucket_function
    method _bucket_function (line 103) | def _bucket_function
    method fast_bucket_function (line 113) | def fast_bucket_function
    method basic_bucket_function (line 129) | def basic_bucket_function

FILE: lib/spark/command_builder.rb
  type Spark (line 3) | module Spark
    class CommandBuilder (line 7) | class CommandBuilder
      method initialize (line 20) | def initialize(serializer, deserializer=nil)
      method create_command (line 26) | def create_command
      method deep_copy (line 32) | def deep_copy
      method build (line 45) | def build
      method add_command (line 49) | def add_command(klass, *args)
      method add_library (line 68) | def add_library(*libraries)
      method bind (line 72) | def bind(objects)
      method serialize_function (line 87) | def serialize_function(func)
      method serialize_function_from_string (line 102) | def serialize_function_from_string(string)
      method serialize_function_from_symbol (line 106) | def serialize_function_from_symbol(symbol)
      method serialize_function_from_proc (line 115) | def serialize_function_from_proc(proc)
      method serialize_function_from_method (line 130) | def serialize_function_from_method(meth)

FILE: lib/spark/command_validator.rb
  type Spark (line 1) | module Spark
    type CommandValidator (line 2) | module CommandValidator
      function validate (line 4) | def validate(value, options)
      function valid? (line 8) | def valid?(value, options)
      function validate_type (line 17) | def validate_type(value, types)
      function validate_size (line 27) | def validate_size(array1, array2)

FILE: lib/spark/config.rb
  type Spark (line 4) | module Spark
    class Config (line 6) | class Config
      method initialize (line 16) | def initialize
      method from_file (line 22) | def from_file(file)
      method [] (line 31) | def [](key)
      method []= (line 35) | def []=(key, value)
      method spark_conf (line 39) | def spark_conf
      method valid! (line 48) | def valid!
      method read_only? (line 80) | def read_only?
      method get (line 85) | def get(key)
      method get_all (line 100) | def get_all
      method contains? (line 104) | def contains?(key)
      method set (line 108) | def set(key, value)
      method set_app_name (line 113) | def set_app_name(name)
      method set_master (line 117) | def set_master(master)
      method parse_boolean (line 121) | def parse_boolean(value)
      method parse_integer (line 130) | def parse_integer(value)
      method set_default (line 137) | def set_default
      method default_serializer (line 151) | def default_serializer
      method default_serializer_compress (line 155) | def default_serializer_compress
      method default_serializer_batch_size (line 159) | def default_serializer_batch_size
      method default_executor_command (line 169) | def default_executor_command
      method default_executor_options (line 178) | def default_executor_options
      method default_worker_type (line 198) | def default_worker_type
      method load_executor_envs (line 208) | def load_executor_envs
      method check_read_only (line 227) | def check_read_only

FILE: lib/spark/constant.rb
  type Spark (line 1) | module Spark
    type Constant (line 3) | module Constant

FILE: lib/spark/context.rb
  type Spark (line 4) | module Spark
    class Context (line 9) | class Context
      method initialize (line 21) | def initialize
      method inspect (line 41) | def inspect
      method stop (line 47) | def stop
      method sc (line 53) | def sc
      method ui (line 57) | def ui
      method default_parallelism (line 63) | def default_parallelism
      method default_serializer (line 71) | def default_serializer
      method default_batch_size (line 92) | def default_batch_size
      method set_local_property (line 104) | def set_local_property(key, value)
      method get_local_property (line 110) | def get_local_property(key)
      method set_call_site (line 116) | def set_call_site(site)
      method clear_call_site (line 120) | def clear_call_site
      method config (line 127) | def config(key=nil)
      method add_file (line 149) | def add_file(*files)
      method broadcast (line 169) | def broadcast(value)
      method accumulator (line 188) | def accumulator(value, accum_param=:+, zero_value=0)
      method parallelize (line 207) | def parallelize(data, num_slices=nil, serializer=nil)
      method text_file (line 236) | def text_file(path, min_partitions=nil, encoding=Encoding::UTF_8, se...
      method whole_text_files (line 260) | def whole_text_files(path, min_partitions=nil, serializer=nil)
      method run_job (line 278) | def run_job(rdd, f, partitions=nil, allow_local=false)
      method run_job_with_command (line 284) | def run_job_with_command(rdd, partitions, allow_local, command, *args)

FILE: lib/spark/error.rb
  type Spark (line 1) | module Spark
    class BuildError (line 3) | class BuildError < StandardError
    class SerializeError (line 8) | class SerializeError < StandardError
    class NotImplemented (line 13) | class NotImplemented < StandardError
    class ConfigurationError (line 17) | class ConfigurationError < StandardError
    class RDDError (line 21) | class RDDError < StandardError
    class CommandError (line 25) | class CommandError < StandardError
    class ParseError (line 30) | class ParseError < StandardError
    class ContextError (line 34) | class ContextError < StandardError
    class BroadcastError (line 39) | class BroadcastError < StandardError
    class AccumulatorError (line 45) | class AccumulatorError < StandardError
    class MllibError (line 49) | class MllibError < StandardError
    class SQLError (line 53) | class SQLError < StandardError
    class JavaBridgeError (line 57) | class JavaBridgeError < StandardError

FILE: lib/spark/ext/hash.rb
  type Spark (line 1) | module Spark
    type CoreExtension (line 2) | module CoreExtension
      type Hash (line 3) | module Hash
        type ClassMethods (line 4) | module ClassMethods
        type InstanceMethods (line 7) | module InstanceMethods
          function stringify_keys_with_spark! (line 9) | def stringify_keys_with_spark!
          function symbolize_keys_with_spark! (line 14) | def symbolize_keys_with_spark!
          function transform_keys_with_spark! (line 20) | def transform_keys_with_spark!
        function included (line 28) | def self.included(base)

FILE: lib/spark/ext/integer.rb
  type Spark (line 1) | module Spark
    type CoreExtension (line 2) | module CoreExtension
      type Integer (line 3) | module Integer
        type ClassMethods (line 4) | module ClassMethods
        type InstanceMethods (line 7) | module InstanceMethods
        function included (line 10) | def self.included(base)

FILE: lib/spark/ext/io.rb
  type Spark (line 1) | module Spark
    type CoreExtension (line 2) | module CoreExtension
      type IO (line 3) | module IO
        type ClassMethods (line 4) | module ClassMethods
        type InstanceMethods (line 7) | module InstanceMethods
          function read_int (line 11) | def read_int
          function read_int_or_eof (line 15) | def read_int_or_eof
          function read_long (line 21) | def read_long
          function read_string (line 25) | def read_string
          function read_data (line 29) | def read_data
          function write_int (line 36) | def write_int(data)
          function write_long (line 40) | def write_long(data)
          function write_string (line 47) | def write_string(data)
          function write_data (line 52) | def write_data(data)
        function included (line 57) | def self.included(base)

FILE: lib/spark/ext/ip_socket.rb
  type Spark (line 1) | module Spark
    type CoreExtension (line 2) | module CoreExtension
      type IPSocket (line 3) | module IPSocket
        type ClassMethods (line 4) | module ClassMethods
        type InstanceMethods (line 7) | module InstanceMethods
          function port (line 8) | def port
          function hostname (line 12) | def hostname
          function numeric_address (line 16) | def numeric_address
        function included (line 21) | def self.included(base)

FILE: lib/spark/ext/module.rb
  type Spark (line 1) | module Spark
    type CoreExtension (line 2) | module CoreExtension
      type Module (line 3) | module Module
        function patch_unless_exist (line 40) | def patch_unless_exist(target, suffix)
        function path_const_unless_exist (line 48) | def path_const_unless_exist(target, suffix)

FILE: lib/spark/ext/object.rb
  type Spark (line 1) | module Spark
    type CoreExtension (line 2) | module CoreExtension
      type Object (line 3) | module Object
        type ClassMethods (line 4) | module ClassMethods
        type InstanceMethods (line 7) | module InstanceMethods
          function deep_copy_with_spark (line 8) | def deep_copy_with_spark
          function silence_warnings (line 12) | def silence_warnings
          function cattr_reader_with_spark (line 19) | def cattr_reader_with_spark(*syms)
          function cattr_writer_with_spark (line 38) | def cattr_writer_with_spark(*syms)
          function cattr_accessor_with_spark (line 57) | def cattr_accessor_with_spark(*syms)
        function included (line 63) | def self.included(base)

FILE: lib/spark/ext/string.rb
  type Spark (line 1) | module Spark
    type CoreExtension (line 2) | module CoreExtension
      type String (line 3) | module String
        type ClassMethods (line 4) | module ClassMethods
        type InstanceMethods (line 7) | module InstanceMethods
          function camelize_with_spark (line 8) | def camelize_with_spark
        function included (line 13) | def self.included(base)

FILE: lib/spark/helper.rb
  type Spark (line 1) | module Spark
    type Helper (line 2) | module Helper

FILE: lib/spark/helper/logger.rb
  type Spark (line 1) | module Spark
    type Helper (line 2) | module Helper
      type Logger (line 3) | module Logger
        function included (line 5) | def self.included(base)
        type Methods (line 10) | module Methods
          function log_info (line 11) | def log_info(message)
          function log_debug (line 15) | def log_debug(message)
          function log_trace (line 19) | def log_trace(message)
          function log_warning (line 23) | def log_warning(message)
          function log_error (line 27) | def log_error(message)

FILE: lib/spark/helper/parser.rb
  type Spark (line 1) | module Spark
    type Helper (line 2) | module Helper
      type Parser (line 3) | module Parser
        function included (line 5) | def self.included(base)
        type Methods (line 10) | module Methods
          function to_java_hash (line 11) | def to_java_hash(hash)
          function convert_to_java_int (line 23) | def convert_to_java_int(data)
          function to_java_array_list (line 31) | def to_java_array_list(array)
          function to_memory_size (line 51) | def to_memory_size(memory, result_unit="KB")
          function memory_multiplier_based_kb (line 66) | def memory_multiplier_based_kb(type)

FILE: lib/spark/helper/serialize.rb
  type Spark (line 1) | module Spark
    type Helper (line 2) | module Helper
      type Serialize (line 3) | module Serialize
        function pack_int (line 16) | def pack_int(data)
        function pack_long (line 20) | def pack_long(data)
        function pack_double (line 24) | def pack_double(data)
        function pack_unsigned_chars (line 28) | def pack_unsigned_chars(data)
        function pack_ints (line 32) | def pack_ints(data)
        function pack_longs (line 37) | def pack_longs(data)
        function pack_doubles (line 42) | def pack_doubles(data)
        function unpack_int (line 49) | def unpack_int(data)
        function unpack_long (line 53) | def unpack_long(data)
        function unpack_chars (line 57) | def unpack_chars(data)
        function __check_array (line 63) | def __check_array(data)

FILE: lib/spark/helper/statistic.rb
  type Spark (line 1) | module Spark
    type Helper (line 2) | module Helper
      type Statistic (line 3) | module Statistic
        function compute_fraction (line 19) | def compute_fraction(lower_bound, total, with_replacement)
        function upper_poisson_bound (line 30) | def upper_poisson_bound(bound)
        function upper_binomial_bound (line 42) | def upper_binomial_bound(delta, total, fraction)
        function bisect_right (line 58) | def bisect_right(data, value, low=0, high=data.size)
        function determine_bounds (line 82) | def determine_bounds(data, num_partitions)

FILE: lib/spark/helper/system.rb
  type Spark (line 1) | module Spark
    type Helper (line 2) | module Helper
      type System (line 3) | module System
        function included (line 5) | def self.included(base)
        type Methods (line 10) | module Methods
          function windows? (line 11) | def windows?
          function mri? (line 15) | def mri?
          function jruby? (line 19) | def jruby?
          function pry? (line 23) | def pry?
          function memory_usage (line 28) | def memory_usage

FILE: lib/spark/java_bridge.rb
  type Spark (line 1) | module Spark
    type JavaBridge (line 2) | module JavaBridge
      function init (line 10) | def self.init(*args)

FILE: lib/spark/java_bridge/base.rb
  type Spark (line 6) | module Spark
    type JavaBridge (line 7) | module JavaBridge
      class Base (line 8) | class Base
        method initialize (line 48) | def initialize(target)
        method import_all (line 53) | def import_all
        method import_all_test (line 65) | def import_all_test
        method call (line 77) | def call(klass, method, *args)
        method to_array_list (line 88) | def to_array_list(array)
        method to_seq (line 96) | def to_seq(array)
        method to_long (line 100) | def to_long(number)
        method to_java (line 105) | def to_java(object)
        method to_ruby (line 124) | def to_ruby(object)
        method jars (line 187) | def jars
        method objects_with_names (line 193) | def objects_with_names(objects)
        method java_objects (line 206) | def java_objects
        method java_test_objects (line 210) | def java_test_objects
        method raise_missing_class (line 214) | def raise_missing_class(klass)

FILE: lib/spark/java_bridge/jruby.rb
  type Spark (line 3) | module Spark
    type JavaBridge (line 4) | module JavaBridge
      class JRuby (line 5) | class JRuby < Base
        method initialize (line 7) | def initialize(*args)
        method import (line 12) | def import(name, klass)
        method java_object? (line 19) | def java_object?(object)

FILE: lib/spark/java_bridge/rjb.rb
  type Spark (line 7) | module Spark
    type JavaBridge (line 8) | module JavaBridge
      class RJB (line 9) | class RJB < Base
        method initialize (line 11) | def initialize(*args)
        method import (line 17) | def import(name, klass)
        method java_object? (line 23) | def java_object?(object)
        method jars (line 29) | def jars

FILE: lib/spark/library.rb
  type Spark (line 1) | module Spark
    type Library (line 2) | module Library
      function autoload (line 4) | def autoload(klass, location, import=true)
      function autoload_without_import (line 13) | def autoload_without_import(klass, location)
      function import (line 17) | def import(to=Object)

FILE: lib/spark/logger.rb
  type Spark (line 4) | module Spark
    class Logger (line 5) | class Logger
      method initialize (line 9) | def initialize
      method level_off (line 13) | def level_off
      method disable (line 18) | def disable
      method enabled? (line 25) | def enabled?
      method info (line 29) | def info(message)
      method debug (line 33) | def debug(message)
      method trace (line 37) | def trace(message)
      method warning (line 41) | def warning(message)
      method error (line 45) | def error(message)
      method info? (line 49) | def info?
      method debug? (line 53) | def debug?
      method trace? (line 57) | def trace?
      method warning? (line 61) | def warning?
      method error? (line 65) | def error?
      method level_enabled? (line 69) | def level_enabled?(type)

FILE: lib/spark/mllib.rb
  type Spark (line 1) | module Spark
    type Mllib (line 5) | module Mllib
      function prepare (line 50) | def self.prepare
      function narray? (line 71) | def self.narray?
      function mdarray? (line 75) | def self.mdarray?

FILE: lib/spark/mllib/classification/common.rb
  type Spark (line 1) | module Spark
    type Mllib (line 2) | module Mllib
      class ClassificationModel (line 3) | class ClassificationModel
        method initialize (line 7) | def initialize(weights, intercept)
        method threshold= (line 13) | def threshold=(value)
        method clear_threshold (line 17) | def clear_threshold
      class ClassificationMethodBase (line 27) | class ClassificationMethodBase < RegressionMethodBase
    type Mllib (line 26) | module Mllib
      class ClassificationModel (line 3) | class ClassificationModel
        method initialize (line 7) | def initialize(weights, intercept)
        method threshold= (line 13) | def threshold=(value)
        method clear_threshold (line 17) | def clear_threshold
      class ClassificationMethodBase (line 27) | class ClassificationMethodBase < RegressionMethodBase
  type Spark (line 25) | module Spark
    type Mllib (line 2) | module Mllib
      class ClassificationModel (line 3) | class ClassificationModel
        method initialize (line 7) | def initialize(weights, intercept)
        method threshold= (line 13) | def threshold=(value)
        method clear_threshold (line 17) | def clear_threshold
      class ClassificationMethodBase (line 27) | class ClassificationMethodBase < RegressionMethodBase
    type Mllib (line 26) | module Mllib
      class ClassificationModel (line 3) | class ClassificationModel
        method initialize (line 7) | def initialize(weights, intercept)
        method threshold= (line 13) | def threshold=(value)
        method clear_threshold (line 17) | def clear_threshold
      class ClassificationMethodBase (line 27) | class ClassificationMethodBase < RegressionMethodBase

FILE: lib/spark/mllib/classification/logistic_regression.rb
  type Spark (line 1) | module Spark
    type Mllib (line 2) | module Mllib
      class LogisticRegressionModel (line 60) | class LogisticRegressionModel < ClassificationModel
        method initialize (line 62) | def initialize(*args)
        method predict (line 69) | def predict(vector)
      class LogisticRegressionWithSGD (line 91) | class LogisticRegressionWithSGD < ClassificationMethodBase
        method train (line 150) | def self.train(rdd, options={})
      class LogisticRegressionWithLBFGS (line 173) | class LogisticRegressionWithLBFGS < ClassificationMethodBase
        method train (line 220) | def self.train(rdd, options={})
    type Mllib (line 90) | module Mllib
      class LogisticRegressionModel (line 60) | class LogisticRegressionModel < ClassificationModel
        method initialize (line 62) | def initialize(*args)
        method predict (line 69) | def predict(vector)
      class LogisticRegressionWithSGD (line 91) | class LogisticRegressionWithSGD < ClassificationMethodBase
        method train (line 150) | def self.train(rdd, options={})
      class LogisticRegressionWithLBFGS (line 173) | class LogisticRegressionWithLBFGS < ClassificationMethodBase
        method train (line 220) | def self.train(rdd, options={})
    type Mllib (line 172) | module Mllib
      class LogisticRegressionModel (line 60) | class LogisticRegressionModel < ClassificationModel
        method initialize (line 62) | def initialize(*args)
        method predict (line 69) | def predict(vector)
      class LogisticRegressionWithSGD (line 91) | class LogisticRegressionWithSGD < ClassificationMethodBase
        method train (line 150) | def self.train(rdd, options={})
      class LogisticRegressionWithLBFGS (line 173) | class LogisticRegressionWithLBFGS < ClassificationMethodBase
        method train (line 220) | def self.train(rdd, options={})
  type Spark (line 89) | module Spark
    type Mllib (line 2) | module Mllib
      class LogisticRegressionModel (line 60) | class LogisticRegressionModel < ClassificationModel
        method initialize (line 62) | def initialize(*args)
        method predict (line 69) | def predict(vector)
      class LogisticRegressionWithSGD (line 91) | class LogisticRegressionWithSGD < ClassificationMethodBase
        method train (line 150) | def self.train(rdd, options={})
      class LogisticRegressionWithLBFGS (line 173) | class LogisticRegressionWithLBFGS < ClassificationMethodBase
        method train (line 220) | def self.train(rdd, options={})
    type Mllib (line 90) | module Mllib
      class LogisticRegressionModel (line 60) | class LogisticRegressionModel < ClassificationModel
        method initialize (line 62) | def initialize(*args)
        method predict (line 69) | def predict(vector)
      class LogisticRegressionWithSGD (line 91) | class LogisticRegressionWithSGD < ClassificationMethodBase
        method train (line 150) | def self.train(rdd, options={})
      class LogisticRegressionWithLBFGS (line 173) | class LogisticRegressionWithLBFGS < ClassificationMethodBase
        method train (line 220) | def self.train(rdd, options={})
    type Mllib (line 172) | module Mllib
      class LogisticRegressionModel (line 60) | class LogisticRegressionModel < ClassificationModel
        method initialize (line 62) | def initialize(*args)
        method predict (line 69) | def predict(vector)
      class LogisticRegressionWithSGD (line 91) | class LogisticRegressionWithSGD < ClassificationMethodBase
        method train (line 150) | def self.train(rdd, options={})
      class LogisticRegressionWithLBFGS (line 173) | class LogisticRegressionWithLBFGS < ClassificationMethodBase
        method train (line 220) | def self.train(rdd, options={})
  type Spark (line 171) | module Spark
    type Mllib (line 2) | module Mllib
      class LogisticRegressionModel (line 60) | class LogisticRegressionModel < ClassificationModel
        method initialize (line 62) | def initialize(*args)
        method predict (line 69) | def predict(vector)
      class LogisticRegressionWithSGD (line 91) | class LogisticRegressionWithSGD < ClassificationMethodBase
        method train (line 150) | def self.train(rdd, options={})
      class LogisticRegressionWithLBFGS (line 173) | class LogisticRegressionWithLBFGS < ClassificationMethodBase
        method train (line 220) | def self.train(rdd, options={})
    type Mllib (line 90) | module Mllib
      class LogisticRegressionModel (line 60) | class LogisticRegressionModel < ClassificationModel
        method initialize (line 62) | def initialize(*args)
        method predict (line 69) | def predict(vector)
      class LogisticRegressionWithSGD (line 91) | class LogisticRegressionWithSGD < ClassificationMethodBase
        method train (line 150) | def self.train(rdd, options={})
      class LogisticRegressionWithLBFGS (line 173) | class LogisticRegressionWithLBFGS < ClassificationMethodBase
        method train (line 220) | def self.train(rdd, options={})
    type Mllib (line 172) | module Mllib
      class LogisticRegressionModel (line 60) | class LogisticRegressionModel < ClassificationModel
        method initialize (line 62) | def initialize(*args)
        method predict (line 69) | def predict(vector)
      class LogisticRegressionWithSGD (line 91) | class LogisticRegressionWithSGD < ClassificationMethodBase
        method train (line 150) | def self.train(rdd, options={})
      class LogisticRegressionWithLBFGS (line 173) | class LogisticRegressionWithLBFGS < ClassificationMethodBase
        method train (line 220) | def self.train(rdd, options={})

FILE: lib/spark/mllib/classification/naive_bayes.rb
  type Spark (line 1) | module Spark
    type Mllib (line 2) | module Mllib
      class NaiveBayesModel (line 43) | class NaiveBayesModel
        method initialize (line 47) | def initialize(labels, pi, theta)
        method predict (line 55) | def predict(vector)
      class NaiveBayes (line 69) | class NaiveBayes
        method train (line 82) | def self.train(rdd, lambda=1.0)
    type Mllib (line 68) | module Mllib
      class NaiveBayesModel (line 43) | class NaiveBayesModel
        method initialize (line 47) | def initialize(labels, pi, theta)
        method predict (line 55) | def predict(vector)
      class NaiveBayes (line 69) | class NaiveBayes
        method train (line 82) | def self.train(rdd, lambda=1.0)
  type Spark (line 67) | module Spark
    type Mllib (line 2) | module Mllib
      class NaiveBayesModel (line 43) | class NaiveBayesModel
        method initialize (line 47) | def initialize(labels, pi, theta)
        method predict (line 55) | def predict(vector)
      class NaiveBayes (line 69) | class NaiveBayes
        method train (line 82) | def self.train(rdd, lambda=1.0)
    type Mllib (line 68) | module Mllib
      class NaiveBayesModel (line 43) | class NaiveBayesModel
        method initialize (line 47) | def initialize(labels, pi, theta)
        method predict (line 55) | def predict(vector)
      class NaiveBayes (line 69) | class NaiveBayes
        method train (line 82) | def self.train(rdd, lambda=1.0)

FILE: lib/spark/mllib/classification/svm.rb
  type Spark (line 1) | module Spark
    type Mllib (line 2) | module Mllib
      class SVMModel (line 42) | class SVMModel < ClassificationModel
        method initialize (line 44) | def initialize(*args)
        method predict (line 51) | def predict(vector)
      class SVMWithSGD (line 72) | class SVMWithSGD < ClassificationMethodBase
        method train (line 130) | def self.train(rdd, options={})
    type Mllib (line 71) | module Mllib
      class SVMModel (line 42) | class SVMModel < ClassificationModel
        method initialize (line 44) | def initialize(*args)
        method predict (line 51) | def predict(vector)
      class SVMWithSGD (line 72) | class SVMWithSGD < ClassificationMethodBase
        method train (line 130) | def self.train(rdd, options={})
  type Spark (line 70) | module Spark
    type Mllib (line 2) | module Mllib
      class SVMModel (line 42) | class SVMModel < ClassificationModel
        method initialize (line 44) | def initialize(*args)
        method predict (line 51) | def predict(vector)
      class SVMWithSGD (line 72) | class SVMWithSGD < ClassificationMethodBase
        method train (line 130) | def self.train(rdd, options={})
    type Mllib (line 71) | module Mllib
      class SVMModel (line 42) | class SVMModel < ClassificationModel
        method initialize (line 44) | def initialize(*args)
        method predict (line 51) | def predict(vector)
      class SVMWithSGD (line 72) | class SVMWithSGD < ClassificationMethodBase
        method train (line 130) | def self.train(rdd, options={})

FILE: lib/spark/mllib/clustering/gaussian_mixture.rb
  type Spark (line 1) | module Spark
    type Mllib (line 2) | module Mllib
      class GaussianMixtureModel (line 25) | class GaussianMixtureModel
        method initialize (line 29) | def initialize(weights, gaussians)
        method predict (line 37) | def predict(rdd)
        method predict_soft (line 46) | def predict_soft(rdd)
        method means (line 50) | def means
        method sigmas (line 54) | def sigmas
      class GaussianMixture (line 64) | class GaussianMixture
        method train (line 66) | def self.train(rdd, k, convergence_tol: 0.001, max_iterations: 100...
    type Mllib (line 63) | module Mllib
      class GaussianMixtureModel (line 25) | class GaussianMixtureModel
        method initialize (line 29) | def initialize(weights, gaussians)
        method predict (line 37) | def predict(rdd)
        method predict_soft (line 46) | def predict_soft(rdd)
        method means (line 50) | def means
        method sigmas (line 54) | def sigmas
      class GaussianMixture (line 64) | class GaussianMixture
        method train (line 66) | def self.train(rdd, k, convergence_tol: 0.001, max_iterations: 100...
  type Spark (line 62) | module Spark
    type Mllib (line 2) | module Mllib
      class GaussianMixtureModel (line 25) | class GaussianMixtureModel
        method initialize (line 29) | def initialize(weights, gaussians)
        method predict (line 37) | def predict(rdd)
        method predict_soft (line 46) | def predict_soft(rdd)
        method means (line 50) | def means
        method sigmas (line 54) | def sigmas
      class GaussianMixture (line 64) | class GaussianMixture
        method train (line 66) | def self.train(rdd, k, convergence_tol: 0.001, max_iterations: 100...
    type Mllib (line 63) | module Mllib
      class GaussianMixtureModel (line 25) | class GaussianMixtureModel
        method initialize (line 29) | def initialize(weights, gaussians)
        method predict (line 37) | def predict(rdd)
        method predict_soft (line 46) | def predict_soft(rdd)
        method means (line 50) | def means
        method sigmas (line 54) | def sigmas
      class GaussianMixture (line 64) | class GaussianMixture
        method train (line 66) | def self.train(rdd, k, convergence_tol: 0.001, max_iterations: 100...

FILE: lib/spark/mllib/clustering/kmeans.rb
  type Spark (line 1) | module Spark
    type Mllib (line 2) | module Mllib
      class KMeansModel (line 47) | class KMeansModel
        method initialize (line 51) | def initialize(centers)
        method predict (line 56) | def predict(vector)
        method from_java (line 72) | def self.from_java(object)
      class KMeans (line 87) | class KMeans
        method train (line 113) | def self.train(rdd, k, max_iterations: 100, runs: 1, initializatio...
    type Mllib (line 86) | module Mllib
      class KMeansModel (line 47) | class KMeansModel
        method initialize (line 51) | def initialize(centers)
        method predict (line 56) | def predict(vector)
        method from_java (line 72) | def self.from_java(object)
      class KMeans (line 87) | class KMeans
        method train (line 113) | def self.train(rdd, k, max_iterations: 100, runs: 1, initializatio...
  type Spark (line 85) | module Spark
    type Mllib (line 2) | module Mllib
      class KMeansModel (line 47) | class KMeansModel
        method initialize (line 51) | def initialize(centers)
        method predict (line 56) | def predict(vector)
        method from_java (line 72) | def self.from_java(object)
      class KMeans (line 87) | class KMeans
        method train (line 113) | def self.train(rdd, k, max_iterations: 100, runs: 1, initializatio...
    type Mllib (line 86) | module Mllib
      class KMeansModel (line 47) | class KMeansModel
        method initialize (line 51) | def initialize(centers)
        method predict (line 56) | def predict(vector)
        method from_java (line 72) | def self.from_java(object)
      class KMeans (line 87) | class KMeans
        method train (line 113) | def self.train(rdd, k, max_iterations: 100, runs: 1, initializatio...

FILE: lib/spark/mllib/matrix.rb
  type Spark (line 1) | module Spark
    type Mllib (line 2) | module Mllib
      type Matrices (line 3) | module Matrices
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function to_matrix (line 13) | def self.to_matrix(data)
      class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
      class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
        method initialize (line 43) | def initialize(rows, cols, values)
        method to_java (line 47) | def to_java
        method from_java (line 51) | def self.from_java(object)
      class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
        method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)
    type Mllib (line 26) | module Mllib
      type Matrices (line 3) | module Matrices
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function to_matrix (line 13) | def self.to_matrix(data)
      class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
      class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
        method initialize (line 43) | def initialize(rows, cols, values)
        method to_java (line 47) | def to_java
        method from_java (line 51) | def self.from_java(object)
      class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
        method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)
    type Mllib (line 34) | module Mllib
      type Matrices (line 3) | module Matrices
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function to_matrix (line 13) | def self.to_matrix(data)
      class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
      class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
        method initialize (line 43) | def initialize(rows, cols, values)
        method to_java (line 47) | def to_java
        method from_java (line 51) | def self.from_java(object)
      class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
        method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)
    type Mllib (line 64) | module Mllib
      type Matrices (line 3) | module Matrices
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function to_matrix (line 13) | def self.to_matrix(data)
      class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
      class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
        method initialize (line 43) | def initialize(rows, cols, values)
        method to_java (line 47) | def to_java
        method from_java (line 51) | def self.from_java(object)
      class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
        method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)
  type Spark (line 25) | module Spark
    type Mllib (line 2) | module Mllib
      type Matrices (line 3) | module Matrices
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function to_matrix (line 13) | def self.to_matrix(data)
      class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
      class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
        method initialize (line 43) | def initialize(rows, cols, values)
        method to_java (line 47) | def to_java
        method from_java (line 51) | def self.from_java(object)
      class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
        method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)
    type Mllib (line 26) | module Mllib
      type Matrices (line 3) | module Matrices
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function to_matrix (line 13) | def self.to_matrix(data)
      class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
      class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
        method initialize (line 43) | def initialize(rows, cols, values)
        method to_java (line 47) | def to_java
        method from_java (line 51) | def self.from_java(object)
      class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
        method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)
    type Mllib (line 34) | module Mllib
      type Matrices (line 3) | module Matrices
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function to_matrix (line 13) | def self.to_matrix(data)
      class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
      class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
        method initialize (line 43) | def initialize(rows, cols, values)
        method to_java (line 47) | def to_java
        method from_java (line 51) | def self.from_java(object)
      class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
        method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)
    type Mllib (line 64) | module Mllib
      type Matrices (line 3) | module Matrices
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function to_matrix (line 13) | def self.to_matrix(data)
      class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
      class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
        method initialize (line 43) | def initialize(rows, cols, values)
        method to_java (line 47) | def to_java
        method from_java (line 51) | def self.from_java(object)
      class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
        method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)
  type Spark (line 33) | module Spark
    type Mllib (line 2) | module Mllib
      type Matrices (line 3) | module Matrices
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function to_matrix (line 13) | def self.to_matrix(data)
      class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
      class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
        method initialize (line 43) | def initialize(rows, cols, values)
        method to_java (line 47) | def to_java
        method from_java (line 51) | def self.from_java(object)
      class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
        method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)
    type Mllib (line 26) | module Mllib
      type Matrices (line 3) | module Matrices
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function to_matrix (line 13) | def self.to_matrix(data)
      class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
      class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
        method initialize (line 43) | def initialize(rows, cols, values)
        method to_java (line 47) | def to_java
        method from_java (line 51) | def self.from_java(object)
      class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
        method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)
    type Mllib (line 34) | module Mllib
      type Matrices (line 3) | module Matrices
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function to_matrix (line 13) | def self.to_matrix(data)
      class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
      class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
        method initialize (line 43) | def initialize(rows, cols, values)
        method to_java (line 47) | def to_java
        method from_java (line 51) | def self.from_java(object)
      class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
        method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)
    type Mllib (line 64) | module Mllib
      type Matrices (line 3) | module Matrices
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function to_matrix (line 13) | def self.to_matrix(data)
      class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
      class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
        method initialize (line 43) | def initialize(rows, cols, values)
        method to_java (line 47) | def to_java
        method from_java (line 51) | def self.from_java(object)
      class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
        method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)
  type Spark (line 63) | module Spark
    type Mllib (line 2) | module Mllib
      type Matrices (line 3) | module Matrices
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function to_matrix (line 13) | def self.to_matrix(data)
      class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
      class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
        method initialize (line 43) | def initialize(rows, cols, values)
        method to_java (line 47) | def to_java
        method from_java (line 51) | def self.from_java(object)
      class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
        method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)
    type Mllib (line 26) | module Mllib
      type Matrices (line 3) | module Matrices
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function to_matrix (line 13) | def self.to_matrix(data)
      class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
      class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
        method initialize (line 43) | def initialize(rows, cols, values)
        method to_java (line 47) | def to_java
        method from_java (line 51) | def self.from_java(object)
      class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
        method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)
    type Mllib (line 34) | module Mllib
      type Matrices (line 3) | module Matrices
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function to_matrix (line 13) | def self.to_matrix(data)
      class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
      class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
        method initialize (line 43) | def initialize(rows, cols, values)
        method to_java (line 47) | def to_java
        method from_java (line 51) | def self.from_java(object)
      class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
        method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)
    type Mllib (line 64) | module Mllib
      type Matrices (line 3) | module Matrices
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function to_matrix (line 13) | def self.to_matrix(data)
      class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
      class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
        method initialize (line 43) | def initialize(rows, cols, values)
        method to_java (line 47) | def to_java
        method from_java (line 51) | def self.from_java(object)
      class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
        method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)

FILE: lib/spark/mllib/regression/common.rb
  type Spark (line 1) | module Spark
    type Mllib (line 2) | module Mllib
      class RegressionModel (line 8) | class RegressionModel
        method initialize (line 12) | def initialize(weights, intercept)
        method predict (line 29) | def predict(data)
      class RegressionMethodBase (line 46) | class RegressionMethodBase
        method train (line 48) | def self.train(rdd, options)
    type Mllib (line 40) | module Mllib
      class RegressionModel (line 8) | class RegressionModel
        method initialize (line 12) | def initialize(weights, intercept)
        method predict (line 29) | def predict(data)
      class RegressionMethodBase (line 46) | class RegressionMethodBase
        method train (line 48) | def self.train(rdd, options)
  type Spark (line 39) | module Spark
    type Mllib (line 2) | module Mllib
      class RegressionModel (line 8) | class RegressionModel
        method initialize (line 12) | def initialize(weights, intercept)
        method predict (line 29) | def predict(data)
      class RegressionMethodBase (line 46) | class RegressionMethodBase
        method train (line 48) | def self.train(rdd, options)
    type Mllib (line 40) | module Mllib
      class RegressionModel (line 8) | class RegressionModel
        method initialize (line 12) | def initialize(weights, intercept)
        method predict (line 29) | def predict(data)
      class RegressionMethodBase (line 46) | class RegressionMethodBase
        method train (line 48) | def self.train(rdd, options)

FILE: lib/spark/mllib/regression/labeled_point.rb
  type Spark (line 1) | module Spark
    type Mllib (line 2) | module Mllib
      class LabeledPoint (line 15) | class LabeledPoint
        method initialize (line 19) | def initialize(label, features)
        method from_java (line 24) | def self.from_java(object)
        method marshal_dump (line 31) | def marshal_dump
        method marshal_load (line 35) | def marshal_load(array)

FILE: lib/spark/mllib/regression/lasso.rb
  class Spark::Mllib::LassoModel (line 49) | class Spark::Mllib::LassoModel < Spark::Mllib::RegressionModel
  type Spark (line 52) | module Spark
    type Mllib (line 53) | module Mllib
      class LassoWithSGD (line 54) | class LassoWithSGD < RegressionMethodBase
        method train (line 104) | def self.train(rdd, options={})

FILE: lib/spark/mllib/regression/linear.rb
  class Spark::Mllib::LinearRegressionModel (line 55) | class Spark::Mllib::LinearRegressionModel < Spark::Mllib::RegressionModel
  type Spark (line 58) | module Spark
    type Mllib (line 59) | module Mllib
      class LinearRegressionWithSGD (line 60) | class LinearRegressionWithSGD < RegressionMethodBase
        method train (line 119) | def self.train(rdd, options={})

FILE: lib/spark/mllib/regression/ridge.rb
  class Spark::Mllib::RidgeRegressionModel (line 46) | class Spark::Mllib::RidgeRegressionModel < Spark::Mllib::RegressionModel
  type Spark (line 49) | module Spark
    type Mllib (line 50) | module Mllib
      class RidgeRegressionWithSGD (line 51) | class RidgeRegressionWithSGD < RegressionMethodBase
        method train (line 101) | def self.train(rdd, options={})

FILE: lib/spark/mllib/ruby_matrix/matrix_adapter.rb
  type Spark (line 3) | module Spark
    type Mllib (line 4) | module Mllib
      class MatrixAdapter (line 5) | class MatrixAdapter < ::Matrix
        method new (line 7) | def self.new(*args)
        method initialize (line 23) | def initialize(type, rows, cols, values=nil)
        method shape (line 43) | def shape
        method values (line 47) | def values

FILE: lib/spark/mllib/ruby_matrix/vector_adapter.rb
  class Vector (line 5) | class Vector
    method elements (line 6) | def self.elements(array, copy=true)
  type Spark (line 11) | module Spark
    type Mllib (line 12) | module Mllib
      class VectorAdapter (line 13) | class VectorAdapter < ::Vector
        method new (line 15) | def self.new(*args)
        method initialize (line 21) | def initialize(*args)
        method []= (line 34) | def []=(index, value)
        method dot (line 38) | def dot(other)
        method squared_distance (line 46) | def squared_distance(other)
        method values (line 51) | def values

FILE: lib/spark/mllib/vector.rb
  type Spark (line 1) | module Spark
    type Mllib (line 2) | module Mllib
      type Vectors (line 3) | module Vectors
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function parse (line 13) | def self.parse(data)
        function to_vector (line 23) | def self.to_vector(data)
      class VectorBase (line 38) | class VectorBase < VectorAdapter
      class DenseVector (line 57) | class DenseVector < VectorBase
        method initialize (line 59) | def initialize(values)
        method parse (line 67) | def self.parse(data)
        method to_s (line 86) | def to_s
        method to_java (line 90) | def to_java
        method from_java (line 94) | def self.from_java(object)
        method marshal_dump (line 98) | def marshal_dump
        method marshal_load (line 102) | def marshal_load(array)
      class SparseVector (line 127) | class SparseVector < VectorBase
        method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
        method parse (line 151) | def self.parse(data)
        method to_s (line 171) | def to_s
        method marshal_dump (line 175) | def marshal_dump
        method marshal_load (line 179) | def marshal_load(array)
    type Mllib (line 36) | module Mllib
      type Vectors (line 3) | module Vectors
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function parse (line 13) | def self.parse(data)
        function to_vector (line 23) | def self.to_vector(data)
      class VectorBase (line 38) | class VectorBase < VectorAdapter
      class DenseVector (line 57) | class DenseVector < VectorBase
        method initialize (line 59) | def initialize(values)
        method parse (line 67) | def self.parse(data)
        method to_s (line 86) | def to_s
        method to_java (line 90) | def to_java
        method from_java (line 94) | def self.from_java(object)
        method marshal_dump (line 98) | def marshal_dump
        method marshal_load (line 102) | def marshal_load(array)
      class SparseVector (line 127) | class SparseVector < VectorBase
        method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
        method parse (line 151) | def self.parse(data)
        method to_s (line 171) | def to_s
        method marshal_dump (line 175) | def marshal_dump
        method marshal_load (line 179) | def marshal_load(array)
    type Mllib (line 44) | module Mllib
      type Vectors (line 3) | module Vectors
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function parse (line 13) | def self.parse(data)
        function to_vector (line 23) | def self.to_vector(data)
      class VectorBase (line 38) | class VectorBase < VectorAdapter
      class DenseVector (line 57) | class DenseVector < VectorBase
        method initialize (line 59) | def initialize(values)
        method parse (line 67) | def self.parse(data)
        method to_s (line 86) | def to_s
        method to_java (line 90) | def to_java
        method from_java (line 94) | def self.from_java(object)
        method marshal_dump (line 98) | def marshal_dump
        method marshal_load (line 102) | def marshal_load(array)
      class SparseVector (line 127) | class SparseVector < VectorBase
        method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
        method parse (line 151) | def self.parse(data)
        method to_s (line 171) | def to_s
        method marshal_dump (line 175) | def marshal_dump
        method marshal_load (line 179) | def marshal_load(array)
    type Mllib (line 111) | module Mllib
      type Vectors (line 3) | module Vectors
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function parse (line 13) | def self.parse(data)
        function to_vector (line 23) | def self.to_vector(data)
      class VectorBase (line 38) | class VectorBase < VectorAdapter
      class DenseVector (line 57) | class DenseVector < VectorBase
        method initialize (line 59) | def initialize(values)
        method parse (line 67) | def self.parse(data)
        method to_s (line 86) | def to_s
        method to_java (line 90) | def to_java
        method from_java (line 94) | def self.from_java(object)
        method marshal_dump (line 98) | def marshal_dump
        method marshal_load (line 102) | def marshal_load(array)
      class SparseVector (line 127) | class SparseVector < VectorBase
        method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
        method parse (line 151) | def self.parse(data)
        method to_s (line 171) | def to_s
        method marshal_dump (line 175) | def marshal_dump
        method marshal_load (line 179) | def marshal_load(array)
  type Spark (line 35) | module Spark
    type Mllib (line 2) | module Mllib
      type Vectors (line 3) | module Vectors
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function parse (line 13) | def self.parse(data)
        function to_vector (line 23) | def self.to_vector(data)
      class VectorBase (line 38) | class VectorBase < VectorAdapter
      class DenseVector (line 57) | class DenseVector < VectorBase
        method initialize (line 59) | def initialize(values)
        method parse (line 67) | def self.parse(data)
        method to_s (line 86) | def to_s
        method to_java (line 90) | def to_java
        method from_java (line 94) | def self.from_java(object)
        method marshal_dump (line 98) | def marshal_dump
        method marshal_load (line 102) | def marshal_load(array)
      class SparseVector (line 127) | class SparseVector < VectorBase
        method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
        method parse (line 151) | def self.parse(data)
        method to_s (line 171) | def to_s
        method marshal_dump (line 175) | def marshal_dump
        method marshal_load (line 179) | def marshal_load(array)
    type Mllib (line 36) | module Mllib
      type Vectors (line 3) | module Vectors
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function parse (line 13) | def self.parse(data)
        function to_vector (line 23) | def self.to_vector(data)
      class VectorBase (line 38) | class VectorBase < VectorAdapter
      class DenseVector (line 57) | class DenseVector < VectorBase
        method initialize (line 59) | def initialize(values)
        method parse (line 67) | def self.parse(data)
        method to_s (line 86) | def to_s
        method to_java (line 90) | def to_java
        method from_java (line 94) | def self.from_java(object)
        method marshal_dump (line 98) | def marshal_dump
        method marshal_load (line 102) | def marshal_load(array)
      class SparseVector (line 127) | class SparseVector < VectorBase
        method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
        method parse (line 151) | def self.parse(data)
        method to_s (line 171) | def to_s
        method marshal_dump (line 175) | def marshal_dump
        method marshal_load (line 179) | def marshal_load(array)
    type Mllib (line 44) | module Mllib
      type Vectors (line 3) | module Vectors
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function parse (line 13) | def self.parse(data)
        function to_vector (line 23) | def self.to_vector(data)
      class VectorBase (line 38) | class VectorBase < VectorAdapter
      class DenseVector (line 57) | class DenseVector < VectorBase
        method initialize (line 59) | def initialize(values)
        method parse (line 67) | def self.parse(data)
        method to_s (line 86) | def to_s
        method to_java (line 90) | def to_java
        method from_java (line 94) | def self.from_java(object)
        method marshal_dump (line 98) | def marshal_dump
        method marshal_load (line 102) | def marshal_load(array)
      class SparseVector (line 127) | class SparseVector < VectorBase
        method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
        method parse (line 151) | def self.parse(data)
        method to_s (line 171) | def to_s
        method marshal_dump (line 175) | def marshal_dump
        method marshal_load (line 179) | def marshal_load(array)
    type Mllib (line 111) | module Mllib
      type Vectors (line 3) | module Vectors
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function parse (line 13) | def self.parse(data)
        function to_vector (line 23) | def self.to_vector(data)
      class VectorBase (line 38) | class VectorBase < VectorAdapter
      class DenseVector (line 57) | class DenseVector < VectorBase
        method initialize (line 59) | def initialize(values)
        method parse (line 67) | def self.parse(data)
        method to_s (line 86) | def to_s
        method to_java (line 90) | def to_java
        method from_java (line 94) | def self.from_java(object)
        method marshal_dump (line 98) | def marshal_dump
        method marshal_load (line 102) | def marshal_load(array)
      class SparseVector (line 127) | class SparseVector < VectorBase
        method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
        method parse (line 151) | def self.parse(data)
        method to_s (line 171) | def to_s
        method marshal_dump (line 175) | def marshal_dump
        method marshal_load (line 179) | def marshal_load(array)
  type Spark (line 43) | module Spark
    type Mllib (line 2) | module Mllib
      type Vectors (line 3) | module Vectors
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function parse (line 13) | def self.parse(data)
        function to_vector (line 23) | def self.to_vector(data)
      class VectorBase (line 38) | class VectorBase < VectorAdapter
      class DenseVector (line 57) | class DenseVector < VectorBase
        method initialize (line 59) | def initialize(values)
        method parse (line 67) | def self.parse(data)
        method to_s (line 86) | def to_s
        method to_java (line 90) | def to_java
        method from_java (line 94) | def self.from_java(object)
        method marshal_dump (line 98) | def marshal_dump
        method marshal_load (line 102) | def marshal_load(array)
      class SparseVector (line 127) | class SparseVector < VectorBase
        method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
        method parse (line 151) | def self.parse(data)
        method to_s (line 171) | def to_s
        method marshal_dump (line 175) | def marshal_dump
        method marshal_load (line 179) | def marshal_load(array)
    type Mllib (line 36) | module Mllib
      type Vectors (line 3) | module Vectors
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function parse (line 13) | def self.parse(data)
        function to_vector (line 23) | def self.to_vector(data)
      class VectorBase (line 38) | class VectorBase < VectorAdapter
      class DenseVector (line 57) | class DenseVector < VectorBase
        method initialize (line 59) | def initialize(values)
        method parse (line 67) | def self.parse(data)
        method to_s (line 86) | def to_s
        method to_java (line 90) | def to_java
        method from_java (line 94) | def self.from_java(object)
        method marshal_dump (line 98) | def marshal_dump
        method marshal_load (line 102) | def marshal_load(array)
      class SparseVector (line 127) | class SparseVector < VectorBase
        method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
        method parse (line 151) | def self.parse(data)
        method to_s (line 171) | def to_s
        method marshal_dump (line 175) | def marshal_dump
        method marshal_load (line 179) | def marshal_load(array)
    type Mllib (line 44) | module Mllib
      type Vectors (line 3) | module Vectors
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function parse (line 13) | def self.parse(data)
        function to_vector (line 23) | def self.to_vector(data)
      class VectorBase (line 38) | class VectorBase < VectorAdapter
      class DenseVector (line 57) | class DenseVector < VectorBase
        method initialize (line 59) | def initialize(values)
        method parse (line 67) | def self.parse(data)
        method to_s (line 86) | def to_s
        method to_java (line 90) | def to_java
        method from_java (line 94) | def self.from_java(object)
        method marshal_dump (line 98) | def marshal_dump
        method marshal_load (line 102) | def marshal_load(array)
      class SparseVector (line 127) | class SparseVector < VectorBase
        method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
        method parse (line 151) | def self.parse(data)
        method to_s (line 171) | def to_s
        method marshal_dump (line 175) | def marshal_dump
        method marshal_load (line 179) | def marshal_load(array)
    type Mllib (line 111) | module Mllib
      type Vectors (line 3) | module Vectors
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function parse (line 13) | def self.parse(data)
        function to_vector (line 23) | def self.to_vector(data)
      class VectorBase (line 38) | class VectorBase < VectorAdapter
      class DenseVector (line 57) | class DenseVector < VectorBase
        method initialize (line 59) | def initialize(values)
        method parse (line 67) | def self.parse(data)
        method to_s (line 86) | def to_s
        method to_java (line 90) | def to_java
        method from_java (line 94) | def self.from_java(object)
        method marshal_dump (line 98) | def marshal_dump
        method marshal_load (line 102) | def marshal_load(array)
      class SparseVector (line 127) | class SparseVector < VectorBase
        method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
        method parse (line 151) | def self.parse(data)
        method to_s (line 171) | def to_s
        method marshal_dump (line 175) | def marshal_dump
        method marshal_load (line 179) | def marshal_load(array)
  type Spark (line 110) | module Spark
    type Mllib (line 2) | module Mllib
      type Vectors (line 3) | module Vectors
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function parse (line 13) | def self.parse(data)
        function to_vector (line 23) | def self.to_vector(data)
      class VectorBase (line 38) | class VectorBase < VectorAdapter
      class DenseVector (line 57) | class DenseVector < VectorBase
        method initialize (line 59) | def initialize(values)
        method parse (line 67) | def self.parse(data)
        method to_s (line 86) | def to_s
        method to_java (line 90) | def to_java
        method from_java (line 94) | def self.from_java(object)
        method marshal_dump (line 98) | def marshal_dump
        method marshal_load (line 102) | def marshal_load(array)
      class SparseVector (line 127) | class SparseVector < VectorBase
        method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
        method parse (line 151) | def self.parse(data)
        method to_s (line 171) | def to_s
        method marshal_dump (line 175) | def marshal_dump
        method marshal_load (line 179) | def marshal_load(array)
    type Mllib (line 36) | module Mllib
      type Vectors (line 3) | module Vectors
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function parse (line 13) | def self.parse(data)
        function to_vector (line 23) | def self.to_vector(data)
      class VectorBase (line 38) | class VectorBase < VectorAdapter
      class DenseVector (line 57) | class DenseVector < VectorBase
        method initialize (line 59) | def initialize(values)
        method parse (line 67) | def self.parse(data)
        method to_s (line 86) | def to_s
        method to_java (line 90) | def to_java
        method from_java (line 94) | def self.from_java(object)
        method marshal_dump (line 98) | def marshal_dump
        method marshal_load (line 102) | def marshal_load(array)
      class SparseVector (line 127) | class SparseVector < VectorBase
        method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
        method parse (line 151) | def self.parse(data)
        method to_s (line 171) | def to_s
        method marshal_dump (line 175) | def marshal_dump
        method marshal_load (line 179) | def marshal_load(array)
    type Mllib (line 44) | module Mllib
      type Vectors (line 3) | module Vectors
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function parse (line 13) | def self.parse(data)
        function to_vector (line 23) | def self.to_vector(data)
      class VectorBase (line 38) | class VectorBase < VectorAdapter
      class DenseVector (line 57) | class DenseVector < VectorBase
        method initialize (line 59) | def initialize(values)
        method parse (line 67) | def self.parse(data)
        method to_s (line 86) | def to_s
        method to_java (line 90) | def to_java
        method from_java (line 94) | def self.from_java(object)
        method marshal_dump (line 98) | def marshal_dump
        method marshal_load (line 102) | def marshal_load(array)
      class SparseVector (line 127) | class SparseVector < VectorBase
        method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
        method parse (line 151) | def self.parse(data)
        method to_s (line 171) | def to_s
        method marshal_dump (line 175) | def marshal_dump
        method marshal_load (line 179) | def marshal_load(array)
    type Mllib (line 111) | module Mllib
      type Vectors (line 3) | module Vectors
        function dense (line 5) | def self.dense(*args)
        function sparse (line 9) | def self.sparse(*args)
        function parse (line 13) | def self.parse(data)
        function to_vector (line 23) | def self.to_vector(data)
      class VectorBase (line 38) | class VectorBase < VectorAdapter
      class DenseVector (line 57) | class DenseVector < VectorBase
        method initialize (line 59) | def initialize(values)
        method parse (line 67) | def self.parse(data)
        method to_s (line 86) | def to_s
        method to_java (line 90) | def to_java
        method from_java (line 94) | def self.from_java(object)
        method marshal_dump (line 98) | def marshal_dump
        method marshal_load (line 102) | def marshal_load(array)
      class SparseVector (line 127) | class SparseVector < VectorBase
        method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
        method parse (line 151) | def self.parse(data)
        method to_s (line 171) | def to_s
        method marshal_dump (line 175) | def marshal_dump
        method marshal_load (line 179) | def marshal_load(array)

FILE: lib/spark/rdd.rb
  type Spark (line 1) | module Spark
    class RDD (line 7) | class RDD
      method initialize (line 27) | def initialize(jrdd, context, serializer, deserializer=nil)
      method inspect (line 37) | def inspect
      method + (line 54) | def +(other)
      method add_command (line 62) | def add_command(klass, *args)
      method add_library (line 72) | def add_library(*libraries)
      method bind (line 89) | def bind(objects)
      method new_rdd_from_command (line 98) | def new_rdd_from_command(klass, *args)
      method config (line 107) | def config
      method default_reduce_partitions (line 111) | def default_reduce_partitions
      method partitions_size (line 116) | def partitions_size
      method id (line 121) | def id
      method cache (line 126) | def cache
      method persist (line 136) | def persist(new_level)
      method unpersist (line 147) | def unpersist(blocking=true)
      method cached? (line 153) | def cached?
      method checkpointed? (line 157) | def checkpointed?
      method name (line 163) | def name
      method set_name (line 170) | def set_name(value)
      method name= (line 175) | def name=(value)
      method to_java (line 179) | def to_java
      method collect (line 199) | def collect(as_enum=false)
      method collect_from_file (line 212) | def collect_from_file(file, as_enum=false)
      method collect_as_hash (line 232) | def collect_as_hash
      method take (line 247) | def take(count)
      method first (line 290) | def first
      method reduce (line 301) | def reduce(f)
      method fold (line 318) | def fold(zero_value, f)
      method aggregate (line 342) | def aggregate(zero_value, seq_op, comb_op)
      method max (line 353) | def max
      method min (line 364) | def min
      method sum (line 375) | def sum
      method count (line 386) | def count
      method stats (line 394) | def stats
      method mean (line 404) | def mean
      method variance (line 414) | def variance
      method stdev (line 424) | def stdev
      method sample_stdev (line 436) | def sample_stdev
      method sample_variance (line 447) | def sample_variance
      method histogram (line 476) | def histogram(buckets)
      method foreach (line 576) | def foreach(f, options={})
      method foreach_partition (line 588) | def foreach_partition(f, options={})
      method map (line 604) | def map(f)
      method flat_map (line 616) | def flat_map(f)
      method map_partitions (line 627) | def map_partitions(f)
      method map_partitions_with_index (line 639) | def map_partitions_with_index(f, options={})
      method filter (line 650) | def filter(f)
      method compact (line 661) | def compact
      method glom (line 672) | def glom
      method coalesce (line 683) | def coalesce(num_partitions)
      method cartesian (line 705) | def cartesian(other)
      method distinct (line 720) | def distinct
      method shuffle (line 733) | def shuffle(seed=nil)
      method union (line 747) | def union(other)
      method reserialize (line 765) | def reserialize(new_serializer)
      method intersection (line 785) | def intersection(other)
      method partition_by (line 802) | def partition_by(num_partitions, partition_func=nil)
      method sample (line 822) | def sample(with_replacement, fraction, seed=nil)
      method take_sample (line 837) | def take_sample(with_replacement, num, seed=nil)
      method pipe (line 913) | def pipe(*cmds)
      method reduce_by_key (line 931) | def reduce_by_key(f, num_partitions=nil)
      method combine_by_key (line 959) | def combine_by_key(create_combiner, merge_value, merge_combiners, nu...
      method group_by (line 974) | def group_by(f, num_partitions=nil)
      method group_by_key (line 989) | def group_by_key(num_partitions=nil)
      method fold_by_key (line 1007) | def fold_by_key(zero_value, f, num_partitions=nil)
      method aggregate_by_key (line 1026) | def aggregate_by_key(zero_value, seq_func, comb_func, num_partitions...
      method group_with (line 1043) | def group_with(other, num_partitions=nil)
      method cogroup (line 1057) | def cogroup(*others)
      method subtract_by_key (line 1075) | def subtract_by_key(other, num_partitions=nil)
      method subtract (line 1094) | def subtract(other, num_partitions=nil)
      method sort_by_key (line 1109) | def sort_by_key(ascending=true, num_partitions=nil)
      method sort_by_value (line 1120) | def sort_by_value(ascending=true, num_partitions=nil)
      method sort_by (line 1139) | def sort_by(key_function=nil, ascending=true, num_partitions=nil)
      method key_by (line 1190) | def key_by(f)
      method map_values (line 1204) | def map_values(f)
      method flat_map_values (line 1218) | def flat_map_values(f)
      method keys (line 1229) | def keys
      method values (line 1240) | def values
      method lookup (line 1258) | def lookup(key)
      method _reduce (line 1301) | def _reduce(klass, seq_op, comb_op, zero_value=nil)
      method _partition_by (line 1320) | def _partition_by(num_partitions, klass, *args)
      method _combine_by_key (line 1341) | def _combine_by_key(combine, merge, num_partitions)
    class PipelinedRDD (line 1365) | class PipelinedRDD < RDD
      method initialize (line 1369) | def initialize(prev, command)
      method pipelinable? (line 1386) | def pipelinable?
      method jrdd (line 1391) | def jrdd
      method _jrdd (line 1397) | def _jrdd

FILE: lib/spark/sampler.rb
  type Spark (line 4) | module Spark
    type RandomGenerator (line 5) | module RandomGenerator
      class Poisson (line 6) | class Poisson
        method initialize (line 8) | def initialize(mean, seed)
        method rand (line 13) | def rand
    type Sampler (line 32) | module Sampler
      class Base (line 34) | class Base
        method initialize (line 37) | def initialize(fraction, seed=nil)
      class Poisson (line 45) | class Poisson < Base
        method sample (line 47) | def sample(iterator)
        method lazy_sample (line 57) | def lazy_sample(iterator)
        method rng (line 64) | def rng
      class Uniform (line 72) | class Uniform < Base
        method sample (line 74) | def sample(iterator)
        method lazy_sample (line 79) | def lazy_sample(iterator)
        method rng (line 85) | def rng
  type Spark (line 31) | module Spark
    type RandomGenerator (line 5) | module RandomGenerator
      class Poisson (line 6) | class Poisson
        method initialize (line 8) | def initialize(mean, seed)
        method rand (line 13) | def rand
    type Sampler (line 32) | module Sampler
      class Base (line 34) | class Base
        method initialize (line 37) | def initialize(fraction, seed=nil)
      class Poisson (line 45) | class Poisson < Base
        method sample (line 47) | def sample(iterator)
        method lazy_sample (line 57) | def lazy_sample(iterator)
        method rng (line 64) | def rng
      class Uniform (line 72) | class Uniform < Base
        method sample (line 74) | def sample(iterator)
        method lazy_sample (line 79) | def lazy_sample(iterator)
        method rng (line 85) | def rng

FILE: lib/spark/serializer.rb
  type Spark (line 1) | module Spark
    type Serializer (line 5) | module Serializer
      function register (line 31) | def self.register(*args)
      function find (line 40) | def self.find(name)
      function find! (line 44) | def self.find!(name)
      function build (line 54) | def self.build(text=nil, &block)

FILE: lib/spark/serializer/auto_batched.rb
  type Spark (line 1) | module Spark
    type Serializer (line 2) | module Serializer
      class AutoBatched (line 8) | class AutoBatched < Batched
        method initialize (line 12) | def initialize(serializer, best_size=65536)
        method batched? (line 19) | def batched?
        method unbatch! (line 23) | def unbatch!
        method name (line 26) | def name
        method dump_to_io (line 30) | def dump_to_io(data, io)

FILE: lib/spark/serializer/base.rb
  type Spark (line 1) | module Spark
    type Serializer (line 2) | module Serializer
      class Base (line 4) | class Base
        method load_from_io (line 6) | def load_from_io(io)
        method load_from_file (line 17) | def load_from_file(file, *args)
        method == (line 28) | def ==(other)
        method batched? (line 32) | def batched?
        method unbatch! (line 36) | def unbatch!
        method check_each (line 39) | def check_each(data)
        method error (line 45) | def error(message)
        method name (line 49) | def name
        method to_s (line 53) | def to_s
        method inspect (line 57) | def inspect

FILE: lib/spark/serializer/batched.rb
  type Spark (line 1) | module Spark
    type Serializer (line 2) | module Serializer
      class Batched (line 3) | class Batched < Base
        method initialize (line 7) | def initialize(serializer, batch_size=nil)
        method batched? (line 17) | def batched?
        method unbatch! (line 21) | def unbatch!
        method load (line 25) | def load(data)
        method dump (line 29) | def dump(data)
        method name (line 33) | def name
        method to_s (line 37) | def to_s
        method dump_to_io (line 44) | def dump_to_io(data, io)
        method load_from_io (line 62) | def load_from_io(io)

FILE: lib/spark/serializer/cartesian.rb
  type Spark (line 1) | module Spark
    type Serializer (line 2) | module Serializer
      class Cartesian (line 3) | class Cartesian < Pair
        method aggregate (line 5) | def aggregate(item1, item2)

FILE: lib/spark/serializer/compressed.rb
  type Spark (line 1) | module Spark
    type Serializer (line 2) | module Serializer
      class Compressed (line 3) | class Compressed < Base
        method initialize (line 5) | def initialize(serializer)
        method dump (line 9) | def dump(data)
        method load (line 13) | def load(data)

FILE: lib/spark/serializer/marshal.rb
  type Spark (line 1) | module Spark
    type Serializer (line 2) | module Serializer
      class Marshal (line 3) | class Marshal < Base
        method dump (line 5) | def dump(data)
        method load (line 9) | def load(data)

FILE: lib/spark/serializer/message_pack.rb
  type Spark (line 1) | module Spark
    type Serializer (line 2) | module Serializer
      class MessagePack (line 3) | class MessagePack < Base
        method dump (line 5) | def dump(data)
        method load (line 9) | def load(data)

FILE: lib/spark/serializer/oj.rb
  type Spark (line 1) | module Spark
    type Serializer (line 2) | module Serializer
      class Oj (line 3) | class Oj < Base
        method dump (line 5) | def dump(data)
        method load (line 9) | def load(data)

FILE: lib/spark/serializer/pair.rb
  type Spark (line 1) | module Spark
    type Serializer (line 2) | module Serializer
      class Pair (line 3) | class Pair < Base
        method initialize (line 5) | def initialize(serializer1, serializer2)
        method to_s (line 10) | def to_s
        method aggregate (line 14) | def aggregate(item1, item2)
        method load_from_io (line 18) | def load_from_io(io)

FILE: lib/spark/serializer/text.rb
  type Spark (line 1) | module Spark
    type Serializer (line 2) | module Serializer
      class Text (line 3) | class Text < Base
        method initialize (line 7) | def initialize(encoding=Encoding::UTF_8)
        method load (line 13) | def load(data)
        method to_s (line 17) | def to_s

FILE: lib/spark/sort.rb
  type Spark (line 1) | module Spark
    type InternalSorter (line 2) | module InternalSorter
      class Base (line 3) | class Base
        method initialize (line 4) | def initialize(key_function)
      class Ascending (line 9) | class Ascending < Base
        method sort (line 10) | def sort(data)
      class Descending (line 15) | class Descending < Ascending
        method sort (line 16) | def sort(data)
      function get (line 22) | def self.get(ascending, key_function)
    class ExternalSorter (line 36) | class ExternalSorter
      method initialize (line 61) | def initialize(total_memory, serializer)
      method add_memory! (line 68) | def add_memory!
      method sort_by (line 72) | def sort_by(iterator, ascending=true, key_function=KEY_FUNCTION)
      method create_temp_folder (line 134) | def create_temp_folder
      method destroy_temp_folder (line 138) | def destroy_temp_folder
      method make_parts (line 144) | def make_parts(iterator, internal_sorter)
  type Spark (line 35) | module Spark
    type InternalSorter (line 2) | module InternalSorter
      class Base (line 3) | class Base
        method initialize (line 4) | def initialize(key_function)
      class Ascending (line 9) | class Ascending < Base
        method sort (line 10) | def sort(data)
      class Descending (line 15) | class Descending < Ascending
        method sort (line 16) | def sort(data)
      function get (line 22) | def self.get(ascending, key_function)
    class ExternalSorter (line 36) | class ExternalSorter
      method initialize (line 61) | def initialize(total_memory, serializer)
      method add_memory! (line 68) | def add_memory!
      method sort_by (line 72) | def sort_by(iterator, ascending=true, key_function=KEY_FUNCTION)
      method create_temp_folder (line 134) | def create_temp_folder
      method destroy_temp_folder (line 138) | def destroy_temp_folder
      method make_parts (line 144) | def make_parts(iterator, internal_sorter)

FILE: lib/spark/sql.rb
  type Spark (line 1) | module Spark
    type SQL (line 2) | module SQL

FILE: lib/spark/sql/column.rb
  type Spark (line 1) | module Spark
    type SQL (line 2) | module SQL
      class Column (line 3) | class Column
        method to_java (line 8) | def self.to_java(col)
        method from_literal (line 16) | def self.from_literal(literal)
        method from_name (line 20) | def self.from_name(name)
        method when (line 42) | def self.when(condition, value)
        method initialize (line 52) | def initialize(jcolumn)
        method get_item (line 134) | def get_item(key)
        method get_field (line 155) | def get_field(name)
        method substr (line 169) | def substr(start, length)
        method isin (line 191) | def isin(*cols)
        method alias (line 211) | def alias(name)
        method cast (line 224) | def cast(data_type)
        method between (line 250) | def between(lower, upper)
        method when (line 270) | def when(condition, value)
        method otherwise (line 295) | def otherwise(value)
        method over (line 304) | def over(*)
        method method_missing (line 308) | def method_missing(method, item)
        method to_s (line 312) | def to_s
        method inspect (line 316) | def inspect
        method func_op (line 327) | def func_op(name)
        method bin_op (line 332) | def bin_op(name, item)
        method unary_op (line 343) | def unary_op(name)

FILE: lib/spark/sql/context.rb
  type Spark (line 1) | module Spark
    type SQL (line 2) | module SQL
      class Context (line 3) | class Context
        method initialize (line 7) | def initialize(spark_context)
        method read (line 12) | def read

FILE: lib/spark/sql/data_frame.rb
  type Spark (line 1) | module Spark
    type SQL (line 2) | module SQL
      class DataFrame (line 8) | class DataFrame
        method initialize (line 12) | def initialize(jdf, sql_context)
        method [] (line 32) | def [](item)
        method columns (line 55) | def columns
        method schema (line 60) | def schema
        method show_string (line 70) | def show_string(n=20, truncate=true)
        method show (line 80) | def show(n=20, truncate=true)
        method print_schema (line 92) | def print_schema
        method explain (line 96) | def explain(extended=false)
        method print_explain (line 120) | def print_explain(extended=false)
        method dtypes (line 130) | def dtypes
        method inspect (line 136) | def inspect
        method method_missing (line 145) | def method_missing(method, *args, &block)
        method collect (line 164) | def collect
        method collect_as_hash (line 168) | def collect_as_hash
        method values (line 174) | def values
        method count (line 183) | def count
        method take (line 188) | def take(num)
        method first (line 193) | def first
        method select (line 219) | def select(*cols)
        method filter (line 243) | def filter(condition)
        method limit (line 257) | def limit(num)

FILE: lib/spark/sql/data_frame_reader.rb
  type Spark (line 1) | module Spark
    type SQL (line 2) | module SQL
      class DataFrameReader (line 3) | class DataFrameReader
        method initialize (line 7) | def initialize(sql_context)
        method df (line 12) | def df(jdf)
        method format (line 18) | def format(source)
        method option (line 24) | def option(key, value)
        method options (line 30) | def options(options)
        method load (line 45) | def load(path=nil, new_format=nil, new_schema=nil, new_options=nil)
        method schema (line 65) | def schema(new_schema)
        method json (line 89) | def json(path, new_schema=nil)

FILE: lib/spark/sql/data_type.rb
  type Spark (line 1) | module Spark
    type SQL (line 2) | module SQL
      class DataType (line 6) | class DataType
        method parse (line 14) | def self.parse(data)
        method class_name (line 33) | def self.class_name
        method type_name (line 37) | def self.type_name
        method complex (line 41) | def self.complex
        method atomic (line 45) | def self.atomic
        method == (line 49) | def ==(other)
        method type_name (line 53) | def type_name
        method simple_string (line 57) | def simple_string
        method json_value (line 61) | def json_value
        method json (line 65) | def json
        method to_s (line 69) | def to_s
        method inspect (line 73) | def inspect
      class StructType (line 91) | class StructType < DataType
        method from_json (line 96) | def self.from_json(json)
        method initialize (line 104) | def initialize(fields=[])
        method json_value (line 109) | def json_value
        method to_s (line 116) | def to_s
      class StructField (line 125) | class StructField < DataType
        method from_json (line 129) | def self.from_json(json)
        method initialize (line 147) | def initialize(name, data_type, nullable=true, metadata={})
        method json_value (line 154) | def json_value
        method to_s (line 163) | def to_s
      class AtomicType (line 174) | class AtomicType < DataType
      class BooleanType (line 182) | class BooleanType < AtomicType
      class NumericType (line 191) | class NumericType < AtomicType
      class IntegralType (line 200) | class IntegralType < NumericType
      class StringType (line 209) | class StringType < AtomicType
      class LongType (line 222) | class LongType < IntegralType

FILE: lib/spark/sql/row.rb
  type Spark (line 1) | module Spark
    type SQL (line 2) | module SQL
      class Row (line 6) | class Row
        method from_java (line 9) | def self.from_java(object, with_schema=true)
        method initialize (line 35) | def initialize(data={})
        method [] (line 39) | def [](item)
        method to_h (line 43) | def to_h
        method inspect (line 47) | def inspect

FILE: lib/spark/stat_counter.rb
  type Spark (line 1) | module Spark
    class StatCounter (line 2) | class StatCounter
      method initialize (line 10) | def initialize(iterator)
      method merge (line 20) | def merge(other)
      method sum (line 32) | def sum
      method variance (line 37) | def variance
      method sample_variance (line 47) | def sample_variance
      method stdev (line 56) | def stdev
      method sample_stdev (line 62) | def sample_stdev
      method to_s (line 66) | def to_s
      method merge_stat_counter (line 79) | def merge_stat_counter(other)
      method merge_array (line 109) | def merge_array(array)
      method merge_value (line 115) | def merge_value(value)

FILE: lib/spark/storage_level.rb
  type Spark (line 4) | module Spark
    class StorageLevel (line 5) | class StorageLevel
      method reload (line 7) | def self.reload
      method reload! (line 13) | def self.reload!
      method java_get (line 28) | def self.java_get(arg)

FILE: lib/spark/version.rb
  type Spark (line 1) | module Spark

FILE: lib/spark/worker/master.rb
  type Master (line 18) | module Master
    function create (line 20) | def self.create
    class Base (line 29) | class Base
      method initialize (line 32) | def initialize
      method run (line 38) | def run
      method receive_message (line 47) | def receive_message
      method kill_worker_and_wait (line 60) | def kill_worker_and_wait
    class Process (line 72) | class Process < Base
      method create_worker (line 74) | def create_worker
      method kill_worker (line 87) | def kill_worker
      method fork? (line 94) | def fork?
      method _fork? (line 98) | def _fork?
    class Thread (line 113) | class Thread < Base
      method initialize (line 115) | def initialize
      method create_worker (line 125) | def create_worker
      method kill_worker (line 131) | def kill_worker

FILE: lib/spark/worker/spark_files.rb
  class SparkFiles (line 1) | class SparkFiles
    method get (line 7) | def self.get(file_name)
    method get_content (line 11) | def self.get_content(file_name)

FILE: lib/spark/worker/worker.rb
  type Worker (line 18) | module Worker
    class Base (line 19) | class Base
      method initialize (line 27) | def initialize(port)
      method run (line 35) | def run
      method before_start (line 47) | def before_start
      method before_end (line 51) | def before_end
      method compute (line 57) | def compute
      method send_error (line 88) | def send_error(e)
      method successful_finish (line 114) | def successful_finish
      method log (line 131) | def log(message=nil)
    class Process (line 143) | class Process < Base
      method id (line 145) | def id
      method before_start (line 151) | def before_start
      method kill_worker (line 155) | def kill_worker
    class Thread (line 164) | class Thread < Base
      method id (line 166) | def id
      method load_command (line 172) | def load_command
      method load_iterator (line 178) | def load_iterator
      method kill_worker (line 189) | def kill_worker

FILE: spec/generator.rb
  class Generator (line 1) | class Generator
    method numbers (line 2) | def self.numbers(size=1000)
    method numbers_with_zero (line 6) | def self.numbers_with_zero(size=1000)
    method words (line 10) | def self.words(size=1000)
    method word (line 14) | def self.word(size=10)
    method lines (line 18) | def self.lines(size=1000, letters=3)
    method hash (line 26) | def self.hash(size=1000)
    method hash_with_values (line 32) | def self.hash_with_values(size=1000, values_count=10)

FILE: spec/lib/command_spec.rb
  function to_s_method (line 3) | def to_s_method(x)

FILE: spec/lib/filter_spec.rb
  function func4 (line 3) | def func4(item)
  function rdd_numbers (line 48) | def rdd_numbers(workers)
  function rdd_words (line 52) | def rdd_words(workers)
  function rdd_numbers (line 68) | def rdd_numbers(workers)
  function rdd_words (line 72) | def rdd_words(workers)

FILE: spec/lib/flat_map_spec.rb
  function rdd (line 62) | def rdd(workers)
  function rdd (line 77) | def rdd(workers)
  function rdd (line 93) | def rdd(workers)

FILE: spec/lib/group_spec.rb
  function make_result (line 39) | def make_result(*hashes)
  function rdd_result (line 54) | def rdd_result(workers)
  function rdd_1 (line 73) | def rdd_1(workers)
  function rdd_2 (line 77) | def rdd_2(workers)
  function rdd_3 (line 81) | def rdd_3(workers)
  function rdd_numbers (line 98) | def rdd_numbers(workers)
  function rdd_words (line 102) | def rdd_words(workers)

FILE: spec/lib/key_spec.rb
  function rdd_numbers (line 28) | def rdd_numbers(workers)
  function rdd_words (line 32) | def rdd_words(workers)

FILE: spec/lib/map_partitions_spec.rb
  function func3 (line 3) | def func3(x)
  function func4_with_index (line 7) | def func4_with_index(data, index)
  function rdd (line 66) | def rdd(workers)
  function rdd (line 80) | def rdd(workers)

FILE: spec/lib/map_spec.rb
  function rdd (line 55) | def rdd(workers)
  function rdd (line 68) | def rdd(workers)
  function rdd (line 84) | def rdd(workers)

FILE: spec/lib/reduce_by_key_spec.rb
  function flat_map (line 3) | def flat_map(line)
  function map (line 7) | def map(item)
  function reduce (line 11) | def reduce(x,y)
  function rdd (line 65) | def rdd(workers)
  function rdd (line 78) | def rdd(workers)
  function fold_by_key (line 104) | def fold_by_key(num_partitions=nil)

FILE: spec/lib/reduce_spec.rb
  function longest_words (line 3) | def longest_words(memo, word)
  function rdd_numbers (line 99) | def rdd_numbers(workers)
  function rdd_lines (line 103) | def rdd_lines(workers)
  function rdd_numbers (line 120) | def rdd_numbers(workers)
  function rdd_lines (line 124) | def rdd_lines(workers)

FILE: spec/lib/sample_spec.rb
  function rdd (line 39) | def rdd(workers)

FILE: spec/lib/sort_spec.rb
  function rdd (line 34) | def rdd(workers)
  function rdd (line 50) | def rdd(workers)

FILE: spec/lib/whole_text_files_spec.rb
  function rdd (line 26) | def rdd(workers)

FILE: spec/spec_helper.rb
  function spark_start (line 14) | def spark_start
  function windows? (line 23) | def windows?

Download .json

Condensed preview — 191 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (482K chars).

[
  {
    "path": ".gitignore",
    "chars": 421,
    "preview": "/.gemtags\n/.tags\n/java/spark.jar\n.jbundler\ntarget/*\n*.class\n*.jar\npom.xml\nvendor/*\n*.gem\n*.rbc\n.bundle\n.config\n.yardoc\nG"
  },
  {
    "path": ".travis.yml",
    "chars": 208,
    "preview": "language: ruby\n\nrvm:\n  - 2.2.0\n\nbefore_script:\n  - bundle exec rake compile\n  - bundle exec ruby bin/ruby-spark build\n\nc"
  },
  {
    "path": "CHANGELOG.md",
    "chars": 323,
    "preview": "## Unreleased\n\n## 1.3.0\n\n  - new method on RDD (lookup)\n  - fix sbt url\n  - Spark 1.5.0\n\n## 1.2.0 (15.06.2015)\n\n  - targ"
  },
  {
    "path": "Gemfile",
    "chars": 956,
    "preview": "source 'https://rubygems.org'\n\ngemspec\n\ngem 'sourcify', '0.6.0.rc4'\ngem 'method_source'\ngem 'commander'\ngem 'pry'\ngem 'n"
  },
  {
    "path": "Guardfile",
    "chars": 176,
    "preview": "guard :rspec, cmd: 'rspec' do\n  watch(%r{^spec/.+_spec\\.rb$})\n  watch(%r{^lib/(.+)\\.rb$})     { |m| \"spec/lib/#{m[1]}_sp"
  },
  {
    "path": "LICENSE.txt",
    "chars": 1072,
    "preview": "Copyright (c) 2014 Ondřej Moravčík\n\nMIT License\n\nPermission is hereby granted, free of charge, to any person obtaining\na"
  },
  {
    "path": "README.md",
    "chars": 9474,
    "preview": "# Ruby-Spark [![Build Status](https://travis-ci.org/ondra-m/ruby-spark.svg?branch=master)](https://travis-ci.org/ondra-m"
  },
  {
    "path": "Rakefile",
    "chars": 623,
    "preview": "#-*- mode: ruby -*-\n\nrequire \"bundler/gem_tasks\"\nrequire \"rspec/core/rake_task\"\n\nRSpec::Core::RakeTask.new\n\ntask default"
  },
  {
    "path": "TODO.md",
    "chars": 356,
    "preview": "- refactor JavaBridge\n  - to_java, from_java\n  - every type should have class\n  - automatic registration\n- add Streaming"
  },
  {
    "path": "benchmark/aggregate.rb",
    "chars": 550,
    "preview": "require 'benchmark'\nrequire 'benchmark/ips'\n\ndata = 0..1_000_000\nzero_value = rand(100_000)\nfunction = Proc.new{|sum, n|"
  },
  {
    "path": "benchmark/bisect.rb",
    "chars": 1541,
    "preview": "require \"benchmark\"\n\ndef bisect_left1(a, x, opts={})\n  return nil if a.nil?\n  return 0 if a.empty?\n\n  lo = (opts[:lo] ||"
  },
  {
    "path": "benchmark/comparison/prepare.sh",
    "chars": 422,
    "preview": "#!/usr/bin/env bash\n\n# Current dir\ncd \"$(dirname \"$0\")\"\n\n# Exit immediately if a pipeline returns a non-zero status.\nset"
  },
  {
    "path": "benchmark/comparison/python.py",
    "chars": 3233,
    "preview": "import os\nimport math\nfrom time import time\nfrom random import random\nfrom operator import add\nfrom pyspark import Spark"
  },
  {
    "path": "benchmark/comparison/r.r",
    "chars": 1429,
    "preview": "library(SparkR)\nsc <- sparkR.init(master=\"local[*]\")\n\nlogFile <- file(Sys.getenv(\"R_LOG\"), \"w\")\n\nlogInfo <- function(..."
  },
  {
    "path": "benchmark/comparison/ruby.rb",
    "chars": 3150,
    "preview": "#!/usr/bin/env ruby\n\nlib = File.expand_path(File.dirname(__FILE__) + '/../../lib')\n$LOAD_PATH.unshift(lib) if File.direc"
  },
  {
    "path": "benchmark/comparison/run-all.sh",
    "chars": 2841,
    "preview": "#!/usr/bin/env bash\n\n# Current dir\ncd \"$(dirname \"$0\")\"\n\n# Exit immediately if a pipeline returns a non-zero status.\nset"
  },
  {
    "path": "benchmark/comparison/scala.scala",
    "chars": 4419,
    "preview": "import java.io._\nimport scala.math\nimport scala.io.Source\nimport org.apache.spark._\n\nobject Scala {\n\n  val logFile = new"
  },
  {
    "path": "benchmark/custom_marshal.rb",
    "chars": 1519,
    "preview": "require 'benchmark'\nrequire 'benchmark/ips'\n\ndef pack_int(data)\n  [data].pack('l>')\nend\n\ndef pack_long(data)\n  [data].pa"
  },
  {
    "path": "benchmark/digest.rb",
    "chars": 3210,
    "preview": "lib = File.expand_path(File.dirname(__FILE__) + '/../lib')\n$LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PAT"
  },
  {
    "path": "benchmark/enumerator.rb",
    "chars": 1424,
    "preview": "require \"benchmark\"\n\nclass Enumerator\n  def defer(&blk)\n    self.class.new do |y|\n      each do |*input|\n        blk.cal"
  },
  {
    "path": "benchmark/serializer.rb",
    "chars": 1978,
    "preview": "require \"benchmark\"\nrequire \"yaml\"\nrequire \"msgpack\"\nrequire \"oj\"\n# require \"thrift\"\n \nputs \"Simple\"\n\ndata = (0..100000)"
  },
  {
    "path": "benchmark/sort.rb",
    "chars": 1356,
    "preview": "require \"benchmark\"\n\narray = []\n1000.times { \n  array << {:bar => rand(1000)} \n}\n\nn = 500\nBenchmark.bm(20) do |x|\n  x.re"
  },
  {
    "path": "benchmark/sort2.rb",
    "chars": 3542,
    "preview": "require \"benchmark\"\nrequire \"algorithms\"\n\nNUMBER_OF_SORTING = 1\nNUMBER_OF_ARRAY   = 10\nWORDS_IN_ARRAY    = 100000\nMAX_WO"
  },
  {
    "path": "benchmark/take.rb",
    "chars": 445,
    "preview": "require \"benchmark\"\n\nSIZE = 100_000_000\n\n@array1 = (0..SIZE).to_a;\n@array2 = (0..SIZE).to_a;\n@array3 = (0..SIZE).to_a;\n\n"
  },
  {
    "path": "bin/ruby-spark",
    "chars": 199,
    "preview": "#!/usr/bin/env ruby\n\nlib = File.expand_path(File.dirname(__FILE__) + '/../lib')\n$LOAD_PATH.unshift(lib) if File.director"
  },
  {
    "path": "example/pi.rb",
    "chars": 482,
    "preview": "#!/usr/bin/env ruby\n\nlib = File.expand_path(File.dirname(__FILE__) + '/../lib')\n$LOAD_PATH.unshift(lib) if File.director"
  },
  {
    "path": "example/website_search.rb",
    "chars": 1655,
    "preview": "#!/usr/bin/env ruby\n\n# Parse sitemap and search word on every page\n\nrequire 'optparse'\nrequire 'open-uri'\nrequire 'nokog"
  },
  {
    "path": "ext/ruby_c/extconf.rb",
    "chars": 50,
    "preview": "require 'mkmf'\n\ncreate_makefile(\"ruby_spark_ext\")\n"
  },
  {
    "path": "ext/ruby_c/murmur.c",
    "chars": 3441,
    "preview": "#include \"murmur.h\"\n\n#if defined(_MSC_VER)\n#define BIG_CONSTANT(x) (x)\n#else\n#define BIG_CONSTANT(x) (x##LLU)\n#endif\n\n/*"
  },
  {
    "path": "ext/ruby_c/murmur.h",
    "chars": 205,
    "preview": "#ifndef MURMUR_INCLUDED\n#define MURMUR_INCLUDED\n\n#include \"ruby.h\"\n\nVALUE method_portable_hash(int argc, VALUE *argv, VA"
  },
  {
    "path": "ext/ruby_c/ruby-spark.c",
    "chars": 556,
    "preview": "#include \"ruby.h\"\n#include \"murmur.h\"\n\n\nVALUE SparkModule;\nVALUE SparkDigestModule;\nVALUE SparkDigestMurmur2Class;\n\n\nvoi"
  },
  {
    "path": "ext/ruby_java/Digest.java",
    "chars": 990,
    "preview": "import org.jruby.Ruby;\nimport org.jruby.RubyModule;\nimport org.jruby.RubyObject;\nimport org.jruby.RubyClass;\nimport org."
  },
  {
    "path": "ext/ruby_java/Murmur2.java",
    "chars": 2700,
    "preview": "import org.jruby.Ruby;\nimport org.jruby.RubyClass;\nimport org.jruby.RubyObject;\nimport org.jruby.RubyString;\nimport org."
  },
  {
    "path": "ext/ruby_java/RubySparkExtService.java",
    "chars": 988,
    "preview": "import org.jruby.Ruby;\nimport org.jruby.RubyClass;\nimport org.jruby.RubyModule;\nimport org.jruby.runtime.ObjectAllocator"
  },
  {
    "path": "ext/ruby_java/extconf.rb",
    "chars": 50,
    "preview": "require 'mkmf'\n\ncreate_makefile(\"ruby_spark_ext\")\n"
  },
  {
    "path": "ext/spark/build.sbt",
    "chars": 3127,
    "preview": "import AssemblyKeys._\n\nassemblySettings\n\n// Default values\nval defaultScalaVersion     = \"2.10.4\"\nval defaultSparkVersio"
  },
  {
    "path": "ext/spark/project/plugins.sbt",
    "chars": 408,
    "preview": "resolvers += Resolver.url(\"artifactory\", url(\"http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases\"))(Reso"
  },
  {
    "path": "ext/spark/sbt/sbt",
    "chars": 1377,
    "preview": "#!/bin/bash\n\n# This script launches sbt for this project. If present it uses the system\n# version of sbt. If there is no"
  },
  {
    "path": "ext/spark/src/main/scala/Exec.scala",
    "chars": 2412,
    "preview": "package org.apache.spark.api.ruby\n\nimport java.io.{File, FileOutputStream, InputStreamReader, BufferedReader}\n\nimport sc"
  },
  {
    "path": "ext/spark/src/main/scala/MLLibAPI.scala",
    "chars": 124,
    "preview": "package org.apache.spark.mllib.api.python\n\n// PythonMLLibAPI is private for python\nclass MLLibAPI extends PythonMLLibAPI"
  },
  {
    "path": "ext/spark/src/main/scala/Marshal.scala",
    "chars": 1452,
    "preview": "package org.apache.spark.api.ruby.marshal\n\nimport java.io.{DataInputStream, DataOutputStream, ByteArrayInputStream, Byte"
  },
  {
    "path": "ext/spark/src/main/scala/MarshalDump.scala",
    "chars": 2560,
    "preview": "package org.apache.spark.api.ruby.marshal\n\nimport java.io.{DataInputStream, DataOutputStream, ByteArrayInputStream, Byte"
  },
  {
    "path": "ext/spark/src/main/scala/MarshalLoad.scala",
    "chars": 5419,
    "preview": "package org.apache.spark.api.ruby.marshal\n\nimport java.io.{DataInputStream, DataOutputStream, ByteArrayInputStream, Byte"
  },
  {
    "path": "ext/spark/src/main/scala/RubyAccumulatorParam.scala",
    "chars": 2284,
    "preview": "package org.apache.spark.api.ruby\n\nimport java.io._\nimport java.net._\nimport java.util.{List, ArrayList}\n\nimport scala.c"
  },
  {
    "path": "ext/spark/src/main/scala/RubyBroadcast.scala",
    "chars": 433,
    "preview": "package org.apache.spark.api.ruby\n\nimport org.apache.spark.api.python.PythonBroadcast\n\n/**\n * An Wrapper for Ruby Broadc"
  },
  {
    "path": "ext/spark/src/main/scala/RubyConstant.scala",
    "chars": 289,
    "preview": "package org.apache.spark.api.ruby\n\nobject RubyConstant {\n  val DATA_EOF = -2\n  val WORKER_ERROR = -1\n  val WORKER_DONE ="
  },
  {
    "path": "ext/spark/src/main/scala/RubyMLLibAPI.scala",
    "chars": 1799,
    "preview": "package org.apache.spark.mllib.api.ruby\n\nimport java.util.ArrayList\n\nimport scala.collection.JavaConverters._\n\nimport or"
  },
  {
    "path": "ext/spark/src/main/scala/RubyMLLibUtilAPI.scala",
    "chars": 560,
    "preview": "package org.apache.spark.mllib.api.ruby\n\nimport java.util.ArrayList\n\nimport org.apache.spark.mllib.util.LinearDataGenera"
  },
  {
    "path": "ext/spark/src/main/scala/RubyPage.scala",
    "chars": 861,
    "preview": "package org.apache.spark.ui.ruby\n\n// import javax.servlet.http.HttpServletRequest\n\n// import scala.xml.Node\n\n// import o"
  },
  {
    "path": "ext/spark/src/main/scala/RubyRDD.scala",
    "chars": 13384,
    "preview": "package org.apache.spark.api.ruby\n\nimport java.io._\nimport java.net._\nimport java.util.{List, ArrayList, Collections}\n\ni"
  },
  {
    "path": "ext/spark/src/main/scala/RubySerializer.scala",
    "chars": 462,
    "preview": "package org.apache.spark.api.ruby\n\nimport scala.collection.JavaConverters._\nimport scala.reflect.{ClassTag, classTag}\n\ni"
  },
  {
    "path": "ext/spark/src/main/scala/RubyTab.scala",
    "chars": 287,
    "preview": "package org.apache.spark.ui.ruby\n\nimport scala.collection.mutable.HashMap\n\nimport org.apache.spark.ui._\n\n// class RubyTa"
  },
  {
    "path": "ext/spark/src/main/scala/RubyUtils.scala",
    "chars": 334,
    "preview": "package org.apache.spark.api.ruby\n\nimport org.apache.spark.util._\nimport org.apache.spark.{SparkConf, Logging}\n\nobject R"
  },
  {
    "path": "ext/spark/src/main/scala/RubyWorker.scala",
    "chars": 8385,
    "preview": "package org.apache.spark.api.ruby\n\nimport java.io.{File, DataInputStream, InputStream, DataOutputStream, FileOutputStrea"
  },
  {
    "path": "ext/spark/src/test/scala/MarshalSpec.scala",
    "chars": 2065,
    "preview": "package org.apache.spark.api.ruby.marshal\n\nimport org.scalatest._\n\n\nimport org.apache.spark.api.ruby.marshal._\n\nclass Ma"
  },
  {
    "path": "lib/ruby-spark.rb",
    "chars": 25,
    "preview": "require_relative 'spark'\n"
  },
  {
    "path": "lib/spark/accumulator.rb",
    "chars": 6087,
    "preview": "module Spark\n  ##\n  # A shared variable that can be accumulated, i.e., has a commutative and associative \"add\"\n  # opera"
  },
  {
    "path": "lib/spark/broadcast.rb",
    "chars": 2505,
    "preview": "module Spark\n  ##\n  # Broadcast a read-only variable to the cluster, returning a Spark::Broadcast\n  # object for reading"
  },
  {
    "path": "lib/spark/build.rb",
    "chars": 1339,
    "preview": "module Spark\n  module Build\n\n    DEFAULT_SCALA_VERSION  = '2.10.4'\n    DEFAULT_CORE_VERSION   = '2.10'\n    DEFAULT_SPARK"
  },
  {
    "path": "lib/spark/cli.rb",
    "chars": 5488,
    "preview": "require 'commander'\n\nmodule Commander\n  module UI\n    # Disable paging\n    # for 'classic' help\n    def self.enable_pagi"
  },
  {
    "path": "lib/spark/command/base.rb",
    "chars": 3561,
    "preview": "##\n# Spark::Command::Base\n#\n# Parent for all commands (Map, FlatMap, Sort, ...)\n#\nclass Spark::Command::Base\n\n  DEFAULT_"
  },
  {
    "path": "lib/spark/command/basic.rb",
    "chars": 7381,
    "preview": "_Base = Spark::Command::Base\n\n# ----------------------------------------------------------------------------------------"
  },
  {
    "path": "lib/spark/command/pair.rb",
    "chars": 2602,
    "preview": "_Base = Spark::Command::Base\n\n# ----------------------------------------------------------------------------------------"
  },
  {
    "path": "lib/spark/command/sort.rb",
    "chars": 1230,
    "preview": "_Base = Spark::Command::Base\n\n# ----------------------------------------------------------------------------------------"
  },
  {
    "path": "lib/spark/command/statistic.rb",
    "chars": 3154,
    "preview": "_Base = Spark::Command::Base\n\n# ----------------------------------------------------------------------------------------"
  },
  {
    "path": "lib/spark/command.rb",
    "chars": 2147,
    "preview": "module Spark\n  ##\n  # Container which includes all commands and other things for worker\n  # Every RDD have own copy of C"
  },
  {
    "path": "lib/spark/command_builder.rb",
    "chars": 3742,
    "preview": "require 'spark/command_validator'\n\nmodule Spark\n  ##\n  # Builder for building correct {Spark::Command}\n  #\n  class Comma"
  },
  {
    "path": "lib/spark/command_validator.rb",
    "chars": 707,
    "preview": "module Spark\n  module CommandValidator\n\n    def validate(value, options)\n      validate_type(value, options[:type])\n    "
  },
  {
    "path": "lib/spark/config.rb",
    "chars": 5537,
    "preview": "# Necessary libraries\nSpark.load_lib\n\nmodule Spark\n  # Common configuration for RubySpark and Spark\n  class Config\n\n    "
  },
  {
    "path": "lib/spark/constant.rb",
    "chars": 293,
    "preview": "module Spark\n  # Commond constant for Ruby and Spark\n  module Constant\n    DATA_EOF = -2\n    WORKER_ERROR = -1\n    WORKE"
  },
  {
    "path": "lib/spark/context.rb",
    "chars": 10104,
    "preview": "# Necessary libraries\nSpark.load_lib\n\nmodule Spark\n  ##\n  # Main entry point for Spark functionality. A SparkContext rep"
  },
  {
    "path": "lib/spark/error.rb",
    "chars": 1000,
    "preview": "module Spark\n  # Extension cannot be built\n  class BuildError < StandardError\n  end\n\n  # Proc.to_source\n  # Java object "
  },
  {
    "path": "lib/spark/ext/hash.rb",
    "chars": 1130,
    "preview": "module Spark\n  module CoreExtension\n    module Hash\n      module ClassMethods\n      end\n\n      module InstanceMethods\n  "
  },
  {
    "path": "lib/spark/ext/integer.rb",
    "chars": 594,
    "preview": "module Spark\n  module CoreExtension\n    module Integer\n      module ClassMethods\n      end\n\n      module InstanceMethods"
  },
  {
    "path": "lib/spark/ext/io.rb",
    "chars": 1321,
    "preview": "module Spark\n  module CoreExtension\n    module IO\n      module ClassMethods\n      end\n\n      module InstanceMethods\n\n   "
  },
  {
    "path": "lib/spark/ext/ip_socket.rb",
    "chars": 495,
    "preview": "module Spark\n  module CoreExtension\n    module IPSocket\n      module ClassMethods\n      end\n\n      module InstanceMethod"
  },
  {
    "path": "lib/spark/ext/module.rb",
    "chars": 1362,
    "preview": "module Spark\n  module CoreExtension\n    module Module\n\n      # Patch method to class unless already exist\n      #\n      "
  },
  {
    "path": "lib/spark/ext/object.rb",
    "chars": 1996,
    "preview": "module Spark\n  module CoreExtension\n    module Object\n      module ClassMethods\n      end\n\n      module InstanceMethods\n"
  },
  {
    "path": "lib/spark/ext/string.rb",
    "chars": 539,
    "preview": "module Spark\n  module CoreExtension\n    module String\n      module ClassMethods\n      end\n\n      module InstanceMethods\n"
  },
  {
    "path": "lib/spark/helper/logger.rb",
    "chars": 885,
    "preview": "module Spark\n  module Helper\n    module Logger\n\n      def self.included(base)\n        base.send :extend,  Methods\n      "
  },
  {
    "path": "lib/spark/helper/parser.rb",
    "chars": 2061,
    "preview": "module Spark\n  module Helper\n    module Parser\n      \n      def self.included(base)\n        base.send :extend,  Methods\n"
  },
  {
    "path": "lib/spark/helper/serialize.rb",
    "chars": 1536,
    "preview": "module Spark\n  module Helper\n    module Serialize\n\n      DIRECTIVE_INTEGER_BIG_ENDIAN = 'l>'\n      DIRECTIVE_INTEGERS_BI"
  },
  {
    "path": "lib/spark/helper/statistic.rb",
    "chars": 3005,
    "preview": "module Spark\n  module Helper\n    module Statistic\n\n      # Returns a sampling rate that guarantees a sample of size >= s"
  },
  {
    "path": "lib/spark/helper/system.rb",
    "chars": 909,
    "preview": "module Spark\n  module Helper\n    module System\n\n      def self.included(base)\n        base.send :extend,  Methods\n      "
  },
  {
    "path": "lib/spark/helper.rb",
    "chars": 330,
    "preview": "module Spark\n  module Helper\n    autoload :System,    \"spark/helper/system\"\n    autoload :Logger,    \"spark/helper/logge"
  },
  {
    "path": "lib/spark/java_bridge/base.rb",
    "chars": 6399,
    "preview": "##\n# Spark::JavaBridge::Base\n#\n# Parent for all adapter (ruby - java)\n#\nmodule Spark\n  module JavaBridge\n    class Base\n"
  },
  {
    "path": "lib/spark/java_bridge/jruby.rb",
    "chars": 432,
    "preview": "require 'java'\n\nmodule Spark\n  module JavaBridge\n    class JRuby < Base\n\n      def initialize(*args)\n        super\n     "
  },
  {
    "path": "lib/spark/java_bridge/rjb.rb",
    "chars": 702,
    "preview": "if !ENV.has_key?('JAVA_HOME')\n  raise Spark::ConfigurationError, 'Environment variable JAVA_HOME is not set'\nend\n\nrequir"
  },
  {
    "path": "lib/spark/java_bridge.rb",
    "chars": 355,
    "preview": "module Spark\n  module JavaBridge\n\n    autoload :Base,  'spark/java_bridge/base'\n    autoload :JRuby, 'spark/java_bridge/"
  },
  {
    "path": "lib/spark/library.rb",
    "chars": 450,
    "preview": "module Spark\n  module Library\n\n    def autoload(klass, location, import=true)\n      if import\n        @for_importing ||="
  },
  {
    "path": "lib/spark/logger.rb",
    "chars": 1268,
    "preview": "# Necessary libraries\nSpark.load_lib\n\nmodule Spark\n  class Logger\n\n    attr_reader :jlogger\n\n    def initialize\n      @j"
  },
  {
    "path": "lib/spark/mllib/classification/common.rb",
    "chars": 541,
    "preview": "module Spark\n  module Mllib\n    class ClassificationModel\n\n      attr_reader :weights, :intercept, :threshold\n\n      def"
  },
  {
    "path": "lib/spark/mllib/classification/logistic_regression.rb",
    "chars": 6916,
    "preview": "module Spark\n  module Mllib\n    ##\n    # LogisticRegressionModel\n    #\n    # A linear binary classification model derive"
  },
  {
    "path": "lib/spark/mllib/classification/naive_bayes.rb",
    "chars": 2777,
    "preview": "module Spark\n  module Mllib\n    ##\n    # NaiveBayesModel\n    #\n    # Model for Naive Bayes classifiers.\n    #\n    # Cont"
  },
  {
    "path": "lib/spark/mllib/classification/svm.rb",
    "chars": 4150,
    "preview": "module Spark\n  module Mllib\n    ##\n    # SVMModel\n    #\n    # A support vector machine.\n    #\n    # == Examples:\n    #\n "
  },
  {
    "path": "lib/spark/mllib/clustering/gaussian_mixture.rb",
    "chars": 2205,
    "preview": "module Spark\n  module Mllib\n    ##\n    # GaussianMixtureModel\n    #\n    # A clustering model derived from the Gaussian M"
  },
  {
    "path": "lib/spark/mllib/clustering/kmeans.rb",
    "chars": 3394,
    "preview": "module Spark\n  module Mllib\n    ##\n    # KMeansModel\n    #\n    # A clustering model derived from the k-means method.\n   "
  },
  {
    "path": "lib/spark/mllib/matrix.rb",
    "chars": 2422,
    "preview": "module Spark\n  module Mllib\n    module Matrices\n\n      def self.dense(*args)\n        DenseMatrix.new(*args)\n      end\n\n "
  },
  {
    "path": "lib/spark/mllib/regression/common.rb",
    "chars": 1769,
    "preview": "module Spark\n  module Mllib\n    ##\n    # RegressionModel\n    #\n    # A linear model that has a vector of coefficients an"
  },
  {
    "path": "lib/spark/mllib/regression/labeled_point.rb",
    "chars": 767,
    "preview": "module Spark\n  module Mllib\n    ##\n    # LabeledPoint\n    #\n    # The features and labels of a data point.\n    #\n    # ="
  },
  {
    "path": "lib/spark/mllib/regression/lasso.rb",
    "chars": 3706,
    "preview": "##\n# LassoModel\n#\n# Train a regression model with L1-regularization using Stochastic Gradient Descent.\n# This solves the"
  },
  {
    "path": "lib/spark/mllib/regression/linear.rb",
    "chars": 4268,
    "preview": "##\n# LinearRegressionModel\n#\n# Train a linear regression model with no regularization using Stochastic Gradient Descent."
  },
  {
    "path": "lib/spark/mllib/regression/ridge.rb",
    "chars": 3716,
    "preview": "##\n# RidgeRegressionModel\n#\n# Train a regression model with L2-regularization using Stochastic Gradient Descent.\n# This "
  },
  {
    "path": "lib/spark/mllib/ruby_matrix/matrix_adapter.rb",
    "chars": 1139,
    "preview": "require 'matrix'\n\nmodule Spark\n  module Mllib\n    class MatrixAdapter < ::Matrix\n\n      def self.new(*args)\n        obje"
  },
  {
    "path": "lib/spark/mllib/ruby_matrix/vector_adapter.rb",
    "chars": 1027,
    "preview": "require 'matrix'\n\n# Based on ruby 2.1\n\nclass Vector\n  def self.elements(array, copy=true)\n    DenseVector.new(convert_to"
  },
  {
    "path": "lib/spark/mllib/stat/distribution.rb",
    "chars": 460,
    "preview": "##\n# MultivariateGaussian\n#\n# This class provides basic functionality for a Multivariate Gaussian (Normal) Distribution."
  },
  {
    "path": "lib/spark/mllib/vector.rb",
    "chars": 4035,
    "preview": "module Spark\n  module Mllib\n    module Vectors\n\n      def self.dense(*args)\n        DenseVector.new(*args)\n      end\n\n  "
  },
  {
    "path": "lib/spark/mllib.rb",
    "chars": 3300,
    "preview": "module Spark\n  # MLlib is Spark’s scalable machine learning library consisting of common learning algorithms and utiliti"
  },
  {
    "path": "lib/spark/rdd.rb",
    "chars": 42715,
    "preview": "module Spark\n  ##\n  # A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. Represents an immutable,\n  "
  },
  {
    "path": "lib/spark/sampler.rb",
    "chars": 1761,
    "preview": "require 'distribution'\n\n# Random Generators\nmodule Spark\n  module RandomGenerator\n    class Poisson\n\n      def initializ"
  },
  {
    "path": "lib/spark/serializer/auto_batched.rb",
    "chars": 1314,
    "preview": "module Spark\n  module Serializer\n    ##\n    # AutoBatched serializator\n    #\n    # Batch size is computed automatically."
  },
  {
    "path": "lib/spark/serializer/base.rb",
    "chars": 1144,
    "preview": "module Spark\n  module Serializer\n    # @abstract Parent for all serializers\n    class Base\n\n      def load_from_io(io)\n "
  },
  {
    "path": "lib/spark/serializer/batched.rb",
    "chars": 1651,
    "preview": "module Spark\n  module Serializer\n    class Batched < Base\n\n      attr_writer :serializer\n\n      def initialize(serialize"
  },
  {
    "path": "lib/spark/serializer/cartesian.rb",
    "chars": 224,
    "preview": "module Spark\n  module Serializer\n    class Cartesian < Pair\n\n      def aggregate(item1, item2)\n        item1.product(ite"
  },
  {
    "path": "lib/spark/serializer/compressed.rb",
    "chars": 503,
    "preview": "module Spark\n  module Serializer\n    class Compressed < Base\n\n      def initialize(serializer)\n        @serializer = ser"
  },
  {
    "path": "lib/spark/serializer/marshal.rb",
    "chars": 266,
    "preview": "module Spark\n  module Serializer\n    class Marshal < Base\n\n      def dump(data)\n        ::Marshal.dump(data)\n      end\n\n"
  },
  {
    "path": "lib/spark/serializer/message_pack.rb",
    "chars": 417,
    "preview": "module Spark\n  module Serializer\n    class MessagePack < Base\n\n      def dump(data)\n        ::MessagePack.dump(data)\n   "
  },
  {
    "path": "lib/spark/serializer/oj.rb",
    "chars": 328,
    "preview": "module Spark\n  module Serializer\n    class Oj < Base\n\n      def dump(data)\n        ::Oj.dump(data)\n      end\n\n      def "
  },
  {
    "path": "lib/spark/serializer/pair.rb",
    "chars": 928,
    "preview": "module Spark\n  module Serializer\n    class Pair < Base\n\n      def initialize(serializer1, serializer2)\n        @serializ"
  },
  {
    "path": "lib/spark/serializer/text.rb",
    "chars": 483,
    "preview": "module Spark\n  module Serializer\n    class Text < Base\n\n      attr_reader :encoding\n\n      def initialize(encoding=Encod"
  },
  {
    "path": "lib/spark/serializer.rb",
    "chars": 1773,
    "preview": "module Spark\n  ##\n  # Serializer\n  #\n  module Serializer\n\n    DEFAULT_COMPRESS = false\n    DEFAULT_BATCH_SIZE = 1024\n   "
  },
  {
    "path": "lib/spark/sort.rb",
    "chars": 4802,
    "preview": "module Spark\n  module InternalSorter\n    class Base\n      def initialize(key_function)\n        @key_function = key_funct"
  },
  {
    "path": "lib/spark/sql/column.rb",
    "chars": 9904,
    "preview": "module Spark\n  module SQL\n    class Column\n\n      # ===================================================================="
  },
  {
    "path": "lib/spark/sql/context.rb",
    "chars": 316,
    "preview": "module Spark\n  module SQL\n    class Context\n\n      attr_reader :spark_context, :jsql_context\n\n      def initialize(spark"
  },
  {
    "path": "lib/spark/sql/data_frame.rb",
    "chars": 6695,
    "preview": "module Spark\n  module SQL\n    ##\n    # Spark::SQL::DataFrame\n    #\n    # All example are base on people.json\n    #\n    c"
  },
  {
    "path": "lib/spark/sql/data_frame_reader.rb",
    "chars": 2943,
    "preview": "module Spark\n  module SQL\n    class DataFrameReader\n\n      attr_reader :sql_context, :jreader\n\n      def initialize(sql_"
  },
  {
    "path": "lib/spark/sql/data_type.rb",
    "chars": 4685,
    "preview": "module Spark\n  module SQL\n    ##\n    # Spark::SQL::DataType\n    #\n    class DataType\n\n      cattr_accessor :atomic_types"
  },
  {
    "path": "lib/spark/sql/row.rb",
    "chars": 1097,
    "preview": "module Spark\n  module SQL\n    ##\n    # Spark::SQL::Row\n    #\n    class Row\n      attr_reader :data\n\n      def self.from_"
  },
  {
    "path": "lib/spark/sql.rb",
    "chars": 821,
    "preview": "module Spark\n  module SQL\n    extend Spark::Library\n\n    autoload_without_import :Context,         'spark/sql/context'\n "
  },
  {
    "path": "lib/spark/stat_counter.rb",
    "chars": 2853,
    "preview": "module Spark\n  class StatCounter\n\n    attr_reader :n   # count of our values\n    attr_reader :mu  # mean of our values\n "
  },
  {
    "path": "lib/spark/storage_level.rb",
    "chars": 1237,
    "preview": "# Necessary libraries\nSpark.load_lib\n\nmodule Spark\n  class StorageLevel\n\n    def self.reload\n      return if @reloaded\n "
  },
  {
    "path": "lib/spark/version.rb",
    "chars": 37,
    "preview": "module Spark\n  VERSION = '1.2.1'\nend\n"
  },
  {
    "path": "lib/spark/worker/master.rb",
    "chars": 2871,
    "preview": "#!/usr/bin/env ruby\n\n$PROGRAM_NAME = 'RubySparkMaster'\n\nrequire 'socket'\nrequire 'io/wait'\nrequire 'nio'\n\nrequire_relati"
  },
  {
    "path": "lib/spark/worker/spark_files.rb",
    "chars": 224,
    "preview": "class SparkFiles\n\n  class << self\n    attr_accessor :root_directory\n  end\n\n  def self.get(file_name)\n    File.join(root_"
  },
  {
    "path": "lib/spark/worker/worker.rb",
    "chars": 4264,
    "preview": "#!/usr/bin/env ruby\n\n# Load root of the gem\nlib = File.expand_path(File.join('..', '..'), File.dirname(__FILE__))\n$LOAD_"
  },
  {
    "path": "lib/spark.rb",
    "chars": 6566,
    "preview": "# Gems and libraries\nrequire 'method_source'\nrequire 'securerandom'\nrequire 'forwardable'\nrequire 'sourcify'\nrequire 'so"
  },
  {
    "path": "ruby-spark.gemspec",
    "chars": 1328,
    "preview": "# coding: utf-8\n\nlib = File.expand_path('../lib', __FILE__)\n$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)\n\nreq"
  },
  {
    "path": "spec/generator.rb",
    "chars": 758,
    "preview": "class Generator\n  def self.numbers(size=1000)\n    Array.new(size){ rand(1..1000) }\n  end\n\n  def self.numbers_with_zero(s"
  },
  {
    "path": "spec/inputs/lorem_300.txt",
    "chars": 30820,
    "preview": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aenean ligula neque, ultricies et lorem\nvel, accumsan cursus fe"
  },
  {
    "path": "spec/inputs/numbers/1.txt",
    "chars": 141,
    "preview": "1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19\n20\n21\n22\n23\n24\n25\n26\n27\n28\n29\n30\n31\n32\n33\n34\n35\n36\n37\n38\n39\n40\n41\n42\n43\n"
  },
  {
    "path": "spec/inputs/numbers/10.txt",
    "chars": 200,
    "preview": "451\n452\n453\n454\n455\n456\n457\n458\n459\n460\n461\n462\n463\n464\n465\n466\n467\n468\n469\n470\n471\n472\n473\n474\n475\n476\n477\n478\n479\n480\n"
  },
  {
    "path": "spec/inputs/numbers/11.txt",
    "chars": 200,
    "preview": "501\n502\n503\n504\n505\n506\n507\n508\n509\n510\n511\n512\n513\n514\n515\n516\n517\n518\n519\n520\n521\n522\n523\n524\n525\n526\n527\n528\n529\n530\n"
  },
  {
    "path": "spec/inputs/numbers/12.txt",
    "chars": 200,
    "preview": "551\n552\n553\n554\n555\n556\n557\n558\n559\n560\n561\n562\n563\n564\n565\n566\n567\n568\n569\n570\n571\n572\n573\n574\n575\n576\n577\n578\n579\n580\n"
  },
  {
    "path": "spec/inputs/numbers/13.txt",
    "chars": 200,
    "preview": "601\n602\n603\n604\n605\n606\n607\n608\n609\n610\n611\n612\n613\n614\n615\n616\n617\n618\n619\n620\n621\n622\n623\n624\n625\n626\n627\n628\n629\n630\n"
  },
  {
    "path": "spec/inputs/numbers/14.txt",
    "chars": 200,
    "preview": "651\n652\n653\n654\n655\n656\n657\n658\n659\n660\n661\n662\n663\n664\n665\n666\n667\n668\n669\n670\n671\n672\n673\n674\n675\n676\n677\n678\n679\n680\n"
  },
  {
    "path": "spec/inputs/numbers/15.txt",
    "chars": 200,
    "preview": "701\n702\n703\n704\n705\n706\n707\n708\n709\n710\n711\n712\n713\n714\n715\n716\n717\n718\n719\n720\n721\n722\n723\n724\n725\n726\n727\n728\n729\n730\n"
  },
  {
    "path": "spec/inputs/numbers/16.txt",
    "chars": 200,
    "preview": "751\n752\n753\n754\n755\n756\n757\n758\n759\n760\n761\n762\n763\n764\n765\n766\n767\n768\n769\n770\n771\n772\n773\n774\n775\n776\n777\n778\n779\n780\n"
  },
  {
    "path": "spec/inputs/numbers/17.txt",
    "chars": 200,
    "preview": "801\n802\n803\n804\n805\n806\n807\n808\n809\n810\n811\n812\n813\n814\n815\n816\n817\n818\n819\n820\n821\n822\n823\n824\n825\n826\n827\n828\n829\n830\n"
  },
  {
    "path": "spec/inputs/numbers/18.txt",
    "chars": 200,
    "preview": "851\n852\n853\n854\n855\n856\n857\n858\n859\n860\n861\n862\n863\n864\n865\n866\n867\n868\n869\n870\n871\n872\n873\n874\n875\n876\n877\n878\n879\n880\n"
  },
  {
    "path": "spec/inputs/numbers/19.txt",
    "chars": 200,
    "preview": "901\n902\n903\n904\n905\n906\n907\n908\n909\n910\n911\n912\n913\n914\n915\n916\n917\n918\n919\n920\n921\n922\n923\n924\n925\n926\n927\n928\n929\n930\n"
  },
  {
    "path": "spec/inputs/numbers/2.txt",
    "chars": 151,
    "preview": "51\n52\n53\n54\n55\n56\n57\n58\n59\n60\n61\n62\n63\n64\n65\n66\n67\n68\n69\n70\n71\n72\n73\n74\n75\n76\n77\n78\n79\n80\n81\n82\n83\n84\n85\n86\n87\n88\n89\n90\n"
  },
  {
    "path": "spec/inputs/numbers/20.txt",
    "chars": 201,
    "preview": "951\n952\n953\n954\n955\n956\n957\n958\n959\n960\n961\n962\n963\n964\n965\n966\n967\n968\n969\n970\n971\n972\n973\n974\n975\n976\n977\n978\n979\n980\n"
  },
  {
    "path": "spec/inputs/numbers/3.txt",
    "chars": 200,
    "preview": "101\n102\n103\n104\n105\n106\n107\n108\n109\n110\n111\n112\n113\n114\n115\n116\n117\n118\n119\n120\n121\n122\n123\n124\n125\n126\n127\n128\n129\n130\n"
  },
  {
    "path": "spec/inputs/numbers/4.txt",
    "chars": 200,
    "preview": "151\n152\n153\n154\n155\n156\n157\n158\n159\n160\n161\n162\n163\n164\n165\n166\n167\n168\n169\n170\n171\n172\n173\n174\n175\n176\n177\n178\n179\n180\n"
  },
  {
    "path": "spec/inputs/numbers/5.txt",
    "chars": 200,
    "preview": "201\n202\n203\n204\n205\n206\n207\n208\n209\n210\n211\n212\n213\n214\n215\n216\n217\n218\n219\n220\n221\n222\n223\n224\n225\n226\n227\n228\n229\n230\n"
  },
  {
    "path": "spec/inputs/numbers/6.txt",
    "chars": 200,
    "preview": "251\n252\n253\n254\n255\n256\n257\n258\n259\n260\n261\n262\n263\n264\n265\n266\n267\n268\n269\n270\n271\n272\n273\n274\n275\n276\n277\n278\n279\n280\n"
  },
  {
    "path": "spec/inputs/numbers/7.txt",
    "chars": 200,
    "preview": "301\n302\n303\n304\n305\n306\n307\n308\n309\n310\n311\n312\n313\n314\n315\n316\n317\n318\n319\n320\n321\n322\n323\n324\n325\n326\n327\n328\n329\n330\n"
  },
  {
    "path": "spec/inputs/numbers/8.txt",
    "chars": 200,
    "preview": "351\n352\n353\n354\n355\n356\n357\n358\n359\n360\n361\n362\n363\n364\n365\n366\n367\n368\n369\n370\n371\n372\n373\n374\n375\n376\n377\n378\n379\n380\n"
  },
  {
    "path": "spec/inputs/numbers/9.txt",
    "chars": 200,
    "preview": "401\n402\n403\n404\n405\n406\n407\n408\n409\n410\n411\n412\n413\n414\n415\n416\n417\n418\n419\n420\n421\n422\n423\n424\n425\n426\n427\n428\n429\n430\n"
  },
  {
    "path": "spec/inputs/numbers_0_100.txt",
    "chars": 293,
    "preview": "0\n1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19\n20\n21\n22\n23\n24\n25\n26\n27\n28\n29\n30\n31\n32\n33\n34\n35\n36\n37\n38\n39\n40\n41\n42\n4"
  },
  {
    "path": "spec/inputs/numbers_1_100.txt",
    "chars": 291,
    "preview": "1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19\n20\n21\n22\n23\n24\n25\n26\n27\n28\n29\n30\n31\n32\n33\n34\n35\n36\n37\n38\n39\n40\n41\n42\n43\n"
  },
  {
    "path": "spec/inputs/people.json",
    "chars": 12535,
    "preview": "{\"id\":1,\"name\":\"Matthew Fuller\",\"age\":49,\"email\":\"mfuller0@blogger.com\",\"active\":false}\n{\"id\":2,\"name\":\"Pamela Thomas\",\""
  },
  {
    "path": "spec/lib/collect_spec.rb",
    "chars": 859,
    "preview": "require 'spec_helper'\n\nRSpec.describe Spark::RDD do\n\n  let(:mapping) { lambda{|x| [x, 1]} }\n  let(:numbers) { Generator."
  },
  {
    "path": "spec/lib/command_spec.rb",
    "chars": 1495,
    "preview": "require 'spec_helper'\n\ndef to_s_method(x)\n  x.to_s\nend\n\nRSpec::describe Spark::CommandBuilder do\n  let(:numbers) { Gener"
  },
  {
    "path": "spec/lib/config_spec.rb",
    "chars": 1159,
    "preview": "require 'spec_helper'\n\nRSpec.describe Spark::Config do\n\n  before(:context) do\n    Spark.stop\n  end\n\n  after(:context) do"
  },
  {
    "path": "spec/lib/context_spec.rb",
    "chars": 4617,
    "preview": "require 'spec_helper'\n\nRSpec.describe Spark::Context do\n\n  it '.run_job' do\n    workers = 5\n    numbers = (0...100).to_a"
  },
  {
    "path": "spec/lib/ext_spec.rb",
    "chars": 1031,
    "preview": "require 'spec_helper'\n\nRSpec.describe Array do\n\n  it '.deep_copy' do\n    data = ['a', 'b', 'c']\n    new_data = data.dup\n"
  },
  {
    "path": "spec/lib/external_apps_spec.rb",
    "chars": 863,
    "preview": "require 'spec_helper'\n\nRSpec.describe Spark::RDD do\n\n  context '.pipe' do\n    let(:words)   { Generator.words }\n    let("
  },
  {
    "path": "spec/lib/filter_spec.rb",
    "chars": 2079,
    "preview": "require 'spec_helper'\n\ndef func4(item)\n  item.start_with?('a') && item.size > 3 && item[1].to_s.ord > 106\nend\n\nRSpec.sha"
  },
  {
    "path": "spec/lib/flat_map_spec.rb",
    "chars": 2793,
    "preview": "require 'spec_helper'\n\nRSpec.shared_examples 'a flat mapping' do |workers|\n  it \"with #{workers || 'default'} worker\" do"
  },
  {
    "path": "spec/lib/group_spec.rb",
    "chars": 2769,
    "preview": "require 'spec_helper'\n\nRSpec.shared_examples 'a groupping by key' do |workers|\n  it \"with #{workers || 'default'} worker"
  },
  {
    "path": "spec/lib/helper_spec.rb",
    "chars": 2616,
    "preview": "require 'spec_helper'\n\nRSpec.configure do |c|\n  c.include Spark::Helper::Parser\n  c.include Spark::Helper::Statistic\nend"
  },
  {
    "path": "spec/lib/key_spec.rb",
    "chars": 1393,
    "preview": "require 'spec_helper'\n\nRSpec.shared_examples 'a keying by' do |workers|\n  it \"with #{workers || 'default'} worker\" do\n  "
  },
  {
    "path": "spec/lib/manipulation_spec.rb",
    "chars": 3261,
    "preview": "require 'spec_helper'\n\nRSpec.describe 'Spark::RDD' do\n  let(:numbers) { 1..100 }\n  let(:rand_numbers) { Generator.number"
  },
  {
    "path": "spec/lib/map_partitions_spec.rb",
    "chars": 2254,
    "preview": "require 'spec_helper'\n\ndef func3(x)\n  x.map(&:to_i).reduce(:+)\nend\n\ndef func4_with_index(data, index)\n  [{\n    index => "
  },
  {
    "path": "spec/lib/map_spec.rb",
    "chars": 2468,
    "preview": "require 'spec_helper'\n\nRSpec.shared_examples 'a mapping' do |workers|\n  it \"with #{workers || 'default'} worker\" do\n    "
  },
  {
    "path": "spec/lib/mllib/classification_spec.rb",
    "chars": 1280,
    "preview": "require 'spec_helper'\n\nRSpec.describe 'Spark::Mllib classification' do\n\n  let(:data1) do\n    [\n      LabeledPoint.new(0."
  },
  {
    "path": "spec/lib/mllib/clustering_spec.rb",
    "chars": 1005,
    "preview": "require 'spec_helper'\n\nRSpec.describe 'Spark::Mllib clustering' do\n  context 'kmeans' do\n    it 'test' do\n      data = ["
  },
  {
    "path": "spec/lib/mllib/matrix_spec.rb",
    "chars": 854,
    "preview": "require 'spec_helper'\n\nRSpec.describe 'Spark::Mllib::Matrix' do\n  context 'dense' do\n    it 'construct' do\n      values "
  },
  {
    "path": "spec/lib/mllib/regression_spec.rb",
    "chars": 3223,
    "preview": "require 'spec_helper'\n\n# Mllib functions are tested on Spark\n# This just test if ruby call proper methods\n\nRSpec.describ"
  },
  {
    "path": "spec/lib/mllib/vector_spec.rb",
    "chars": 2236,
    "preview": "require 'spec_helper'\n\nRSpec.describe 'Spark::Mllib::Vector' do\n\n  context 'parsing' do\n    it 'dense vector' do\n      d"
  },
  {
    "path": "spec/lib/reduce_by_key_spec.rb",
    "chars": 2864,
    "preview": "require 'spec_helper'\n\ndef flat_map(line)\n  line.split\nend\n\ndef map(item)\n  [item, 1]\nend\n\ndef reduce(x,y)\n  x+y\nend\n\nRS"
  },
  {
    "path": "spec/lib/reduce_spec.rb",
    "chars": 3346,
    "preview": "require 'spec_helper'\n\ndef longest_words(memo, word)\n  memo.length > word.length ? memo : word\nend\n\nRSpec.shared_example"
  },
  {
    "path": "spec/lib/sample_spec.rb",
    "chars": 1164,
    "preview": "require 'spec_helper'\n\n# Sample method can not be tested because of random generator\n# Just test it for raising error\n\nR"
  },
  {
    "path": "spec/lib/serializer_spec.rb",
    "chars": 4027,
    "preview": "require 'spec_helper'\nrequire 'zlib'\n\nRSpec.describe Spark::Serializer do\n  let(:data) { [1, 'test', 2.0, [3], {key: 'va"
  },
  {
    "path": "spec/lib/sort_spec.rb",
    "chars": 1479,
    "preview": "require 'spec_helper'\n\nRSpec.shared_examples 'a sorting' do |workers|\n  it \"with #{workers || 'default'} worker\" do\n    "
  },
  {
    "path": "spec/lib/sql/column_spec.rb",
    "chars": 3459,
    "preview": "require 'spec_helper'\n\nRSpec.shared_examples 'binary comparison' do |op|\n  it \"#{op}\" do\n    to_test = 20\n\n    result = "
  },
  {
    "path": "spec/lib/sql/data_frame_spec.rb",
    "chars": 2501,
    "preview": "require 'spec_helper'\n\nRSpec.describe Spark::SQL::DataFrame do\n\n  let(:file) { File.join('spec', 'inputs', 'people.json'"
  },
  {
    "path": "spec/lib/statistic_spec.rb",
    "chars": 5828,
    "preview": "require 'spec_helper'\n\nRSpec.shared_examples 'a stats' do |workers|\n  let(:numbers) { [1.0, 1.0, 2.0, 3.0, 5.0, 8.0] }\n\n"
  },
  {
    "path": "spec/lib/whole_text_files_spec.rb",
    "chars": 904,
    "preview": "require 'spec_helper'\n\nRSpec.shared_examples 'a whole_text_files' do |workers|\n  it \"with #{workers || 'default'} worker"
  },
  {
    "path": "spec/spec_helper.rb",
    "chars": 694,
    "preview": "require 'simplecov'\nSimpleCov.start\n\n$LOAD_PATH.unshift File.dirname(__FILE__) + '/../lib'\nrequire 'ruby-spark'\nrequire "
  }
]

About this extraction

This page contains the full source code of the ondra-m/ruby-spark GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 191 files (440.0 KB), approximately 128.7k tokens, and a symbol index with 1072 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo