Showing preview only (484K chars total). Download the full file or copy to clipboard to get everything.
Repository: ondra-m/ruby-spark
Branch: master
Commit: d1b9787642fe
Files: 191
Total size: 440.0 KB
Directory structure:
gitextract_h83fh3m2/
├── .gitignore
├── .travis.yml
├── CHANGELOG.md
├── Gemfile
├── Guardfile
├── LICENSE.txt
├── README.md
├── Rakefile
├── TODO.md
├── benchmark/
│ ├── aggregate.rb
│ ├── bisect.rb
│ ├── comparison/
│ │ ├── prepare.sh
│ │ ├── python.py
│ │ ├── r.r
│ │ ├── ruby.rb
│ │ ├── run-all.sh
│ │ └── scala.scala
│ ├── custom_marshal.rb
│ ├── digest.rb
│ ├── enumerator.rb
│ ├── serializer.rb
│ ├── sort.rb
│ ├── sort2.rb
│ └── take.rb
├── bin/
│ └── ruby-spark
├── example/
│ ├── pi.rb
│ └── website_search.rb
├── ext/
│ ├── ruby_c/
│ │ ├── extconf.rb
│ │ ├── murmur.c
│ │ ├── murmur.h
│ │ └── ruby-spark.c
│ ├── ruby_java/
│ │ ├── Digest.java
│ │ ├── Murmur2.java
│ │ ├── RubySparkExtService.java
│ │ └── extconf.rb
│ └── spark/
│ ├── build.sbt
│ ├── project/
│ │ └── plugins.sbt
│ ├── sbt/
│ │ └── sbt
│ └── src/
│ ├── main/
│ │ └── scala/
│ │ ├── Exec.scala
│ │ ├── MLLibAPI.scala
│ │ ├── Marshal.scala
│ │ ├── MarshalDump.scala
│ │ ├── MarshalLoad.scala
│ │ ├── RubyAccumulatorParam.scala
│ │ ├── RubyBroadcast.scala
│ │ ├── RubyConstant.scala
│ │ ├── RubyMLLibAPI.scala
│ │ ├── RubyMLLibUtilAPI.scala
│ │ ├── RubyPage.scala
│ │ ├── RubyRDD.scala
│ │ ├── RubySerializer.scala
│ │ ├── RubyTab.scala
│ │ ├── RubyUtils.scala
│ │ └── RubyWorker.scala
│ └── test/
│ └── scala/
│ └── MarshalSpec.scala
├── lib/
│ ├── ruby-spark.rb
│ ├── spark/
│ │ ├── accumulator.rb
│ │ ├── broadcast.rb
│ │ ├── build.rb
│ │ ├── cli.rb
│ │ ├── command/
│ │ │ ├── base.rb
│ │ │ ├── basic.rb
│ │ │ ├── pair.rb
│ │ │ ├── sort.rb
│ │ │ └── statistic.rb
│ │ ├── command.rb
│ │ ├── command_builder.rb
│ │ ├── command_validator.rb
│ │ ├── config.rb
│ │ ├── constant.rb
│ │ ├── context.rb
│ │ ├── error.rb
│ │ ├── ext/
│ │ │ ├── hash.rb
│ │ │ ├── integer.rb
│ │ │ ├── io.rb
│ │ │ ├── ip_socket.rb
│ │ │ ├── module.rb
│ │ │ ├── object.rb
│ │ │ └── string.rb
│ │ ├── helper/
│ │ │ ├── logger.rb
│ │ │ ├── parser.rb
│ │ │ ├── serialize.rb
│ │ │ ├── statistic.rb
│ │ │ └── system.rb
│ │ ├── helper.rb
│ │ ├── java_bridge/
│ │ │ ├── base.rb
│ │ │ ├── jruby.rb
│ │ │ └── rjb.rb
│ │ ├── java_bridge.rb
│ │ ├── library.rb
│ │ ├── logger.rb
│ │ ├── mllib/
│ │ │ ├── classification/
│ │ │ │ ├── common.rb
│ │ │ │ ├── logistic_regression.rb
│ │ │ │ ├── naive_bayes.rb
│ │ │ │ └── svm.rb
│ │ │ ├── clustering/
│ │ │ │ ├── gaussian_mixture.rb
│ │ │ │ └── kmeans.rb
│ │ │ ├── matrix.rb
│ │ │ ├── regression/
│ │ │ │ ├── common.rb
│ │ │ │ ├── labeled_point.rb
│ │ │ │ ├── lasso.rb
│ │ │ │ ├── linear.rb
│ │ │ │ └── ridge.rb
│ │ │ ├── ruby_matrix/
│ │ │ │ ├── matrix_adapter.rb
│ │ │ │ └── vector_adapter.rb
│ │ │ ├── stat/
│ │ │ │ └── distribution.rb
│ │ │ └── vector.rb
│ │ ├── mllib.rb
│ │ ├── rdd.rb
│ │ ├── sampler.rb
│ │ ├── serializer/
│ │ │ ├── auto_batched.rb
│ │ │ ├── base.rb
│ │ │ ├── batched.rb
│ │ │ ├── cartesian.rb
│ │ │ ├── compressed.rb
│ │ │ ├── marshal.rb
│ │ │ ├── message_pack.rb
│ │ │ ├── oj.rb
│ │ │ ├── pair.rb
│ │ │ └── text.rb
│ │ ├── serializer.rb
│ │ ├── sort.rb
│ │ ├── sql/
│ │ │ ├── column.rb
│ │ │ ├── context.rb
│ │ │ ├── data_frame.rb
│ │ │ ├── data_frame_reader.rb
│ │ │ ├── data_type.rb
│ │ │ └── row.rb
│ │ ├── sql.rb
│ │ ├── stat_counter.rb
│ │ ├── storage_level.rb
│ │ ├── version.rb
│ │ └── worker/
│ │ ├── master.rb
│ │ ├── spark_files.rb
│ │ └── worker.rb
│ └── spark.rb
├── ruby-spark.gemspec
└── spec/
├── generator.rb
├── inputs/
│ ├── lorem_300.txt
│ ├── numbers/
│ │ ├── 1.txt
│ │ ├── 10.txt
│ │ ├── 11.txt
│ │ ├── 12.txt
│ │ ├── 13.txt
│ │ ├── 14.txt
│ │ ├── 15.txt
│ │ ├── 16.txt
│ │ ├── 17.txt
│ │ ├── 18.txt
│ │ ├── 19.txt
│ │ ├── 2.txt
│ │ ├── 20.txt
│ │ ├── 3.txt
│ │ ├── 4.txt
│ │ ├── 5.txt
│ │ ├── 6.txt
│ │ ├── 7.txt
│ │ ├── 8.txt
│ │ └── 9.txt
│ ├── numbers_0_100.txt
│ ├── numbers_1_100.txt
│ └── people.json
├── lib/
│ ├── collect_spec.rb
│ ├── command_spec.rb
│ ├── config_spec.rb
│ ├── context_spec.rb
│ ├── ext_spec.rb
│ ├── external_apps_spec.rb
│ ├── filter_spec.rb
│ ├── flat_map_spec.rb
│ ├── group_spec.rb
│ ├── helper_spec.rb
│ ├── key_spec.rb
│ ├── manipulation_spec.rb
│ ├── map_partitions_spec.rb
│ ├── map_spec.rb
│ ├── mllib/
│ │ ├── classification_spec.rb
│ │ ├── clustering_spec.rb
│ │ ├── matrix_spec.rb
│ │ ├── regression_spec.rb
│ │ └── vector_spec.rb
│ ├── reduce_by_key_spec.rb
│ ├── reduce_spec.rb
│ ├── sample_spec.rb
│ ├── serializer_spec.rb
│ ├── sort_spec.rb
│ ├── sql/
│ │ ├── column_spec.rb
│ │ └── data_frame_spec.rb
│ ├── statistic_spec.rb
│ └── whole_text_files_spec.rb
└── spec_helper.rb
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
/.gemtags
/.tags
/java/spark.jar
.jbundler
target/*
*.class
*.jar
pom.xml
vendor/*
*.gem
*.rbc
.bundle
.config
.yardoc
Gemfile.lock
InstalledFiles
_yardoc
coverage
doc/
lib/bundler/man
pkg
rdoc
spec/reports
test/tmp
test/version_tmp
tmp
*.bundle
*.so
*.o
*.a
mkmf.log
ext/spark/target/*
ext/spark/project/target/*
ext/spark/project/project/target/*
wiki
/benchmark/performance/spark/*
/benchmark/performance/rspark/*
/_*
================================================
FILE: .travis.yml
================================================
language: ruby
rvm:
- 2.2.0
before_script:
- bundle exec rake compile
- bundle exec ruby bin/ruby-spark build
cache:
bundler: true
directories:
- $HOME/.m2
- $HOME/.ivy2
- $HOME/.sbt
================================================
FILE: CHANGELOG.md
================================================
## Unreleased
## 1.3.0
- new method on RDD (lookup)
- fix sbt url
- Spark 1.5.0
## 1.2.0 (15.06.2015)
- target folder is now located at HOME
- better serializators
- error when java class does not exist
- default setting at ~/.ruby-spark.conf
- compatible with Spark 1.4.0
- added calling site to RDD
================================================
FILE: Gemfile
================================================
source 'https://rubygems.org'
gemspec
gem 'sourcify', '0.6.0.rc4'
gem 'method_source'
gem 'commander'
gem 'pry'
gem 'nio4r'
gem 'distribution'
platform :mri do
gem 'rjb'
gem 'msgpack'
gem 'oj'
gem 'narray'
end
platform :jruby do
gem 'msgpack-jruby', require: 'msgpack'
# NameError: no constructorfor arguments (org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.joda.time.chrono.GJChronology) on Java::OrgJodaTime::DateTime
# gem 'mdarray'
end
group :stats do
# gem 'nmatrix'
# gem 'statsample'
# gem 'statsample-glm'
# gem 'statsample-timeseries'
# gem 'statistics2'
# gem 'statsample-optimization' # libgsl0-dev
# gem 'narray'
# gem 'gsl-nmatrix'
end
group :development do
gem 'benchmark-ips'
gem 'rspec'
gem 'rake-compiler'
gem 'guard'
gem 'guard-rspec'
gem 'listen'
end
group :test do
gem 'simplecov', require: false
end
================================================
FILE: Guardfile
================================================
guard :rspec, cmd: 'rspec' do
watch(%r{^spec/.+_spec\.rb$})
watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
watch('spec/spec_helper.rb') { "spec" }
end
================================================
FILE: LICENSE.txt
================================================
Copyright (c) 2014 Ondřej Moravčík
MIT License
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
================================================
FILE: README.md
================================================
# Ruby-Spark [](https://travis-ci.org/ondra-m/ruby-spark)
Apache Spark™ is a fast and general engine for large-scale data processing.
This Gem allows the use Spark functionality on Ruby.
> Word count in Spark's Ruby API
```ruby
file = spark.text_file("hdfs://...")
file.flat_map(:split)
.map(lambda{|word| [word, 1]})
.reduce_by_key(lambda{|a, b| a+b})
```
- [Apache Spark](http://spark.apache.org)
- [Wiki](https://github.com/ondra-m/ruby-spark/wiki)
- [Rubydoc](http://www.rubydoc.info/gems/ruby-spark)
## Installation
### Requirments
- Java 7+
- Ruby 2+
- wget or curl
- MRI or JRuby
Add this line to your application's Gemfile:
```ruby
gem 'ruby-spark'
```
And then execute:
```
$ bundle
```
Or install it yourself as:
```
$ gem install ruby-spark
```
Run `rake compile` if you are using gem from local filesystem.
### Build Apache Spark
This command will download Spark and build extensions for this gem ([SBT](ext/spark/build.sbt) is used for compiling). For more informations check [wiki](https://github.com/ondra-m/ruby-spark/wiki/Installation). Jars will be stored at you HOME directory.
```
$ ruby-spark build
```
## Usage
You can use Ruby Spark via interactive shell (Pry is used)
```
$ ruby-spark shell
```
Or on existing project.
If you want configure Spark first. See [configurations](https://github.com/ondra-m/ruby-spark/wiki/Configuration) for more details.
```ruby
require 'ruby-spark'
# Configuration
Spark.config do
set_app_name "RubySpark"
set 'spark.ruby.serializer', 'oj'
set 'spark.ruby.serializer.batch_size', 100
end
# Start Apache Spark
Spark.start
# Context reference
Spark.sc
```
Finally, to stop the cluster. On the shell is Spark stopped automatically when environment exit.
```ruby
Spark.stop
```
After first use, global configuration is created at **~/.ruby-spark.conf**. There can be specified properties for Spark and RubySpark.
## Creating RDD (a new collection)
Single text file:
```ruby
rdd = sc.text_file(FILE, workers_num, serializer=nil)
```
All files on directory:
```ruby
rdd = sc.whole_text_files(DIRECTORY, workers_num, serializer=nil)
```
Direct uploading structures from ruby:
```ruby
rdd = sc.parallelize([1,2,3,4,5], workers_num, serializer=nil)
rdd = sc.parallelize(1..5, workers_num, serializer=nil)
```
There is 2 conditions:
1. choosen serializer must be able to serialize it
2. data must be iterable
If you do not specified serializer -> default is used (defined from spark.ruby.serializer.* options). [Check this](https://github.com/ondra-m/ruby-spark/wiki/Loading-data#custom-serializer) if you want create custom serializer.
## Operations
All operations can be divided into 2 groups:
- **Transformations:** append new operation to current RDD and return new
- **Actions:** add operation and start calculations
More informations:
- [Wiki page](https://github.com/ondra-m/ruby-spark/wiki/RDD)
- [Rubydoc](http://www.rubydoc.info/github/ondra-m/ruby-spark/master/Spark/RDD)
- [rdd.rb](https://github.com/ondra-m/ruby-spark/blob/master/lib/spark/rdd.rb)
You can also check official Spark documentation. First make sure that method is implemented here.
- [Transformations](http://spark.apache.org/docs/latest/programming-guide.html#transformations)
- [Actions](http://spark.apache.org/docs/latest/programming-guide.html#actions)
#### Transformations
<dl>
<dt><code>rdd.map(function)</code></dt>
<dd>Return a new RDD by applying a function to all elements of this RDD.</dd>
<dt><code>rdd.flat_map(function)</code></dt>
<dd>Return a new RDD by first applying a function to all elements of this RDD, and then flattening the results.</dd>
<dt><code>rdd.map_partitions(function)</code></dt>
<dd>Return a new RDD by applying a function to each partition of this RDD.</dd>
<dt><code>rdd.filter(function)</code></dt>
<dd>Return a new RDD containing only the elements that satisfy a predicate.</dd>
<dt><code>rdd.cartesian(other)</code></dt>
<dd>Return the Cartesian product of this RDD and another one, that is, the RDD of all pairs of elements `(a, b)` where `a` is in `self` and `b` is in `other`.</dd>
<dt><code>rdd.intersection(other)</code></dt>
<dd>Return the intersection of this RDD and another one. The output will not contain any duplicate elements, even if the input RDDs did.</dd>
<dt><code>rdd.sample(with_replacement, fraction, seed)</code></dt>
<dd>Return a sampled subset of this RDD. Operations are base on Poisson and Uniform distributions.</dd>
<dt><code>rdd.group_by_key(num_partitions)</code></dt>
<dd>Group the values for each key in the RDD into a single sequence.</dd>
<dt><a href="http://www.rubydoc.info/gems/ruby-spark/Spark/RDD" target="_blank"><code>...many more...</code></a></dt>
<dd></dd>
</dl>
#### Actions
<dl>
<dt><code>rdd.take(count)</code></dt>
<dd>Take the first num elements of the RDD.</dd>
<dt><code>rdd.reduce(function)</code></dt>
<dd>Reduces the elements of this RDD using the specified lambda or method.</dd>
<dt><code>rdd.aggregate(zero_value, seq_op, comb_op)</code></dt>
<dd>Aggregate the elements of each partition, and then the results for all the partitions, using given combine functions and a neutral “zero value”.</dd>
<dt><code>rdd.histogram(buckets)</code></dt>
<dd>Compute a histogram using the provided buckets.</dd>
<dt><code>rdd.collect</code></dt>
<dd>Return an array that contains all of the elements in this RDD.</dd>
<dt><a href="http://www.rubydoc.info/gems/ruby-spark/Spark/RDD" target="_blank"><code>...many more...</code></a></dt>
<dd></dd>
</dl>
## Examples
##### Basic methods
```ruby
# Every batch will be serialized by Marshal and will have size 10
ser = Spark::Serializer.build('batched(marshal, 10)')
# Range 0..100, 2 workers, custom serializer
rdd = Spark.sc.parallelize(0..100, 2, ser)
# Take first 5 items
rdd.take(5)
# => [0, 1, 2, 3, 4]
# Numbers reducing
rdd.reduce(lambda{|sum, x| sum+x})
rdd.reduce(:+)
rdd.sum
# => 5050
# Reducing with zero items
seq = lambda{|x,y| x+y}
com = lambda{|x,y| x*y}
rdd.aggregate(1, seq, com)
# 1. Every workers adds numbers
# => [1226, 3826]
# 2. Results are multiplied
# => 4690676
# Statistic method
rdd.stats
# => StatCounter: (count, mean, max, min, variance,
# sample_variance, stdev, sample_stdev)
# Compute a histogram using the provided buckets.
rdd.histogram(2)
# => [[0.0, 50.0, 100], [50, 51]]
# Mapping
rdd.map(lambda {|x| x*2}).collect
# => [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, ...]
rdd.map(:to_f).collect
# => [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ...]
# Mapping the whole collection
rdd.map_partitions(lambda{|part| part.reduce(:+)}).collect
# => [1225, 3825]
# Selecting
rdd.filter(lambda{|x| x.even?}).collect
# => [0, 2, 4, 6, 8, 10, 12, 14, 16, ...]
# Sampling
rdd.sample(true, 10).collect
# => [3, 36, 40, 54, 58, 82, 86, 95, 98]
# Sampling X items
rdd.take_sample(true, 10)
# => [53, 87, 71, 74, 18, 75, 55, 94, 46, 32]
# Using external process
rdd.pipe('cat', "awk '{print $1*10}'")
# => ["0", "10", "20", "30", "40", "50", ...]
```
##### Words count using methods
```ruby
# Content:
# "first line"
# "second line"
rdd = sc.text_file(PATH)
# ["first", "line", "second", "line"]
rdd = rdd.flat_map(lambda{|line| line.split})
# [["first", 1], ["line", 1], ["second", 1], ["line", 1]]
rdd = rdd.map(lambda{|word| [word, 1]})
# [["first", 1], ["line", 2], ["second", 1]]
rdd = rdd.reduce_by_key(lambda{|a, b| a+b})
# {"first"=>1, "line"=>2, "second"=>1}
rdd.collect_as_hash
```
##### Estimating PI with a custom serializer
```ruby
slices = 3
n = 100000 * slices
def map(_)
x = rand * 2 - 1
y = rand * 2 - 1
if x**2 + y**2 < 1
return 1
else
return 0
end
end
rdd = Spark.context.parallelize(1..n, slices, serializer: 'oj')
rdd = rdd.map(method(:map))
puts 'Pi is roughly %f' % (4.0 * rdd.sum / n)
```
##### Estimating PI
```ruby
rdd = sc.parallelize([10_000], 1)
rdd = rdd.add_library('bigdecimal/math')
rdd = rdd.map(lambda{|x| BigMath.PI(x)})
rdd.collect # => #<BigDecimal, '0.31415926...'>
```
### Mllib (Machine Learning Library)
Mllib functions are using Spark's Machine Learning Library. Ruby objects are serialized and deserialized in Java so you cannot use custom classes. Supported are primitive types such as string or integers.
All supported methods/models:
- [Rubydoc / Mllib](http://www.rubydoc.info/github/ondra-m/ruby-spark/Spark/Mllib)
- [Github / Mllib](https://github.com/ondra-m/ruby-spark/tree/master/lib/spark/mllib)
##### Linear regression
```ruby
# Import Mllib classes into Object
# Otherwise are accessible via Spark::Mllib::LinearRegressionWithSGD
Spark::Mllib.import(Object)
# Training data
data = [
LabeledPoint.new(0.0, [0.0]),
LabeledPoint.new(1.0, [1.0]),
LabeledPoint.new(3.0, [2.0]),
LabeledPoint.new(2.0, [3.0])
]
# Train a model
lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initial_weights: [1.0])
lrm.predict([0.0])
```
##### K-Mean
```ruby
Spark::Mllib.import
# Dense vectors
data = [
DenseVector.new([0.0,0.0]),
DenseVector.new([1.0,1.0]),
DenseVector.new([9.0,8.0]),
DenseVector.new([8.0,9.0])
]
model = KMeans.train(sc.parallelize(data), 2)
model.predict([0.0, 0.0]) == model.predict([1.0, 1.0])
# => true
model.predict([8.0, 9.0]) == model.predict([9.0, 8.0])
# => true
```
## Benchmarks
================================================
FILE: Rakefile
================================================
#-*- mode: ruby -*-
require "bundler/gem_tasks"
require "rspec/core/rake_task"
RSpec::Core::RakeTask.new
task default: :spec
task test: :spec
def java?
RUBY_PLATFORM =~ /java/
end
if java?
require "rake/javaextensiontask"
Rake::JavaExtensionTask.new("ruby_java") do |ext|
ext.name = "ruby_spark_ext"
end
else
require "rake/extensiontask"
Rake::ExtensionTask.new("ruby_c") do |ext|
ext.name = "ruby_spark_ext"
end
end
task :clean do
Dir['lib/*.{jar,o,so}'].each do |path|
puts "Deleting #{path} ..."
File.delete(path)
end
FileUtils.rm_rf('./pkg')
FileUtils.rm_rf('./tmp')
end
================================================
FILE: TODO.md
================================================
- refactor JavaBridge
- to_java, from_java
- every type should have class
- automatic registration
- add Streaming
- worker informations (time, memory, ...)
- killing zombie workers
- add_rb, add_inline_rb to Spark::{Context, RDD}
- fix broadcast for cluster
- dump to disk if there is memory limit
- Add Partitioner to RDD
- add NonExist serializer
================================================
FILE: benchmark/aggregate.rb
================================================
require 'benchmark'
require 'benchmark/ips'
data = 0..1_000_000
zero_value = rand(100_000)
function = Proc.new{|sum, n| sum+n}
Benchmark.ips do |r|
r.report('each') do
sum = zero_value
data.each do |n|
sum += n
end
end
r.report('reduce') do
data.reduce(zero_value){|sum, n| sum+n}
end
r.report('each with function') do
sum = zero_value
data.each do |n|
sum = function.call(sum, n)
end
end
r.report('reduce with function') do
data.reduce(zero_value, &function)
end
r.compare!
end
================================================
FILE: benchmark/bisect.rb
================================================
require "benchmark"
def bisect_left1(a, x, opts={})
return nil if a.nil?
return 0 if a.empty?
lo = (opts[:lo] || opts[:low]).to_i
hi = opts[:hi] || opts[:high] || a.length
while lo < hi
mid = (lo + hi) / 2
v = a[mid]
if v < x
lo = mid + 1
else
hi = mid
end
end
return lo
end
def bisect_left2(list, item)
count = 0
list.each{|i|
return count if i >= item
count += 1
}
nil
end
def bisect_left3(list, item, lo = 0, hi = list.size)
while lo < hi
i = (lo + hi - 1) >> 1
if 0 <= (list[i] <=> item)
hi = i
else
lo = i + 1
end
end
return hi
end
array = Array.new(1000000) { rand(0..1000000) };
to_find = Array.new(500) { rand(0..10000) };
Benchmark.bm(20) do |x|
x.report("bisect_left1") do
to_find.each do |item|
bisect_left1(array, item)
end
end
x.report("bisect_left2") do
to_find.each do |item|
bisect_left2(array, item)
end
end
x.report("bisect_left3") do
to_find.each do |item|
bisect_left3(array, item)
end
end
end
array = Array.new(100000) { Array.new(rand(1..10)){(97+rand(26)).chr}.join };
to_find = Array.new(500) { (97+rand(26)).chr };
Benchmark.bm(20) do |x|
x.report("bisect_left1") do
to_find.each do |item|
bisect_left1(array, item)
end
end
x.report("bisect_left2") do
to_find.each do |item|
bisect_left2(array, item)
end
end
x.report("bisect_left3") do
to_find.each do |item|
bisect_left3(array, item)
end
end
end
================================================
FILE: benchmark/comparison/prepare.sh
================================================
#!/usr/bin/env bash
# Current dir
cd "$(dirname "$0")"
# Exit immediately if a pipeline returns a non-zero status.
set -e
# Spark
wget "http://d3kbcqa49mib13.cloudfront.net/spark-1.3.0-bin-hadoop2.4.tgz" -O spark.tgz
tar xvzf spark.tgz
mv spark-1.3.0-bin-hadoop2.4 spark
rm spark.tgz
# RSpark (only for 1.3.0)
git clone git@github.com:amplab-extras/SparkR-pkg.git rspark
cd rspark
SPARK_VERSION=1.3.0 ./install-dev.sh
================================================
FILE: benchmark/comparison/python.py
================================================
import os
import math
from time import time
from random import random
from operator import add
from pyspark import SparkContext
sc = SparkContext(appName="Python", master="local[*]")
log_file = open(os.environ.get('PYTHON_LOG'), 'w')
def log(*values):
values = map(lambda x: str(x), values)
log_file.write(';'.join(values))
log_file.write('\n')
workers = int(os.environ.get('WORKERS'))
numbers_count = int(os.environ.get('NUMBERS_COUNT'))
text_file = os.environ.get('TEXT_FILE')
numbers = range(numbers_count)
floats = [float(i) for i in numbers]
with open(text_file) as t:
strings = t.read().split("\n")
# =============================================================================
# Serialization
# =============================================================================
t = time()
rdd_numbers = sc.parallelize(numbers, workers)
t = time() - t
log('NumbersSerialization', t)
t = time()
rdd_floats = sc.parallelize(floats, workers)
t = time() - t
log('FloatsSerialization', t)
t = time()
rdd_strings = sc.parallelize(strings, workers)
t = time() - t
log('StringsSerialization', t)
# =============================================================================
# Computing
# =============================================================================
# --- Is prime? ---------------------------------------------------------------
def is_prime(x):
if x < 2:
return [x, False]
elif x == 2:
return [x, True]
elif x % 2 == 0:
return [x, False]
else:
upper = int(math.sqrt(float(x)))
result = True
i = 3
while i <= upper:
if x % i == 0:
result = False
break
i += 2
return [x, result]
t = time()
rdd_numbers.map(is_prime).collect()
t = time() - t
log('IsPrime', t)
# --- Matrix multiplication ---------------------------------------------------
matrix_size = int(os.environ.get('MATRIX_SIZE'))
matrix = []
for row in range(matrix_size):
matrix.append([])
for col in range(matrix_size):
matrix[row].append(row+col)
def multiplication_func(matrix):
matrix = list(matrix)
size = len(matrix)
new_matrix = []
for row in range(size):
new_matrix.append([])
for col in range(size):
result = 0
for i in range(size):
result += matrix[row][i] * matrix[col][i]
new_matrix[row].append(result)
return new_matrix
t = time()
rdd = sc.parallelize(matrix, 1)
rdd.mapPartitions(multiplication_func).collect()
t = time() - t
log('MatrixMultiplication', t)
# --- Pi digits ---------------------------------------------------------------
# http://rosettacode.org/wiki/Pi#Python
pi_digit = int(os.environ.get('PI_DIGIT'))
def pi_func(size):
size = size.next()
result = ''
q, r, t, k, n, l = 1, 0, 1, 1, 3, 3
while size > 0:
if 4*q+r-t < n*t:
result += str(n)
size -= 1
nr = 10*(r-n*t)
n = ((10*(3*q+r))//t)-10*n
q *= 10
r = nr
else:
nr = (2*q+r)*l
nn = (q*(7*k)+2+(r*l))//(t*l)
q *= k
t *= l
l += 2
k += 1
n = nn
r = nr
return [result]
t = time()
rdd = sc.parallelize([pi_digit], 1)
rdd.mapPartitions(pi_func).collect()
t = time() - t
log('PiDigit', t)
log_file.close()
================================================
FILE: benchmark/comparison/r.r
================================================
library(SparkR)
sc <- sparkR.init(master="local[*]")
logFile <- file(Sys.getenv("R_LOG"), "w")
logInfo <- function(...){
args <- list(...)
line <- paste(args, collapse = ";")
writeLines(line, logFile)
}
workers <- as.integer(Sys.getenv('WORKERS'))
numbersCount <- as.integer(Sys.getenv('NUMBERS_COUNT'))
textFile <- Sys.getenv('TEXT_FILE')
# =============================================================================
# Serialization
# =============================================================================
time <- proc.time()
rddNumbers <- parallelize(sc, as.numeric(seq(0, numbersCount)), workers)
time <- as.double(proc.time()-time)[3]
logInfo('NumbersSerialization', time)
# =============================================================================
# Computing
# =============================================================================
isPrime = function(x) {
if(x < 2){
c(x, FALSE)
}
else if(x == 2){
c(x, TRUE)
}
else if(x %% 2 == 0){
c(x, FALSE)
}
else{
upper <- as.numeric(sqrt(as.double(x)))
result <- TRUE
i <- 3
while(i <= upper){
if(x %% i == 0){
result = FALSE
break
}
i <- i+2
}
c(x, result)
}
}
time <- proc.time()
rdd <- map(rddNumbers, isPrime)
capture.output(collect(rdd), file='/dev/null')
time <- as.double(proc.time()-time)[3]
logInfo('IsPrime', time)
close(logFile)
sparkR.stop()
================================================
FILE: benchmark/comparison/ruby.rb
================================================
#!/usr/bin/env ruby
lib = File.expand_path(File.dirname(__FILE__) + '/../../lib')
$LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)
require 'ruby-spark'
require 'benchmark'
Spark.start
sc = Spark.context
$log_file = File.open(ENV['RUBY_LOG'], 'w')
def log(*values)
$log_file.puts(values.join(';'))
end
workers = ENV['WORKERS'].to_i
numbers_count = ENV['NUMBERS_COUNT'].to_i
text_file = ENV['TEXT_FILE']
numbers = (0...numbers_count).to_a
floats = numbers.map(&:to_f)
strings = File.read(text_file).split("\n")
# =============================================================================
# Serialization
# =============================================================================
time = Benchmark.realtime do
@rdd_numbers = sc.parallelize(numbers, workers)
end
log('NumbersSerialization', time)
time = Benchmark.realtime do
@rdd_floats = sc.parallelize(floats, workers)
end
log('FloatsSerialization', time)
time = Benchmark.realtime do
@rdd_strings = sc.parallelize(strings, workers)
end
log('StringsSerialization', time)
# =============================================================================
# Computing
# =============================================================================
# --- Is prime? ---------------------------------------------------------------
is_prime = Proc.new do |x|
case
when x < 2
[x, false]
when x == 2
[x, true]
when x % 2 == 0
[x, false]
else
upper = Math.sqrt(x.to_f).to_i
result = true
i = 3
while i <= upper
if x % i == 0
result = false
break
end
i += 2
end
[x, result]
end
end
time = Benchmark.realtime do
@rdd_numbers.map(is_prime).collect
end
log('IsPrime', time)
# --- Matrix multiplication ---------------------------------------------------
matrix_size = ENV['MATRIX_SIZE'].to_i
matrix = Array.new(matrix_size) do |row|
Array.new(matrix_size) do |col|
row+col
end
end;
multiplication_func = Proc.new do |matrix|
size = matrix.size
Array.new(size) do |row|
Array.new(size) do |col|
matrix[row]
result = 0
size.times do |i|
result += matrix[row][i] * matrix[col][i]
end
result
end
end
end
time = Benchmark.realtime do
rdd = sc.parallelize(matrix, 1)
rdd.map_partitions(multiplication_func).collect
end
log('MatrixMultiplication', time)
# --- Pi digits ---------------------------------------------------------------
# http://rosettacode.org/wiki/Pi#Ruby
pi_digit = ENV['PI_DIGIT'].to_i
pi_func = Proc.new do |size|
size = size.first
result = ''
q, r, t, k, n, l = 1, 0, 1, 1, 3, 3
while size > 0
if 4*q+r-t < n*t
result << n.to_s
size -= 1
nr = 10*(r-n*t)
n = ((10*(3*q+r)) / t) - 10*n
q *= 10
r = nr
else
nr = (2*q+r) * l
nn = (q*(7*k+2)+r*l) / (t*l)
q *= k
t *= l
l += 2
k += 1
n = nn
r = nr
end
end
[result]
end
time = Benchmark.realtime do
rdd = sc.parallelize([pi_digit], 1)
rdd.map_partitions(pi_func).collect
end
log('PiDigit', time)
$log_file.close
================================================
FILE: benchmark/comparison/run-all.sh
================================================
#!/usr/bin/env bash
# Current dir
cd "$(dirname "$0")"
# Exit immediately if a pipeline returns a non-zero status.
set -e
# Settings
export WORKERS=2
export MATRIX_SIZE=100
export NUMBERS_COUNT=1000000
export TEXT_FILE=$(mktemp)
export PI_DIGIT=1000
export RUBY_BATCH_SIZE=2048
text_file_rows=10
text_file_per_line=10
text_file_duplicates=50
mx="4096m"
ms="4096m"
# Parse arguments
while (( "$#" )); do
case $1 in
--workers)
WORKERS="$2"
shift
;;
--matrix-size)
MATRIX_SIZE="$2"
shift
;;
--numbers-count)
NUMBERS_COUNT="$2"
shift
;;
--random-file-rows)
text_file_rows="$2"
shift
;;
--text-file-per-line)
text_file_per_line="$2"
shift
;;
--text-file-duplicates)
text_file_duplicates="$2"
shift
;;
--pi-digit)
PI_DIGIT="$2"
shift
;;
--ruby-batch-size)
RUBY_BATCH_SIZE="$2"
shift
;;
--mx)
mx="$2"
shift
;;
--ms)
ms="$2"
shift
;;
*)
break
;;
esac
shift
done
# Generating
file=$(mktemp)
for (( i=0; i<$text_file_rows; i++ ))
do
shuf -n $text_file_per_line /usr/share/dict/words | tr '\n' ' ' >> $file
echo >> $file
done
for (( i=0; i<$text_file_duplicates; i++ ))
do
cat $file >> $TEXT_FILE
done
# Before run
if [[ -z "$SPARK_HOME" ]]; then
export SPARK_HOME=$(pwd)/spark
fi
if [[ -z "$RSPARK_HOME" ]]; then
export RSPARK_HOME=$(pwd)/rspark
fi
export SPARK_RUBY_BATCH_SIZE="$RUBY_BATCH_SIZE"
SPARK_CLASSPATH=$($SPARK_HOME/bin/compute-classpath.sh 2>/dev/null)
export _JAVA_OPTIONS="$_JAVA_OPTIONS -Xms$ms -Xmx$mx"
# Log files
export RUBY_MARSHAL_LOG=$(mktemp)
export RUBY_OJ_LOG=$(mktemp)
export PYTHON_LOG=$(mktemp)
export SCALA_LOG=$(mktemp)
export R_LOG=$(mktemp)
# Run:
echo "Workers: $WORKERS"
echo "Matrix size: $MATRIX_SIZE"
echo "Numbers count: $NUMBERS_COUNT"
echo "Pi digits: $PI_DIGIT"
echo "File: rows = $(($text_file_rows * $text_file_duplicates))"
echo " per line = $text_file_per_line"
# --- Ruby
export SPARK_RUBY_SERIALIZER='marshal'
export RUBY_LOG="$RUBY_MARSHAL_LOG"
/usr/bin/env ruby ruby.rb &>/dev/null
export SPARK_RUBY_SERIALIZER='oj'
export RUBY_LOG="$RUBY_OJ_LOG"
/usr/bin/env ruby ruby.rb &>/dev/null
# # --- Python
"$SPARK_HOME"/bin/spark-submit --master "local[*]" $(pwd)/python.py &>/dev/null
# # --- Scala
/usr/bin/env scalac -cp $SPARK_CLASSPATH scala.scala -d scala.jar &>/dev/null
"$SPARK_HOME"/bin/spark-submit --master "local[*]" $(pwd)/scala.jar &>/dev/null
# --- R
# "$RSPARK_HOME"/sparkR r.r #&>/dev/null
# Parse results
echo "# Ruby (Marshal)"
cat $RUBY_MARSHAL_LOG
echo ""
echo "# Ruby (Oj)"
cat $RUBY_OJ_LOG
echo ""
echo "# Python"
cat $PYTHON_LOG
echo ""
echo "# Scala"
cat $SCALA_LOG
echo ""
echo "# R"
cat $R_LOG
================================================
FILE: benchmark/comparison/scala.scala
================================================
import java.io._
import scala.math
import scala.io.Source
import org.apache.spark._
object Scala {
val logFile = new PrintWriter(new File(System.getenv("SCALA_LOG")))
def log(args: Any*) {
logFile.write(args.mkString(";"))
logFile.write("\n")
}
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("Scala")
val sc = new SparkContext(conf)
val workers = System.getenv("WORKERS").toInt
val numbersCount = System.getenv("NUMBERS_COUNT").toInt
val textFile = System.getenv("TEXT_FILE")
val numbers = 0 until numbersCount
val floats = numbers.map(_.toDouble)
val strings = Source.fromFile(textFile).mkString.split("\n")
// =============================================================================
// Serialization
// =============================================================================
var time: Long = 0
time = System.currentTimeMillis
val rddNumbers = sc.parallelize(numbers, workers)
time = System.currentTimeMillis - time
log("NumbersSerialization", time/1000.0)
time = System.currentTimeMillis
val rddFloats = sc.parallelize(floats, workers)
time = System.currentTimeMillis - time
log("FloatsSerialization", time/1000.0)
time = System.currentTimeMillis
val rddStrings = sc.parallelize(strings, workers)
time = System.currentTimeMillis - time
log("StringsSerialization", time/1000.0)
// =============================================================================
// Computing
// =============================================================================
// --- Is prime? ---------------------------------------------------------------
time = System.currentTimeMillis
val primes = rddNumbers.map{ x =>
if(x < 2){
(x, false)
}
else if(x == 2){
(x, true)
}
else if(x % 2 == 0){
(x, false)
}
else{
val upper = math.sqrt(x.toDouble).toInt
var result = true
var i = 3
while(i <= upper && result == true){
if(x % i == 0){
result = false
}
else{
i += 2
}
}
(x, result)
}
}
primes.collect()
time = System.currentTimeMillis - time
log("IsPrime", time/1000.0)
// --- Matrix multiplication ---------------------------------------------------
val matrixSize = System.getenv("MATRIX_SIZE").toInt
val matrix = new Array[Array[Long]](matrixSize)
for( row <- 0 until matrixSize ) {
matrix(row) = new Array[Long](matrixSize)
for( col <- 0 until matrixSize ) {
matrix(row)(col) = row + col
}
}
time = System.currentTimeMillis
val rdd = sc.parallelize(matrix, 1)
rdd.mapPartitions { it =>
val matrix = it.toArray
val size = matrix.size
val newMatrix = new Array[Array[Long]](size)
for( row <- 0 until size ) {
newMatrix(row) = new Array[Long](size)
for( col <- 0 until size ) {
var result: Long = 0
for( i <- 0 until size ) {
result += matrix(row)(i) * matrix(col)(i)
}
newMatrix(row)(col) = result
}
}
newMatrix.toIterator
}
time = System.currentTimeMillis - time
log("MatrixMultiplication", time/1000.0)
// --- Pi digits ---------------------------------------------------------------
// http://rosettacode.org/wiki/Pi#Scala
val piDigit = System.getenv("PI_DIGIT").toInt
time = System.currentTimeMillis
val piDigits = sc.parallelize(Array(piDigit), 1)
piDigits.mapPartitions { it =>
var size = it.toArray.asInstanceOf[Array[Int]](0)
var result = ""
var r: BigInt = 0
var q, t, k: BigInt = 1
var n, l: BigInt = 3
var nr, nn: BigInt = 0
while(size > 0){
while((4*q+r-t) >= (n*t)){
nr = (2*q+r)*l
nn = (q*(7*k)+2+(r*l))/(t*l)
q = q * k
t = t * l
l = l + 2
k = k + 1
n = nn
r = nr
}
result += n.toString
size -= 1
nr = 10*(r-n*t)
n = ((10*(3*q+r))/t)-(10*n)
q = q * 10
r = nr
}
Iterator(result)
}
time = System.currentTimeMillis - time
log("PiDigit", time/1000.0)
sc.stop()
logFile.close()
}
}
================================================
FILE: benchmark/custom_marshal.rb
================================================
require 'benchmark'
require 'benchmark/ips'
def pack_int(data)
[data].pack('l>')
end
def pack_long(data)
[data].pack('q>')
end
def pack_doubles(data)
data.pack('G*')
end
module Standard
class LabeledPoint
def initialize(label, features)
@label = label
@features = Standard::Vector.new(features)
end
def marshal_dump
[@label, @features]
end
def marshal_load(*)
end
end
class Vector
def initialize(array)
@values = array
end
def marshal_dump
[@values]
end
def marshal_load(*)
end
end
end
module Custom
class LabeledPoint
def initialize(label, features)
@label = label
@features = Custom::Vector.new(features)
end
def _dump(*)
pack_long(@label) + @features._dump
end
def self._load(*)
end
end
class Vector
def initialize(array)
@values = array
end
def _dump(*)
result = 'v'
result << pack_int(@values.size)
result << pack_doubles(@values)
result.encode(Encoding::ASCII_8BIT)
end
def self._load(*)
end
end
end
data_size = 10_000
vector_size = 1_000
values = Array.new(vector_size) { |x| rand(10_000..100_000) }
@data1 = Array.new(data_size) {|i| Standard::LabeledPoint.new(i, values)}
@data2 = Array.new(data_size) {|i| Custom::LabeledPoint.new(i, values)}
Benchmark.ips do |r|
r.report('standard') do
Marshal.dump(@data1)
end
r.report('custom') do
Marshal.dump(@data2)
end
r.compare!
end
================================================
FILE: benchmark/digest.rb
================================================
lib = File.expand_path(File.dirname(__FILE__) + '/../lib')
$LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)
def java?
RUBY_PLATFORM =~ /java/
end
unless java?
require 'murmurhash3'
end
require 'digest'
require 'benchmark'
require 'ruby-spark'
TEST = 5_000_000
WORDS = ["wefwefwef", "rgwefiwefwe", "a", "rujfwgrethrzjrhgawf", "irncrnuggo"]
puts "TEST COUNT = #{TEST*WORDS.size}"
# =================================================================================================
# Pure ruby mumrumur
# funny-falcon/murmurhash3-ruby
MASK32 = 0xffffffff
def murmur3_32_rotl(x, r)
((x << r) | (x >> (32 - r))) & MASK32
end
def murmur3_32_fmix(h)
h &= MASK32
h ^= h >> 16
h = (h * 0x85ebca6b) & MASK32
h ^= h >> 13
h = (h * 0xc2b2ae35) & MASK32
h ^ (h >> 16)
end
def murmur3_32__mmix(k1)
k1 = (k1 * 0xcc9e2d51) & MASK32
k1 = murmur3_32_rotl(k1, 15)
(k1 * 0x1b873593) & MASK32
end
def murmur3_32_str_hash(str, seed=0)
h1 = seed
numbers = str.unpack('V*C*')
tailn = str.bytesize % 4
tail = numbers.slice!(numbers.size - tailn, tailn)
for k1 in numbers
h1 ^= murmur3_32__mmix(k1)
h1 = murmur3_32_rotl(h1, 13)
h1 = (h1*5 + 0xe6546b64) & MASK32
end
unless tail.empty?
k1 = 0
tail.reverse_each do |c1|
k1 = (k1 << 8) | c1
end
h1 ^= murmur3_32__mmix(k1)
end
h1 ^= str.bytesize
murmur3_32_fmix(h1)
end
# =================================================================================================
# Benchmark
Benchmark.bm(18) do |x|
x.report("ruby hash"){
TEST.times{
WORDS.each{ |word|
word.hash
}
}
}
x.report("ext portable"){
TEST.times{
WORDS.each{ |word|
Spark::Digest.portable_hash(word)
}
}
}
x.report("murmur3 32"){
TEST.times{
WORDS.each{ |word|
# MurmurHash3::V128.str_hash(word)
# [MurmurHash3::V128.str_hash(word).join.to_i].pack("q>")
# MurmurHash3::V128.str_hash(word)
# a = MurmurHash3::V32.str_hash(word).to_s
# a.slice!(0,8)
MurmurHash3::V32.str_hash(word)
}
}
} unless java?
# Too slow
# x.report("murmur3 32 (ruby)"){
# TEST.times{
# WORDS.each{ |word|
# # MurmurHash3::V128.str_hash(word)
# # [MurmurHash3::V128.str_hash(word).join.to_i].pack("q>")
# # MurmurHash3::V128.str_hash(word)
# # a = murmur3_32_str_hash(word).to_s
# # a.slice!(0,8)
# murmur3_32_str_hash(word)
# }
# }
# }
x.report("murmur3 128"){
TEST.times{
WORDS.each{ |word|
# MurmurHash3::V128.str_hash(word)
# [MurmurHash3::V128.str_hash(word).join.to_i].pack("q>")
# a = MurmurHash3::V128.str_hash(word).to_s
# a.slice!(0,8)
MurmurHash3::V128.str_hash(word)
}
}
} unless java?
# x.report("sha256"){
# TEST.times{
# WORDS.each{ |word|
# a = Digest::SHA256.digest(word)
# # a.slice!(0,8)
# }
# }
# }
# x.report("md5"){
# TEST.times{
# WORDS.each{ |word|
# a = Digest::MD5.digest(word)
# # a.slice!(0,8)
# }
# }
# }
end
================================================
FILE: benchmark/enumerator.rb
================================================
require "benchmark"
class Enumerator
def defer(&blk)
self.class.new do |y|
each do |*input|
blk.call(y, *input)
end
end
end
end
ARRAY_SIZE = 50_000_000
def type_yield
return to_enum(__callee__) unless block_given?
ARRAY_SIZE.times { |i|
yield i
}
end
def yield_map_x2(enum)
return to_enum(__callee__, enum) unless block_given?
enum.each do |item|
yield item*2
end
end
def type_enumerator_new
Enumerator.new do |e|
ARRAY_SIZE.times { |i|
e << i
}
end
end
def enumerator_new_map_x2(enum)
Enumerator.new do |e|
enum.each do |item|
e << item*2
end
end
end
def enumerator_defer_x2(enum)
enum.defer do |out, inp|
out << inp*2
end
end
Benchmark.bm(26) do |x|
x.report("yield max") do
type_yield.max
end
x.report("yield sum") do
type_yield.reduce(:+)
end
x.report("yield map x*2 sum") do
yield_map_x2(type_yield).reduce(:+)
end
x.report("yield defer map x*2 sum") do
enumerator_defer_x2(type_yield).reduce(:+)
end
x.report("-----"){}
x.report("Enum.new max") do
type_enumerator_new.max
end
x.report("Enum.new sum") do
type_enumerator_new.reduce(:+)
end
x.report("Enum.new map x*2 sum") do
enumerator_new_map_x2(type_enumerator_new).reduce(:+)
end
x.report("Enum.new defer map x*2 sum") do
enumerator_defer_x2(type_enumerator_new).reduce(:+)
end
end
================================================
FILE: benchmark/serializer.rb
================================================
require "benchmark"
require "yaml"
require "msgpack"
require "oj"
# require "thrift"
puts "Simple"
data = (0..100000).to_a
Benchmark.bmbm do |x|
x.report("YAML") do
serialized = YAML.dump(data)
deserialized = YAML.load(serialized)
puts "Size: #{serialized.size}, Equal: #{deserialized == data}"
end
x.report("Marshal") do
serialized = Marshal.dump(data)
deserialized = Marshal.load(serialized)
puts "Size: #{serialized.size}, Equal: #{deserialized == data}"
end
x.report("MessagePack") do
serialized = MessagePack.dump(data)
deserialized = MessagePack.load(serialized)
puts "Size: #{serialized.size}, Equal: #{deserialized == data}"
end
x.report("Oj") do
serialized = Oj.dump(data)
deserialized = Oj.load(serialized)
puts "Size: #{serialized.size}, Equal: #{deserialized == data}"
end
# x.report("Thrift") do
# serializer = Thrift::Serializer.new
# deserializer = Thrift::Deserializer.new
# serialized = serializer.serialize(data)
# end
end
puts ""
puts "More complex"
data = Array.new(10000000) {
[rand(97..122).chr, rand(10000000)]
}
Benchmark.bm do |x|
# Take too long
# x.report("YAML") do
# serialized = YAML.dump(data)
# YAML.load(serialized)
# end
x.report("Marshal") do
serialized = Marshal.dump(data)
deserialized = Marshal.load(serialized)
puts " Size: #{serialized.size}, Equal: #{deserialized == data}"
end
x.report("MessagePack") do
serialized = MessagePack.dump(data)
deserialized = MessagePack.load(serialized)
puts " Size: #{serialized.size}, Equal: #{deserialized == data}"
end
x.report("Oj") do
serialized = Oj.dump(data)
deserialized = Oj.load(serialized)
puts " Size: #{serialized.size}, Equal: #{deserialized == data}"
end
# x.report("Thrift") do
# serializer = Thrift::Serializer.new
# deserializer = Thrift::Deserializer.new
# serialized = serializer.serialize(data)
# end
end
================================================
FILE: benchmark/sort.rb
================================================
require "benchmark"
array = []
1000.times {
array << {:bar => rand(1000)}
}
n = 500
Benchmark.bm(20) do |x|
x.report("sort") { n.times { array.sort{ |a,b| b[:bar] <=> a[:bar] } } }
x.report("sort reverse") { n.times { array.sort{ |a,b| a[:bar] <=> b[:bar] }.reverse } }
x.report("sort_by -a[:bar]") { n.times { array.sort_by{ |a| -a[:bar] } } }
x.report("sort_by a[:bar]*-1") { n.times { array.sort_by{ |a| a[:bar]*-1 } } }
x.report("sort_by.reverse!") { n.times { array.sort_by{ |a| a[:bar] }.reverse } }
end
array = Array.new(10000) { Array.new(rand(1..10)){(97+rand(26)).chr}.join }
Benchmark.bm(20) do |x|
x.report("sort asc") { n.times { array.sort } }
x.report("sort asc block") { n.times { array.sort{|a,b| a <=> b} } }
x.report("sort desc") { n.times { array.sort{|a,b| b <=> a} } }
x.report("sort asc reverse") { n.times { array.sort.reverse } }
end
key_value = Struct.new(:key, :value) do
def <=>(other)
key <=> other.key
end
end
count = 10000
item_range = 1000000
array1 = Array.new(count) { [rand(item_range), rand(item_range)] }
array2 = Array.new(count) { key_value.new rand(item_range), rand(item_range) }
Benchmark.bm(20) do |x|
x.report("sort_by") { n.times { array1.sort_by {|a| a[0]} } }
x.report("sort struct") { n.times { array2.sort } }
end
================================================
FILE: benchmark/sort2.rb
================================================
require "benchmark"
require "algorithms"
NUMBER_OF_SORTING = 1
NUMBER_OF_ARRAY = 10
WORDS_IN_ARRAY = 100000
MAX_WORD_SIZE = 10
EVAL_N_VALUES = 10
puts "NUMBER_OF_SORTING: #{NUMBER_OF_SORTING}"
puts "NUMBER_OF_ARRAY: #{NUMBER_OF_ARRAY}"
puts "WORDS_IN_ARRAY: #{WORDS_IN_ARRAY}"
puts "MAX_WORD_SIZE: #{MAX_WORD_SIZE}"
puts "EVAL_N_VALUES: #{EVAL_N_VALUES}"
def words
Array.new(WORDS_IN_ARRAY) { word }
end
def word
Array.new(rand(1..MAX_WORD_SIZE)){(97+rand(26)).chr}.join
end
@array = Array.new(NUMBER_OF_ARRAY) { words.sort }
# =================================================================================================
# Sort1
# Vrátí nový (nevyhodnocený) enumerator
def sort1(data)
return to_enum(__callee__, data) unless block_given?
heap = []
# Inicializuji heap s prvními položkami
# připojím samotné enumeratory pro volání .next
data.each do |a|
heap << [a.next, a]
end
while data.any?
begin
# Seřadím pole podle hodnot
heap.sort_by!{|(item,_)| item}
# Uložím si hodnotu a enumerator
item, enum = heap.shift
# Hodnota půjde do výsledku
yield item
# Místo odstraněné položky nahradí další ze stejného seznamu
heap << [enum.next, enum]
rescue StopIteration
# Enumerator je prázdný
data.delete(enum)
end
end
end
# =================================================================================================
# Sort1_2
# Vrátí nový (nevyhodnocený) enumerator
def sort1_2(data)
return to_enum(__callee__, data) unless block_given?
heap = []
enums = []
# Inicializuji heap s prvními položkami
# připojím samotné enumeratory pro volání .next
data.each do |a|
EVAL_N_VALUES.times {
begin
heap << [a.next, a]
rescue StopIteration
end
}
end
while data.any? || heap.any?
# Seřadím pole podle hodnot
heap.sort_by!{|(item,_)| item}
# Minimálně můžu vzít EVAL_N_VALUES
EVAL_N_VALUES.times {
break if heap.empty?
# Uložím si hodnotu a enumerator
item, enum = heap.shift
# Hodnota půjde do výsledku
yield item
enums << enum
}
while (enum = enums.shift)
begin
heap << [enum.next, enum]
rescue StopIteration
data.delete(enum)
enums.delete(enum)
end
end
end
end
# =================================================================================================
# Sort 2
def sort2(data)
return to_enum(__callee__, data) unless block_given?
heap = Containers::Heap.new
data.each do |enum|
item = enum.next
heap.push(item, [item, enum])
end
while data.any?
begin
item, enum = heap.pop
yield item
item = enum.next
heap.push(item, [item, enum])
rescue StopIteration
data.delete(enum)
end
end
end
# =================================================================================================
# Benchmark
Benchmark.bm(10) do |x|
x.report("sort") do
NUMBER_OF_SORTING.times {
@result = @array.flatten.sort
}
end
x.report("sort 1") do
NUMBER_OF_SORTING.times {
raise "Bad sorting" if @result != sort1(@array.map(&:each)).to_a
}
end
x.report("sort 1_2") do
NUMBER_OF_SORTING.times {
raise "Bad sorting" if @result != sort1_2(@array.map(&:each)).to_a
}
end
# x.report("sort 2") do
# NUMBER_OF_SORTING.times {
# raise "Bad sorting" if @result != sort2(@array.map(&:each)).to_a
# }
# end
end
================================================
FILE: benchmark/take.rb
================================================
require "benchmark"
SIZE = 100_000_000
@array1 = (0..SIZE).to_a;
@array2 = (0..SIZE).to_a;
@array3 = (0..SIZE).to_a;
TAKE = 100_000
Benchmark.bm(15) do |x|
# Fastest
x.report("take"){
a=@array1.take(TAKE)
}
# Slowest and take most memory
x.report("reverse drop"){
@array2.reverse!
@array2.drop(@array2.size - TAKE)
@array2.reverse!
}
# Least memory
x.report("splice"){
a=@array2.slice!(0, TAKE)
}
end
================================================
FILE: bin/ruby-spark
================================================
#!/usr/bin/env ruby
lib = File.expand_path(File.dirname(__FILE__) + '/../lib')
$LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)
require 'ruby-spark'
Spark::CLI.new.run
================================================
FILE: example/pi.rb
================================================
#!/usr/bin/env ruby
lib = File.expand_path(File.dirname(__FILE__) + '/../lib')
$LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)
require 'ruby-spark'
Spark.logger.disable
Spark.start
slices = 3
n = 100000 * slices
def map(_)
x = rand * 2 - 1
y = rand * 2 - 1
if x**2 + y**2 < 1
return 1
else
return 0
end
end
rdd = Spark.context.parallelize(1..n, slices)
rdd = rdd.map(method(:map))
puts 'Pi is roughly %f' % (4.0 * rdd.sum / n)
================================================
FILE: example/website_search.rb
================================================
#!/usr/bin/env ruby
# Parse sitemap and search word on every page
require 'optparse'
require 'open-uri'
require 'nokogiri'
require 'ruby-spark'
options = {
sitemap: 'http://fit.cvut.cz/sitemap.xml',
query: 'cvut',
workers: 2
}
opt_parser = OptionParser.new do |opts|
opts.banner = 'Usage: website_search.rb [options]'
opts.separator ''
opts.separator 'Specific options:'
opts.on('-s', '--sitemap SITEMAP', 'Sitemap URL') do |sitemap|
options[:sitemap] = sitemap
end
opts.on('-q', '--query QUERY', 'Query to search') do |query|
options[:query] = query
end
opts.on('-w', '--workers WORKERS_NUM', Integer, 'Number of workers') do |workers|
options[:workers] = workers
end
opts.on('--quite', 'Run quitely') do |v|
Spark.logger.disabled
end
opts.on_tail('-h', '--help', 'Show this message') do
puts opts
exit
end
end
opt_parser.parse!
@links = []
def parse_sitemap(doc)
doc.xpath('//sitemapindex/sitemap/loc').each do |loc|
next_doc = Nokogiri::HTML(open(loc.text))
parse_sitemap(next_doc)
end
doc.xpath('//url/loc').each do |loc|
@links << loc.text
end
end
doc = Nokogiri::HTML(open(options[:sitemap]))
parse_sitemap(doc)
# Map function
func = Proc.new do |url|
begin
open(url) {|f|
[url, f.read.scan(query).count]
}
rescue
[url, 0]
end
end
Spark.start
rdd = Spark.sc.parallelize(@links, options[:workers])
.add_library('open-uri')
.bind(query: options[:query])
.map(func)
.sort_by(lambda{|(_, value)| value}, false)
rdd.collect.each do |(url, count)|
puts "#{url} => #{count}"
end
================================================
FILE: ext/ruby_c/extconf.rb
================================================
require 'mkmf'
create_makefile("ruby_spark_ext")
================================================
FILE: ext/ruby_c/murmur.c
================================================
#include "murmur.h"
#if defined(_MSC_VER)
#define BIG_CONSTANT(x) (x)
#else
#define BIG_CONSTANT(x) (x##LLU)
#endif
/*-----------------------------------------------------------------------------
// MurmurHash2, 64-bit versions, by Austin Appleby
//
// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment
// and endian-ness issues if used across multiple platforms.
//
// 64-bit hash for 64-bit platforms
*/
uint64_t MurmurHash64A(const void * key, int len, uint64_t seed)
{
const uint64_t m = BIG_CONSTANT(0xc6a4a7935bd1e995);
const int r = 47;
uint64_t h = seed ^ (len * m);
const uint64_t * data = (const uint64_t *)key;
const uint64_t * end = data + (len/8);
while(data != end)
{
uint64_t k = *data++;
k *= m;
k ^= k >> r;
k *= m;
h ^= k;
h *= m;
}
const unsigned char * data2 = (const unsigned char*)data;
switch(len & 7)
{
case 7: h ^= ((uint64_t) data2[6]) << 48;
case 6: h ^= ((uint64_t) data2[5]) << 40;
case 5: h ^= ((uint64_t) data2[4]) << 32;
case 4: h ^= ((uint64_t) data2[3]) << 24;
case 3: h ^= ((uint64_t) data2[2]) << 16;
case 2: h ^= ((uint64_t) data2[1]) << 8;
case 1: h ^= ((uint64_t) data2[0]);
h *= m;
};
h ^= h >> r;
h *= m;
h ^= h >> r;
return h;
}
/* 64-bit hash for 32-bit platforms */
uint64_t MurmurHash64B(const void * key, int len, uint64_t seed)
{
const uint32_t m = 0x5bd1e995;
const int r = 24;
uint32_t h1 = ((uint32_t) seed) ^ len;
uint32_t h2 = ((uint32_t) (seed >> 32));
const uint32_t * data = (const uint32_t *)key;
while(len >= 8)
{
uint32_t k1 = *data++;
k1 *= m; k1 ^= k1 >> r; k1 *= m;
h1 *= m; h1 ^= k1;
len -= 4;
uint32_t k2 = *data++;
k2 *= m; k2 ^= k2 >> r; k2 *= m;
h2 *= m; h2 ^= k2;
len -= 4;
}
if(len >= 4)
{
uint32_t k1 = *data++;
k1 *= m; k1 ^= k1 >> r; k1 *= m;
h1 *= m; h1 ^= k1;
len -= 4;
}
switch(len)
{
case 3: h2 ^= ((unsigned char*)data)[2] << 16;
case 2: h2 ^= ((unsigned char*)data)[1] << 8;
case 1: h2 ^= ((unsigned char*)data)[0];
h2 *= m;
};
h1 ^= h2 >> 18; h1 *= m;
h2 ^= h1 >> 22; h2 *= m;
h1 ^= h2 >> 17; h1 *= m;
h2 ^= h1 >> 19; h2 *= m;
uint64_t h = h1;
h = (h << 32) | h2;
return h;
}
// ================================================================================================
// Ruby methods
#define PORTABLE_HASH_SEED 16154832
VALUE murmur2_digest(VALUE rb_str, uint64_t seed)
{
StringValue(rb_str);
void * key = RSTRING_PTR(rb_str);
long len = RSTRING_LEN(rb_str);
uint64_t result = MurmurHash64A(key, len, seed);
return LONG2FIX(result);
}
// ------------------------------------------------------------------------------------------------
// Spark::Digest::Murmur2.digest
VALUE method_murmur2_digest(int argc, VALUE *argv, VALUE klass)
{
if(argc == 0 || argc > 2){
rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
}
uint64_t seed = (argc == 1 ? 0 : NUM2UINT(argv[1]));
return murmur2_digest(argv[0], seed);
}
// ------------------------------------------------------------------------------------------------
// Spark::Digest.portable_hash
VALUE method_portable_hash(int argc, VALUE *argv, VALUE klass)
{
if(argc != 1){
rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
}
return murmur2_digest(argv[0], PORTABLE_HASH_SEED);
}
================================================
FILE: ext/ruby_c/murmur.h
================================================
#ifndef MURMUR_INCLUDED
#define MURMUR_INCLUDED
#include "ruby.h"
VALUE method_portable_hash(int argc, VALUE *argv, VALUE klass);
VALUE method_murmur2_digest(int argc, VALUE *argv, VALUE klass);
#endif
================================================
FILE: ext/ruby_c/ruby-spark.c
================================================
#include "ruby.h"
#include "murmur.h"
VALUE SparkModule;
VALUE SparkDigestModule;
VALUE SparkDigestMurmur2Class;
void Init_ruby_spark_ext()
{
SparkModule = rb_define_module("Spark");
SparkDigestModule = rb_define_module_under(SparkModule, "Digest");
SparkDigestMurmur2Class = rb_define_class_under(SparkDigestModule, "Murmur2", rb_cObject);
rb_define_singleton_method(SparkDigestModule, "portable_hash", method_portable_hash, -1);
rb_define_singleton_method(SparkDigestMurmur2Class, "digest", method_murmur2_digest, -1);
}
================================================
FILE: ext/ruby_java/Digest.java
================================================
import org.jruby.Ruby;
import org.jruby.RubyModule;
import org.jruby.RubyObject;
import org.jruby.RubyClass;
import org.jruby.RubyString;
import org.jruby.RubyFixnum;
import org.jruby.anno.JRubyModule;
import org.jruby.anno.JRubyMethod;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;
@JRubyModule(name="Spark::Digest")
public class Digest extends RubyObject{
// Have to be the same as in C extension
final static long PORTABLE_HASH_SEED = 16154832;
public Digest(final Ruby ruby, RubyClass rubyClass) {
super(ruby, rubyClass);
}
@JRubyMethod(module=true)
public static IRubyObject portable_hash(ThreadContext context, IRubyObject self, IRubyObject arg) {
Ruby ruby = self.getRuntime();
RubyString keyString = (RubyString)arg;
long hash = Murmur2.hash64(keyString.getBytes(), (int)keyString.length().getLongValue(), PORTABLE_HASH_SEED);
RubyFixnum result = new RubyFixnum(ruby, hash);
return result;
}
}
================================================
FILE: ext/ruby_java/Murmur2.java
================================================
import org.jruby.Ruby;
import org.jruby.RubyClass;
import org.jruby.RubyObject;
import org.jruby.RubyString;
import org.jruby.RubyFixnum;
import org.jruby.anno.JRubyClass;
import org.jruby.anno.JRubyMethod;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;
/** Murmur hash 2.0.
*
* The murmur hash is a relative fast hash function from
* http://murmurhash.googlepages.com/ for platforms with efficient
* multiplication.
*
* http://d3s.mff.cuni.cz/~holub/sw/javamurmurhash/
*
*/
@JRubyClass(name="Spark::Digest::Murmur2")
public class Murmur2 extends RubyObject {
public Murmur2(final Ruby ruby, RubyClass rubyClass) {
super(ruby, rubyClass);
}
@JRubyMethod(required=1, optional=1, module=true)
public static IRubyObject digest(ThreadContext context, IRubyObject self, IRubyObject[] args) {
Ruby ruby = context.getRuntime();
RubyString keyString = (RubyString)args[0];
long seed;
if(args.length > 1){
RubyFixnum rb_seed = (RubyFixnum)args[1];
seed = rb_seed.getLongValue();
}
else{
seed = 0;
}
long hash = hash64(keyString.getBytes(), (int)keyString.length().getLongValue(), seed);
RubyFixnum result = new RubyFixnum(ruby, hash);
return result;
}
/** Generates 64 bit hash from byte array of the given length and seed.
*
* @param data byte array to hash
* @param length length of the array to hash
* @param seed initial seed value
* @return 64 bit hash of the given array
*/
public static long hash64(final byte[] data, int length, long seed) {
final long m = 0xc6a4a7935bd1e995L;
final int r = 47;
long h = (seed&0xffffffffl)^(length*m);
int length8 = length/8;
for (int i=0; i<length8; i++) {
final int i8 = i*8;
long k = ((long)data[i8+0]&0xff) +(((long)data[i8+1]&0xff)<<8)
+(((long)data[i8+2]&0xff)<<16) +(((long)data[i8+3]&0xff)<<24)
+(((long)data[i8+4]&0xff)<<32) +(((long)data[i8+5]&0xff)<<40)
+(((long)data[i8+6]&0xff)<<48) +(((long)data[i8+7]&0xff)<<56);
k *= m;
k ^= k >>> r;
k *= m;
h ^= k;
h *= m;
}
switch (length%8) {
case 7: h ^= (long)(data[(length&~7)+6]&0xff) << 48;
case 6: h ^= (long)(data[(length&~7)+5]&0xff) << 40;
case 5: h ^= (long)(data[(length&~7)+4]&0xff) << 32;
case 4: h ^= (long)(data[(length&~7)+3]&0xff) << 24;
case 3: h ^= (long)(data[(length&~7)+2]&0xff) << 16;
case 2: h ^= (long)(data[(length&~7)+1]&0xff) << 8;
case 1: h ^= (long)(data[length&~7]&0xff);
h *= m;
};
h ^= h >>> r;
h *= m;
h ^= h >>> r;
return h;
}
}
================================================
FILE: ext/ruby_java/RubySparkExtService.java
================================================
import org.jruby.Ruby;
import org.jruby.RubyClass;
import org.jruby.RubyModule;
import org.jruby.runtime.ObjectAllocator;
import org.jruby.runtime.builtin.IRubyObject;
import org.jruby.runtime.load.BasicLibraryService;
public class RubySparkExtService implements BasicLibraryService
{
public boolean basicLoad(final Ruby ruby) throws java.io.IOException {
RubyModule sparkModule = ruby.defineModule("Spark");
RubyModule sparkDigestModule = sparkModule.defineModuleUnder("Digest");
RubyClass sparkDigestMurmur2Class = sparkDigestModule.defineClassUnder("Murmur2", ruby.getObject(), sparkDigestMurmur2Allocator);
sparkDigestModule.defineAnnotatedMethods(Digest.class);
sparkDigestMurmur2Class.defineAnnotatedMethods(Murmur2.class);
return true;
}
public static ObjectAllocator sparkDigestMurmur2Allocator = new ObjectAllocator() {
public IRubyObject allocate(Ruby ruby, RubyClass rubyClass) {
return new Murmur2(ruby, rubyClass);
}
};
}
================================================
FILE: ext/ruby_java/extconf.rb
================================================
require 'mkmf'
create_makefile("ruby_spark_ext")
================================================
FILE: ext/spark/build.sbt
================================================
import AssemblyKeys._
assemblySettings
// Default values
val defaultScalaVersion = "2.10.4"
val defaultSparkVersion = "1.6.0"
val defaultSparkCoreVersion = "2.10"
val defaultTargetDir = "target"
val defaultHadoopVersion = "1.0.4"
// Values
val _hadoopVersion = scala.util.Properties.envOrElse("HADOOP_VERSION", defaultHadoopVersion)
val _scalaVersion = scala.util.Properties.envOrElse("SCALA_VERSION", defaultScalaVersion)
val _sparkVersion = scala.util.Properties.envOrElse("SPARK_VERSION", defaultSparkVersion)
val _sparkCoreVersion = scala.util.Properties.envOrElse("SPARK_CORE_VERSION", defaultSparkCoreVersion)
val _targetDir = scala.util.Properties.envOrElse("TARGET_DIR", defaultTargetDir)
// Project settings
name := "ruby-spark"
version := "1.0.0"
scalaVersion := _scalaVersion
javacOptions ++= Seq("-source", "1.7", "-target", "1.7")
// Jar target folder
artifactPath in Compile in packageBin := file(s"${_targetDir}/ruby-spark.jar")
outputPath in packageDependency := file(s"${_targetDir}/ruby-spark-deps.jar")
// Protocol buffer support
seq(sbtprotobuf.ProtobufPlugin.protobufSettings: _*)
// Additional libraries
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % _sparkVersion excludeAll(ExclusionRule(organization = "org.apache.hadoop")),
"org.apache.spark" %% "spark-graphx" % _sparkVersion,
"org.apache.spark" %% "spark-mllib" % _sparkVersion,
"org.apache.spark" %% "spark-sql" % _sparkVersion,
"org.apache.hadoop" % "hadoop-client" % _hadoopVersion,
"com.github.fommil.netlib" % "all" % "1.1.2",
"org.scalatest" % "scalatest_2.10" % "2.2.1" % "test"
)
// Repositories
resolvers ++= Seq(
"JBoss Repository" at "http://repository.jboss.org/nexus/content/repositories/releases/",
"Spray Repository" at "http://repo.spray.io/",
"Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/",
"Akka Repository" at "http://repo.akka.io/releases/",
"Twitter4J Repository" at "http://twitter4j.org/maven2/",
"Apache HBase" at "https://repository.apache.org/content/repositories/releases",
"Twitter Maven Repo" at "http://maven.twttr.com/",
"scala-tools" at "https://oss.sonatype.org/content/groups/scala-tools",
"Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/",
"Second Typesafe repo" at "http://repo.typesafe.com/typesafe/maven-releases/",
"Mesosphere Public Repository" at "http://downloads.mesosphere.io/maven",
Resolver.sonatypeRepo("public")
)
// Merge strategy
mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) =>
{
case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard
case m if m.startsWith("META-INF") => MergeStrategy.discard
case PathList("javax", "servlet", xs @ _*) => MergeStrategy.first
case PathList("org", "apache", xs @ _*) => MergeStrategy.first
case PathList("org", "jboss", xs @ _*) => MergeStrategy.first
case "about.html" => MergeStrategy.rename
case "reference.conf" => MergeStrategy.concat
case _ => MergeStrategy.first
}
}
================================================
FILE: ext/spark/project/plugins.sbt
================================================
resolvers += Resolver.url("artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns)
resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/"
resolvers += "Spray Repository" at "http://repo.spray.io/"
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.10.2")
addSbtPlugin("com.github.gseitz" % "sbt-protobuf" % "0.3.3")
================================================
FILE: ext/spark/sbt/sbt
================================================
#!/bin/bash
# This script launches sbt for this project. If present it uses the system
# version of sbt. If there is no system version of sbt it attempts to download
# sbt locally.
SBT_VERSION=0.13.9
URL1=http://dl.bintray.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
URL2=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
JAR=sbt/sbt-launch-${SBT_VERSION}.jar
# Download sbt launch jar if it hasn't been downloaded yet
if [ ! -f ${JAR} ]; then
# Download
printf "Attempting to fetch sbt\n"
JAR_DL=${JAR}.part
if hash wget 2>/dev/null; then
(wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR}
elif hash curl 2>/dev/null; then
(curl --progress-bar ${URL1} > ${JAR_DL} || curl --progress-bar ${URL2} > ${JAR_DL}) && mv ${JAR_DL} ${JAR}
else
printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n"
exit -1
fi
fi
if [ ! -f ${JAR} ]; then
# We failed to download
printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n"
exit -1
fi
printf "Launching sbt from ${JAR}\n"
java \
-Xmx1200m -XX:MaxPermSize=350m -XX:ReservedCodeCacheSize=256m \
-jar ${JAR} \
"$@"
================================================
FILE: ext/spark/src/main/scala/Exec.scala
================================================
package org.apache.spark.api.ruby
import java.io.{File, FileOutputStream, InputStreamReader, BufferedReader}
import scala.collection.JavaConversions._
import org.apache.spark.{SparkEnv, Logging}
import org.apache.spark.util._
/* =================================================================================================
* class FileCommand
* =================================================================================================
*
* Save command to file and than execute him because from Scala you cannot simply run
* something like "bash --norc -i -c 'source .zshrc; ruby master.rb'"
*/
class FileCommand(command: String) extends Logging {
var pb: ProcessBuilder = null
var file: File = null
// Command is complete.
def this(command: String, env: SparkEnv) = {
this(command)
create(env)
}
// Template must contains %s which will be replaced for command
def this(template: String, command: String, env: SparkEnv, envVars: Map[String, String]) = {
this(template.format(command), env)
setEnvVars(envVars)
}
private def create(env: SparkEnv) {
val dir = new File(env.sparkFilesDir)
val ext = if(Utils.isWindows) ".cmd" else ".sh"
val shell = if(Utils.isWindows) "cmd" else "bash"
file = File.createTempFile("command", ext, dir)
val out = new FileOutputStream(file)
out.write(command.getBytes)
out.close
logInfo(s"New FileCommand at ${file.getAbsolutePath}")
pb = new ProcessBuilder(shell, file.getAbsolutePath)
}
def setEnvVars(vars: Map[String, String]) {
pb.environment().putAll(vars)
}
def run = {
new ExecutedFileCommand(pb.start)
}
}
/* =================================================================================================
* class ExecutedFileCommand
* =================================================================================================
*
* Represent process executed from file.
*/
class ExecutedFileCommand(process: Process) {
var reader: BufferedReader = null
def readLine = {
openInput
reader.readLine.toString.trim
}
def openInput {
if(reader != null){
return
}
val input = new InputStreamReader(process.getInputStream)
reader = new BufferedReader(input)
}
// Delegation
def destroy = process.destroy
def getInputStream = process.getInputStream
def getErrorStream = process.getErrorStream
}
================================================
FILE: ext/spark/src/main/scala/MLLibAPI.scala
================================================
package org.apache.spark.mllib.api.python
// PythonMLLibAPI is private for python
class MLLibAPI extends PythonMLLibAPI {}
================================================
FILE: ext/spark/src/main/scala/Marshal.scala
================================================
package org.apache.spark.api.ruby.marshal
import java.io.{DataInputStream, DataOutputStream, ByteArrayInputStream, ByteArrayOutputStream}
import scala.collection.mutable.ArrayBuffer
import scala.collection.JavaConverters._
/* =================================================================================================
* object Marshal
* =================================================================================================
*/
object Marshal {
def load(bytes: Array[Byte]) = {
val is = new DataInputStream(new ByteArrayInputStream(bytes))
val majorVersion = is.readUnsignedByte // 4
val minorVersion = is.readUnsignedByte // 8
(new MarshalLoad(is)).load
}
def dump(data: Any) = {
val aos = new ByteArrayOutputStream
val os = new DataOutputStream(aos)
os.writeByte(4)
os.writeByte(8)
(new MarshalDump(os)).dump(data)
aos.toByteArray
}
}
/* =================================================================================================
* class IterableMarshaller
* =================================================================================================
*/
class IterableMarshaller(iter: Iterator[Any]) extends Iterator[Array[Byte]] {
private val buffer = new ArrayBuffer[Any]
override def hasNext: Boolean = iter.hasNext
override def next(): Array[Byte] = {
while (iter.hasNext) {
buffer += iter.next()
}
Marshal.dump(buffer)
}
}
================================================
FILE: ext/spark/src/main/scala/MarshalDump.scala
================================================
package org.apache.spark.api.ruby.marshal
import java.io.{DataInputStream, DataOutputStream, ByteArrayInputStream, ByteArrayOutputStream}
import scala.collection.mutable.ArrayBuffer
import scala.collection.JavaConverters._
import scala.reflect.{ClassTag, classTag}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.{Vector, DenseVector, SparseVector}
/* =================================================================================================
* class MarshalDump
* =================================================================================================
*/
class MarshalDump(os: DataOutputStream) {
val NAN_BYTELIST = "nan".getBytes
val NEGATIVE_INFINITY_BYTELIST = "-inf".getBytes
val INFINITY_BYTELIST = "inf".getBytes
def dump(data: Any) {
data match {
case null =>
os.writeByte('0')
case item: Boolean =>
val char = if(item) 'T' else 'F'
os.writeByte(char)
case item: Int =>
os.writeByte('i')
dumpInt(item)
case item: Array[_] =>
os.writeByte('[')
dumpArray(item)
case item: Double =>
os.writeByte('f')
dumpFloat(item)
case item: ArrayBuffer[Any] => dump(item.toArray)
}
}
def dumpInt(data: Int) {
if(data == 0){
os.writeByte(0)
}
else if (0 < data && data < 123) {
os.writeByte(data + 5)
}
else if (-124 < data && data < 0) {
os.writeByte((data - 5) & 0xff)
}
else {
val buffer = new Array[Byte](4)
var value = data
var i = 0
while(i != 4 && value != 0 && value != -1){
buffer(i) = (value & 0xff).toByte
value = value >> 8
i += 1
}
val lenght = i + 1
if(value < 0){
os.writeByte(-lenght)
}
else{
os.writeByte(lenght)
}
os.write(buffer, 0, lenght)
}
}
def dumpArray(array: Array[_]) {
dumpInt(array.size)
for(item <- array) {
dump(item)
}
}
def dumpFloat(value: Double) {
if(value.isPosInfinity){
dumpString(NEGATIVE_INFINITY_BYTELIST)
}
else if(value.isNegInfinity){
dumpString(INFINITY_BYTELIST)
}
else if(value.isNaN){
dumpString(NAN_BYTELIST)
}
else{
// dumpString("%.17g".format(value))
dumpString(value.toString)
}
}
def dumpString(data: String) {
dumpString(data.getBytes)
}
def dumpString(data: Array[Byte]) {
dumpInt(data.size)
os.write(data)
}
}
================================================
FILE: ext/spark/src/main/scala/MarshalLoad.scala
================================================
package org.apache.spark.api.ruby.marshal
import java.io.{DataInputStream, DataOutputStream, ByteArrayInputStream, ByteArrayOutputStream}
import scala.collection.mutable.ArrayBuffer
import scala.collection.JavaConverters._
import scala.reflect.{ClassTag, classTag}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.{Vector, DenseVector, SparseVector}
/* =================================================================================================
* class MarshalLoad
* =================================================================================================
*/
class MarshalLoad(is: DataInputStream) {
case class WaitForObject()
val registeredSymbols = ArrayBuffer[String]()
val registeredLinks = ArrayBuffer[Any]()
def load: Any = {
load(is.readUnsignedByte.toChar)
}
def load(dataType: Char): Any = {
dataType match {
case '0' => null
case 'T' => true
case 'F' => false
case 'i' => loadInt
case 'f' => loadAndRegisterFloat
case ':' => loadAndRegisterSymbol
case '[' => loadAndRegisterArray
case 'U' => loadAndRegisterUserObject
case _ =>
throw new IllegalArgumentException(s"Format is not supported: $dataType.")
}
}
// ----------------------------------------------------------------------------------------------
// Load by type
def loadInt: Int = {
var c = is.readByte.toInt
if (c == 0) {
return 0
} else if (4 < c && c < 128) {
return c - 5
} else if (-129 < c && c < -4) {
return c + 5
}
var result: Long = 0
if (c > 0) {
result = 0
for( i <- 0 until c ) {
result |= (is.readUnsignedByte << (8 * i)).toLong
}
} else {
c = -c
result = -1
for( i <- 0 until c ) {
result &= ~((0xff << (8 * i)).toLong)
result |= (is.readUnsignedByte << (8 * i)).toLong
}
}
result.toInt
}
def loadAndRegisterFloat: Double = {
val result = loadFloat
registeredLinks += result
result
}
def loadFloat: Double = {
val string = loadString
string match {
case "nan" => Double.NaN
case "inf" => Double.PositiveInfinity
case "-inf" => Double.NegativeInfinity
case _ => string.toDouble
}
}
def loadString: String = {
new String(loadStringBytes)
}
def loadStringBytes: Array[Byte] = {
val size = loadInt
val buffer = new Array[Byte](size)
var readSize = 0
while(readSize < size){
val read = is.read(buffer, readSize, size-readSize)
if(read == -1){
throw new IllegalArgumentException("Marshal too short.")
}
readSize += read
}
buffer
}
def loadAndRegisterSymbol: String = {
val result = loadString
registeredSymbols += result
result
}
def loadAndRegisterArray: Array[Any] = {
val size = loadInt
val array = new Array[Any](size)
registeredLinks += array
for( i <- 0 until size ) {
array(i) = loadNextObject
}
array
}
def loadAndRegisterUserObject: Any = {
val klass = loadNextObject.asInstanceOf[String]
// Register future class before load the next object
registeredLinks += WaitForObject()
val index = registeredLinks.size - 1
val data = loadNextObject
val result = klass match {
case "Spark::Mllib::LabeledPoint" => createLabeledPoint(data)
case "Spark::Mllib::DenseVector" => createDenseVector(data)
case "Spark::Mllib::SparseVector" => createSparseVector(data)
case other =>
throw new IllegalArgumentException(s"Object $other is not supported.")
}
registeredLinks(index) = result
result
}
// ----------------------------------------------------------------------------------------------
// Other loads
def loadNextObject: Any = {
val dataType = is.readUnsignedByte.toChar
if(isLinkType(dataType)){
readLink(dataType)
}
else{
load(dataType)
}
}
// ----------------------------------------------------------------------------------------------
// To java objects
def createLabeledPoint(data: Any): LabeledPoint = {
val array = data.asInstanceOf[Array[_]]
new LabeledPoint(array(0).asInstanceOf[Double], array(1).asInstanceOf[Vector])
}
def createDenseVector(data: Any): DenseVector = {
new DenseVector(data.asInstanceOf[Array[_]].map(toDouble(_)))
}
def createSparseVector(data: Any): SparseVector = {
val array = data.asInstanceOf[Array[_]]
val size = array(0).asInstanceOf[Int]
val indices = array(1).asInstanceOf[Array[_]].map(_.asInstanceOf[Int])
val values = array(2).asInstanceOf[Array[_]].map(toDouble(_))
new SparseVector(size, indices, values)
}
// ----------------------------------------------------------------------------------------------
// Helpers
def toDouble(data: Any): Double = data match {
case x: Int => x.toDouble
case x: Double => x
case _ => 0.0
}
// ----------------------------------------------------------------------------------------------
// Cache
def readLink(dataType: Char): Any = {
val index = loadInt
dataType match {
case '@' => registeredLinks(index)
case ';' => registeredSymbols(index)
}
}
def isLinkType(dataType: Char): Boolean = {
dataType == ';' || dataType == '@'
}
}
================================================
FILE: ext/spark/src/main/scala/RubyAccumulatorParam.scala
================================================
package org.apache.spark.api.ruby
import java.io._
import java.net._
import java.util.{List, ArrayList}
import scala.collection.JavaConversions._
import scala.collection.immutable._
import org.apache.spark._
import org.apache.spark.util.Utils
/**
* Internal class that acts as an `AccumulatorParam` for Ruby accumulators. Inside, it
* collects a list of pickled strings that we pass to Ruby through a socket.
*/
private class RubyAccumulatorParam(serverHost: String, serverPort: Int)
extends AccumulatorParam[List[Array[Byte]]] {
// Utils.checkHost(serverHost, "Expected hostname")
val bufferSize = SparkEnv.get.conf.getInt("spark.buffer.size", 65536)
// Socket shoudl not be serialized
// Otherwise: SparkException: Task not serializable
@transient var socket: Socket = null
@transient var socketOutputStream: DataOutputStream = null
@transient var socketInputStream: DataInputStream = null
def openSocket(){
synchronized {
if (socket == null || socket.isClosed) {
socket = new Socket(serverHost, serverPort)
socketInputStream = new DataInputStream(new BufferedInputStream(socket.getInputStream, bufferSize))
socketOutputStream = new DataOutputStream(new BufferedOutputStream(socket.getOutputStream, bufferSize))
}
}
}
override def zero(value: List[Array[Byte]]): List[Array[Byte]] = new ArrayList
override def addInPlace(val1: List[Array[Byte]], val2: List[Array[Byte]]) : List[Array[Byte]] = synchronized {
if (serverHost == null) {
// This happens on the worker node, where we just want to remember all the updates
val1.addAll(val2)
val1
} else {
// This happens on the master, where we pass the updates to Ruby through a socket
openSocket()
socketOutputStream.writeInt(val2.size)
for (array <- val2) {
socketOutputStream.writeInt(array.length)
socketOutputStream.write(array)
}
socketOutputStream.flush()
// Wait for acknowledgement
// http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
//
// if(in.readInt() != RubyConstant.ACCUMULATOR_ACK){
// throw new SparkException("Accumulator was not acknowledged")
// }
new ArrayList
}
}
}
================================================
FILE: ext/spark/src/main/scala/RubyBroadcast.scala
================================================
package org.apache.spark.api.ruby
import org.apache.spark.api.python.PythonBroadcast
/**
* An Wrapper for Ruby Broadcast, which is written into disk by Ruby. It also will
* write the data into disk after deserialization, then Ruby can read it from disks.
*
* Class use Python logic - only for semantic
*/
class RubyBroadcast(@transient var _path: String, @transient var id: java.lang.Long) extends PythonBroadcast(_path) {
}
================================================
FILE: ext/spark/src/main/scala/RubyConstant.scala
================================================
package org.apache.spark.api.ruby
object RubyConstant {
val DATA_EOF = -2
val WORKER_ERROR = -1
val WORKER_DONE = 0
val CREATE_WORKER = 1
val KILL_WORKER = 2
val KILL_WORKER_AND_WAIT = 3
val SUCCESSFULLY_KILLED = 4
val UNSUCCESSFUL_KILLING = 5
val ACCUMULATOR_ACK = 6
}
================================================
FILE: ext/spark/src/main/scala/RubyMLLibAPI.scala
================================================
package org.apache.spark.mllib.api.ruby
import java.util.ArrayList
import scala.collection.JavaConverters._
import org.apache.spark.rdd.RDD
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.clustering.GaussianMixtureModel
import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
import org.apache.spark.mllib.api.python.MLLibAPI
class RubyMLLibAPI extends MLLibAPI {
// trainLinearRegressionModelWithSGD
// trainLassoModelWithSGD
// trainRidgeModelWithSGD
// trainLogisticRegressionModelWithSGD
// trainLogisticRegressionModelWithLBFGS
// trainSVMModelWithSGD
// trainKMeansModel
// trainGaussianMixtureModel
// Rjb have a problem with theta: Array[Array[Double]]
override def trainNaiveBayesModel(data: JavaRDD[LabeledPoint], lambda: Double) = {
val model = NaiveBayes.train(data.rdd, lambda)
List(
Vectors.dense(model.labels),
Vectors.dense(model.pi),
model.theta.toSeq
).map(_.asInstanceOf[Object]).asJava
}
// On python is wt just Object
def predictSoftGMM(
data: JavaRDD[Vector],
wt: ArrayList[Object],
mu: ArrayList[Object],
si: ArrayList[Object]): RDD[Array[Double]] = {
// val weight = wt.asInstanceOf[Array[Double]]
val weight = wt.toArray.map(_.asInstanceOf[Double])
val mean = mu.toArray.map(_.asInstanceOf[DenseVector])
val sigma = si.toArray.map(_.asInstanceOf[DenseMatrix])
val gaussians = Array.tabulate(weight.length){
i => new MultivariateGaussian(mean(i), sigma(i))
}
val model = new GaussianMixtureModel(weight, gaussians)
model.predictSoft(data)
}
}
================================================
FILE: ext/spark/src/main/scala/RubyMLLibUtilAPI.scala
================================================
package org.apache.spark.mllib.api.ruby
import java.util.ArrayList
import org.apache.spark.mllib.util.LinearDataGenerator
import org.apache.spark.mllib.regression.LabeledPoint
object RubyMLLibUtilAPI {
// Ruby does have a problem with creating Array[Double]
def generateLinearInput(
intercept: Double,
weights: ArrayList[String],
nPoints: Int,
seed: Int,
eps: Double = 0.1): Seq[LabeledPoint] = {
LinearDataGenerator.generateLinearInput(intercept, weights.toArray.map(_.toString.toDouble), nPoints, seed, eps)
}
}
================================================
FILE: ext/spark/src/main/scala/RubyPage.scala
================================================
package org.apache.spark.ui.ruby
// import javax.servlet.http.HttpServletRequest
// import scala.xml.Node
// import org.apache.spark.ui.{WebUIPage, UIUtils}
// import org.apache.spark.util.Utils
// private[ui] class RubyPage(parent: RubyTab, rbConfig: Array[Tuple2[String, String]]) extends WebUIPage("") {
// def render(request: HttpServletRequest): Seq[Node] = {
// val content = UIUtils.listingTable(header, row, rbConfig)
// UIUtils.headerSparkPage("Ruby Config", content, parent)
// }
// private def header = Seq(
// "Number"
// )
// private def row(keyValue: (String, String)): Seq[Node] = {
// // scalastyle:off
// keyValue match {
// case (key, value) =>
// <tr>
// <td>{key}</td>
// <td>{value}</td>
// </tr>
// }
// // scalastyle:on
// }
// }
class RubyPage {}
================================================
FILE: ext/spark/src/main/scala/RubyRDD.scala
================================================
package org.apache.spark.api.ruby
import java.io._
import java.net._
import java.util.{List, ArrayList, Collections}
import scala.util.Try
import scala.reflect.ClassTag
import scala.collection.JavaConversions._
import org.apache.spark._
import org.apache.spark.{SparkEnv, Partition, SparkException, TaskContext}
import org.apache.spark.api.ruby._
import org.apache.spark.api.ruby.marshal._
import org.apache.spark.api.java.{JavaSparkContext, JavaPairRDD, JavaRDD}
import org.apache.spark.api.python.PythonRDD
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.util.Utils
import org.apache.spark.InterruptibleIterator
/* =================================================================================================
* Class RubyRDD
* =================================================================================================
*/
class RubyRDD(
@transient parent: RDD[_],
command: Array[Byte],
broadcastVars: ArrayList[Broadcast[RubyBroadcast]],
accumulator: Accumulator[List[Array[Byte]]])
extends RDD[Array[Byte]](parent){
val bufferSize = conf.getInt("spark.buffer.size", 65536)
val asJavaRDD: JavaRDD[Array[Byte]] = JavaRDD.fromRDD(this)
override def getPartitions: Array[Partition] = firstParent.partitions
override val partitioner = None
/* ------------------------------------------------------------------------------------------ */
override def compute(split: Partition, context: TaskContext): Iterator[Array[Byte]] = {
val env = SparkEnv.get
// Get worker and id
val (worker, workerId) = RubyWorker.create(env)
// Start a thread to feed the process input from our parent's iterator
val writerThread = new WriterThread(env, worker, split, context)
context.addTaskCompletionListener { context =>
writerThread.shutdownOnTaskCompletion()
writerThread.join()
// Cleanup the worker socket. This will also cause the worker to exit.
try {
RubyWorker.remove(worker, workerId)
worker.close()
} catch {
case e: Exception => logWarning("Failed to close worker socket", e)
}
}
val stream = new DataInputStream(new BufferedInputStream(worker.getInputStream, bufferSize))
// Send data
writerThread.start()
// For violent termination of worker
new MonitorThread(workerId, worker, context).start()
// Return an iterator that read lines from the process's stdout
val stdoutIterator = new StreamReader(stream, writerThread, context)
// An iterator that wraps around an existing iterator to provide task killing functionality.
new InterruptibleIterator(context, stdoutIterator)
} // end compute
/* ------------------------------------------------------------------------------------------ */
class WriterThread(env: SparkEnv, worker: Socket, split: Partition, context: TaskContext)
extends Thread("stdout writer for worker") {
@volatile private var _exception: Exception = null
setDaemon(true)
// Contains the exception thrown while writing the parent iterator to the process.
def exception: Option[Exception] = Option(_exception)
// Terminates the writer thread, ignoring any exceptions that may occur due to cleanup.
def shutdownOnTaskCompletion() {
assert(context.isCompleted)
this.interrupt()
}
// -------------------------------------------------------------------------------------------
// Send the necessary data for worker
// - split index
// - command
// - iterator
override def run(): Unit = Utils.logUncaughtExceptions {
try {
SparkEnv.set(env)
val stream = new BufferedOutputStream(worker.getOutputStream, bufferSize)
val dataOut = new DataOutputStream(stream)
// Partition index
dataOut.writeInt(split.index)
// Spark files
PythonRDD.writeUTF(SparkFiles.getRootDirectory, dataOut)
// Broadcast variables
dataOut.writeInt(broadcastVars.length)
for (broadcast <- broadcastVars) {
dataOut.writeLong(broadcast.value.id)
PythonRDD.writeUTF(broadcast.value.path, dataOut)
}
// Serialized command
dataOut.writeInt(command.length)
dataOut.write(command)
// Send it
dataOut.flush()
// Data
PythonRDD.writeIteratorToStream(firstParent.iterator(split, context), dataOut)
dataOut.writeInt(RubyConstant.DATA_EOF)
dataOut.flush()
} catch {
case e: Exception if context.isCompleted || context.isInterrupted =>
logDebug("Exception thrown after task completion (likely due to cleanup)", e)
case e: Exception =>
// We must avoid throwing exceptions here, because the thread uncaught exception handler
// will kill the whole executor (see org.apache.spark.executor.Executor).
_exception = e
} finally {
Try(worker.shutdownOutput()) // kill worker process
}
}
} // end WriterThread
/* ------------------------------------------------------------------------------------------ */
class StreamReader(stream: DataInputStream, writerThread: WriterThread, context: TaskContext) extends Iterator[Array[Byte]] {
def hasNext = _nextObj != null
var _nextObj = read()
// -------------------------------------------------------------------------------------------
def next(): Array[Byte] = {
val obj = _nextObj
if (hasNext) {
_nextObj = read()
}
obj
}
// -------------------------------------------------------------------------------------------
private def read(): Array[Byte] = {
if (writerThread.exception.isDefined) {
throw writerThread.exception.get
}
try {
stream.readInt() match {
case length if length > 0 =>
val obj = new Array[Byte](length)
stream.readFully(obj)
obj
case RubyConstant.WORKER_DONE =>
val numAccumulatorUpdates = stream.readInt()
(1 to numAccumulatorUpdates).foreach { _ =>
val updateLen = stream.readInt()
val update = new Array[Byte](updateLen)
stream.readFully(update)
accumulator += Collections.singletonList(update)
}
null
case RubyConstant.WORKER_ERROR =>
// Exception from worker
// message
val length = stream.readInt()
val obj = new Array[Byte](length)
stream.readFully(obj)
// stackTrace
val stackTraceLen = stream.readInt()
val stackTrace = new Array[String](stackTraceLen)
(0 until stackTraceLen).foreach { i =>
val length = stream.readInt()
val obj = new Array[Byte](length)
stream.readFully(obj)
stackTrace(i) = new String(obj, "utf-8")
}
// Worker will be killed
stream.close
// exception
val exception = new RubyException(new String(obj, "utf-8"), writerThread.exception.getOrElse(null))
exception.appendToStackTrace(stackTrace)
throw exception
}
} catch {
case e: Exception if context.isInterrupted =>
logDebug("Exception thrown after task interruption", e)
throw new TaskKilledException
case e: Exception if writerThread.exception.isDefined =>
logError("Worker exited unexpectedly (crashed)", e)
throw writerThread.exception.get
case eof: EOFException =>
throw new SparkException("Worker exited unexpectedly (crashed)", eof)
}
}
} // end StreamReader
/* ---------------------------------------------------------------------------------------------
* Monitor thread for controll worker. Kill worker if task is interrupted.
*/
class MonitorThread(workerId: Long, worker: Socket, context: TaskContext)
extends Thread("Worker Monitor for worker") {
setDaemon(true)
override def run() {
// Kill the worker if it is interrupted, checking until task completion.
while (!context.isInterrupted && !context.isCompleted) {
Thread.sleep(2000)
}
if (!context.isCompleted) {
try {
logWarning("Incomplete task interrupted: Attempting to kill Worker "+workerId.toString())
RubyWorker.kill(workerId)
} catch {
case e: Exception =>
logError("Exception when trying to kill worker "+workerId.toString(), e)
}
}
}
} // end MonitorThread
} // end RubyRDD
/* =================================================================================================
* Class PairwiseRDD
* =================================================================================================
*
* Form an RDD[(Array[Byte], Array[Byte])] from key-value pairs returned from Ruby.
* This is used by PySpark's shuffle operations.
* Borrowed from Python Package -> need new deserializeLongValue ->
* Marshal will add the same 4b header
*/
class PairwiseRDD(prev: RDD[Array[Byte]]) extends RDD[(Long, Array[Byte])](prev) {
override def getPartitions = prev.partitions
override def compute(split: Partition, context: TaskContext) =
prev.iterator(split, context).grouped(2).map {
case Seq(a, b) => (Utils.deserializeLongValue(a.reverse), b)
case x => throw new SparkException("PairwiseRDD: unexpected value: " + x)
}
val asJavaPairRDD : JavaPairRDD[Long, Array[Byte]] = JavaPairRDD.fromRDD(this)
}
/* =================================================================================================
* Object RubyRDD
* =================================================================================================
*/
object RubyRDD extends Logging {
def runJob(
sc: SparkContext,
rdd: JavaRDD[Array[Byte]],
partitions: ArrayList[Int],
allowLocal: Boolean,
filename: String): String = {
type ByteArray = Array[Byte]
type UnrolledPartition = Array[ByteArray]
val allPartitions: Array[UnrolledPartition] =
sc.runJob(rdd, (x: Iterator[ByteArray]) => x.toArray, partitions, allowLocal)
val flattenedPartition: UnrolledPartition = Array.concat(allPartitions: _*)
writeRDDToFile(flattenedPartition.iterator, filename)
}
def readRDDFromFile(sc: JavaSparkContext, filename: String, parallelism: Int): JavaRDD[Array[Byte]] = {
val file = new DataInputStream(new BufferedInputStream(new FileInputStream(filename)))
val objs = new collection.mutable.ArrayBuffer[Array[Byte]]
try {
while (true) {
val length = file.readInt()
val obj = new Array[Byte](length)
file.readFully(obj)
objs.append(obj)
}
} catch {
case eof: EOFException => {}
}
JavaRDD.fromRDD(sc.sc.parallelize(objs, parallelism))
}
def writeRDDToFile[T](items: Iterator[T], filename: String): String = {
val file = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(filename)))
try {
PythonRDD.writeIteratorToStream(items, file)
} finally {
file.close()
}
filename
}
def writeRDDToFile[T](rdd: RDD[T], filename: String): String = {
writeRDDToFile(rdd.collect.iterator, filename)
}
def readBroadcastFromFile(sc: JavaSparkContext, path: String, id: java.lang.Long): Broadcast[RubyBroadcast] = {
sc.broadcast(new RubyBroadcast(path, id))
}
/**
* Convert an RDD of serialized Ruby objects to RDD of objects, that is usable in Java.
*/
def toJava(rbRDD: JavaRDD[Array[Byte]], batched: Boolean): JavaRDD[Any] = {
rbRDD.rdd.mapPartitions { iter =>
iter.flatMap { item =>
val obj = Marshal.load(item)
if(batched){
obj.asInstanceOf[Array[_]]
}
else{
Seq(item)
}
}
}.toJavaRDD()
}
/**
* Convert an RDD of Java objects to an RDD of serialized Ruby objects, that is usable by Ruby.
*/
def toRuby(jRDD: JavaRDD[_]): JavaRDD[Array[Byte]] = {
jRDD.rdd.mapPartitions { iter => new IterableMarshaller(iter) }
}
}
/* =================================================================================================
* Class RubyException
* =================================================================================================
*/
class RubyException(msg: String, cause: Exception) extends RuntimeException(msg, cause) {
def appendToStackTrace(toAdded: Array[String]) {
val newStactTrace = getStackTrace.toBuffer
var regexpMatch = "(.*):([0-9]+):in `([a-z]+)'".r
for(item <- toAdded) {
item match {
case regexpMatch(fileName, lineNumber, methodName) =>
newStactTrace += new StackTraceElement("RubyWorker", methodName, fileName, lineNumber.toInt)
case _ => null
}
}
setStackTrace(newStactTrace.toArray)
}
}
================================================
FILE: ext/spark/src/main/scala/RubySerializer.scala
================================================
package org.apache.spark.api.ruby
import scala.collection.JavaConverters._
import scala.reflect.{ClassTag, classTag}
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.api.ruby.marshal._
/* =================================================================================================
* object RubySerializer
* =================================================================================================
*/
object RubySerializer { }
================================================
FILE: ext/spark/src/main/scala/RubyTab.scala
================================================
package org.apache.spark.ui.ruby
import scala.collection.mutable.HashMap
import org.apache.spark.ui._
// class RubyTab(parent: SparkUI, rbConfig: HashMap[String, String]) extends SparkUITab(parent, "ruby"){
// attachPage(new RubyPage(this, rbConfig.toArray))
// }
class RubyTab {}
================================================
FILE: ext/spark/src/main/scala/RubyUtils.scala
================================================
package org.apache.spark.api.ruby
import org.apache.spark.util._
import org.apache.spark.{SparkConf, Logging}
object RubyUtils extends Logging {
def loadPropertiesFile(conf: SparkConf, path: String): String = {
Utils.getPropertiesFromFile(path).foreach {
case (key, value) => conf.set(key, value)
}
path
}
}
================================================
FILE: ext/spark/src/main/scala/RubyWorker.scala
================================================
package org.apache.spark.api.ruby
import java.io.{File, DataInputStream, InputStream, DataOutputStream, FileOutputStream}
import java.net.{InetAddress, ServerSocket, Socket, SocketException}
import java.nio.file.Paths
import scala.collection.mutable
import scala.collection.JavaConversions._
import org.apache.spark._
import org.apache.spark.api.python.PythonRDD
import org.apache.spark.util.Utils
import org.apache.spark.util.RedirectThread
/* =================================================================================================
* Object RubyWorker
* =================================================================================================
*
* Create and store server for creating workers.
*/
object RubyWorker extends Logging {
val PROCESS_WAIT_TIMEOUT = 10000
private var serverSocket: ServerSocket = null
private val serverHost = InetAddress.getByAddress(Array(127, 0, 0, 1))
private var serverPort: Int = 0
private var master: ExecutedFileCommand = null
private var masterSocket: Socket = null
private var masterOutputStream: DataOutputStream = null
private var masterInputStream: DataInputStream = null
private var workers = new mutable.WeakHashMap[Socket, Long]()
/* ----------------------------------------------------------------------------------------------
* Create new worker but first check if exist SocketServer and master process.
* If not it will create them. Worker have 2 chance to create.
*/
def create(env: SparkEnv): (Socket, Long) = {
synchronized {
// Create the server if it hasn't been started
createServer(env)
// Attempt to connect, restart and retry once if it fails
try {
createWorker
} catch {
case exc: SocketException =>
logWarning("Worker unexpectedly quit, attempting to restart")
createWorker
}
}
}
/* ----------------------------------------------------------------------------------------------
* Create a worker throught master process. Return new socket and id.
* According spark.ruby.worker.type id will be:
* process: PID
* thread: thread object id
*/
def createWorker: (Socket, Long) = {
synchronized {
masterOutputStream.writeInt(RubyConstant.CREATE_WORKER)
var socket = serverSocket.accept()
var id = new DataInputStream(socket.getInputStream).readLong()
workers.put(socket, id)
(socket, id)
}
}
/* ----------------------------------------------------------------------------------------------
* Create SocketServer and bind it to the localhost. Max numbers of connection on queue
* is set to default. If server is created withou exception -> create master.
*/
private def createServer(env: SparkEnv){
synchronized {
// Already running?
if(serverSocket != null && masterSocket != null) {
return
}
try {
// Start Socket Server for comunication
serverSocket = new ServerSocket(0, 0, serverHost)
serverPort = serverSocket.getLocalPort
// Create a master for worker creations
createMaster(env)
} catch {
case e: Exception =>
throw new SparkException("There was a problem with creating a server", e)
}
}
}
/* ----------------------------------------------------------------------------------------------
* In this point SocketServer must be created. Master process create and kill workers.
* Creating workers from Java can be an expensive operation because new process can
* get copy of address space.
*/
private def createMaster(env: SparkEnv){
synchronized {
val isDriver = env.executorId == SparkContext.DRIVER_IDENTIFIER
val executorOptions = env.conf.get("spark.ruby.executor.options", "")
val commandTemplate = env.conf.get("spark.ruby.executor.command")
val workerType = env.conf.get("spark.ruby.worker.type")
// Where is root of ruby-spark
var executorLocation = ""
if(isDriver){
// Use worker from current active gem location
executorLocation = env.conf.get("spark.ruby.driver_home")
}
else{
// Use gem installed on the system
try {
val homeCommand = (new FileCommand(commandTemplate, "ruby-spark home", env, getEnvVars(env))).run
executorLocation = homeCommand.readLine
} catch {
case e: Exception =>
throw new SparkException("Ruby-spark gem is not installed.", e)
}
}
// Master and worker are saved in GEM_ROOT/lib/spark/worker
executorLocation = Paths.get(executorLocation, "lib", "spark", "worker").toString
// Create master command
// -C: change worker dir before execution
val masterRb = s"ruby $executorOptions -C $executorLocation master.rb $workerType $serverPort"
val masterCommand = new FileCommand(commandTemplate, masterRb, env, getEnvVars(env))
// Start master
master = masterCommand.run
// Redirect master stdout and stderr
redirectStreamsToStderr(master.getInputStream, master.getErrorStream)
// Wait for it to connect to our socket
serverSocket.setSoTimeout(PROCESS_WAIT_TIMEOUT)
try {
// Use socket for comunication. Keep stdout and stdin for log
masterSocket = serverSocket.accept()
masterOutputStream = new DataOutputStream(masterSocket.getOutputStream)
masterInputStream = new DataInputStream(masterSocket.getInputStream)
PythonRDD.writeUTF(executorOptions, masterOutputStream)
} catch {
case e: Exception =>
throw new SparkException("Ruby master did not connect back in time", e)
}
}
}
/* ----------------------------------------------------------------------------------------------
* Gel all environment variables for executor
*/
def getEnvVars(env: SparkEnv): Map[String, String] = {
val prefix = "spark.ruby.executor.env."
env.conf.getAll.filter{case (k, _) => k.startsWith(prefix)}
.map{case (k, v) => (k.substring(prefix.length), v)}
.toMap
}
/* ------------------------------------------------------------------------------------------- */
def kill(workerId: Long){
masterOutputStream.writeInt(RubyConstant.KILL_WORKER)
masterOutputStream.writeLong(workerId)
}
/* ------------------------------------------------------------------------------------------- */
def killAndWait(workerId: Long){
masterOutputStream.writeInt(RubyConstant.KILL_WORKER_AND_WAIT)
masterOutputStream.writeLong(workerId)
// Wait for answer
masterInputStream.readInt() match {
case RubyConstant.SUCCESSFULLY_KILLED =>
logInfo(s"Worker $workerId was successfully killed")
case RubyConstant.UNSUCCESSFUL_KILLING =>
logInfo(s"Worker $workerId cannot be killed (maybe is already killed)")
}
}
/* ----------------------------------------------------------------------------------------------
* workers HashMap is week but it avoid long list of workers which cannot be killed (killAndWait)
*/
def remove(worker: Socket, workerId: Long){
try {
workers.remove(worker)
} catch {
case e: Exception => logWarning(s"Worker $workerId does not exist (maybe is already removed)")
}
}
/* ------------------------------------------------------------------------------------------- */
def stopServer{
synchronized {
// Kill workers
workers.foreach { case (socket, id) => killAndWait(id) }
// Kill master
master.destroy
// Stop SocketServer
serverSocket.close()
// Clean variables
serverSocket = null
serverPort = 0
master = null
masterSocket = null
masterOutputStream = null
masterInputStream = null
}
}
/* ------------------------------------------------------------------------------------------- */
private def redirectStreamsToStderr(streams: InputStream*) {
try {
for(stream <- streams) {
new RedirectThread(stream, System.err, "stream reader").start()
}
} catch {
case e: Exception =>
logError("Exception in redirecting streams", e)
}
}
/* ------------------------------------------------------------------------------------------- */
}
================================================
FILE: ext/spark/src/test/scala/MarshalSpec.scala
================================================
package org.apache.spark.api.ruby.marshal
import org.scalatest._
import org.apache.spark.api.ruby.marshal._
class MarshalSpec extends FunSpec with Matchers {
// ====================================================================================
// Load
describe("Marshal.load"){
describe("single value"){
it("int"){
val data = 1
val serialized = Array[Byte](4, 8, 105, 6)
Marshal.load(serialized) should equal(data)
}
it("double"){
val data = 1.2
val serialized = Array[Byte](4, 8, 102, 8, 49, 46, 50)
Marshal.load(serialized) should equal(data)
}
}
describe("array"){
it("ints"){
val data = Array(1, 2, 3, 4, 5)
val serialized = Array[Byte](4, 8, 91, 10, 105, 6, 105, 7, 105, 8, 105, 9, 105, 10)
Marshal.load(serialized) should equal(data)
}
it("doubles"){
val data = Array(1.1, 2.2, 3.3)
val serialized = Array[Byte](4, 8, 91, 8, 102, 8, 49, 46, 49, 102, 8, 50, 46, 50, 102, 8, 51, 46, 51)
Marshal.load(serialized) should equal(data)
}
}
}
// ====================================================================================
// Dump
describe("Marshal.dump"){
describe("single value"){
it("int"){
val data = 1
val serialized = Array(4, 8, 105, 6)
Marshal.dump(data) should equal(serialized)
}
it("double"){
val data = 1.2
val serialized = Array(4, 8, 102, 8, 49, 46, 50)
Marshal.dump(data) should equal(serialized)
}
}
describe("array"){
it("ints"){
val data = Array(1, 2, 3, 4, 5)
val serialized = Array(4, 8, 91, 10, 105, 6, 105, 7, 105, 8, 105, 9, 105, 10)
Marshal.dump(data) should equal(serialized)
}
it("doubles"){
val data = Array(1.1, 2.2, 3.3)
val serialized = Array(4, 8, 91, 8, 102, 8, 49, 46, 49, 102, 8, 50, 46, 50, 102, 8, 51, 46, 51)
Marshal.dump(data) should equal(serialized)
}
}
}
}
================================================
FILE: lib/ruby-spark.rb
================================================
require_relative 'spark'
================================================
FILE: lib/spark/accumulator.rb
================================================
module Spark
##
# A shared variable that can be accumulated, i.e., has a commutative and associative "add"
# operation. Worker tasks on a Spark cluster can add values to an Accumulator with the `+=`
# operator, but only the driver program is allowed to access its value, using value.
# Updates from the workers get propagated automatically to the driver program.
#
# == Arguments:
# value::
# Initial value for accumulator. This values is stored only on driver process
#
# accum_param::
# How merge 2 value on worker or driver process.
# Symbol or Proc (or String)
#
# zero_value::
# Initial value for worker process
#
#
# == Examples:
#
# accum1 = $sc.accumulator(1)
# accum2 = $sc.accumulator(2, :*, 1)
# accum3 = $sc.accumulator(3, lambda{|max, val| val > max ? val : max})
#
# accum1 += 1
#
# accum2.add(2)
# accum2.add(2)
# accum2.add(2)
#
# accum3.add(9)
# accum3.add(6)
# accum3.add(7)
#
# accum1.value # => 2
# accum2.value # => 16
# accum3.value # => 9
#
# func = Proc.new do |_, index|
# accum1.add(1)
# accum2.add(2)
# accum3.add(index * 10)
# end
#
# rdd = $sc.parallelize(0..4, 4)
# rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3)
# rdd = rdd.map_partitions_with_index(func)
# rdd.collect
#
# accum1.value # => 6
# accum2.value # => 256
# accum3.value # => 30
#
class Accumulator
attr_reader :id, :value, :accum_param, :zero_value
@@instances = {}
@@changed = []
SUPPORTED_SYMBOLS = [:+, :-, :*, :/, :**]
# =========================================================================
# Creating and selecting Spark::Accumulator
def initialize(value, accum_param=:+, zero_value=0)
@id = object_id
@value = value
@accum_param = accum_param
@zero_value = zero_value
@driver = true
valid_accum_param
@@instances[@id] = self
end
def inspect
result = %{#<#{self.class.name}:0x#{object_id}\n}
result << %{ ID: #{@id}\n}
result << %{ Zero: #{@zero_value.to_s[0, 10]}\n}
result << %{Value: #{@value.to_s[0, 10]}>}
result
end
def self.changed
@@changed
end
def self.instances
@@instances
end
def valid_accum_param
if @accum_param.is_a?(Symbol)
raise Spark::AccumulatorError, "Unsupported symbol #{@accum_param}" unless SUPPORTED_SYMBOLS.include?(@accum_param)
@serialized_accum_param = @accum_param
return
end
if @accum_param.is_a?(Proc)
begin
@serialized_accum_param = @accum_param.to_source
return
rescue
raise Spark::SerializeError, 'Proc can not be serialized. Use String instead.'
end
end
if @accum_param.is_a?(String)
@serialized_accum_param = @accum_param
@accum_param = eval(@accum_param)
unless @accum_param.is_a?(Proc)
raise Spark::SerializeError, 'Yours param is not a Proc.'
end
return
end
raise Spark::AccumulatorError, 'Unsupported param. Use Symbol, Proc or String.'
end
# Driver process or worker
def driver?
@driver
end
# =========================================================================
# Operations
def add(term)
if !driver? && !@@changed.include?(self)
@@changed << self
end
if @accum_param.is_a?(Proc)
@value = @accum_param.call(@value, term)
else
add_by_symbol(term)
end
end
def +(term)
add(term)
self
end
def add_by_symbol(term)
case @accum_param
when :+
@value += term
when :-
@value -= term
when :*
@value *= term
when :/
@value /= term
when :**
@value **= term
end
end
# =========================================================================
# Dump and load
def marshal_dump
[@id, @zero_value, @serialized_accum_param]
end
def marshal_load(array)
@id, @zero_value, @serialized_accum_param = array
@value = @zero_value
@driver = false
load_accum_param
end
def load_accum_param
if @serialized_accum_param.is_a?(String)
@accum_param = eval(@serialized_accum_param)
else
@accum_param = @serialized_accum_param
end
end
end
end
# =============================================================================
# Server for handeling Accumulator update
#
module Spark
class Accumulator
class Server
attr_reader :server, :host, :port
def self.start
@instance ||= Spark::Accumulator::Server.new
end
def self.stop
@instance && @instance.stop
end
def self.host
start
@instance.host
end
def self.port
start
@instance.port
end
def initialize
@server = TCPServer.new(0)
@host = @server.hostname
@port = @server.port
@threads = []
handle_accept
end
def stop
@threads.each(&:kill)
rescue
nil
end
def handle_accept
@threads << Thread.new do
loop {
handle_connection(@server.accept)
}
end
end
def handle_connection(socket)
@threads << Thread.new do
until socket.closed?
count = socket.read_int
count.times do
data = socket.read_data
accum = Spark::Accumulator.instances[data[0]]
if accum
accum.add(data[1])
else
Spark.logger.warn("Accumulator with id #{data[0]} does not exist.")
end
end
# http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
# socket.write_int(Spark::Constant::ACCUMULATOR_ACK)
end
end
end
end
end
end
================================================
FILE: lib/spark/broadcast.rb
================================================
module Spark
##
# Broadcast a read-only variable to the cluster, returning a Spark::Broadcast
# object for reading it in distributed functions. The variable will
# be sent to each cluster only once.
#
# == Example:
#
# broadcast1 = $sc.broadcast('a')
# broadcast2 = $sc.broadcast('b')
# broadcast3 = $sc.broadcast([1,2,3])
#
# func = Proc.new do |part, index|
# [
# broadcast1.value * index,
# broadcast2.value * index,
# broadcast3.value.reduce(:+)
# ]
# end
#
# rdd = $sc.parallelize(0..5, 4)
# rdd = rdd.bind(broadcast1: broadcast1, broadcast2: broadcast2, broadcast3: broadcast3)
# rdd = rdd.map_partitions_with_index(func)
# rdd.collect
# # => ["", "", 6, "a", "b", 6, "aa", "bb", 6, "aaa", "bbb", 6]
#
class Broadcast
LOADED = 0 # id, value, path
NOT_LOADED = 1 # id, path
WITHOUT_PATH = 2 # id
attr_reader :id, :state, :path, :jbroadcast
@@registered = {}
# =========================================================================
# Creating broadcast for SparkContext
# Create new Broadcast and dump value to the disk
#
# b = $sc.broadcast('a')
#
# b.value # => 'a'
# b.path
# b.jbroadcast
#
def initialize(sc, value)
@id = object_id
@value = value
@state = LOADED
file = Tempfile.create('broadcast', sc.temp_dir)
file.binmode
file.write(Marshal.dump(value))
file.close
@path = file.path
@jbroadcast = RubyRDD.readBroadcastFromFile(sc.jcontext, @path, Spark.jb.to_long(@id))
ObjectSpace.define_finalizer(self, proc { File.unlink(@path) })
end
def inspect
result = %{#<#{self.class.name}:0x#{object_id}\n}
result << %{ ID: #{@id}\n}
result << %{Value: #{@value.to_s[0, 10]}>}
result
end
def self.register(id, path)
@@registered[id] = path
end
def value
case state
when LOADED
@value
when NOT_LOADED
@value = Marshal.load(File.read(@path))
@state = LOADED
@value
when WITHOUT_PATH
@path = @@registered[id]
if @path
@state = NOT_LOADED
value
else
raise Spark::BroadcastError, "Broadcast #{@id} do not have registered path."
end
end
end
def marshal_dump
@id
end
def marshal_load(id)
@id = id
@state = WITHOUT_PATH
end
end
end
================================================
FILE: lib/spark/build.rb
================================================
module Spark
module Build
DEFAULT_SCALA_VERSION = '2.10.4'
DEFAULT_CORE_VERSION = '2.10'
DEFAULT_SPARK_VERSION = '1.6.0'
DEFAULT_HADOOP_VERSION = '1.0.4'
SBT = 'sbt/sbt'
SBT_DEPS = 'assemblyPackageDependency'
SBT_EXT = 'package'
SBT_CLEAN = 'clean'
def self.build(options={})
scala_version = options[:scala_version] || DEFAULT_SCALA_VERSION
spark_core_version = options[:spark_core_version] || DEFAULT_CORE_VERSION
spark_version = options[:spark_version] || DEFAULT_SPARK_VERSION
hadoop_version = options[:hadoop_version] || DEFAULT_HADOOP_VERSION
target = options[:target] || Spark.target_dir
only_ext = options[:only_ext] || false
env = {
'SCALA_VERSION' => scala_version,
'SPARK_VERSION' => spark_version,
'SPARK_CORE_VERSION' => spark_core_version,
'HADOOP_VERSION' => hadoop_version,
'TARGET_DIR' => target
}
cmd = [SBT]
cmd << SBT_EXT
cmd << SBT_DEPS unless only_ext
cmd << SBT_CLEAN unless $DEBUG
Dir.chdir(Spark.spark_ext_dir) do
unless Kernel.system(env, cmd.join(' '))
raise Spark::BuildError, 'Spark cannot be assembled.'
end
end
end
end
end
================================================
FILE: lib/spark/cli.rb
================================================
require 'commander'
module Commander
module UI
# Disable paging
# for 'classic' help
def self.enable_paging
end
end
end
module Spark
class CLI
include Commander::Methods
# IRB_HISTORY_FILE = File.join(Dir.home, '.irb_spark_history')
# IRB_HISTORY_SIZE = 100
def run
program :name, 'RubySpark'
program :version, Spark::VERSION
program :description, 'Ruby wrapper for Spark'
global_option('-d', '--debug', 'Logging message to stdout'){ $DEBUG = true }
default_command :help
# Build ---------------------------------------------------------------
command :build do |c|
c.syntax = 'build [options]'
c.description = 'Build spark and gem extensions'
c.option '--hadoop-version STRING', String, 'Version of hadoop which will assembled with the Spark'
c.option '--spark-core-version STRING', String, 'Version of Spark core'
c.option '--spark-version STRING', String, 'Version of Spark'
c.option '--scala-version STRING', String, 'Version of Scala'
c.option '--target STRING', String, 'Directory where Spark will be stored'
c.option '--only-ext', 'Build only extension for RubySpark'
c.action do |args, options|
Spark::Build.build(options.__hash__)
puts
puts 'Everything is OK'
end
end
alias_command :install, :build
# Shell -----------------------------------------------------------------
command :shell do |c|
c.syntax = 'shell [options]'
c.description = 'Start ruby shell for spark'
c.option '--target STRING', String, 'Directory where Spark is stored'
c.option '--properties-file STRING', String, 'Path to a file from which to load extra properties'
c.option '--[no-]start', 'Start Spark immediately'
c.option '--[no-]logger', 'Enable/disable logger (default: enable)'
c.option '--auto-reload', 'Autoreload changed files'
c.action do |args, options|
options.default start: true, logger: true
Spark.load_lib(options.target)
Spark.logger.disable unless options.logger
Spark.config do
set_app_name 'RubySpark'
end
Spark.config.from_file(options.properties_file)
if options.auto_reload
require 'listen'
listener = Listen.to(File.join(Spark.root, 'lib')) do |modified, added, removed|
(modified+added).each do |file|
silence_warnings { load(file) }
end
end
listener.start
end
if options.start
# Load Java and Spark
Spark.start
$sc = Spark.context
Spark.print_logo('Spark context is loaded as $sc')
else
Spark.print_logo('You can start Spark with Spark.start')
end
# Load Pry
require 'pry'
Pry.start
end
end
# # IRB -------------------------------------------------------------------
# command :irb do |c|
# c.syntax = 'irb [options]'
# c.description = 'Start ruby shell for spark'
# c.option '--spark-home STRING', String, 'Directory where Spark is stored'
# c.option '--[no-]start', 'Start Spark immediately'
# c.option '--[no-]logger', 'Enable/disable logger (default: enable)'
#
# c.action do |args, options|
# options.default start: true, logger: true
#
# Spark.load_lib(options.spark_home)
# Spark::Logger.disable unless options.logger
#
# Spark.config do
# set_app_name 'Pry RubySpark'
# end
#
# if options.start
# # Load Java and Spark
# Spark.start
# $sc = Spark.context
#
# Spark.print_logo('Spark context is loaded as $sc')
# else
# Spark.print_logo('You can start Spark with Spark.start')
# end
#
# # Load IRB
# require 'irb'
# require 'irb/completion'
# require 'irb/ext/save-history'
#
# begin
# file = File.expand_path(IRB_HISTORY_FILE)
# if File.exists?(file)
# lines = IO.readlines(file).collect { |line| line.chomp }
# Readline::HISTORY.push(*lines)
# end
# Kernel.at_exit do
# lines = Readline::HISTORY.to_a.reverse.uniq.reverse
# lines = lines[-IRB_HISTORY_SIZE, IRB_HISTORY_SIZE] if lines.nitems > IRB_HISTORY_SIZE
# File.open(IRB_HISTORY_FILE, File::WRONLY | File::CREAT | File::TRUNC) { |io| io.puts lines.join("\n") }
# end
# rescue
# end
#
# ARGV.clear # Clear Thor ARGV, otherwise IRB will parse it
# ARGV.concat ['--readline', '--prompt-mode', 'simple']
# IRB.start
# end
# end
# Home ------------------------------------------------------------------
command :home do |c|
c.action do |args, options|
puts Spark.home
exit(0)
end
end
# Ruby spark jar --------------------------------------------------------
command :ruby_spark_jar do |c|
c.action do |args, options|
puts Spark.ruby_spark_jar
exit(0)
end
end
run!
end
end
end
================================================
FILE: lib/spark/command/base.rb
================================================
##
# Spark::Command::Base
#
# Parent for all commands (Map, FlatMap, Sort, ...)
#
class Spark::Command::Base
DEFAULT_VARIABLE_OPTIONS = {
type: Hash,
function: true
}
def initialize(*args)
settings.variables.each do |name, options|
instance_variable_set("@#{name}", args.shift)
end
end
def to_s
self.class.name.split('::').last
end
def self.error(message)
raise Spark::CommandError, message
end
def error(message)
self.class.error(message)
end
def log(message=nil)
$stdout.puts %{==> #{Time.now.strftime("%H:%M:%S")} [#{self.class.name}] #{message}}
$stdout.flush
end
# ===============================================================================================
# Methods called during class loading
# This is not nicer way but these methods set/get classes variables for child
# Settings for command (variables)
def self.settings
init_settings
class_variable_get(:@@settings)
end
def settings
self.class.settings
end
# Init empty settings
def self.init_settings
if !class_variable_defined?(:@@settings)
struct = Struct.new(:variables)
class_variable_set(:@@settings, struct.new)
settings.variables = {}
end
end
# New variable for command
#
# == Example:
#
# class Map < Spark::Command::Base
# variable :map_function
# end
#
# command = Map.new(1)
#
# command.instance_variables
# # => [:@map_function]
# command.instance_variable_get(:@map_function)
# # => 1
#
def self.variable(name, options={})
if settings.variables.has_key?(name)
error "Function #{name} already exist."
end
settings.variables[name] = DEFAULT_VARIABLE_OPTIONS.merge(options)
end
# ===============================================================================================
# Executing methods
# Execute command for data and split index
def execute(iterator, split_index)
# Implemented on Base but can be override
before_run
# Run has to be implemented on child
if iterator.is_a?(Enumerator::Lazy) && respond_to?(:lazy_run)
return lazy_run(iterator, split_index)
end
iterator = iterator.to_a
run(iterator, split_index)
end
def prepared?
!!@prepared
end
# This is called before execution. Executing will be stopped if
# some command contains error (e.g. badly serialized lambda).
#
# == What is doing?
# * evaluate lambda
# * evaluate method
# * make new lambda
#
def prepare
return if prepared?
to_function = settings.variables.select {|_, options| options[:function]}
to_function.each do |name, options|
name = "@#{name}"
data = instance_variable_get(name)
case data[:type]
when 'proc'
result = eval(data[:content])
when 'symbol'
result = lambda(&data[:content])
when 'method'
# Method must me added to instance not Class
instance_eval(data[:content])
# Method will be available as Proc
result = lambda(&method(data[:name]))
end
instance_variable_set(name, result)
end
@prepared = true
end
# This method is called before every execution.
def before_run
end
# ===============================================================================================
# Bound objects
attr_accessor :__objects__
def method_missing(method, *args, &block)
if __objects__ && __objects__.has_key?(method)
return __objects__[method]
end
super
end
end
================================================
FILE: lib/spark/command/basic.rb
================================================
_Base = Spark::Command::Base
# -------------------------------------------------------------------------------------------------
# Map
class Spark::Command::Map < _Base
variable :map_function
def run(iterator, *)
iterator.map! do |item|
@map_function.call(item)
end
iterator
end
def lazy_run(iterator, *)
iterator.map do |item|
@map_function.call(item)
end
end
end
# -------------------------------------------------------------------------------------------------
# FlatMap
class Spark::Command::FlatMap < Spark::Command::Map
def run(iterator, *)
iterator = super
iterator.flatten!(1)
iterator
end
def lazy_run(iterator, *)
iterator.flat_map do |item|
@map_function.call(item)
end
end
end
# -------------------------------------------------------------------------------------------------
# MapPartitionsWithIndex
class Spark::Command::MapPartitionsWithIndex < _Base
variable :partition_function
def run(iterator, index)
iterator = @partition_function.call(iterator, index)
iterator
end
# User should controll if there is Enumerator or not
# alias_method :lazy_run, :run
end
# -------------------------------------------------------------------------------------------------
# MapPartitions
class Spark::Command::MapPartitions < Spark::Command::MapPartitionsWithIndex
def run(iterator, *)
# Do not use `super` because `@partition_function` can be method with 1 argument
iterator = @partition_function.call(iterator)
iterator
end
# alias_method :lazy_run, :run
end
# -------------------------------------------------------------------------------------------------
# Filter
class Spark::Command::Filter < _Base
variable :filter_function
def run(iterator, *)
iterator.select! do |item|
@filter_function.call(item)
end
iterator
end
def lazy_run(iterator, *)
iterator.select do |item|
@filter_function.call(item)
end
end
end
# -------------------------------------------------------------------------------------------------
# Compact
class Spark::Command::Compact < _Base
def run(iterator, *)
iterator.compact!
iterator
end
def lazy_run(iterator, *)
iterator.select do |item|
!item.nil?
end
end
end
# -------------------------------------------------------------------------------------------------
# Glom
class Spark::Command::Glom < _Base
def run(iterator, *)
[iterator]
end
def lazy_run(iterator, *)
run(iterator.to_a)
end
end
# -------------------------------------------------------------------------------------------------
# Shuffle
class Spark::Command::Shuffle < _Base
variable :seed, function: false, type: Integer
def run(iterator, *)
iterator.shuffle!(random: rng)
iterator
end
def rng
Random.new(@seed)
end
end
# -------------------------------------------------------------------------------------------------
# PartitionBy
class Spark::Command::PartitionBy
class Base < Spark::Command::Base
include Spark::Helper::Serialize
def prepare
super
# Default. Keep it after super because Sorting has own key_function.
@key_function ||= lambda{|x| x[0]}
end
def run(iterator, *)
iterator.map! do |item|
make_partition_item(item)
end
iterator.flatten!(1)
iterator
end
def lazy_run(iterator, *)
iterator.flat_map do |item|
make_partition_item(item)
end
end
private
def make_partition_item(item)
[
pack_long(@partition_func.call(@key_function[item])),
item
]
end
end
class Basic < Base
variable :partition_func
end
class Sorting < Base
variable :key_function
variable :bounds, function: false, type: Array
variable :ascending, function: false, type: [TrueClass, FalseClass]
variable :num_partitions, function: false, type: Numeric
def prepare
super
# Index by bisect alghoritm
@partition_func ||= Proc.new do |key|
count = 0
@bounds.each{|i|
break if i >= key
count += 1
}
if @ascending
count
else
@num_partitions - 1 - count
end
end
end
end # Sorting
end # PartitionBy
# -------------------------------------------------------------------------------------------------
# Aggregate
class Spark::Command::Aggregate < _Base
variable :reduce_func
variable :zero_value, function: false, type: Object
def run(iterator, *)
[iterator.reduce(@zero_value, &@reduce_func)]
end
def lazy_run(iterator, *)
run(iterator)
end
end
# -------------------------------------------------------------------------------------------------
# Reduce
class Spark::Command::Reduce < Spark::Command::Aggregate
def run(iterator, *)
[iterator.reduce(&@reduce_func)]
end
end
# -------------------------------------------------------------------------------------------------
# Foreach
class Spark::Command::Foreach < _Base
variable :each_function
def run(iterator, *)
iterator.each do |item|
@each_function.call(item)
end
nil
end
end
# -------------------------------------------------------------------------------------------------
# ForeachPartition
class Spark::Command::ForeachPartition < _Base
variable :partition_function
def run(iterator, *)
@partition_function.call(iterator)
nil
end
end
# -------------------------------------------------------------------------------------------------
# KeyBy
class Spark::Command::KeyBy < _Base
variable :key_function
def run(iterator, *)
iterator.map! do |item|
[@key_function.call(item), item]
end
iterator
end
def lazy_run(iterator, *)
iterator.map do |item|
[@key_function.call(item), item]
end
end
end
# -------------------------------------------------------------------------------------------------
# Take
class Spark::Command::Take < _Base
variable :total, function: false, type: Numeric
variable :last_part, function: false, type: Numeric
def run(iterator, index)
if index == @last_part && iterator.size > @total
return iterator.slice!(0, @total)
end
iterator
end
end
# -------------------------------------------------------------------------------------------------
# Pipe
class Spark::Command::Pipe < _Base
variable :cmds, function: false, type: Array
def before_run
require 'open3'
@in, @out, @threads = Open3.pipeline_rw(*@cmds)
end
def run(iterator, *)
create_writing_thread(iterator)
new_iterator = []
# Read full input
begin
loop {
new_iterator << @out.readline.rstrip
}
rescue EOFError
end
new_iterator
end
def lazy_run(iterator, *)
create_writing_thread(iterator)
Enumerator::Lazy.new([nil]) do |yielder, _|
begin
loop {
yielder << @out.readline.rstrip
}
rescue EOFError
end
end
end
private
def create_writing_thread(iterator)
@writing_thread = Thread.new do
# Send complete iterator to the pipe
iterator.each do |item|
@in.puts(item.to_s.rstrip)
end
# Input must be closed for EOFError
@in.close
end
end
end
================================================
FILE: lib/spark/command/pair.rb
================================================
_Base = Spark::Command::Base
# -------------------------------------------------------------------------------------------------
# CombineByKey
class Spark::Command::CombineByKey
# ---------------
class Base < Spark::Command::Base
def run(iterator, *)
_run(iterator).to_a
end
def lazy_run(iterator, *)
_run(iterator).lazy
end
end
# ---------------
class Combine < Base
variable :create_combiner
variable :merge_value
def _run(iterator)
# Not use combiners[key] ||= ..
# it tests nil and not has_key?
combiners = {}
iterator.each do |key, value|
if combiners.has_key?(key)
combiners[key] = @merge_value.call(combiners[key], value)
else
combiners[key] = @create_combiner.call(value)
end
end
combiners
end
end
# ---------------
class Merge < Base
variable :merge_combiners
def _run(iterator, *)
combiners = {}
iterator.each do |key, value|
if combiners.has_key?(key)
combiners[key] = @merge_combiners.call(combiners[key], value)
else
combiners[key] = value
end
end
combiners
end
end
# ---------------
class CombineWithZero < Base
variable :zero_value, function: false, type: Object
variable :merge_value
def _run(iterator)
# Not use combiners[key] ||= ..
# it tests nil and not has_key?
combiners = {}
iterator.each do |key, value|
unless combiners.has_key?(key)
combiners[key] = @zero_value
end
combiners[key] = @merge_value.call(combiners[key], value)
end
combiners
end
end
# ---------------
end
# -------------------------------------------------------------------------------------------------
# MapValues
class Spark::Command::MapValues < _Base
variable :map_function
def run(iterator, *)
iterator.map! do |item|
item[1] = @map_function.call(item[1])
item
end
iterator
end
def lazy_run(iterator, *)
iterator.map do |item|
item[1] = @map_function.call(item[1])
item
end
end
end
# -------------------------------------------------------------------------------------------------
# FlatMapValues
class Spark::Command::FlatMapValues < _Base
variable :map_function
def run(iterator, *)
iterator.map! do |(key, values)|
values = @map_function.call(values)
values.flatten!(1)
values.map! do |value|
[key, value]
end
end
iterator.flatten!(1)
iterator
end
end
================================================
FILE: lib/spark/command/sort.rb
================================================
_Base = Spark::Command::Base
# -------------------------------------------------------------------------------------------------
# Sort
class Spark::Command::SortByKey < _Base
variable :key_function
variable :ascending, function: false, type: [TrueClass, FalseClass]
variable :spilling, function: false, type: [TrueClass, FalseClass]
variable :memory, function: false, type: [Numeric, NilClass]
variable :serializer, function: false, type: Spark::Serializer::Base
# Currently disabled
def before_run
@spilling = false
end
def run(iterator, _)
if @spilling
iterator = run_with_spilling(iterator.each)
else
run_without_spilling(iterator)
end
iterator
end
def run_with_enum(iterator, _)
if @spilling
iterator = run_with_spilling(iterator)
else
iterator = iterator.to_a
run_without_spilling(iterator)
end
iterator
end
private
def run_with_spilling(iterator)
sorter = Spark::ExternalSorter.new(@memory, @serializer)
sorter.sort_by(iterator, @ascending, @key_function)
end
def run_without_spilling(iterator)
iterator.sort_by!(&@key_function)
iterator.reverse! unless @ascending
end
end
================================================
FILE: lib/spark/command/statistic.rb
================================================
_Base = Spark::Command::Base
# -------------------------------------------------------------------------------------------------
# Sample
class Spark::Command::Sample < _Base
variable :with_replacement, function: false, type: [TrueClass, FalseClass]
variable :fraction, function: false, type: Numeric
variable :seed, function: false, type: [NilClass, Numeric]
def run(iterator, _)
sampler.sample(iterator)
end
def lazy_run(iterator, _)
sampler.lazy_sample(iterator)
end
def sampler
@sampler ||= _sampler
end
def _sampler
if @with_replacement
sampler = Spark::Sampler::Poisson
else
sampler = Spark::Sampler::Uniform
end
sampler = sampler.new(@fraction, @seed)
end
end
# -------------------------------------------------------------------------------------------------
# Stats
class Spark::Command::Stats < _Base
def run(iterator, *)
[Spark::StatCounter.new(iterator)]
end
def lazy_run(iterator, *)
run(iterator)
end
end
# -------------------------------------------------------------------------------------------------
# Histogram
class Spark::Command::Histogram < _Base
include Spark::Helper::Statistic
variable :even, function: false, type: [TrueClass, FalseClass]
variable :buckets, function: false, type: Array
def run(iterator, *)
counters = Array.new(counter_size) { 0 }
iterator.each do |item|
if item.nil? || (item.is_a?(Float) && !item.finite?) || item > max || item < min
next
end
x = bucket_function.call(item)
if x.nil?
# next
else
counters[x] += 1
end
end
[counters]
end
def lazy_run(iterator, *)
run(iterator)
end
private
def min
@buckets.first
end
def max
@buckets.last
end
def counter_size
@buckets.size-1
end
def increment
@buckets[1]-@buckets[0]
end
# Decide which bucket function to pass. We decide here rather than having
# a general function so that the decission need only be made once.
def bucket_function
@bucket_function ||= _bucket_function
end
def _bucket_function
if @even
fast_bucket_function
else
basic_bucket_function
end
end
# Determine the bucket function in constant time.
# Requires that buckets are evenly spaced
def fast_bucket_function
Proc.new do |item|
if item.is_a?(Float) && item.nan?
nil
else
bucket_number = (item - min)/increment
if bucket_number > counter_size || bucket_number < 0
nil
else
[bucket_number.to_i, counter_size-1].min
end
end
end
end
# Basic bucket function. Same as right bisect.
def basic_bucket_function
Proc.new do |item|
bucket_number = bisect_right(@buckets, item) - 1
# Counters is @buckets.size - 1
# [bucket_number, counter_size-1].min
if bucket_number > counter_size-1
counter_size-1
else
bucket_number
end
end
end
end
================================================
FILE: lib/spark/command.rb
================================================
module Spark
##
# Container which includes all commands and other things for worker
# Every RDD have own copy of Command
#
class Command
attr_accessor :serializer, :deserializer, :commands, :libraries, :bound_objects
def initialize
@serializer = nil
@deserializer = nil
@commands = []
@libraries = []
@bound_objects = {}
end
def execute(iterator, split_index)
# Require necessary libraries
libraries.each{|lib| require lib}
# Prepare bound objects
@commands.each do |command|
command.__objects__ = bound_objects
end
# Prepare for running
@commands.each(&:prepare)
# Run all task
@commands.each do |command|
iterator = command.execute(iterator, split_index)
end
# Return changed iterator. This is not be necessary for some tasks
# because of using inplace changing but some task can return
# only one value (for example reduce).
iterator
end
def last
@commands.last
end
def bound_objects
# Objects from users
# Already initialized objects on worker
return @bound_objects if @bound_objects
if @serialized_bound_objects
# Still serialized
@bound_objects = Marshal.load(@serialized_bound_objects)
else
# Something else
@bound_objects = {}
end
end
# Bound objects can depend on library which is loaded during @execute
# In that case worker raise "undefined class/module"
def marshal_dump
[@serializer, @deserializer, @commands, @libraries, serialized_bound_objects]
end
def marshal_load(array)
@serializer = array.shift
@deserializer = array.shift
@commands = array.shift
@libraries = array.shift
@serialized_bound_objects = array.shift
end
private
def serialized_bound_objects
@serialized_bound_objects ||= Marshal.dump(@bound_objects)
end
end
end
require 'spark/command/base'
require 'spark/command/basic'
require 'spark/command/pair'
require 'spark/command/statistic'
require 'spark/command/sort'
================================================
FILE: lib/spark/command_builder.rb
================================================
require 'spark/command_validator'
module Spark
##
# Builder for building correct {Spark::Command}
#
class CommandBuilder
extend Forwardable
include Spark::Helper::Serialize
include Spark::Helper::System
include Spark::CommandValidator
attr_reader :command
def_delegators :@command, :serializer, :serializer=, :deserializer, :deserializer=, :commands,
:commands=, :libraries, :libraries=, :bound_objects, :bound_objects=
def initialize(serializer, deserializer=nil)
create_command
self.serializer = serializer
self.deserializer = deserializer || serializer.dup
end
def create_command
@command = Spark::Command.new
end
# Do not user Marshal.dump(Marshal.load(self)) because some variables
# have marshal_dump prepared for worker.
def deep_copy
copy = self.dup
copy.create_command
copy.serializer = self.serializer.deep_copy
copy.deserializer = self.deserializer.deep_copy
copy.commands = self.commands.dup
copy.libraries = self.libraries.dup
copy.bound_objects = self.bound_objects.dup
copy
end
# Serialize Command class for worker
# Java use signed number
def build
unpack_chars(Marshal.dump(@command))
end
def add_command(klass, *args)
variables = klass.settings.variables
validate_size(variables, args)
built_args = []
variables.values.zip(args) do |var, arg|
if var[:function]
arg = serialize_function(arg)
end
validate(arg, var)
built_args << arg
end
comm = klass.new(*built_args)
@command.commands << comm
self
end
def add_library(*libraries)
@command.libraries += libraries
end
def bind(objects)
objects.symbolize_keys!
@command.bound_objects.merge!(objects)
end
private
# Serialized can be Proc and Method
#
# === Func
# * *string:* already serialized proc
# * *proc:* proc
# * *symbol:* name of method
# * *method:* Method class
#
def serialize_function(func)
case func
when String
serialize_function_from_string(func)
when Symbol
serialize_function_from_symbol(func)
when Proc
serialize_function_from_proc(func)
when Method
serialize_function_from_method(func)
else
raise Spark::CommandError, 'You must enter String, Symbol, Proc or Method.'
end
end
def serialize_function_from_string(string)
{type: 'proc', content: string}
end
def serialize_function_from_symbol(symbol)
{type: 'symbol', content: symbol}
end
# Serialize Proc as String
#
# lambda{|x| x*x}.to_source
# # => "proc { |x| (x * x) }"
#
def serialize_function_from_proc(proc)
serialize_function_from_string(proc.to_source)
rescue
raise Spark::SerializeError, 'Proc can not be serialized. Use String instead.'
end
# Serialize method as string
#
# def test(x)
# x*x
# end
# serialize_function_from_method(method(:test))
#
# # => "def test(x)\n x*x\nend\n"
#
def serialize_function_from_method(meth)
if pry?
meth = Pry::Method.new(meth)
end
{type: 'method', name: meth.name, content: meth.source}
rescue
raise Spark::SerializeError, 'Method can not be serialized. Use full path or Proc.'
end
end
end
================================================
FILE: lib/spark/command_validator.rb
================================================
module Spark
module CommandValidator
def validate(value, options)
validate_type(value, options[:type])
end
def valid?(value, options)
begin
validate(value, options)
return true
rescue
return false
end
end
def validate_type(value, types)
types = [types] if !types.is_a?(Array)
types.each do |type|
return if value.is_a?(type)
end
error "Value: #{value} should be a #{types.join(' or ')} but is #{value.class}."
end
def validate_size(array1, array2)
if array1.size != array2.size
error "Wrong number of arguments (#{array1.size} for #{array2.size})"
end
end
end
end
================================================
FILE: lib/spark/config.rb
================================================
# Necessary libraries
Spark.load_lib
module Spark
# Common configuration for RubySpark and Spark
class Config
include Spark::Helper::System
TYPES = {
'spark.shuffle.spill' => :boolean,
'spark.ruby.serializer.compress' => :boolean
}
# Initialize java SparkConf and load default configuration.
def initialize
@spark_conf = SparkConf.new(true)
set_default
from_file(Spark::DEFAULT_CONFIG_FILE)
end
def from_file(file)
check_read_only
if file && File.exist?(file)
file = File.expand_path(file)
RubyUtils.loadPropertiesFile(spark_conf, file)
end
end
def [](key)
get(key)
end
def []=(key, value)
set(key, value)
end
def spark_conf
if Spark.started?
# Get latest configuration
Spark.context.jcontext.conf
else
@spark_conf
end
end
def valid!
errors = []
if !contains?('spark.app.name')
errors << 'An application name must be set in your configuration.'
end
if !contains?('spark.master')
errors << 'A master URL must be set in your configuration.'
end
if Spark::Serializer.find(get('spark.ruby.serializer')).nil?
errors << 'Unknow serializer.'
end
scanned = get('spark.ruby.executor.command').scan('%s')
if scanned.size == 0
errors << "Executor command must contain '%s'."
end
if scanned.size > 1
errors << "Executor command can contain only one '%s'."
end
if errors.any?
errors.map!{|error| "- #{error}"}
raise Spark::ConfigurationError, "Configuration is not valid:\r\n#{errors.join("\r\n")}"
end
end
def read_only?
Spark.started?
end
# Rescue from NoSuchElementException
def get(key)
value = spark_conf.get(key.to_s)
case TYPES[key]
when :boolean
parse_boolean(value)
when :integer
parse_integer(value)
else
value
end
rescue
nil
end
def get_all
Hash[spark_conf.getAll.map{|tuple| [tuple._1, tuple._2]}]
end
def contains?(key)
spark_conf.contains(key.to_s)
end
def set(key, value)
check_read_only
spark_conf.set(key.to_s, value.to_s)
end
def set_app_name(name)
set('spark.app.name', name)
end
def set_master(master)
set('spark.master', master)
end
def parse_boolean(value)
case value
when 'true'
true
when 'false'
false
end
end
def parse_integer(value)
value.to_i
end
# =============================================================================
# Defaults
def set_default
set_app_name('RubySpark')
set_master('local[*]')
set('spark.ruby.driver_home', Spark.home)
set('spark.ruby.serializer', default_serializer)
set('spark.ruby.serializer.compress', default_serializer_compress)
set('spark.ruby.serializer.batch_size', default_serializer_batch_size)
set('spark.ruby.executor.command', default_executor_command)
set('spark.ruby.executor.options', default_executor_options)
set('spark.ruby.worker.type', default_worker_type)
load_executor_envs
# set('spark.ruby.executor.install', default_executor_install)
end
def default_serializer
ENV['SPARK_RUBY_SERIALIZER'] || Spark::Serializer::DEFAULT_SERIALIZER_NAME
end
def default_serializer_compress
ENV['SPARK_RUBY_SERIALIZER_COMPRESS'] || Spark::Serializer::DEFAULT_COMPRESS
end
def default_serializer_batch_size
ENV['SPARK_RUBY_SERIALIZER_BATCH_SIZE'] || Spark::Serializer::DEFAULT_BATCH_SIZE
end
# Command template which is applied when scala want create a ruby
# process (e.g. master, home request). Command is represented by '%s'.
#
# == Example:
# bash --norc -i -c "export HOME=/home/user; cd; source .bashrc; %s"
#
def default_executor_command
ENV['SPARK_RUBY_EXECUTOR_COMMAND'] || '%s'
end
# Options for every worker.
#
# == Example:
# -J-Xmx512m
#
def default_executor_options
ENV['SPARK_RUBY_EXECUTOR_OPTIONS'] || ''
end
# # Install command which is triggered before on start.
# # This command using executor command template.
# #
# # == Example:
# # gem install ruby-spark -v 1.2.0
# #
# def default_executor_install
# ENV['SPARK_RUBY_EXECUTOR_INSTALL'] || ''
# end
# Type of worker.
#
# == Options:
# process:: (default)
# thread:: (experimental)
#
def default_worker_type
ENV['SPARK_RUBY_WORKER_TYPE'] || 'process'
end
# Load environment variables for executor from ENV.
#
# == Examples:
# SPARK_RUBY_EXECUTOR_ENV_KEY1="1"
# SPARK_RUBY_EXECUTOR_ENV_KEY2="2"
#
def load_executor_envs
prefix = 'SPARK_RUBY_EXECUTOR_ENV_'
envs = ENV.select{|key, _| key.start_with?(prefix)}
envs.each do |key, value|
key = key.dup # ENV keys are frozen
key.slice!(0, prefix.size)
set("spark.ruby.executor.env.#{key}", value)
end
end
# Aliases
alias_method :getAll, :get_all
alias_method :setAppName, :set_app_name
alias_method :setMaster, :set_master
private
def check_read_only
if read_only?
raise Spark::ConfigurationError, 'Configuration is ready only'
end
end
end
end
================================================
FILE: lib/spark/constant.rb
================================================
module Spark
# Commond constant for Ruby and Spark
module Constant
DATA_EOF = -2
WORKER_ERROR = -1
WORKER_DONE = 0
CREATE_WORKER = 1
KILL_WORKER = 2
KILL_WORKER_AND_WAIT = 3
SUCCESSFULLY_KILLED = 4
UNSUCCESSFUL_KILLING = 5
ACCUMULATOR_ACK = 6
end
end
================================================
FILE: lib/spark/context.rb
================================================
# Necessary libraries
Spark.load_lib
module Spark
##
# Main entry point for Spark functionality. A SparkContext represents the connection to a Spark
# cluster, and can be used to create RDDs, accumulators and broadcast variables on that cluster.
#
class Context
include Spark::Helper::System
include Spark::Helper::Parser
include Spark::Helper::Logger
attr_reader :jcontext, :jaccumulator, :temp_dir
# Constructor for Ruby context. Configuration is automatically is taken
# from Spark. Config will be automatically set to default if user start
# context first.
#
def initialize
Spark.config.valid!
@jcontext = JavaSparkContext.new(Spark.config.spark_conf)
@jcontext.addJar(Spark.ruby_spark_jar)
# Does not work on 1.2
# ui.attachTab(RubyTab.new(ui, to_java_hash(RbConfig::CONFIG)))
spark_local_dir = JUtils.getLocalDir(sc.conf)
@temp_dir = JUtils.createTempDir(spark_local_dir, 'ruby').getAbsolutePath
accum_server = Spark::Accumulator::Server
accum_server.start
@jaccumulator = @jcontext.accumulator(ArrayList.new, RubyAccumulatorParam.new(accum_server.host, accum_server.port))
log_info("Ruby accumulator server is running on port #{accum_server.port}")
set_call_site('Ruby') # description of stage
end
def inspect
result = %{#<#{self.class.name}:0x#{object_id}\n}
result << %{Tempdir: "#{temp_dir}">}
result
end
def stop
Spark::Accumulator::Server.stop
log_info('Ruby accumulator server was stopped')
@jcontext.stop
end
def sc
@jcontext.sc
end
def ui
sc.ui
end
# Default level of parallelism to use when not given by user (e.g. parallelize and makeRDD)
#
def default_parallelism
sc.defaultParallelism
end
# Default serializer
#
# Batch -> Compress -> Basic
#
def default_serializer
# Basic
serializer = Spark::Serializer.find!(config('spark.ruby.serializer')).new
# Compress
if config('spark.ruby.serializer.compress')
serializer = Spark::Serializer.compressed(serializer)
end
# Bactching
batch_size = default_batch_size
if batch_size == 'auto'
serializer = Spark::Serializer.auto_batched(serializer)
else
serializer = Spark::Serializer.batched(serializer, batch_size)
end
# Finally, "container" contains serializers
serializer
end
def default_batch_size
size = config('spark.ruby.serializer.batch_size').to_i
if size >= 1
size
else
'auto'
end
end
# Set a local property that affects jobs submitted from this thread, such as the
# Spark fair scheduler pool.
#
def set_local_property(key, value)
jcontext.setLocalProperty(key, value)
end
# Get a local property set in this thread, or null if it is missing
#
def get_local_property(key)
jcontext.getLocalProperty(key)
end
# Support function for API backtraces.
#
def set_call_site(site)
jcontext.setCallSite(site)
end
def clear_call_site
jcontext.clearCallSite
end
# Return a copy of this SparkContext's configuration. The configuration *cannot*
# be changed at runtime.
#
def config(key=nil)
if key
Spark.config.get(key)
else
Spark.config
end
end
# Add a file to be downloaded with this Spark job on every node.
# The path of file passed can be either a local file, a file in HDFS
# (or other Hadoop-supported filesystems), or an HTTP, HTTPS or FTP URI.
#
# To access the file in Spark jobs, use `SparkFiles.get(file_name)` with the
# filename to find its download location.
#
# == Example:
# `echo 10 > test.txt`
#
# $sc.add_file('test.txt')
# $sc.parallelize(0..5).map(lambda{|x| x * SparkFiles.get_content('test.txt').to_i}).collect
# # => [0, 10, 20, 30, 40, 50]
#
def add_file(*files)
files.each do |file|
sc.addFile(file)
end
end
# Broadcast a read-only variable to the cluster, returning a Spark::Broadcast
# object for reading it in distributed functions. The variable will
# be sent to each cluster only once.
#
# == Example:
# broadcast1 = $sc.broadcast('a')
# broadcast2 = $sc.broadcast('b')
#
# rdd = $sc.parallelize(0..5, 4)
# rdd = rdd.bind(broadcast1: broadcast1, broadcast2: broadcast2)
# rdd = rdd.map_partitions_with_index(lambda{|part, index| [broadcast1.value * index, broadcast2.value * index] })
# rdd.collect
# # => ["", "", "a", "b", "aa", "bb", "aaa", "bbb"]
#
def broadcast(value)
Spark::Broadcast.new(self, value)
end
# Create an Accumulator with the given initial value, using a given
# accum_param helper object to define how to add values of the
# data type if provided.
#
# == Example:
# accum = $sc.accumulator(7)
#
# rdd = $sc.parallelize(0..5, 4)
# rdd = rdd.bind(accum: accum)
# rdd = rdd.map_partitions(lambda{|_| accum.add(1) })
# rdd = rdd.collect
#
# accum.value
# # => 11
#
def accumulator(value, accum_param=:+, zero_value=0)
Spark::Accumulator.new(value, accum_param, zero_value)
end
# Distribute a local Ruby collection to form an RDD
# Direct method can be slow so be careful, this method update data inplace
#
# == Parameters:
# data:: Range or Array
# num_slices:: number of slice
# serializer:: custom serializer (default: serializer based on configuration)
#
# == Examples:
# $sc.parallelize(["1", "2", "3"]).map(lambda{|x| x.to_i}).collect
# #=> [1, 2, 3]
#
# $sc.parallelize(1..3).map(:to_s).collect
# #=> ["1", "2", "3"]
#
def parallelize(data, num_slices=nil, serializer=nil)
num_slices ||= default_parallelism
serializer ||= default_serializer
serializer.check_each(data)
# Through file
file = Tempfile.new('to_parallelize', temp_dir)
serializer.dump_to_io(data, file)
file.close # not unlink
jrdd = RubyRDD.readRDDFromFile(jcontext, file.path, num_slices)
Spark::RDD.new(jrdd, self, serializer)
ensure
file && file.unlink
end
# Read a text file from HDFS, a local file system (available on all nodes), or any
# Hadoop-supported file system URI, and return it as an RDD of Strings.
#
# == Example:
# f = Tempfile.new("test")
# f.puts("1")
# f.puts("2")
# f.close
#
# $sc.text_file(f.path).map(lambda{|x| x.to_i}).collect
# # => [1, 2]
#
def text_file(path, min_partitions=nil, encoding=Encoding::UTF_8, serializer=nil)
min_partitions ||= default_parallelism
serializer ||= default_serializer
deserializer = Spark::Serializer.build { __text__(encoding) }
Spark::RDD.new(@jcontext.textFile(path, min_partitions), self, serializer, deserializer)
end
# Read a directory of text files from HDFS, a local file system (available on all nodes), or any
# Hadoop-supported file system URI. Each file is read as a single record and returned in a
# key-value pair, where the key is the path of each file, the value is the content of each file.
#
# == Example:
# dir = Dir.mktmpdir
# f1 = Tempfile.new("test1", dir)
# f2 = Tempfile.new("test2", dir)
# f1.puts("1"); f1.puts("2");
# f2.puts("3"); f2.puts("4");
# f1.close
# f2.close
#
# $sc.whole_text_files(dir).flat_map(lambda{|key, value| value.split}).collect
# # => ["1", "2", "3", "4"]
#
def whole_text_files(path, min_partitions=nil, serializer=nil)
min_partitions ||= default_parallelism
serializer ||= default_serializer
deserializer = Spark::Serializer.build{ __pair__(__text__, __text__) }
Spark::RDD.new(@jcontext.wholeTextFiles(path, min_partitions), self, serializer, deserializer)
end
# Executes the given partition function f on the specified set of partitions,
# returning the result as an array of elements.
#
# If partitions is not specified, this will run over all partitions.
#
# == Example:
# rdd = $sc.parallelize(0..10, 5)
# $sc.run_job(rdd, lambda{|x| x.to_s}, [0,2])
# # => ["[0, 1]", "[4, 5]"]
#
def run_job(rdd, f, partitions=nil, allow_local=false)
run_job_with_command(rdd, partitions, allow_local, Spark::Command::MapPartitions, f)
end
# Execute the given command on specific set of partitions.
#
def run_job_with_command(rdd, partitions, allow_local, command, *args)
if !partitions.nil? && !partitions.is_a?(Array)
raise Spark::ContextError, 'Partitions must be nil or Array'
end
partitions_size = rdd.partitions_size
# Execute all parts
if partitions.nil?
partitions = (0...partitions_size).to_a
end
# Can happend when you use coalesce
partitions.delete_if {|part| part >= partitions_size}
# Rjb represent Fixnum as Integer but Jruby as Long
partitions = to_java_array_list(convert_to_java_int(partitions))
# File for result
file = Tempfile.new('collect', temp_dir)
mapped = rdd.new_rdd_from_command(command, *args)
RubyRDD.runJob(rdd.context.sc, mapped.jrdd, partitions, allow_local, file.path)
mapped.collect_from_file(file)
end
# Aliases
alias_method :textFile, :text_file
alias_method :wholeTextFiles, :whole_text_files
alias_method :defaultParallelism, :default_parallelism
alias_method :setLocalProperty, :set_local_property
alias_method :getLocalProperty, :get_local_property
alias_method :setCallSite, :set_call_site
alias_method :clearCallSite, :clear_call_site
alias_method :runJob, :run_job
alias_method :runJobWithCommand, :run_job_with_command
alias_method :addFile, :add_file
end
end
================================================
FILE: lib/spark/error.rb
================================================
module Spark
# Extension cannot be built
class BuildError < StandardError
end
# Proc.to_source
# Java object cannot be converted
class SerializeError < StandardError
end
# Serializer method
# Non-existing serializer
class NotImplemented < StandardError
end
# Missison app_name or master
class ConfigurationError < StandardError
end
# Wrong parameters
class RDDError < StandardError
end
# Validations
class CommandError < StandardError
end
# Parser helper
# SQL DataType
class ParseError < StandardError
end
# Validation in context
class ContextError < StandardError
end
# Broadcasts
# Missing path
class BroadcastError < StandardError
end
# Accumulators
# Existing keys
# Wrong ID
class AccumulatorError < StandardError
end
# Wrong instances
class MllibError < StandardError
end
# Wrong datatype
class SQLError < StandardError
end
# Missing Java class
class JavaBridgeError < StandardError
end
end
================================================
FILE: lib/spark/ext/hash.rb
================================================
module Spark
module CoreExtension
module Hash
module ClassMethods
end
module InstanceMethods
# Destructively convert all keys to strings.
def stringify_keys_with_spark!
transform_keys!{ |key| key.to_s }
end
# Destructively convert all keys to symbols, as long as they respond
def symbolize_keys_with_spark!
transform_keys!{ |key| key.to_sym rescue key }
end
# Destructively convert all keys using the block operations.
# Same as transform_keys but modifies +self+.
def transform_keys_with_spark!
keys.each do |key|
self[yield(key)] = delete(key)
end
self
end
end
def self.included(base)
base.extend(ClassMethods)
base.send(:include, InstanceMethods)
base.class_eval do
patch_unless_exist :stringify_keys!, :spark
patch_unless_exist :symbolize_keys!, :spark
patch_unless_exist :transform_keys!, :spark
end
end
end
end
end
Hash.__send__(:include, Spark::CoreExtension::Hash)
================================================
FILE: lib/spark/ext/integer.rb
================================================
module Spark
module CoreExtension
module Integer
module ClassMethods
end
module InstanceMethods
end
def self.included(base)
base.extend(ClassMethods)
base.send(:include, InstanceMethods)
base.class_eval do
const_set :MAX_WITH_SPARK, 1 << (1.size * 8 - 2) - 1
const_set :MIN_WITH_SPARK, -const_get(:MAX_WITH_SPARK) - 1
path_const_unless_exist :MAX, :SPARK
path_const_unless_exist :MIN, :SPARK
end
end
end
end
end
Integer.__send__(:include, Spark::CoreExtension::Integer)
================================================
FILE: lib/spark/ext/io.rb
================================================
module Spark
module CoreExtension
module IO
module ClassMethods
end
module InstanceMethods
# Reading
def read_int
unpack_int(read(4))
end
def read_int_or_eof
bytes = read(4)
return Spark::Constant::DATA_EOF if bytes.nil?
unpack_int(bytes)
end
def read_long
unpack_long(read(8))
end
def read_string
read(read_int)
end
def read_data
Marshal.load(read_string)
end
# Writing
def write_int(data)
write(pack_int(data))
end
def write_long(data)
write(pack_long(data))
end
# Size and data can have different encoding
# Marshal: both ASCII
# Oj: ASCII and UTF-8
def write_string(data)
write_int(data.bytesize)
write(data)
end
def write_data(data)
write_string(Marshal.dump(data))
end
end
def self.included(base)
base.extend(ClassMethods)
base.send(:include, Spark::Helper::Serialize)
base.send(:include, InstanceMethods)
end
end
end
end
IO.__send__(:include, Spark::CoreExtension::IO)
StringIO.__send__(:include, Spark::CoreExtension::IO)
================================================
FILE: lib/spark/ext/ip_socket.rb
================================================
module Spark
module CoreExtension
module IPSocket
module ClassMethods
end
module InstanceMethods
def port
addr[1]
end
def hostname
addr(true)[2]
end
def numeric_address
addr[3]
end
end
def self.included(base)
base.extend(ClassMethods)
base.send(:include, InstanceMethods)
end
end
end
end
IPSocket.__send__(:include, Spark::CoreExtension::IPSocket)
================================================
FILE: lib/spark/ext/module.rb
================================================
module Spark
module CoreExtension
module Module
# Patch method to class unless already exist
#
# == Example:
#
# class Hash
# def a
# 1
# end
# end
#
# module HashExtension
# module InstanceMethods
# def a_with_spark
# 2
# end
#
# def b_with_spark
# 1
# end
# end
#
# def self.included(base)
# base.send(:include, InstanceMethods)
# base.class_eval do
# patch_unless_exist :a, :spark
# patch_unless_exist :b, :spark
# end
# end
# end
#
# Hash.include(HashExtension)
#
# Hash.new.a # => 1
# Hash.new.b # => 1
#
def patch_unless_exist(target, suffix)
unless method_defined?(target)
aliased_target, punctuation = target.to_s.sub(/([?!=])$/, ''), $1
alias_method target, "#{aliased_target}_with_#{suffix}#{punctuation}"
end
end
def path_const_unless_exist(target, suffix)
unless const_defined?(target)
const_set(target, const_get("#{target}_WITH_#{suffix}"))
end
end
end
end
end
Module.__send__(:include, Spark::CoreExtension::Module)
================================================
FILE: lib/spark/ext/object.rb
================================================
module Spark
module CoreExtension
module Object
module ClassMethods
end
module InstanceMethods
def deep_copy_with_spark
Marshal.load(Marshal.dump(self))
end
def silence_warnings
old_verbose, $VERBOSE = $VERBOSE, nil
yield
ensure
$VERBOSE = old_verbose
end
def cattr_reader_with_spark(*syms)
syms.each do |sym|
raise NameError.new("Invalid attribute name: #{sym}") unless sym =~ /^[_A-Za-z]\w*$/
class_eval(<<-EOS, __FILE__, __LINE__ + 1)
@@#{sym} = nil unless defined? @@#{sym}
def self.#{sym}
@@#{sym}
end
EOS
class_eval(<<-EOS, __FILE__, __LINE__ + 1)
def #{sym}
@@#{sym}
end
EOS
end
end
def cattr_writer_with_spark(*syms)
syms.each do |sym|
raise NameError.new("Invalid attribute name: #{sym}") unless sym =~ /^[_A-Za-z]\w*$/
class_eval(<<-EOS, __FILE__, __LINE__ + 1)
@@#{sym} = nil unless defined? @@#{sym}
def self.#{sym}=(obj)
@@#{sym} = obj
end
EOS
class_eval(<<-EOS, __FILE__, __LINE__ + 1)
def #{sym}=(obj)
@@#{sym} = obj
end
EOS
end
end
def cattr_accessor_with_spark(*syms)
cattr_reader_with_spark(*syms)
cattr_writer_with_spark(*syms)
end
end
def self.included(base)
base.extend(ClassMethods)
base.send(:include, InstanceMethods)
base.class_eval do
patch_unless_exist :deep_copy, :spark
patch_unless_exist :silence_warnings, :spark
patch_unless_exist :cattr_accessor, :spark
end
end
end
end
end
Object.__send__(:include, Spark::CoreExtension::Object)
================================================
FILE: lib/spark/ext/string.rb
================================================
module Spark
module CoreExtension
module String
module ClassMethods
end
module InstanceMethods
def camelize_with_spark
self.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase }
end
end
def self.included(base)
base.extend(ClassMethods)
base.send(:include, InstanceMethods)
base.class_eval do
patch_unless_exist :camelize, :spark
end
end
end
end
end
String.__send__(:include, Spark::CoreExtension::String)
================================================
FILE: lib/spark/helper/logger.rb
================================================
module Spark
module Helper
module Logger
def self.included(base)
base.send :extend, Methods
base.send :include, Methods
end
module Methods
def log_info(message)
Spark.logger.info(message)
end
def log_debug(message)
Spark.logger.debug(message)
end
def log_trace(message)
Spark.logger.trace(message)
end
def log_warning(message)
Spark.logger.warning(message)
end
def log_error(message)
Spark.logger.error(message)
end
alias_method :logInfo, :log_info
alias_method :logDebug, :log_debug
alias_method :logTrace, :log_trace
alias_method :logWarning, :log_warning
alias_method :logError, :log_error
end # Methods
end # Logger
end # Helper
end # Spark
================================================
FILE: lib/spark/helper/parser.rb
================================================
module Spark
module Helper
module Parser
def self.included(base)
base.send :extend, Methods
base.send :include, Methods
end
module Methods
def to_java_hash(hash)
hash_map = HashMap.new
hash.each_pair do |key, value|
begin
# RJB raise Object is NULL (but new record is put correctly)
hash_map.put(key, value)
rescue RuntimeError
end
end
hash_map
end
def convert_to_java_int(data)
if data.is_a?(Array)
data.map{|x| JInteger.new(x)}
else
JInteger.new(data)
end
end
def to_java_array_list(array)
array_list = ArrayList.new
array.each do |item|
array_list.add(item)
end
array_list
end
# Parse and convert memory size. Shifting be better but Float doesn't support it.
#
# == Examples:
# to_memory_size("512mb")
# # => 524288
#
# to_memory_size("512 MB")
# # => 524288
#
# to_memory_size("512mb", "GB")
# # => 0.5
#
def to_memory_size(memory, result_unit="KB")
match = memory.match(/([\d]+)[\s]*([\w]*)/)
if match.nil?
raise Spark::ParseError, "Memory has wrong format. Use: 'SIZE UNIT'"
end
size = match[1].to_f
unit = match[2]
size *= memory_multiplier_based_kb(unit)
size /= memory_multiplier_based_kb(result_unit)
size.round(2)
end
# Based to KB
def memory_multiplier_based_kb(type)
case type.to_s.upcase
when "G", "GB"
1048576
when "M", "MB"
1024
when "K", "KB"
1
else
raise Spark::ParseError, "Unsupported type #{type}"
end
end
end # Methods
end # Parser
end # Helper
end # Spark
================================================
FILE: lib/spark/helper/serialize.rb
================================================
module Spark
module Helper
module Serialize
DIRECTIVE_INTEGER_BIG_ENDIAN = 'l>'
DIRECTIVE_INTEGERS_BIG_ENDIAN = 'l>*'
DIRECTIVE_LONG_BIG_ENDIAN = 'q>'
DIRECTIVE_LONGS_BIG_ENDIAN = 'q>*'
DIRECTIVE_DOUBLE_BIG_ENDIAN = 'G'
DIRECTIVE_DOUBLES_BIG_ENDIAN = 'G*'
DIRECTIVE_UNSIGNED_CHARS = 'C*'
DIRECTIVE_CHARS = 'c*'
# Packing
def pack_int(data)
[data].pack(DIRECTIVE_INTEGER_BIG_ENDIAN)
end
def pack_long(data)
[data].pack(DIRECTIVE_LONG_BIG_ENDIAN)
end
def pack_double(data)
[data].pack(DIRECTIVE_DOUBLE_BIG_ENDIAN)
end
def pack_unsigned_chars(data)
data.pack(DIRECTIVE_UNSIGNED_CHARS)
end
def pack_ints(data)
__check_array(data)
data.pack(DIRECTIVE_INTEGERS_BIG_ENDIAN)
end
def pack_longs(data)
__check_array(data)
data.pack(DIRECTIVE_LONGS_BIG_ENDIAN)
end
def pack_doubles(data)
__check_array(data)
data.pack(DIRECTIVE_DOUBLES_BIG_ENDIAN)
end
# Unpacking
def unpack_int(data)
data.unpack(DIRECTIVE_INTEGER_BIG_ENDIAN)[0]
end
def unpack_long(data)
data.unpack(DIRECTIVE_LONG_BIG_ENDIAN)[0]
end
def unpack_chars(data)
data.unpack(DIRECTIVE_CHARS)
end
private
def __check_array(data)
unless data.is_a?(Array)
raise ArgumentError, 'Data must be an Array.'
end
end
end
end
end
================================================
FILE: lib/spark/helper/statistic.rb
================================================
module Spark
module Helper
module Statistic
# Returns a sampling rate that guarantees a sample of size >= sampleSizeLowerBound 99.99% of the time.
#
# == How the sampling rate is determined:
# Let p = num / total, where num is the sample size and total is the total number of
# datapoints in the RDD. We're trying to compute q > p such that
# * when sampling with replacement, we're drawing each datapoint with prob_i ~ Pois(q),
# where we want to guarantee Pr[s < num] < 0.0001 for s = sum(prob_i for i from 0 to total),
# i.e. the failure rate of not having a sufficiently large sample < 0.0001.
# Setting q = p + 5 * sqrt(p/total) is sufficient to guarantee 0.9999 success rate for
# num > 12, but we need a slightly larger q (9 empirically determined).
# * when sampling without replacement, we're drawing each datapoint with prob_i
# ~ Binomial(total, fraction) and our choice of q guarantees 1-delta, or 0.9999 success
# rate, where success rate is defined the same as in sampling with replacement.
#
def compute_fraction(lower_bound, total, with_replacement)
lower_bound = lower_bound.to_f
if with_replacement
upper_poisson_bound(lower_bound) / total
else
fraction = lower_bound / total
upper_binomial_bound(0.00001, total, fraction)
end
end
def upper_poisson_bound(bound)
num_std = if bound < 6
12
elsif bound < 16
9
else
6
end.to_f
[bound + num_std * Math.sqrt(bound), 1e-10].max
end
def upper_binomial_bound(delta, total, fraction)
gamma = -Math.log(delta) / total
[1, fraction + gamma + Math.sqrt(gamma*gamma + 2*gamma*fraction)].min
end
# Bisect right
#
# == Examples:
# data = [1,5,6,8,96,120,133]
#
# bisect_right(data, 0) # => 0
# bisect_right(data, 1) # => 1
# bisect_right(data, 5) # => 2
# bisect_right(data, 9) # => 4
# bisect_right(data, 150) # => 7
#
def bisect_right(data, value, low=0, high=data.size)
if low < 0
raise ArgumentError, 'Low must be >= 0.'
end
while low < high
mid = (low + high) / 2
if value < data[mid]
high = mid
else
low = mid + 1
end
end
low
end
# Determine bound of partitioning
#
# == Example:
# data = [0,1,2,3,4,5,6,7,8,9,10]
# determine_bounds(data, 3)
# # => [3, 7]
#
def determine_bounds(data, num_partitions)
if num_partitions > data.size
return data
end
bounds = []
count = data.size
(0...(num_partitions-1)).each do |index|
bounds << data[count * (index+1) / num_partitions]
end
bounds
end
end
end
end
================================================
FILE: lib/spark/helper/system.rb
================================================
module Spark
module Helper
module System
def self.included(base)
base.send :extend, Methods
base.send :include, Methods
end
module Methods
def windows?
RbConfig::CONFIG['host_os'] =~ /mswin|mingw/
end
def mri?
RbConfig::CONFIG['ruby_install_name'] == 'ruby'
end
def jruby?
RbConfig::CONFIG['ruby_install_name'] == 'jruby'
end
def pry?
!!Thread.current[:__pry__]
end
# Memory usage in kb
def memory_usage
if jruby?
runtime = java.lang.Runtime.getRuntime
(runtime.totalMemory - runtime.freeMemory) >> 10
elsif windows?
# not yet
else
`ps -o rss= -p #{Process.pid}`.to_i
end
end
end # Methods
end # System
end # Helper
end # Spark
================================================
FILE: lib/spark/helper.rb
================================================
module Spark
module Helper
autoload :System, "spark/helper/system"
autoload :Logger, "spark/helper/logger"
autoload :Statistic, "spark/helper/statistic"
autoload :Serialize, "spark/helper/serialize"
autoload :Partition, "spark/helper/partition"
autoload :Parser, "spark/helper/parser"
end
end
================================================
FILE: lib/spark/java_bridge/base.rb
================================================
##
# Spark::JavaBridge::Base
#
# Parent for all adapter (ruby - java)
#
module Spark
module JavaBridge
class Base
include Spark::Helper::System
JAVA_OBJECTS = [
'java.util.ArrayList',
'scala.collection.mutable.HashMap',
'org.apache.spark.SparkConf',
'org.apache.spark.api.java.JavaSparkContext',
'org.apache.spark.api.ruby.RubyRDD',
'org.apache.spark.api.ruby.RubyUtils',
'org.apache.spark.api.ruby.RubyWorker',
'org.apache.spark.api.ruby.PairwiseRDD',
'org.apache.spark.api.ruby.RubyAccumulatorParam',
'org.apache.spark.api.ruby.RubySerializer',
'org.apache.spark.api.python.PythonRDD',
'org.apache.spark.api.python.PythonPartitioner',
'org.apache.spark.api.python.PythonUtils',
'org.apache.spark.ui.ruby.RubyTab',
'org.apache.spark.mllib.api.ruby.RubyMLLibAPI',
:JInteger => 'java.lang.Integer',
:JLong => 'java.lang.Long',
:JLogger => 'org.apache.log4j.Logger',
:JLevel => 'org.apache.log4j.Level',
:JPriority => 'org.apache.log4j.Priority',
:JUtils => 'org.apache.spark.util.Utils',
:JDataType => 'org.apache.spark.sql.types.DataType',
:JSQLContext => 'org.apache.spark.sql.SQLContext',
:JDenseVector => 'org.apache.spark.mllib.linalg.DenseVector',
:JDenseMatrix => 'org.apache.spark.mllib.linalg.DenseMatrix',
:JStorageLevel => 'org.apache.spark.storage.StorageLevel',
:JSQLFunctions => 'org.apache.spark.sql.functions'
]
JAVA_TEST_OBJECTS = [
'org.apache.spark.mllib.api.ruby.RubyMLLibUtilAPI'
]
RUBY_TO_JAVA_SKIP = [Fixnum, Integer]
def initialize(target)
@target = target
end
# Import all important classes into Objects
def import_all
return if @imported
java_objects.each do |name, klass|
import(name, klass)
end
@imported = true
nil
end
# Import classes for testing
def import_all_test
return if @imported_test
java_test_objects.each do |name, klass|
import(name, klass)
end
@imported_test = true
nil
end
# Call java object
def call(klass, method, *args)
# To java
args.map!{|item| to_java(item)}
# Call java
result = klass.__send__(method, *args)
# To ruby
to_ruby(result)
end
def to_array_list(array)
array_list = ArrayList.new
array.each do |item|
array_list.add(to_java(item))
end
array_list
end
def to_seq(array)
PythonUtils.toSeq(to_array_list(array))
end
def to_long(number)
return nil if number.nil?
JLong.new(number)
end
def to_java(object)
if RUBY_TO_JAVA_SKIP.include?(object.class)
# Some object are convert automatically
# This is for preventing errors
# For example: jruby store integer as long so 1.to_java is Long
object
elsif object.respond_to?(:to_java)
object.to_java
elsif object.is_a?(Array)
to_array_list(object)
else
object
end
end
# Array problem:
# Rjb: object.toArray -> Array
# Jruby: object.toArray -> java.lang.Object
#
def to_ruby(object)
# Java object
if java_object?(object)
class_name = object.getClass.getSimpleName
case class_name
when 'ArraySeq'
result = []
iterator = object.iterator
while iterator.hasNext
result << to_ruby(iterator.next)
end
result
when 'Map2', 'Map3', 'Map4', 'HashTrieMap'
Hash[
object.toSeq.array.to_a.map!{|item| [item._1, item._2]}
]
when 'SeqWrapper'; object.toArray.to_a.map!{|item| to_ruby(item)}
when 'ofRef'; object.array.to_a.map!{|item| to_ruby(item)} # WrappedArray$ofRef
when 'LabeledPoint'; Spark::Mllib::LabeledPoint.from_java(object)
when 'DenseVector'; Spark::Mllib::DenseVector.from_java(object)
when 'KMeansModel'; Spark::Mllib::KMeansModel.from_java(object)
when 'DenseMatrix'; Spark::Mllib::DenseMatrix.from_java(object)
when 'GenericRowWithSchema'; Spark::SQL::Row.from_java(object, true)
else
# Some RDD
if class_name != 'JavaRDD' && class_name.end_with?('RDD')
object = object.toJavaRDD
class_name = 'JavaRDD'
end
# JavaRDD
if class_name == 'JavaRDD'
jrdd = RubyRDD.toRuby(object)
serializer = Spark::Serializer.build { __batched__(__marshal__) }
serializer = Spark::Serializer.build { __batched__(__marshal__, 2) }
return Spark::RDD.new(jrdd, Spark.sc, serializer, deserializer)
end
# Unknow
Spark.logger.warn("Java object '#{object.getClass.name}' was not converted.")
object
end
# Array can be automatically transfered but content not
elsif object.is_a?(Array)
object.map! do |item|
to_ruby(item)
end
object
# Already transfered
else
object
end
end
alias_method :java_to_ruby, :to_ruby
alias_method :ruby_to_java, :to_java
private
def jars
result = Dir.glob(File.join(@target, '*.jar'))
result.flatten!
result
end
def objects_with_names(objects)
hash = {}
objects.each do |object|
if object.is_a?(Hash)
hash.merge!(object)
else
key = object.split('.').last.to_sym
hash[key] = object
end
end
hash
end
def java_objects
objects_with_names(JAVA_OBJECTS)
end
def java_test_objects
objects_with_names(JAVA_TEST_OBJECTS)
end
def raise_missing_class(klass)
raise Spark::JavaBridgeError, "Class #{klass} is missing. Make sure that Spark and RubySpark is assembled."
end
end
end
end
================================================
FILE: lib/spark/java_bridge/jruby.rb
================================================
require 'java'
module Spark
module JavaBridge
class JRuby < Base
def initialize(*args)
super
jars.each {|jar| require jar}
end
def import(name, klass)
klass = "Java::#{klass}"
Object.const_set(name, eval(klass))
rescue NameError
raise_missing_class(klass)
end
def java_object?(object)
object.is_a?(JavaProxy)
end
end
end
end
================================================
FILE: lib/spark/java_bridge/rjb.rb
================================================
if !ENV.has_key?('JAVA_HOME')
raise Spark::ConfigurationError, 'Environment variable JAVA_HOME is not set'
end
require 'rjb'
module Spark
module JavaBridge
class RJB < Base
def initialize(*args)
super
Rjb.load(jars)
Rjb.primitive_conversion = true
end
def import(name, klass)
Object.const_set(name, silence_warnings { Rjb.import(klass) })
rescue NoClassDefFoundError
raise_missing_class(klass)
end
def java_object?(object)
object.is_a?(Rjb::Rjb_JavaProxy)
end
private
def jars
separator = windows? ? ';' : ':'
super.join(separator)
end
end
end
end
================================================
FILE: lib/spark/java_bridge.rb
================================================
module Spark
module JavaBridge
autoload :Base, 'spark/java_bridge/base'
autoload :JRuby, 'spark/java_bridge/jruby'
autoload :RJB, 'spark/java_bridge/rjb'
include Spark::Helper::System
def self.init(*args)
if jruby?
klass = JRuby
else
klass = RJB
end
klass.new(*args)
end
end
end
================================================
FILE: lib/spark/library.rb
================================================
module Spark
module Library
def autoload(klass, location, import=true)
if import
@for_importing ||= []
@for_importing << klass
end
super(klass, location)
end
def autoload_without_import(klass, location)
autoload(klass, location, false)
end
def import(to=Object)
@for_importing.each do |klass|
to.const_set(klass, const_get(klass))
end
nil
end
end
end
================================================
FILE: lib/spark/logger.rb
================================================
# Necessary libraries
Spark.load_lib
module Spark
class Logger
attr_reader :jlogger
def initialize
@jlogger = JLogger.getLogger('Ruby')
end
def level_off
JLevel.toLevel('OFF')
end
# Disable all Spark log
def disable
jlogger.setLevel(level_off)
JLogger.getLogger('org').setLevel(level_off)
JLogger.getLogger('akka').setLevel(level_off)
JLogger.getRootLogger.setLevel(level_off)
end
def enabled?
!disabled?
end
def info(message)
jlogger.info(message) if info?
end
def debug(message)
jlogger.debug(message) if debug?
end
def trace(message)
jlogger.trace(message) if trace?
end
def warning(message)
jlogger.warn(message) if warning?
end
def error(message)
jlogger.error(message) if error?
end
def info?
level_enabled?('info')
end
def debug?
level_enabled?('debug')
end
def trace?
level_enabled?('trace')
end
def warning?
level_enabled?('warn')
end
def error?
level_enabled?('error')
end
def level_enabled?(type)
jlogger.isEnabledFor(JPriority.toPriority(type.upcase))
end
alias_method :warn, :warning
end
end
================================================
FILE: lib/spark/mllib/classification/common.rb
================================================
module Spark
module Mllib
class ClassificationModel
attr_reader :weights, :intercept, :threshold
def initialize(weights, intercept)
@weights = Spark::Mllib::Vectors.to_vector(weights)
@intercept = intercept.to_f
@threshold = nil
end
def threshold=(value)
@threshold = value.to_f
end
def clear_threshold
@threshold = nil
end
end
end
end
module Spark
module Mllib
class ClassificationMethodBase < RegressionMethodBase
end
end
end
================================================
FILE: lib/spark/mllib/classification/logistic_regression.rb
================================================
module Spark
module Mllib
##
# LogisticRegressionModel
#
# A linear binary classification model derived from logistic regression.
#
# == Examples:
#
# Spark::Mllib.import
#
# # Dense vectors
# data = [
# LabeledPoint.new(0.0, [0.0, 1.0]),
# LabeledPoint.new(1.0, [1.0, 0.0]),
# ]
# lrm = LogisticRegressionWithSGD.train($sc.parallelize(data))
#
gitextract_h83fh3m2/
├── .gitignore
├── .travis.yml
├── CHANGELOG.md
├── Gemfile
├── Guardfile
├── LICENSE.txt
├── README.md
├── Rakefile
├── TODO.md
├── benchmark/
│ ├── aggregate.rb
│ ├── bisect.rb
│ ├── comparison/
│ │ ├── prepare.sh
│ │ ├── python.py
│ │ ├── r.r
│ │ ├── ruby.rb
│ │ ├── run-all.sh
│ │ └── scala.scala
│ ├── custom_marshal.rb
│ ├── digest.rb
│ ├── enumerator.rb
│ ├── serializer.rb
│ ├── sort.rb
│ ├── sort2.rb
│ └── take.rb
├── bin/
│ └── ruby-spark
├── example/
│ ├── pi.rb
│ └── website_search.rb
├── ext/
│ ├── ruby_c/
│ │ ├── extconf.rb
│ │ ├── murmur.c
│ │ ├── murmur.h
│ │ └── ruby-spark.c
│ ├── ruby_java/
│ │ ├── Digest.java
│ │ ├── Murmur2.java
│ │ ├── RubySparkExtService.java
│ │ └── extconf.rb
│ └── spark/
│ ├── build.sbt
│ ├── project/
│ │ └── plugins.sbt
│ ├── sbt/
│ │ └── sbt
│ └── src/
│ ├── main/
│ │ └── scala/
│ │ ├── Exec.scala
│ │ ├── MLLibAPI.scala
│ │ ├── Marshal.scala
│ │ ├── MarshalDump.scala
│ │ ├── MarshalLoad.scala
│ │ ├── RubyAccumulatorParam.scala
│ │ ├── RubyBroadcast.scala
│ │ ├── RubyConstant.scala
│ │ ├── RubyMLLibAPI.scala
│ │ ├── RubyMLLibUtilAPI.scala
│ │ ├── RubyPage.scala
│ │ ├── RubyRDD.scala
│ │ ├── RubySerializer.scala
│ │ ├── RubyTab.scala
│ │ ├── RubyUtils.scala
│ │ └── RubyWorker.scala
│ └── test/
│ └── scala/
│ └── MarshalSpec.scala
├── lib/
│ ├── ruby-spark.rb
│ ├── spark/
│ │ ├── accumulator.rb
│ │ ├── broadcast.rb
│ │ ├── build.rb
│ │ ├── cli.rb
│ │ ├── command/
│ │ │ ├── base.rb
│ │ │ ├── basic.rb
│ │ │ ├── pair.rb
│ │ │ ├── sort.rb
│ │ │ └── statistic.rb
│ │ ├── command.rb
│ │ ├── command_builder.rb
│ │ ├── command_validator.rb
│ │ ├── config.rb
│ │ ├── constant.rb
│ │ ├── context.rb
│ │ ├── error.rb
│ │ ├── ext/
│ │ │ ├── hash.rb
│ │ │ ├── integer.rb
│ │ │ ├── io.rb
│ │ │ ├── ip_socket.rb
│ │ │ ├── module.rb
│ │ │ ├── object.rb
│ │ │ └── string.rb
│ │ ├── helper/
│ │ │ ├── logger.rb
│ │ │ ├── parser.rb
│ │ │ ├── serialize.rb
│ │ │ ├── statistic.rb
│ │ │ └── system.rb
│ │ ├── helper.rb
│ │ ├── java_bridge/
│ │ │ ├── base.rb
│ │ │ ├── jruby.rb
│ │ │ └── rjb.rb
│ │ ├── java_bridge.rb
│ │ ├── library.rb
│ │ ├── logger.rb
│ │ ├── mllib/
│ │ │ ├── classification/
│ │ │ │ ├── common.rb
│ │ │ │ ├── logistic_regression.rb
│ │ │ │ ├── naive_bayes.rb
│ │ │ │ └── svm.rb
│ │ │ ├── clustering/
│ │ │ │ ├── gaussian_mixture.rb
│ │ │ │ └── kmeans.rb
│ │ │ ├── matrix.rb
│ │ │ ├── regression/
│ │ │ │ ├── common.rb
│ │ │ │ ├── labeled_point.rb
│ │ │ │ ├── lasso.rb
│ │ │ │ ├── linear.rb
│ │ │ │ └── ridge.rb
│ │ │ ├── ruby_matrix/
│ │ │ │ ├── matrix_adapter.rb
│ │ │ │ └── vector_adapter.rb
│ │ │ ├── stat/
│ │ │ │ └── distribution.rb
│ │ │ └── vector.rb
│ │ ├── mllib.rb
│ │ ├── rdd.rb
│ │ ├── sampler.rb
│ │ ├── serializer/
│ │ │ ├── auto_batched.rb
│ │ │ ├── base.rb
│ │ │ ├── batched.rb
│ │ │ ├── cartesian.rb
│ │ │ ├── compressed.rb
│ │ │ ├── marshal.rb
│ │ │ ├── message_pack.rb
│ │ │ ├── oj.rb
│ │ │ ├── pair.rb
│ │ │ └── text.rb
│ │ ├── serializer.rb
│ │ ├── sort.rb
│ │ ├── sql/
│ │ │ ├── column.rb
│ │ │ ├── context.rb
│ │ │ ├── data_frame.rb
│ │ │ ├── data_frame_reader.rb
│ │ │ ├── data_type.rb
│ │ │ └── row.rb
│ │ ├── sql.rb
│ │ ├── stat_counter.rb
│ │ ├── storage_level.rb
│ │ ├── version.rb
│ │ └── worker/
│ │ ├── master.rb
│ │ ├── spark_files.rb
│ │ └── worker.rb
│ └── spark.rb
├── ruby-spark.gemspec
└── spec/
├── generator.rb
├── inputs/
│ ├── lorem_300.txt
│ ├── numbers/
│ │ ├── 1.txt
│ │ ├── 10.txt
│ │ ├── 11.txt
│ │ ├── 12.txt
│ │ ├── 13.txt
│ │ ├── 14.txt
│ │ ├── 15.txt
│ │ ├── 16.txt
│ │ ├── 17.txt
│ │ ├── 18.txt
│ │ ├── 19.txt
│ │ ├── 2.txt
│ │ ├── 20.txt
│ │ ├── 3.txt
│ │ ├── 4.txt
│ │ ├── 5.txt
│ │ ├── 6.txt
│ │ ├── 7.txt
│ │ ├── 8.txt
│ │ └── 9.txt
│ ├── numbers_0_100.txt
│ ├── numbers_1_100.txt
│ └── people.json
├── lib/
│ ├── collect_spec.rb
│ ├── command_spec.rb
│ ├── config_spec.rb
│ ├── context_spec.rb
│ ├── ext_spec.rb
│ ├── external_apps_spec.rb
│ ├── filter_spec.rb
│ ├── flat_map_spec.rb
│ ├── group_spec.rb
│ ├── helper_spec.rb
│ ├── key_spec.rb
│ ├── manipulation_spec.rb
│ ├── map_partitions_spec.rb
│ ├── map_spec.rb
│ ├── mllib/
│ │ ├── classification_spec.rb
│ │ ├── clustering_spec.rb
│ │ ├── matrix_spec.rb
│ │ ├── regression_spec.rb
│ │ └── vector_spec.rb
│ ├── reduce_by_key_spec.rb
│ ├── reduce_spec.rb
│ ├── sample_spec.rb
│ ├── serializer_spec.rb
│ ├── sort_spec.rb
│ ├── sql/
│ │ ├── column_spec.rb
│ │ └── data_frame_spec.rb
│ ├── statistic_spec.rb
│ └── whole_text_files_spec.rb
└── spec_helper.rb
SYMBOL INDEX (1072 symbols across 108 files)
FILE: benchmark/bisect.rb
function bisect_left1 (line 3) | def bisect_left1(a, x, opts={})
function bisect_left2 (line 22) | def bisect_left2(list, item)
function bisect_left3 (line 31) | def bisect_left3(list, item, lo = 0, hi = list.size)
FILE: benchmark/comparison/python.py
function log (line 12) | def log(*values):
function is_prime (line 56) | def is_prime(x):
function multiplication_func (line 94) | def multiplication_func(matrix):
function pi_func (line 123) | def pi_func(size):
FILE: benchmark/comparison/ruby.rb
function log (line 14) | def log(*values)
FILE: benchmark/custom_marshal.rb
function pack_int (line 4) | def pack_int(data)
function pack_long (line 8) | def pack_long(data)
function pack_doubles (line 12) | def pack_doubles(data)
type Standard (line 16) | module Standard
class LabeledPoint (line 17) | class LabeledPoint
method initialize (line 18) | def initialize(label, features)
method marshal_dump (line 23) | def marshal_dump
method marshal_load (line 27) | def marshal_load(*)
class Vector (line 31) | class Vector
method initialize (line 32) | def initialize(array)
method marshal_dump (line 36) | def marshal_dump
method marshal_load (line 40) | def marshal_load(*)
type Custom (line 45) | module Custom
class LabeledPoint (line 46) | class LabeledPoint
method initialize (line 47) | def initialize(label, features)
method _dump (line 52) | def _dump(*)
method _load (line 56) | def self._load(*)
class Vector (line 60) | class Vector
method initialize (line 61) | def initialize(array)
method _dump (line 65) | def _dump(*)
method _load (line 72) | def self._load(*)
FILE: benchmark/digest.rb
function java? (line 4) | def java?
function murmur3_32_rotl (line 27) | def murmur3_32_rotl(x, r)
function murmur3_32_fmix (line 31) | def murmur3_32_fmix(h)
function murmur3_32__mmix (line 40) | def murmur3_32__mmix(k1)
function murmur3_32_str_hash (line 46) | def murmur3_32_str_hash(str, seed=0)
FILE: benchmark/enumerator.rb
class Enumerator (line 3) | class Enumerator
method defer (line 4) | def defer(&blk)
function type_yield (line 15) | def type_yield
function yield_map_x2 (line 23) | def yield_map_x2(enum)
function type_enumerator_new (line 31) | def type_enumerator_new
function enumerator_new_map_x2 (line 39) | def enumerator_new_map_x2(enum)
function enumerator_defer_x2 (line 47) | def enumerator_defer_x2(enum)
FILE: benchmark/sort.rb
function <=> (line 29) | def <=>(other)
FILE: benchmark/sort2.rb
function words (line 16) | def words
function word (line 20) | def word
function sort1 (line 31) | def sort1(data)
function sort1_2 (line 64) | def sort1_2(data)
function sort2 (line 113) | def sort2(data)
FILE: example/pi.rb
function map (line 14) | def map(_)
FILE: example/website_search.rb
function parse_sitemap (line 48) | def parse_sitemap(doc)
FILE: ext/ruby_c/murmur.c
function MurmurHash64A (line 18) | uint64_t MurmurHash64A(const void * key, int len, uint64_t seed)
function MurmurHash64B (line 63) | uint64_t MurmurHash64B(const void * key, int len, uint64_t seed)
function VALUE (line 122) | VALUE murmur2_digest(VALUE rb_str, uint64_t seed)
function VALUE (line 137) | VALUE method_murmur2_digest(int argc, VALUE *argv, VALUE klass)
function VALUE (line 151) | VALUE method_portable_hash(int argc, VALUE *argv, VALUE klass)
FILE: ext/ruby_c/ruby-spark.c
function Init_ruby_spark_ext (line 10) | void Init_ruby_spark_ext()
FILE: ext/ruby_java/Digest.java
class Digest (line 12) | @JRubyModule(name="Spark::Digest")
method Digest (line 18) | public Digest(final Ruby ruby, RubyClass rubyClass) {
method portable_hash (line 22) | @JRubyMethod(module=true)
FILE: ext/ruby_java/Murmur2.java
class Murmur2 (line 21) | @JRubyClass(name="Spark::Digest::Murmur2")
method Murmur2 (line 24) | public Murmur2(final Ruby ruby, RubyClass rubyClass) {
method digest (line 28) | @JRubyMethod(required=1, optional=1, module=true)
method hash64 (line 57) | public static long hash64(final byte[] data, int length, long seed) {
FILE: ext/ruby_java/RubySparkExtService.java
class RubySparkExtService (line 8) | public class RubySparkExtService implements BasicLibraryService
method basicLoad (line 10) | public boolean basicLoad(final Ruby ruby) throws java.io.IOException {
method allocate (line 23) | public IRubyObject allocate(Ruby ruby, RubyClass rubyClass) {
FILE: lib/spark.rb
type Spark (line 11) | module Spark
function print_logo (line 43) | def self.print_logo(message=nil)
function config (line 71) | def self.config(&block)
function clear_config (line 83) | def self.clear_config
function context (line 88) | def self.context
function sql_context (line 93) | def self.sql_context
function start (line 100) | def self.start
function start_sql (line 104) | def self.start_sql
function stop (line 108) | def self.stop
function started? (line 120) | def self.started?
function load_defaults (line 131) | def self.load_defaults
function load_defaults_from (line 140) | def self.load_defaults_from(file_path)
function save_defaults_to (line 158) | def self.save_defaults_to(file_path)
function logger (line 181) | def self.logger
function root (line 186) | def self.root
function target_dir (line 191) | def self.target_dir
function worker_dir (line 196) | def self.worker_dir
function ruby_spark_jar (line 200) | def self.ruby_spark_jar
function spark_ext_dir (line 204) | def self.spark_ext_dir
function load_lib (line 219) | def self.load_lib(target=nil)
function java_bridge (line 229) | def self.java_bridge
FILE: lib/spark/accumulator.rb
type Spark (line 1) | module Spark
class Accumulator (line 55) | class Accumulator
method initialize (line 68) | def initialize(value, accum_param=:+, zero_value=0)
method inspect (line 80) | def inspect
method changed (line 88) | def self.changed
method instances (line 92) | def self.instances
method valid_accum_param (line 96) | def valid_accum_param
method driver? (line 127) | def driver?
method add (line 135) | def add(term)
method + (line 147) | def +(term)
method add_by_symbol (line 152) | def add_by_symbol(term)
method marshal_dump (line 171) | def marshal_dump
method marshal_load (line 175) | def marshal_load(array)
method load_accum_param (line 183) | def load_accum_param
class Server (line 199) | class Server
method start (line 203) | def self.start
method stop (line 207) | def self.stop
method host (line 211) | def self.host
method port (line 216) | def self.port
method initialize (line 221) | def initialize
method stop (line 230) | def stop
method handle_accept (line 236) | def handle_accept
method handle_connection (line 245) | def handle_connection(socket)
class Accumulator (line 198) | class Accumulator
method initialize (line 68) | def initialize(value, accum_param=:+, zero_value=0)
method inspect (line 80) | def inspect
method changed (line 88) | def self.changed
method instances (line 92) | def self.instances
method valid_accum_param (line 96) | def valid_accum_param
method driver? (line 127) | def driver?
method add (line 135) | def add(term)
method + (line 147) | def +(term)
method add_by_symbol (line 152) | def add_by_symbol(term)
method marshal_dump (line 171) | def marshal_dump
method marshal_load (line 175) | def marshal_load(array)
method load_accum_param (line 183) | def load_accum_param
class Server (line 199) | class Server
method start (line 203) | def self.start
method stop (line 207) | def self.stop
method host (line 211) | def self.host
method port (line 216) | def self.port
method initialize (line 221) | def initialize
method stop (line 230) | def stop
method handle_accept (line 236) | def handle_accept
method handle_connection (line 245) | def handle_connection(socket)
type Spark (line 197) | module Spark
class Accumulator (line 55) | class Accumulator
method initialize (line 68) | def initialize(value, accum_param=:+, zero_value=0)
method inspect (line 80) | def inspect
method changed (line 88) | def self.changed
method instances (line 92) | def self.instances
method valid_accum_param (line 96) | def valid_accum_param
method driver? (line 127) | def driver?
method add (line 135) | def add(term)
method + (line 147) | def +(term)
method add_by_symbol (line 152) | def add_by_symbol(term)
method marshal_dump (line 171) | def marshal_dump
method marshal_load (line 175) | def marshal_load(array)
method load_accum_param (line 183) | def load_accum_param
class Server (line 199) | class Server
method start (line 203) | def self.start
method stop (line 207) | def self.stop
method host (line 211) | def self.host
method port (line 216) | def self.port
method initialize (line 221) | def initialize
method stop (line 230) | def stop
method handle_accept (line 236) | def handle_accept
method handle_connection (line 245) | def handle_connection(socket)
class Accumulator (line 198) | class Accumulator
method initialize (line 68) | def initialize(value, accum_param=:+, zero_value=0)
method inspect (line 80) | def inspect
method changed (line 88) | def self.changed
method instances (line 92) | def self.instances
method valid_accum_param (line 96) | def valid_accum_param
method driver? (line 127) | def driver?
method add (line 135) | def add(term)
method + (line 147) | def +(term)
method add_by_symbol (line 152) | def add_by_symbol(term)
method marshal_dump (line 171) | def marshal_dump
method marshal_load (line 175) | def marshal_load(array)
method load_accum_param (line 183) | def load_accum_param
class Server (line 199) | class Server
method start (line 203) | def self.start
method stop (line 207) | def self.stop
method host (line 211) | def self.host
method port (line 216) | def self.port
method initialize (line 221) | def initialize
method stop (line 230) | def stop
method handle_accept (line 236) | def handle_accept
method handle_connection (line 245) | def handle_connection(socket)
FILE: lib/spark/broadcast.rb
type Spark (line 1) | module Spark
class Broadcast (line 27) | class Broadcast
method initialize (line 48) | def initialize(sc, value)
method inspect (line 64) | def inspect
method register (line 71) | def self.register(id, path)
method value (line 75) | def value
method marshal_dump (line 95) | def marshal_dump
method marshal_load (line 99) | def marshal_load(id)
FILE: lib/spark/build.rb
type Spark (line 1) | module Spark
type Build (line 2) | module Build
function build (line 14) | def self.build(options={})
FILE: lib/spark/cli.rb
type Commander (line 3) | module Commander
type UI (line 4) | module UI
function enable_paging (line 7) | def self.enable_paging
type Spark (line 12) | module Spark
class CLI (line 13) | class CLI
method run (line 19) | def run
FILE: lib/spark/command.rb
type Spark (line 1) | module Spark
class Command (line 6) | class Command
method initialize (line 10) | def initialize
method execute (line 18) | def execute(iterator, split_index)
method last (line 41) | def last
method bound_objects (line 45) | def bound_objects
method marshal_dump (line 61) | def marshal_dump
method marshal_load (line 65) | def marshal_load(array)
method serialized_bound_objects (line 75) | def serialized_bound_objects
FILE: lib/spark/command/base.rb
class Spark::Command::Base (line 6) | class Spark::Command::Base
method initialize (line 13) | def initialize(*args)
method to_s (line 19) | def to_s
method error (line 23) | def self.error(message)
method error (line 27) | def error(message)
method log (line 31) | def log(message=nil)
method settings (line 42) | def self.settings
method settings (line 47) | def settings
method init_settings (line 52) | def self.init_settings
method variable (line 76) | def self.variable(name, options={})
method execute (line 89) | def execute(iterator, split_index)
method prepared? (line 102) | def prepared?
method prepare (line 114) | def prepare
method before_run (line 141) | def before_run
method method_missing (line 150) | def method_missing(method, *args, &block)
FILE: lib/spark/command/basic.rb
class Spark::Command::Map (line 6) | class Spark::Command::Map < _Base
method run (line 9) | def run(iterator, *)
method lazy_run (line 16) | def lazy_run(iterator, *)
class Spark::Command::FlatMap (line 26) | class Spark::Command::FlatMap < Spark::Command::Map
method run (line 27) | def run(iterator, *)
method lazy_run (line 33) | def lazy_run(iterator, *)
class Spark::Command::MapPartitionsWithIndex (line 43) | class Spark::Command::MapPartitionsWithIndex < _Base
method run (line 46) | def run(iterator, index)
class Spark::Command::MapPartitions (line 58) | class Spark::Command::MapPartitions < Spark::Command::MapPartitionsWithI...
method run (line 59) | def run(iterator, *)
class Spark::Command::Filter (line 70) | class Spark::Command::Filter < _Base
method run (line 73) | def run(iterator, *)
method lazy_run (line 80) | def lazy_run(iterator, *)
class Spark::Command::Compact (line 90) | class Spark::Command::Compact < _Base
method run (line 91) | def run(iterator, *)
method lazy_run (line 96) | def lazy_run(iterator, *)
class Spark::Command::Glom (line 106) | class Spark::Command::Glom < _Base
method run (line 107) | def run(iterator, *)
method lazy_run (line 111) | def lazy_run(iterator, *)
class Spark::Command::Shuffle (line 119) | class Spark::Command::Shuffle < _Base
method run (line 122) | def run(iterator, *)
method rng (line 127) | def rng
class Spark::Command::PartitionBy (line 135) | class Spark::Command::PartitionBy
class Base (line 137) | class Base < Spark::Command::Base
method prepare (line 140) | def prepare
method run (line 147) | def run(iterator, *)
method lazy_run (line 155) | def lazy_run(iterator, *)
method make_partition_item (line 163) | def make_partition_item(item)
class Basic (line 171) | class Basic < Base
class Sorting (line 175) | class Sorting < Base
method prepare (line 181) | def prepare
class Spark::Command::Aggregate (line 206) | class Spark::Command::Aggregate < _Base
method run (line 210) | def run(iterator, *)
method lazy_run (line 214) | def lazy_run(iterator, *)
class Spark::Command::Reduce (line 222) | class Spark::Command::Reduce < Spark::Command::Aggregate
method run (line 223) | def run(iterator, *)
class Spark::Command::Foreach (line 231) | class Spark::Command::Foreach < _Base
method run (line 234) | def run(iterator, *)
class Spark::Command::ForeachPartition (line 245) | class Spark::Command::ForeachPartition < _Base
method run (line 248) | def run(iterator, *)
class Spark::Command::KeyBy (line 257) | class Spark::Command::KeyBy < _Base
method run (line 260) | def run(iterator, *)
method lazy_run (line 267) | def lazy_run(iterator, *)
class Spark::Command::Take (line 277) | class Spark::Command::Take < _Base
method run (line 281) | def run(iterator, index)
class Spark::Command::Pipe (line 293) | class Spark::Command::Pipe < _Base
method before_run (line 296) | def before_run
method run (line 302) | def run(iterator, *)
method lazy_run (line 318) | def lazy_run(iterator, *)
method create_writing_thread (line 333) | def create_writing_thread(iterator)
FILE: lib/spark/command/pair.rb
class Spark::Command::CombineByKey (line 6) | class Spark::Command::CombineByKey
class Base (line 10) | class Base < Spark::Command::Base
method run (line 11) | def run(iterator, *)
method lazy_run (line 15) | def lazy_run(iterator, *)
class Combine (line 22) | class Combine < Base
method _run (line 26) | def _run(iterator)
class Merge (line 43) | class Merge < Base
method _run (line 46) | def _run(iterator, *)
class CombineWithZero (line 61) | class CombineWithZero < Base
method _run (line 65) | def _run(iterator)
class Spark::Command::MapValues (line 88) | class Spark::Command::MapValues < _Base
method run (line 91) | def run(iterator, *)
method lazy_run (line 99) | def lazy_run(iterator, *)
class Spark::Command::FlatMapValues (line 110) | class Spark::Command::FlatMapValues < _Base
method run (line 113) | def run(iterator, *)
FILE: lib/spark/command/sort.rb
class Spark::Command::SortByKey (line 6) | class Spark::Command::SortByKey < _Base
method before_run (line 14) | def before_run
method run (line 18) | def run(iterator, _)
method run_with_enum (line 28) | def run_with_enum(iterator, _)
method run_with_spilling (line 41) | def run_with_spilling(iterator)
method run_without_spilling (line 46) | def run_without_spilling(iterator)
FILE: lib/spark/command/statistic.rb
class Spark::Command::Sample (line 6) | class Spark::Command::Sample < _Base
method run (line 11) | def run(iterator, _)
method lazy_run (line 15) | def lazy_run(iterator, _)
method sampler (line 19) | def sampler
method _sampler (line 23) | def _sampler
class Spark::Command::Stats (line 37) | class Spark::Command::Stats < _Base
method run (line 39) | def run(iterator, *)
method lazy_run (line 43) | def lazy_run(iterator, *)
class Spark::Command::Histogram (line 52) | class Spark::Command::Histogram < _Base
method run (line 58) | def run(iterator, *)
method lazy_run (line 75) | def lazy_run(iterator, *)
method min (line 81) | def min
method max (line 85) | def max
method counter_size (line 89) | def counter_size
method increment (line 93) | def increment
method bucket_function (line 99) | def bucket_function
method _bucket_function (line 103) | def _bucket_function
method fast_bucket_function (line 113) | def fast_bucket_function
method basic_bucket_function (line 129) | def basic_bucket_function
FILE: lib/spark/command_builder.rb
type Spark (line 3) | module Spark
class CommandBuilder (line 7) | class CommandBuilder
method initialize (line 20) | def initialize(serializer, deserializer=nil)
method create_command (line 26) | def create_command
method deep_copy (line 32) | def deep_copy
method build (line 45) | def build
method add_command (line 49) | def add_command(klass, *args)
method add_library (line 68) | def add_library(*libraries)
method bind (line 72) | def bind(objects)
method serialize_function (line 87) | def serialize_function(func)
method serialize_function_from_string (line 102) | def serialize_function_from_string(string)
method serialize_function_from_symbol (line 106) | def serialize_function_from_symbol(symbol)
method serialize_function_from_proc (line 115) | def serialize_function_from_proc(proc)
method serialize_function_from_method (line 130) | def serialize_function_from_method(meth)
FILE: lib/spark/command_validator.rb
type Spark (line 1) | module Spark
type CommandValidator (line 2) | module CommandValidator
function validate (line 4) | def validate(value, options)
function valid? (line 8) | def valid?(value, options)
function validate_type (line 17) | def validate_type(value, types)
function validate_size (line 27) | def validate_size(array1, array2)
FILE: lib/spark/config.rb
type Spark (line 4) | module Spark
class Config (line 6) | class Config
method initialize (line 16) | def initialize
method from_file (line 22) | def from_file(file)
method [] (line 31) | def [](key)
method []= (line 35) | def []=(key, value)
method spark_conf (line 39) | def spark_conf
method valid! (line 48) | def valid!
method read_only? (line 80) | def read_only?
method get (line 85) | def get(key)
method get_all (line 100) | def get_all
method contains? (line 104) | def contains?(key)
method set (line 108) | def set(key, value)
method set_app_name (line 113) | def set_app_name(name)
method set_master (line 117) | def set_master(master)
method parse_boolean (line 121) | def parse_boolean(value)
method parse_integer (line 130) | def parse_integer(value)
method set_default (line 137) | def set_default
method default_serializer (line 151) | def default_serializer
method default_serializer_compress (line 155) | def default_serializer_compress
method default_serializer_batch_size (line 159) | def default_serializer_batch_size
method default_executor_command (line 169) | def default_executor_command
method default_executor_options (line 178) | def default_executor_options
method default_worker_type (line 198) | def default_worker_type
method load_executor_envs (line 208) | def load_executor_envs
method check_read_only (line 227) | def check_read_only
FILE: lib/spark/constant.rb
type Spark (line 1) | module Spark
type Constant (line 3) | module Constant
FILE: lib/spark/context.rb
type Spark (line 4) | module Spark
class Context (line 9) | class Context
method initialize (line 21) | def initialize
method inspect (line 41) | def inspect
method stop (line 47) | def stop
method sc (line 53) | def sc
method ui (line 57) | def ui
method default_parallelism (line 63) | def default_parallelism
method default_serializer (line 71) | def default_serializer
method default_batch_size (line 92) | def default_batch_size
method set_local_property (line 104) | def set_local_property(key, value)
method get_local_property (line 110) | def get_local_property(key)
method set_call_site (line 116) | def set_call_site(site)
method clear_call_site (line 120) | def clear_call_site
method config (line 127) | def config(key=nil)
method add_file (line 149) | def add_file(*files)
method broadcast (line 169) | def broadcast(value)
method accumulator (line 188) | def accumulator(value, accum_param=:+, zero_value=0)
method parallelize (line 207) | def parallelize(data, num_slices=nil, serializer=nil)
method text_file (line 236) | def text_file(path, min_partitions=nil, encoding=Encoding::UTF_8, se...
method whole_text_files (line 260) | def whole_text_files(path, min_partitions=nil, serializer=nil)
method run_job (line 278) | def run_job(rdd, f, partitions=nil, allow_local=false)
method run_job_with_command (line 284) | def run_job_with_command(rdd, partitions, allow_local, command, *args)
FILE: lib/spark/error.rb
type Spark (line 1) | module Spark
class BuildError (line 3) | class BuildError < StandardError
class SerializeError (line 8) | class SerializeError < StandardError
class NotImplemented (line 13) | class NotImplemented < StandardError
class ConfigurationError (line 17) | class ConfigurationError < StandardError
class RDDError (line 21) | class RDDError < StandardError
class CommandError (line 25) | class CommandError < StandardError
class ParseError (line 30) | class ParseError < StandardError
class ContextError (line 34) | class ContextError < StandardError
class BroadcastError (line 39) | class BroadcastError < StandardError
class AccumulatorError (line 45) | class AccumulatorError < StandardError
class MllibError (line 49) | class MllibError < StandardError
class SQLError (line 53) | class SQLError < StandardError
class JavaBridgeError (line 57) | class JavaBridgeError < StandardError
FILE: lib/spark/ext/hash.rb
type Spark (line 1) | module Spark
type CoreExtension (line 2) | module CoreExtension
type Hash (line 3) | module Hash
type ClassMethods (line 4) | module ClassMethods
type InstanceMethods (line 7) | module InstanceMethods
function stringify_keys_with_spark! (line 9) | def stringify_keys_with_spark!
function symbolize_keys_with_spark! (line 14) | def symbolize_keys_with_spark!
function transform_keys_with_spark! (line 20) | def transform_keys_with_spark!
function included (line 28) | def self.included(base)
FILE: lib/spark/ext/integer.rb
type Spark (line 1) | module Spark
type CoreExtension (line 2) | module CoreExtension
type Integer (line 3) | module Integer
type ClassMethods (line 4) | module ClassMethods
type InstanceMethods (line 7) | module InstanceMethods
function included (line 10) | def self.included(base)
FILE: lib/spark/ext/io.rb
type Spark (line 1) | module Spark
type CoreExtension (line 2) | module CoreExtension
type IO (line 3) | module IO
type ClassMethods (line 4) | module ClassMethods
type InstanceMethods (line 7) | module InstanceMethods
function read_int (line 11) | def read_int
function read_int_or_eof (line 15) | def read_int_or_eof
function read_long (line 21) | def read_long
function read_string (line 25) | def read_string
function read_data (line 29) | def read_data
function write_int (line 36) | def write_int(data)
function write_long (line 40) | def write_long(data)
function write_string (line 47) | def write_string(data)
function write_data (line 52) | def write_data(data)
function included (line 57) | def self.included(base)
FILE: lib/spark/ext/ip_socket.rb
type Spark (line 1) | module Spark
type CoreExtension (line 2) | module CoreExtension
type IPSocket (line 3) | module IPSocket
type ClassMethods (line 4) | module ClassMethods
type InstanceMethods (line 7) | module InstanceMethods
function port (line 8) | def port
function hostname (line 12) | def hostname
function numeric_address (line 16) | def numeric_address
function included (line 21) | def self.included(base)
FILE: lib/spark/ext/module.rb
type Spark (line 1) | module Spark
type CoreExtension (line 2) | module CoreExtension
type Module (line 3) | module Module
function patch_unless_exist (line 40) | def patch_unless_exist(target, suffix)
function path_const_unless_exist (line 48) | def path_const_unless_exist(target, suffix)
FILE: lib/spark/ext/object.rb
type Spark (line 1) | module Spark
type CoreExtension (line 2) | module CoreExtension
type Object (line 3) | module Object
type ClassMethods (line 4) | module ClassMethods
type InstanceMethods (line 7) | module InstanceMethods
function deep_copy_with_spark (line 8) | def deep_copy_with_spark
function silence_warnings (line 12) | def silence_warnings
function cattr_reader_with_spark (line 19) | def cattr_reader_with_spark(*syms)
function cattr_writer_with_spark (line 38) | def cattr_writer_with_spark(*syms)
function cattr_accessor_with_spark (line 57) | def cattr_accessor_with_spark(*syms)
function included (line 63) | def self.included(base)
FILE: lib/spark/ext/string.rb
type Spark (line 1) | module Spark
type CoreExtension (line 2) | module CoreExtension
type String (line 3) | module String
type ClassMethods (line 4) | module ClassMethods
type InstanceMethods (line 7) | module InstanceMethods
function camelize_with_spark (line 8) | def camelize_with_spark
function included (line 13) | def self.included(base)
FILE: lib/spark/helper.rb
type Spark (line 1) | module Spark
type Helper (line 2) | module Helper
FILE: lib/spark/helper/logger.rb
type Spark (line 1) | module Spark
type Helper (line 2) | module Helper
type Logger (line 3) | module Logger
function included (line 5) | def self.included(base)
type Methods (line 10) | module Methods
function log_info (line 11) | def log_info(message)
function log_debug (line 15) | def log_debug(message)
function log_trace (line 19) | def log_trace(message)
function log_warning (line 23) | def log_warning(message)
function log_error (line 27) | def log_error(message)
FILE: lib/spark/helper/parser.rb
type Spark (line 1) | module Spark
type Helper (line 2) | module Helper
type Parser (line 3) | module Parser
function included (line 5) | def self.included(base)
type Methods (line 10) | module Methods
function to_java_hash (line 11) | def to_java_hash(hash)
function convert_to_java_int (line 23) | def convert_to_java_int(data)
function to_java_array_list (line 31) | def to_java_array_list(array)
function to_memory_size (line 51) | def to_memory_size(memory, result_unit="KB")
function memory_multiplier_based_kb (line 66) | def memory_multiplier_based_kb(type)
FILE: lib/spark/helper/serialize.rb
type Spark (line 1) | module Spark
type Helper (line 2) | module Helper
type Serialize (line 3) | module Serialize
function pack_int (line 16) | def pack_int(data)
function pack_long (line 20) | def pack_long(data)
function pack_double (line 24) | def pack_double(data)
function pack_unsigned_chars (line 28) | def pack_unsigned_chars(data)
function pack_ints (line 32) | def pack_ints(data)
function pack_longs (line 37) | def pack_longs(data)
function pack_doubles (line 42) | def pack_doubles(data)
function unpack_int (line 49) | def unpack_int(data)
function unpack_long (line 53) | def unpack_long(data)
function unpack_chars (line 57) | def unpack_chars(data)
function __check_array (line 63) | def __check_array(data)
FILE: lib/spark/helper/statistic.rb
type Spark (line 1) | module Spark
type Helper (line 2) | module Helper
type Statistic (line 3) | module Statistic
function compute_fraction (line 19) | def compute_fraction(lower_bound, total, with_replacement)
function upper_poisson_bound (line 30) | def upper_poisson_bound(bound)
function upper_binomial_bound (line 42) | def upper_binomial_bound(delta, total, fraction)
function bisect_right (line 58) | def bisect_right(data, value, low=0, high=data.size)
function determine_bounds (line 82) | def determine_bounds(data, num_partitions)
FILE: lib/spark/helper/system.rb
type Spark (line 1) | module Spark
type Helper (line 2) | module Helper
type System (line 3) | module System
function included (line 5) | def self.included(base)
type Methods (line 10) | module Methods
function windows? (line 11) | def windows?
function mri? (line 15) | def mri?
function jruby? (line 19) | def jruby?
function pry? (line 23) | def pry?
function memory_usage (line 28) | def memory_usage
FILE: lib/spark/java_bridge.rb
type Spark (line 1) | module Spark
type JavaBridge (line 2) | module JavaBridge
function init (line 10) | def self.init(*args)
FILE: lib/spark/java_bridge/base.rb
type Spark (line 6) | module Spark
type JavaBridge (line 7) | module JavaBridge
class Base (line 8) | class Base
method initialize (line 48) | def initialize(target)
method import_all (line 53) | def import_all
method import_all_test (line 65) | def import_all_test
method call (line 77) | def call(klass, method, *args)
method to_array_list (line 88) | def to_array_list(array)
method to_seq (line 96) | def to_seq(array)
method to_long (line 100) | def to_long(number)
method to_java (line 105) | def to_java(object)
method to_ruby (line 124) | def to_ruby(object)
method jars (line 187) | def jars
method objects_with_names (line 193) | def objects_with_names(objects)
method java_objects (line 206) | def java_objects
method java_test_objects (line 210) | def java_test_objects
method raise_missing_class (line 214) | def raise_missing_class(klass)
FILE: lib/spark/java_bridge/jruby.rb
type Spark (line 3) | module Spark
type JavaBridge (line 4) | module JavaBridge
class JRuby (line 5) | class JRuby < Base
method initialize (line 7) | def initialize(*args)
method import (line 12) | def import(name, klass)
method java_object? (line 19) | def java_object?(object)
FILE: lib/spark/java_bridge/rjb.rb
type Spark (line 7) | module Spark
type JavaBridge (line 8) | module JavaBridge
class RJB (line 9) | class RJB < Base
method initialize (line 11) | def initialize(*args)
method import (line 17) | def import(name, klass)
method java_object? (line 23) | def java_object?(object)
method jars (line 29) | def jars
FILE: lib/spark/library.rb
type Spark (line 1) | module Spark
type Library (line 2) | module Library
function autoload (line 4) | def autoload(klass, location, import=true)
function autoload_without_import (line 13) | def autoload_without_import(klass, location)
function import (line 17) | def import(to=Object)
FILE: lib/spark/logger.rb
type Spark (line 4) | module Spark
class Logger (line 5) | class Logger
method initialize (line 9) | def initialize
method level_off (line 13) | def level_off
method disable (line 18) | def disable
method enabled? (line 25) | def enabled?
method info (line 29) | def info(message)
method debug (line 33) | def debug(message)
method trace (line 37) | def trace(message)
method warning (line 41) | def warning(message)
method error (line 45) | def error(message)
method info? (line 49) | def info?
method debug? (line 53) | def debug?
method trace? (line 57) | def trace?
method warning? (line 61) | def warning?
method error? (line 65) | def error?
method level_enabled? (line 69) | def level_enabled?(type)
FILE: lib/spark/mllib.rb
type Spark (line 1) | module Spark
type Mllib (line 5) | module Mllib
function prepare (line 50) | def self.prepare
function narray? (line 71) | def self.narray?
function mdarray? (line 75) | def self.mdarray?
FILE: lib/spark/mllib/classification/common.rb
type Spark (line 1) | module Spark
type Mllib (line 2) | module Mllib
class ClassificationModel (line 3) | class ClassificationModel
method initialize (line 7) | def initialize(weights, intercept)
method threshold= (line 13) | def threshold=(value)
method clear_threshold (line 17) | def clear_threshold
class ClassificationMethodBase (line 27) | class ClassificationMethodBase < RegressionMethodBase
type Mllib (line 26) | module Mllib
class ClassificationModel (line 3) | class ClassificationModel
method initialize (line 7) | def initialize(weights, intercept)
method threshold= (line 13) | def threshold=(value)
method clear_threshold (line 17) | def clear_threshold
class ClassificationMethodBase (line 27) | class ClassificationMethodBase < RegressionMethodBase
type Spark (line 25) | module Spark
type Mllib (line 2) | module Mllib
class ClassificationModel (line 3) | class ClassificationModel
method initialize (line 7) | def initialize(weights, intercept)
method threshold= (line 13) | def threshold=(value)
method clear_threshold (line 17) | def clear_threshold
class ClassificationMethodBase (line 27) | class ClassificationMethodBase < RegressionMethodBase
type Mllib (line 26) | module Mllib
class ClassificationModel (line 3) | class ClassificationModel
method initialize (line 7) | def initialize(weights, intercept)
method threshold= (line 13) | def threshold=(value)
method clear_threshold (line 17) | def clear_threshold
class ClassificationMethodBase (line 27) | class ClassificationMethodBase < RegressionMethodBase
FILE: lib/spark/mllib/classification/logistic_regression.rb
type Spark (line 1) | module Spark
type Mllib (line 2) | module Mllib
class LogisticRegressionModel (line 60) | class LogisticRegressionModel < ClassificationModel
method initialize (line 62) | def initialize(*args)
method predict (line 69) | def predict(vector)
class LogisticRegressionWithSGD (line 91) | class LogisticRegressionWithSGD < ClassificationMethodBase
method train (line 150) | def self.train(rdd, options={})
class LogisticRegressionWithLBFGS (line 173) | class LogisticRegressionWithLBFGS < ClassificationMethodBase
method train (line 220) | def self.train(rdd, options={})
type Mllib (line 90) | module Mllib
class LogisticRegressionModel (line 60) | class LogisticRegressionModel < ClassificationModel
method initialize (line 62) | def initialize(*args)
method predict (line 69) | def predict(vector)
class LogisticRegressionWithSGD (line 91) | class LogisticRegressionWithSGD < ClassificationMethodBase
method train (line 150) | def self.train(rdd, options={})
class LogisticRegressionWithLBFGS (line 173) | class LogisticRegressionWithLBFGS < ClassificationMethodBase
method train (line 220) | def self.train(rdd, options={})
type Mllib (line 172) | module Mllib
class LogisticRegressionModel (line 60) | class LogisticRegressionModel < ClassificationModel
method initialize (line 62) | def initialize(*args)
method predict (line 69) | def predict(vector)
class LogisticRegressionWithSGD (line 91) | class LogisticRegressionWithSGD < ClassificationMethodBase
method train (line 150) | def self.train(rdd, options={})
class LogisticRegressionWithLBFGS (line 173) | class LogisticRegressionWithLBFGS < ClassificationMethodBase
method train (line 220) | def self.train(rdd, options={})
type Spark (line 89) | module Spark
type Mllib (line 2) | module Mllib
class LogisticRegressionModel (line 60) | class LogisticRegressionModel < ClassificationModel
method initialize (line 62) | def initialize(*args)
method predict (line 69) | def predict(vector)
class LogisticRegressionWithSGD (line 91) | class LogisticRegressionWithSGD < ClassificationMethodBase
method train (line 150) | def self.train(rdd, options={})
class LogisticRegressionWithLBFGS (line 173) | class LogisticRegressionWithLBFGS < ClassificationMethodBase
method train (line 220) | def self.train(rdd, options={})
type Mllib (line 90) | module Mllib
class LogisticRegressionModel (line 60) | class LogisticRegressionModel < ClassificationModel
method initialize (line 62) | def initialize(*args)
method predict (line 69) | def predict(vector)
class LogisticRegressionWithSGD (line 91) | class LogisticRegressionWithSGD < ClassificationMethodBase
method train (line 150) | def self.train(rdd, options={})
class LogisticRegressionWithLBFGS (line 173) | class LogisticRegressionWithLBFGS < ClassificationMethodBase
method train (line 220) | def self.train(rdd, options={})
type Mllib (line 172) | module Mllib
class LogisticRegressionModel (line 60) | class LogisticRegressionModel < ClassificationModel
method initialize (line 62) | def initialize(*args)
method predict (line 69) | def predict(vector)
class LogisticRegressionWithSGD (line 91) | class LogisticRegressionWithSGD < ClassificationMethodBase
method train (line 150) | def self.train(rdd, options={})
class LogisticRegressionWithLBFGS (line 173) | class LogisticRegressionWithLBFGS < ClassificationMethodBase
method train (line 220) | def self.train(rdd, options={})
type Spark (line 171) | module Spark
type Mllib (line 2) | module Mllib
class LogisticRegressionModel (line 60) | class LogisticRegressionModel < ClassificationModel
method initialize (line 62) | def initialize(*args)
method predict (line 69) | def predict(vector)
class LogisticRegressionWithSGD (line 91) | class LogisticRegressionWithSGD < ClassificationMethodBase
method train (line 150) | def self.train(rdd, options={})
class LogisticRegressionWithLBFGS (line 173) | class LogisticRegressionWithLBFGS < ClassificationMethodBase
method train (line 220) | def self.train(rdd, options={})
type Mllib (line 90) | module Mllib
class LogisticRegressionModel (line 60) | class LogisticRegressionModel < ClassificationModel
method initialize (line 62) | def initialize(*args)
method predict (line 69) | def predict(vector)
class LogisticRegressionWithSGD (line 91) | class LogisticRegressionWithSGD < ClassificationMethodBase
method train (line 150) | def self.train(rdd, options={})
class LogisticRegressionWithLBFGS (line 173) | class LogisticRegressionWithLBFGS < ClassificationMethodBase
method train (line 220) | def self.train(rdd, options={})
type Mllib (line 172) | module Mllib
class LogisticRegressionModel (line 60) | class LogisticRegressionModel < ClassificationModel
method initialize (line 62) | def initialize(*args)
method predict (line 69) | def predict(vector)
class LogisticRegressionWithSGD (line 91) | class LogisticRegressionWithSGD < ClassificationMethodBase
method train (line 150) | def self.train(rdd, options={})
class LogisticRegressionWithLBFGS (line 173) | class LogisticRegressionWithLBFGS < ClassificationMethodBase
method train (line 220) | def self.train(rdd, options={})
FILE: lib/spark/mllib/classification/naive_bayes.rb
type Spark (line 1) | module Spark
type Mllib (line 2) | module Mllib
class NaiveBayesModel (line 43) | class NaiveBayesModel
method initialize (line 47) | def initialize(labels, pi, theta)
method predict (line 55) | def predict(vector)
class NaiveBayes (line 69) | class NaiveBayes
method train (line 82) | def self.train(rdd, lambda=1.0)
type Mllib (line 68) | module Mllib
class NaiveBayesModel (line 43) | class NaiveBayesModel
method initialize (line 47) | def initialize(labels, pi, theta)
method predict (line 55) | def predict(vector)
class NaiveBayes (line 69) | class NaiveBayes
method train (line 82) | def self.train(rdd, lambda=1.0)
type Spark (line 67) | module Spark
type Mllib (line 2) | module Mllib
class NaiveBayesModel (line 43) | class NaiveBayesModel
method initialize (line 47) | def initialize(labels, pi, theta)
method predict (line 55) | def predict(vector)
class NaiveBayes (line 69) | class NaiveBayes
method train (line 82) | def self.train(rdd, lambda=1.0)
type Mllib (line 68) | module Mllib
class NaiveBayesModel (line 43) | class NaiveBayesModel
method initialize (line 47) | def initialize(labels, pi, theta)
method predict (line 55) | def predict(vector)
class NaiveBayes (line 69) | class NaiveBayes
method train (line 82) | def self.train(rdd, lambda=1.0)
FILE: lib/spark/mllib/classification/svm.rb
type Spark (line 1) | module Spark
type Mllib (line 2) | module Mllib
class SVMModel (line 42) | class SVMModel < ClassificationModel
method initialize (line 44) | def initialize(*args)
method predict (line 51) | def predict(vector)
class SVMWithSGD (line 72) | class SVMWithSGD < ClassificationMethodBase
method train (line 130) | def self.train(rdd, options={})
type Mllib (line 71) | module Mllib
class SVMModel (line 42) | class SVMModel < ClassificationModel
method initialize (line 44) | def initialize(*args)
method predict (line 51) | def predict(vector)
class SVMWithSGD (line 72) | class SVMWithSGD < ClassificationMethodBase
method train (line 130) | def self.train(rdd, options={})
type Spark (line 70) | module Spark
type Mllib (line 2) | module Mllib
class SVMModel (line 42) | class SVMModel < ClassificationModel
method initialize (line 44) | def initialize(*args)
method predict (line 51) | def predict(vector)
class SVMWithSGD (line 72) | class SVMWithSGD < ClassificationMethodBase
method train (line 130) | def self.train(rdd, options={})
type Mllib (line 71) | module Mllib
class SVMModel (line 42) | class SVMModel < ClassificationModel
method initialize (line 44) | def initialize(*args)
method predict (line 51) | def predict(vector)
class SVMWithSGD (line 72) | class SVMWithSGD < ClassificationMethodBase
method train (line 130) | def self.train(rdd, options={})
FILE: lib/spark/mllib/clustering/gaussian_mixture.rb
type Spark (line 1) | module Spark
type Mllib (line 2) | module Mllib
class GaussianMixtureModel (line 25) | class GaussianMixtureModel
method initialize (line 29) | def initialize(weights, gaussians)
method predict (line 37) | def predict(rdd)
method predict_soft (line 46) | def predict_soft(rdd)
method means (line 50) | def means
method sigmas (line 54) | def sigmas
class GaussianMixture (line 64) | class GaussianMixture
method train (line 66) | def self.train(rdd, k, convergence_tol: 0.001, max_iterations: 100...
type Mllib (line 63) | module Mllib
class GaussianMixtureModel (line 25) | class GaussianMixtureModel
method initialize (line 29) | def initialize(weights, gaussians)
method predict (line 37) | def predict(rdd)
method predict_soft (line 46) | def predict_soft(rdd)
method means (line 50) | def means
method sigmas (line 54) | def sigmas
class GaussianMixture (line 64) | class GaussianMixture
method train (line 66) | def self.train(rdd, k, convergence_tol: 0.001, max_iterations: 100...
type Spark (line 62) | module Spark
type Mllib (line 2) | module Mllib
class GaussianMixtureModel (line 25) | class GaussianMixtureModel
method initialize (line 29) | def initialize(weights, gaussians)
method predict (line 37) | def predict(rdd)
method predict_soft (line 46) | def predict_soft(rdd)
method means (line 50) | def means
method sigmas (line 54) | def sigmas
class GaussianMixture (line 64) | class GaussianMixture
method train (line 66) | def self.train(rdd, k, convergence_tol: 0.001, max_iterations: 100...
type Mllib (line 63) | module Mllib
class GaussianMixtureModel (line 25) | class GaussianMixtureModel
method initialize (line 29) | def initialize(weights, gaussians)
method predict (line 37) | def predict(rdd)
method predict_soft (line 46) | def predict_soft(rdd)
method means (line 50) | def means
method sigmas (line 54) | def sigmas
class GaussianMixture (line 64) | class GaussianMixture
method train (line 66) | def self.train(rdd, k, convergence_tol: 0.001, max_iterations: 100...
FILE: lib/spark/mllib/clustering/kmeans.rb
type Spark (line 1) | module Spark
type Mllib (line 2) | module Mllib
class KMeansModel (line 47) | class KMeansModel
method initialize (line 51) | def initialize(centers)
method predict (line 56) | def predict(vector)
method from_java (line 72) | def self.from_java(object)
class KMeans (line 87) | class KMeans
method train (line 113) | def self.train(rdd, k, max_iterations: 100, runs: 1, initializatio...
type Mllib (line 86) | module Mllib
class KMeansModel (line 47) | class KMeansModel
method initialize (line 51) | def initialize(centers)
method predict (line 56) | def predict(vector)
method from_java (line 72) | def self.from_java(object)
class KMeans (line 87) | class KMeans
method train (line 113) | def self.train(rdd, k, max_iterations: 100, runs: 1, initializatio...
type Spark (line 85) | module Spark
type Mllib (line 2) | module Mllib
class KMeansModel (line 47) | class KMeansModel
method initialize (line 51) | def initialize(centers)
method predict (line 56) | def predict(vector)
method from_java (line 72) | def self.from_java(object)
class KMeans (line 87) | class KMeans
method train (line 113) | def self.train(rdd, k, max_iterations: 100, runs: 1, initializatio...
type Mllib (line 86) | module Mllib
class KMeansModel (line 47) | class KMeansModel
method initialize (line 51) | def initialize(centers)
method predict (line 56) | def predict(vector)
method from_java (line 72) | def self.from_java(object)
class KMeans (line 87) | class KMeans
method train (line 113) | def self.train(rdd, k, max_iterations: 100, runs: 1, initializatio...
FILE: lib/spark/mllib/matrix.rb
type Spark (line 1) | module Spark
type Mllib (line 2) | module Mllib
type Matrices (line 3) | module Matrices
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function to_matrix (line 13) | def self.to_matrix(data)
class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
method initialize (line 43) | def initialize(rows, cols, values)
method to_java (line 47) | def to_java
method from_java (line 51) | def self.from_java(object)
class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)
type Mllib (line 26) | module Mllib
type Matrices (line 3) | module Matrices
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function to_matrix (line 13) | def self.to_matrix(data)
class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
method initialize (line 43) | def initialize(rows, cols, values)
method to_java (line 47) | def to_java
method from_java (line 51) | def self.from_java(object)
class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)
type Mllib (line 34) | module Mllib
type Matrices (line 3) | module Matrices
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function to_matrix (line 13) | def self.to_matrix(data)
class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
method initialize (line 43) | def initialize(rows, cols, values)
method to_java (line 47) | def to_java
method from_java (line 51) | def self.from_java(object)
class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)
type Mllib (line 64) | module Mllib
type Matrices (line 3) | module Matrices
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function to_matrix (line 13) | def self.to_matrix(data)
class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
method initialize (line 43) | def initialize(rows, cols, values)
method to_java (line 47) | def to_java
method from_java (line 51) | def self.from_java(object)
class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)
type Spark (line 25) | module Spark
type Mllib (line 2) | module Mllib
type Matrices (line 3) | module Matrices
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function to_matrix (line 13) | def self.to_matrix(data)
class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
method initialize (line 43) | def initialize(rows, cols, values)
method to_java (line 47) | def to_java
method from_java (line 51) | def self.from_java(object)
class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)
type Mllib (line 26) | module Mllib
type Matrices (line 3) | module Matrices
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function to_matrix (line 13) | def self.to_matrix(data)
class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
method initialize (line 43) | def initialize(rows, cols, values)
method to_java (line 47) | def to_java
method from_java (line 51) | def self.from_java(object)
class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)
type Mllib (line 34) | module Mllib
type Matrices (line 3) | module Matrices
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function to_matrix (line 13) | def self.to_matrix(data)
class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
method initialize (line 43) | def initialize(rows, cols, values)
method to_java (line 47) | def to_java
method from_java (line 51) | def self.from_java(object)
class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)
type Mllib (line 64) | module Mllib
type Matrices (line 3) | module Matrices
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function to_matrix (line 13) | def self.to_matrix(data)
class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
method initialize (line 43) | def initialize(rows, cols, values)
method to_java (line 47) | def to_java
method from_java (line 51) | def self.from_java(object)
class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)
type Spark (line 33) | module Spark
type Mllib (line 2) | module Mllib
type Matrices (line 3) | module Matrices
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function to_matrix (line 13) | def self.to_matrix(data)
class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
method initialize (line 43) | def initialize(rows, cols, values)
method to_java (line 47) | def to_java
method from_java (line 51) | def self.from_java(object)
class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)
type Mllib (line 26) | module Mllib
type Matrices (line 3) | module Matrices
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function to_matrix (line 13) | def self.to_matrix(data)
class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
method initialize (line 43) | def initialize(rows, cols, values)
method to_java (line 47) | def to_java
method from_java (line 51) | def self.from_java(object)
class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)
type Mllib (line 34) | module Mllib
type Matrices (line 3) | module Matrices
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function to_matrix (line 13) | def self.to_matrix(data)
class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
method initialize (line 43) | def initialize(rows, cols, values)
method to_java (line 47) | def to_java
method from_java (line 51) | def self.from_java(object)
class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)
type Mllib (line 64) | module Mllib
type Matrices (line 3) | module Matrices
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function to_matrix (line 13) | def self.to_matrix(data)
class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
method initialize (line 43) | def initialize(rows, cols, values)
method to_java (line 47) | def to_java
method from_java (line 51) | def self.from_java(object)
class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)
type Spark (line 63) | module Spark
type Mllib (line 2) | module Mllib
type Matrices (line 3) | module Matrices
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function to_matrix (line 13) | def self.to_matrix(data)
class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
method initialize (line 43) | def initialize(rows, cols, values)
method to_java (line 47) | def to_java
method from_java (line 51) | def self.from_java(object)
class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)
type Mllib (line 26) | module Mllib
type Matrices (line 3) | module Matrices
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function to_matrix (line 13) | def self.to_matrix(data)
class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
method initialize (line 43) | def initialize(rows, cols, values)
method to_java (line 47) | def to_java
method from_java (line 51) | def self.from_java(object)
class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)
type Mllib (line 34) | module Mllib
type Matrices (line 3) | module Matrices
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function to_matrix (line 13) | def self.to_matrix(data)
class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
method initialize (line 43) | def initialize(rows, cols, values)
method to_java (line 47) | def to_java
method from_java (line 51) | def self.from_java(object)
class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)
type Mllib (line 64) | module Mllib
type Matrices (line 3) | module Matrices
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function to_matrix (line 13) | def self.to_matrix(data)
class MatrixBase (line 28) | class MatrixBase < MatrixAdapter
class DenseMatrix (line 41) | class DenseMatrix < MatrixBase
method initialize (line 43) | def initialize(rows, cols, values)
method to_java (line 47) | def to_java
method from_java (line 51) | def self.from_java(object)
class SparseMatrix (line 95) | class SparseMatrix < MatrixBase
method initialize (line 99) | def initialize(rows, cols, col_pointers, row_indices, values)
FILE: lib/spark/mllib/regression/common.rb
type Spark (line 1) | module Spark
type Mllib (line 2) | module Mllib
class RegressionModel (line 8) | class RegressionModel
method initialize (line 12) | def initialize(weights, intercept)
method predict (line 29) | def predict(data)
class RegressionMethodBase (line 46) | class RegressionMethodBase
method train (line 48) | def self.train(rdd, options)
type Mllib (line 40) | module Mllib
class RegressionModel (line 8) | class RegressionModel
method initialize (line 12) | def initialize(weights, intercept)
method predict (line 29) | def predict(data)
class RegressionMethodBase (line 46) | class RegressionMethodBase
method train (line 48) | def self.train(rdd, options)
type Spark (line 39) | module Spark
type Mllib (line 2) | module Mllib
class RegressionModel (line 8) | class RegressionModel
method initialize (line 12) | def initialize(weights, intercept)
method predict (line 29) | def predict(data)
class RegressionMethodBase (line 46) | class RegressionMethodBase
method train (line 48) | def self.train(rdd, options)
type Mllib (line 40) | module Mllib
class RegressionModel (line 8) | class RegressionModel
method initialize (line 12) | def initialize(weights, intercept)
method predict (line 29) | def predict(data)
class RegressionMethodBase (line 46) | class RegressionMethodBase
method train (line 48) | def self.train(rdd, options)
FILE: lib/spark/mllib/regression/labeled_point.rb
type Spark (line 1) | module Spark
type Mllib (line 2) | module Mllib
class LabeledPoint (line 15) | class LabeledPoint
method initialize (line 19) | def initialize(label, features)
method from_java (line 24) | def self.from_java(object)
method marshal_dump (line 31) | def marshal_dump
method marshal_load (line 35) | def marshal_load(array)
FILE: lib/spark/mllib/regression/lasso.rb
class Spark::Mllib::LassoModel (line 49) | class Spark::Mllib::LassoModel < Spark::Mllib::RegressionModel
type Spark (line 52) | module Spark
type Mllib (line 53) | module Mllib
class LassoWithSGD (line 54) | class LassoWithSGD < RegressionMethodBase
method train (line 104) | def self.train(rdd, options={})
FILE: lib/spark/mllib/regression/linear.rb
class Spark::Mllib::LinearRegressionModel (line 55) | class Spark::Mllib::LinearRegressionModel < Spark::Mllib::RegressionModel
type Spark (line 58) | module Spark
type Mllib (line 59) | module Mllib
class LinearRegressionWithSGD (line 60) | class LinearRegressionWithSGD < RegressionMethodBase
method train (line 119) | def self.train(rdd, options={})
FILE: lib/spark/mllib/regression/ridge.rb
class Spark::Mllib::RidgeRegressionModel (line 46) | class Spark::Mllib::RidgeRegressionModel < Spark::Mllib::RegressionModel
type Spark (line 49) | module Spark
type Mllib (line 50) | module Mllib
class RidgeRegressionWithSGD (line 51) | class RidgeRegressionWithSGD < RegressionMethodBase
method train (line 101) | def self.train(rdd, options={})
FILE: lib/spark/mllib/ruby_matrix/matrix_adapter.rb
type Spark (line 3) | module Spark
type Mllib (line 4) | module Mllib
class MatrixAdapter (line 5) | class MatrixAdapter < ::Matrix
method new (line 7) | def self.new(*args)
method initialize (line 23) | def initialize(type, rows, cols, values=nil)
method shape (line 43) | def shape
method values (line 47) | def values
FILE: lib/spark/mllib/ruby_matrix/vector_adapter.rb
class Vector (line 5) | class Vector
method elements (line 6) | def self.elements(array, copy=true)
type Spark (line 11) | module Spark
type Mllib (line 12) | module Mllib
class VectorAdapter (line 13) | class VectorAdapter < ::Vector
method new (line 15) | def self.new(*args)
method initialize (line 21) | def initialize(*args)
method []= (line 34) | def []=(index, value)
method dot (line 38) | def dot(other)
method squared_distance (line 46) | def squared_distance(other)
method values (line 51) | def values
FILE: lib/spark/mllib/vector.rb
type Spark (line 1) | module Spark
type Mllib (line 2) | module Mllib
type Vectors (line 3) | module Vectors
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function parse (line 13) | def self.parse(data)
function to_vector (line 23) | def self.to_vector(data)
class VectorBase (line 38) | class VectorBase < VectorAdapter
class DenseVector (line 57) | class DenseVector < VectorBase
method initialize (line 59) | def initialize(values)
method parse (line 67) | def self.parse(data)
method to_s (line 86) | def to_s
method to_java (line 90) | def to_java
method from_java (line 94) | def self.from_java(object)
method marshal_dump (line 98) | def marshal_dump
method marshal_load (line 102) | def marshal_load(array)
class SparseVector (line 127) | class SparseVector < VectorBase
method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
method parse (line 151) | def self.parse(data)
method to_s (line 171) | def to_s
method marshal_dump (line 175) | def marshal_dump
method marshal_load (line 179) | def marshal_load(array)
type Mllib (line 36) | module Mllib
type Vectors (line 3) | module Vectors
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function parse (line 13) | def self.parse(data)
function to_vector (line 23) | def self.to_vector(data)
class VectorBase (line 38) | class VectorBase < VectorAdapter
class DenseVector (line 57) | class DenseVector < VectorBase
method initialize (line 59) | def initialize(values)
method parse (line 67) | def self.parse(data)
method to_s (line 86) | def to_s
method to_java (line 90) | def to_java
method from_java (line 94) | def self.from_java(object)
method marshal_dump (line 98) | def marshal_dump
method marshal_load (line 102) | def marshal_load(array)
class SparseVector (line 127) | class SparseVector < VectorBase
method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
method parse (line 151) | def self.parse(data)
method to_s (line 171) | def to_s
method marshal_dump (line 175) | def marshal_dump
method marshal_load (line 179) | def marshal_load(array)
type Mllib (line 44) | module Mllib
type Vectors (line 3) | module Vectors
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function parse (line 13) | def self.parse(data)
function to_vector (line 23) | def self.to_vector(data)
class VectorBase (line 38) | class VectorBase < VectorAdapter
class DenseVector (line 57) | class DenseVector < VectorBase
method initialize (line 59) | def initialize(values)
method parse (line 67) | def self.parse(data)
method to_s (line 86) | def to_s
method to_java (line 90) | def to_java
method from_java (line 94) | def self.from_java(object)
method marshal_dump (line 98) | def marshal_dump
method marshal_load (line 102) | def marshal_load(array)
class SparseVector (line 127) | class SparseVector < VectorBase
method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
method parse (line 151) | def self.parse(data)
method to_s (line 171) | def to_s
method marshal_dump (line 175) | def marshal_dump
method marshal_load (line 179) | def marshal_load(array)
type Mllib (line 111) | module Mllib
type Vectors (line 3) | module Vectors
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function parse (line 13) | def self.parse(data)
function to_vector (line 23) | def self.to_vector(data)
class VectorBase (line 38) | class VectorBase < VectorAdapter
class DenseVector (line 57) | class DenseVector < VectorBase
method initialize (line 59) | def initialize(values)
method parse (line 67) | def self.parse(data)
method to_s (line 86) | def to_s
method to_java (line 90) | def to_java
method from_java (line 94) | def self.from_java(object)
method marshal_dump (line 98) | def marshal_dump
method marshal_load (line 102) | def marshal_load(array)
class SparseVector (line 127) | class SparseVector < VectorBase
method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
method parse (line 151) | def self.parse(data)
method to_s (line 171) | def to_s
method marshal_dump (line 175) | def marshal_dump
method marshal_load (line 179) | def marshal_load(array)
type Spark (line 35) | module Spark
type Mllib (line 2) | module Mllib
type Vectors (line 3) | module Vectors
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function parse (line 13) | def self.parse(data)
function to_vector (line 23) | def self.to_vector(data)
class VectorBase (line 38) | class VectorBase < VectorAdapter
class DenseVector (line 57) | class DenseVector < VectorBase
method initialize (line 59) | def initialize(values)
method parse (line 67) | def self.parse(data)
method to_s (line 86) | def to_s
method to_java (line 90) | def to_java
method from_java (line 94) | def self.from_java(object)
method marshal_dump (line 98) | def marshal_dump
method marshal_load (line 102) | def marshal_load(array)
class SparseVector (line 127) | class SparseVector < VectorBase
method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
method parse (line 151) | def self.parse(data)
method to_s (line 171) | def to_s
method marshal_dump (line 175) | def marshal_dump
method marshal_load (line 179) | def marshal_load(array)
type Mllib (line 36) | module Mllib
type Vectors (line 3) | module Vectors
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function parse (line 13) | def self.parse(data)
function to_vector (line 23) | def self.to_vector(data)
class VectorBase (line 38) | class VectorBase < VectorAdapter
class DenseVector (line 57) | class DenseVector < VectorBase
method initialize (line 59) | def initialize(values)
method parse (line 67) | def self.parse(data)
method to_s (line 86) | def to_s
method to_java (line 90) | def to_java
method from_java (line 94) | def self.from_java(object)
method marshal_dump (line 98) | def marshal_dump
method marshal_load (line 102) | def marshal_load(array)
class SparseVector (line 127) | class SparseVector < VectorBase
method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
method parse (line 151) | def self.parse(data)
method to_s (line 171) | def to_s
method marshal_dump (line 175) | def marshal_dump
method marshal_load (line 179) | def marshal_load(array)
type Mllib (line 44) | module Mllib
type Vectors (line 3) | module Vectors
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function parse (line 13) | def self.parse(data)
function to_vector (line 23) | def self.to_vector(data)
class VectorBase (line 38) | class VectorBase < VectorAdapter
class DenseVector (line 57) | class DenseVector < VectorBase
method initialize (line 59) | def initialize(values)
method parse (line 67) | def self.parse(data)
method to_s (line 86) | def to_s
method to_java (line 90) | def to_java
method from_java (line 94) | def self.from_java(object)
method marshal_dump (line 98) | def marshal_dump
method marshal_load (line 102) | def marshal_load(array)
class SparseVector (line 127) | class SparseVector < VectorBase
method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
method parse (line 151) | def self.parse(data)
method to_s (line 171) | def to_s
method marshal_dump (line 175) | def marshal_dump
method marshal_load (line 179) | def marshal_load(array)
type Mllib (line 111) | module Mllib
type Vectors (line 3) | module Vectors
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function parse (line 13) | def self.parse(data)
function to_vector (line 23) | def self.to_vector(data)
class VectorBase (line 38) | class VectorBase < VectorAdapter
class DenseVector (line 57) | class DenseVector < VectorBase
method initialize (line 59) | def initialize(values)
method parse (line 67) | def self.parse(data)
method to_s (line 86) | def to_s
method to_java (line 90) | def to_java
method from_java (line 94) | def self.from_java(object)
method marshal_dump (line 98) | def marshal_dump
method marshal_load (line 102) | def marshal_load(array)
class SparseVector (line 127) | class SparseVector < VectorBase
method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
method parse (line 151) | def self.parse(data)
method to_s (line 171) | def to_s
method marshal_dump (line 175) | def marshal_dump
method marshal_load (line 179) | def marshal_load(array)
type Spark (line 43) | module Spark
type Mllib (line 2) | module Mllib
type Vectors (line 3) | module Vectors
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function parse (line 13) | def self.parse(data)
function to_vector (line 23) | def self.to_vector(data)
class VectorBase (line 38) | class VectorBase < VectorAdapter
class DenseVector (line 57) | class DenseVector < VectorBase
method initialize (line 59) | def initialize(values)
method parse (line 67) | def self.parse(data)
method to_s (line 86) | def to_s
method to_java (line 90) | def to_java
method from_java (line 94) | def self.from_java(object)
method marshal_dump (line 98) | def marshal_dump
method marshal_load (line 102) | def marshal_load(array)
class SparseVector (line 127) | class SparseVector < VectorBase
method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
method parse (line 151) | def self.parse(data)
method to_s (line 171) | def to_s
method marshal_dump (line 175) | def marshal_dump
method marshal_load (line 179) | def marshal_load(array)
type Mllib (line 36) | module Mllib
type Vectors (line 3) | module Vectors
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function parse (line 13) | def self.parse(data)
function to_vector (line 23) | def self.to_vector(data)
class VectorBase (line 38) | class VectorBase < VectorAdapter
class DenseVector (line 57) | class DenseVector < VectorBase
method initialize (line 59) | def initialize(values)
method parse (line 67) | def self.parse(data)
method to_s (line 86) | def to_s
method to_java (line 90) | def to_java
method from_java (line 94) | def self.from_java(object)
method marshal_dump (line 98) | def marshal_dump
method marshal_load (line 102) | def marshal_load(array)
class SparseVector (line 127) | class SparseVector < VectorBase
method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
method parse (line 151) | def self.parse(data)
method to_s (line 171) | def to_s
method marshal_dump (line 175) | def marshal_dump
method marshal_load (line 179) | def marshal_load(array)
type Mllib (line 44) | module Mllib
type Vectors (line 3) | module Vectors
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function parse (line 13) | def self.parse(data)
function to_vector (line 23) | def self.to_vector(data)
class VectorBase (line 38) | class VectorBase < VectorAdapter
class DenseVector (line 57) | class DenseVector < VectorBase
method initialize (line 59) | def initialize(values)
method parse (line 67) | def self.parse(data)
method to_s (line 86) | def to_s
method to_java (line 90) | def to_java
method from_java (line 94) | def self.from_java(object)
method marshal_dump (line 98) | def marshal_dump
method marshal_load (line 102) | def marshal_load(array)
class SparseVector (line 127) | class SparseVector < VectorBase
method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
method parse (line 151) | def self.parse(data)
method to_s (line 171) | def to_s
method marshal_dump (line 175) | def marshal_dump
method marshal_load (line 179) | def marshal_load(array)
type Mllib (line 111) | module Mllib
type Vectors (line 3) | module Vectors
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function parse (line 13) | def self.parse(data)
function to_vector (line 23) | def self.to_vector(data)
class VectorBase (line 38) | class VectorBase < VectorAdapter
class DenseVector (line 57) | class DenseVector < VectorBase
method initialize (line 59) | def initialize(values)
method parse (line 67) | def self.parse(data)
method to_s (line 86) | def to_s
method to_java (line 90) | def to_java
method from_java (line 94) | def self.from_java(object)
method marshal_dump (line 98) | def marshal_dump
method marshal_load (line 102) | def marshal_load(array)
class SparseVector (line 127) | class SparseVector < VectorBase
method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
method parse (line 151) | def self.parse(data)
method to_s (line 171) | def to_s
method marshal_dump (line 175) | def marshal_dump
method marshal_load (line 179) | def marshal_load(array)
type Spark (line 110) | module Spark
type Mllib (line 2) | module Mllib
type Vectors (line 3) | module Vectors
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function parse (line 13) | def self.parse(data)
function to_vector (line 23) | def self.to_vector(data)
class VectorBase (line 38) | class VectorBase < VectorAdapter
class DenseVector (line 57) | class DenseVector < VectorBase
method initialize (line 59) | def initialize(values)
method parse (line 67) | def self.parse(data)
method to_s (line 86) | def to_s
method to_java (line 90) | def to_java
method from_java (line 94) | def self.from_java(object)
method marshal_dump (line 98) | def marshal_dump
method marshal_load (line 102) | def marshal_load(array)
class SparseVector (line 127) | class SparseVector < VectorBase
method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
method parse (line 151) | def self.parse(data)
method to_s (line 171) | def to_s
method marshal_dump (line 175) | def marshal_dump
method marshal_load (line 179) | def marshal_load(array)
type Mllib (line 36) | module Mllib
type Vectors (line 3) | module Vectors
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function parse (line 13) | def self.parse(data)
function to_vector (line 23) | def self.to_vector(data)
class VectorBase (line 38) | class VectorBase < VectorAdapter
class DenseVector (line 57) | class DenseVector < VectorBase
method initialize (line 59) | def initialize(values)
method parse (line 67) | def self.parse(data)
method to_s (line 86) | def to_s
method to_java (line 90) | def to_java
method from_java (line 94) | def self.from_java(object)
method marshal_dump (line 98) | def marshal_dump
method marshal_load (line 102) | def marshal_load(array)
class SparseVector (line 127) | class SparseVector < VectorBase
method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
method parse (line 151) | def self.parse(data)
method to_s (line 171) | def to_s
method marshal_dump (line 175) | def marshal_dump
method marshal_load (line 179) | def marshal_load(array)
type Mllib (line 44) | module Mllib
type Vectors (line 3) | module Vectors
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function parse (line 13) | def self.parse(data)
function to_vector (line 23) | def self.to_vector(data)
class VectorBase (line 38) | class VectorBase < VectorAdapter
class DenseVector (line 57) | class DenseVector < VectorBase
method initialize (line 59) | def initialize(values)
method parse (line 67) | def self.parse(data)
method to_s (line 86) | def to_s
method to_java (line 90) | def to_java
method from_java (line 94) | def self.from_java(object)
method marshal_dump (line 98) | def marshal_dump
method marshal_load (line 102) | def marshal_load(array)
class SparseVector (line 127) | class SparseVector < VectorBase
method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
method parse (line 151) | def self.parse(data)
method to_s (line 171) | def to_s
method marshal_dump (line 175) | def marshal_dump
method marshal_load (line 179) | def marshal_load(array)
type Mllib (line 111) | module Mllib
type Vectors (line 3) | module Vectors
function dense (line 5) | def self.dense(*args)
function sparse (line 9) | def self.sparse(*args)
function parse (line 13) | def self.parse(data)
function to_vector (line 23) | def self.to_vector(data)
class VectorBase (line 38) | class VectorBase < VectorAdapter
class DenseVector (line 57) | class DenseVector < VectorBase
method initialize (line 59) | def initialize(values)
method parse (line 67) | def self.parse(data)
method to_s (line 86) | def to_s
method to_java (line 90) | def to_java
method from_java (line 94) | def self.from_java(object)
method marshal_dump (line 98) | def marshal_dump
method marshal_load (line 102) | def marshal_load(array)
class SparseVector (line 127) | class SparseVector < VectorBase
method initialize (line 131) | def initialize(arg1, arg2=nil, arg3=nil)
method parse (line 151) | def self.parse(data)
method to_s (line 171) | def to_s
method marshal_dump (line 175) | def marshal_dump
method marshal_load (line 179) | def marshal_load(array)
FILE: lib/spark/rdd.rb
type Spark (line 1) | module Spark
class RDD (line 7) | class RDD
method initialize (line 27) | def initialize(jrdd, context, serializer, deserializer=nil)
method inspect (line 37) | def inspect
method + (line 54) | def +(other)
method add_command (line 62) | def add_command(klass, *args)
method add_library (line 72) | def add_library(*libraries)
method bind (line 89) | def bind(objects)
method new_rdd_from_command (line 98) | def new_rdd_from_command(klass, *args)
method config (line 107) | def config
method default_reduce_partitions (line 111) | def default_reduce_partitions
method partitions_size (line 116) | def partitions_size
method id (line 121) | def id
method cache (line 126) | def cache
method persist (line 136) | def persist(new_level)
method unpersist (line 147) | def unpersist(blocking=true)
method cached? (line 153) | def cached?
method checkpointed? (line 157) | def checkpointed?
method name (line 163) | def name
method set_name (line 170) | def set_name(value)
method name= (line 175) | def name=(value)
method to_java (line 179) | def to_java
method collect (line 199) | def collect(as_enum=false)
method collect_from_file (line 212) | def collect_from_file(file, as_enum=false)
method collect_as_hash (line 232) | def collect_as_hash
method take (line 247) | def take(count)
method first (line 290) | def first
method reduce (line 301) | def reduce(f)
method fold (line 318) | def fold(zero_value, f)
method aggregate (line 342) | def aggregate(zero_value, seq_op, comb_op)
method max (line 353) | def max
method min (line 364) | def min
method sum (line 375) | def sum
method count (line 386) | def count
method stats (line 394) | def stats
method mean (line 404) | def mean
method variance (line 414) | def variance
method stdev (line 424) | def stdev
method sample_stdev (line 436) | def sample_stdev
method sample_variance (line 447) | def sample_variance
method histogram (line 476) | def histogram(buckets)
method foreach (line 576) | def foreach(f, options={})
method foreach_partition (line 588) | def foreach_partition(f, options={})
method map (line 604) | def map(f)
method flat_map (line 616) | def flat_map(f)
method map_partitions (line 627) | def map_partitions(f)
method map_partitions_with_index (line 639) | def map_partitions_with_index(f, options={})
method filter (line 650) | def filter(f)
method compact (line 661) | def compact
method glom (line 672) | def glom
method coalesce (line 683) | def coalesce(num_partitions)
method cartesian (line 705) | def cartesian(other)
method distinct (line 720) | def distinct
method shuffle (line 733) | def shuffle(seed=nil)
method union (line 747) | def union(other)
method reserialize (line 765) | def reserialize(new_serializer)
method intersection (line 785) | def intersection(other)
method partition_by (line 802) | def partition_by(num_partitions, partition_func=nil)
method sample (line 822) | def sample(with_replacement, fraction, seed=nil)
method take_sample (line 837) | def take_sample(with_replacement, num, seed=nil)
method pipe (line 913) | def pipe(*cmds)
method reduce_by_key (line 931) | def reduce_by_key(f, num_partitions=nil)
method combine_by_key (line 959) | def combine_by_key(create_combiner, merge_value, merge_combiners, nu...
method group_by (line 974) | def group_by(f, num_partitions=nil)
method group_by_key (line 989) | def group_by_key(num_partitions=nil)
method fold_by_key (line 1007) | def fold_by_key(zero_value, f, num_partitions=nil)
method aggregate_by_key (line 1026) | def aggregate_by_key(zero_value, seq_func, comb_func, num_partitions...
method group_with (line 1043) | def group_with(other, num_partitions=nil)
method cogroup (line 1057) | def cogroup(*others)
method subtract_by_key (line 1075) | def subtract_by_key(other, num_partitions=nil)
method subtract (line 1094) | def subtract(other, num_partitions=nil)
method sort_by_key (line 1109) | def sort_by_key(ascending=true, num_partitions=nil)
method sort_by_value (line 1120) | def sort_by_value(ascending=true, num_partitions=nil)
method sort_by (line 1139) | def sort_by(key_function=nil, ascending=true, num_partitions=nil)
method key_by (line 1190) | def key_by(f)
method map_values (line 1204) | def map_values(f)
method flat_map_values (line 1218) | def flat_map_values(f)
method keys (line 1229) | def keys
method values (line 1240) | def values
method lookup (line 1258) | def lookup(key)
method _reduce (line 1301) | def _reduce(klass, seq_op, comb_op, zero_value=nil)
method _partition_by (line 1320) | def _partition_by(num_partitions, klass, *args)
method _combine_by_key (line 1341) | def _combine_by_key(combine, merge, num_partitions)
class PipelinedRDD (line 1365) | class PipelinedRDD < RDD
method initialize (line 1369) | def initialize(prev, command)
method pipelinable? (line 1386) | def pipelinable?
method jrdd (line 1391) | def jrdd
method _jrdd (line 1397) | def _jrdd
FILE: lib/spark/sampler.rb
type Spark (line 4) | module Spark
type RandomGenerator (line 5) | module RandomGenerator
class Poisson (line 6) | class Poisson
method initialize (line 8) | def initialize(mean, seed)
method rand (line 13) | def rand
type Sampler (line 32) | module Sampler
class Base (line 34) | class Base
method initialize (line 37) | def initialize(fraction, seed=nil)
class Poisson (line 45) | class Poisson < Base
method sample (line 47) | def sample(iterator)
method lazy_sample (line 57) | def lazy_sample(iterator)
method rng (line 64) | def rng
class Uniform (line 72) | class Uniform < Base
method sample (line 74) | def sample(iterator)
method lazy_sample (line 79) | def lazy_sample(iterator)
method rng (line 85) | def rng
type Spark (line 31) | module Spark
type RandomGenerator (line 5) | module RandomGenerator
class Poisson (line 6) | class Poisson
method initialize (line 8) | def initialize(mean, seed)
method rand (line 13) | def rand
type Sampler (line 32) | module Sampler
class Base (line 34) | class Base
method initialize (line 37) | def initialize(fraction, seed=nil)
class Poisson (line 45) | class Poisson < Base
method sample (line 47) | def sample(iterator)
method lazy_sample (line 57) | def lazy_sample(iterator)
method rng (line 64) | def rng
class Uniform (line 72) | class Uniform < Base
method sample (line 74) | def sample(iterator)
method lazy_sample (line 79) | def lazy_sample(iterator)
method rng (line 85) | def rng
FILE: lib/spark/serializer.rb
type Spark (line 1) | module Spark
type Serializer (line 5) | module Serializer
function register (line 31) | def self.register(*args)
function find (line 40) | def self.find(name)
function find! (line 44) | def self.find!(name)
function build (line 54) | def self.build(text=nil, &block)
FILE: lib/spark/serializer/auto_batched.rb
type Spark (line 1) | module Spark
type Serializer (line 2) | module Serializer
class AutoBatched (line 8) | class AutoBatched < Batched
method initialize (line 12) | def initialize(serializer, best_size=65536)
method batched? (line 19) | def batched?
method unbatch! (line 23) | def unbatch!
method name (line 26) | def name
method dump_to_io (line 30) | def dump_to_io(data, io)
FILE: lib/spark/serializer/base.rb
type Spark (line 1) | module Spark
type Serializer (line 2) | module Serializer
class Base (line 4) | class Base
method load_from_io (line 6) | def load_from_io(io)
method load_from_file (line 17) | def load_from_file(file, *args)
method == (line 28) | def ==(other)
method batched? (line 32) | def batched?
method unbatch! (line 36) | def unbatch!
method check_each (line 39) | def check_each(data)
method error (line 45) | def error(message)
method name (line 49) | def name
method to_s (line 53) | def to_s
method inspect (line 57) | def inspect
FILE: lib/spark/serializer/batched.rb
type Spark (line 1) | module Spark
type Serializer (line 2) | module Serializer
class Batched (line 3) | class Batched < Base
method initialize (line 7) | def initialize(serializer, batch_size=nil)
method batched? (line 17) | def batched?
method unbatch! (line 21) | def unbatch!
method load (line 25) | def load(data)
method dump (line 29) | def dump(data)
method name (line 33) | def name
method to_s (line 37) | def to_s
method dump_to_io (line 44) | def dump_to_io(data, io)
method load_from_io (line 62) | def load_from_io(io)
FILE: lib/spark/serializer/cartesian.rb
type Spark (line 1) | module Spark
type Serializer (line 2) | module Serializer
class Cartesian (line 3) | class Cartesian < Pair
method aggregate (line 5) | def aggregate(item1, item2)
FILE: lib/spark/serializer/compressed.rb
type Spark (line 1) | module Spark
type Serializer (line 2) | module Serializer
class Compressed (line 3) | class Compressed < Base
method initialize (line 5) | def initialize(serializer)
method dump (line 9) | def dump(data)
method load (line 13) | def load(data)
FILE: lib/spark/serializer/marshal.rb
type Spark (line 1) | module Spark
type Serializer (line 2) | module Serializer
class Marshal (line 3) | class Marshal < Base
method dump (line 5) | def dump(data)
method load (line 9) | def load(data)
FILE: lib/spark/serializer/message_pack.rb
type Spark (line 1) | module Spark
type Serializer (line 2) | module Serializer
class MessagePack (line 3) | class MessagePack < Base
method dump (line 5) | def dump(data)
method load (line 9) | def load(data)
FILE: lib/spark/serializer/oj.rb
type Spark (line 1) | module Spark
type Serializer (line 2) | module Serializer
class Oj (line 3) | class Oj < Base
method dump (line 5) | def dump(data)
method load (line 9) | def load(data)
FILE: lib/spark/serializer/pair.rb
type Spark (line 1) | module Spark
type Serializer (line 2) | module Serializer
class Pair (line 3) | class Pair < Base
method initialize (line 5) | def initialize(serializer1, serializer2)
method to_s (line 10) | def to_s
method aggregate (line 14) | def aggregate(item1, item2)
method load_from_io (line 18) | def load_from_io(io)
FILE: lib/spark/serializer/text.rb
type Spark (line 1) | module Spark
type Serializer (line 2) | module Serializer
class Text (line 3) | class Text < Base
method initialize (line 7) | def initialize(encoding=Encoding::UTF_8)
method load (line 13) | def load(data)
method to_s (line 17) | def to_s
FILE: lib/spark/sort.rb
type Spark (line 1) | module Spark
type InternalSorter (line 2) | module InternalSorter
class Base (line 3) | class Base
method initialize (line 4) | def initialize(key_function)
class Ascending (line 9) | class Ascending < Base
method sort (line 10) | def sort(data)
class Descending (line 15) | class Descending < Ascending
method sort (line 16) | def sort(data)
function get (line 22) | def self.get(ascending, key_function)
class ExternalSorter (line 36) | class ExternalSorter
method initialize (line 61) | def initialize(total_memory, serializer)
method add_memory! (line 68) | def add_memory!
method sort_by (line 72) | def sort_by(iterator, ascending=true, key_function=KEY_FUNCTION)
method create_temp_folder (line 134) | def create_temp_folder
method destroy_temp_folder (line 138) | def destroy_temp_folder
method make_parts (line 144) | def make_parts(iterator, internal_sorter)
type Spark (line 35) | module Spark
type InternalSorter (line 2) | module InternalSorter
class Base (line 3) | class Base
method initialize (line 4) | def initialize(key_function)
class Ascending (line 9) | class Ascending < Base
method sort (line 10) | def sort(data)
class Descending (line 15) | class Descending < Ascending
method sort (line 16) | def sort(data)
function get (line 22) | def self.get(ascending, key_function)
class ExternalSorter (line 36) | class ExternalSorter
method initialize (line 61) | def initialize(total_memory, serializer)
method add_memory! (line 68) | def add_memory!
method sort_by (line 72) | def sort_by(iterator, ascending=true, key_function=KEY_FUNCTION)
method create_temp_folder (line 134) | def create_temp_folder
method destroy_temp_folder (line 138) | def destroy_temp_folder
method make_parts (line 144) | def make_parts(iterator, internal_sorter)
FILE: lib/spark/sql.rb
type Spark (line 1) | module Spark
type SQL (line 2) | module SQL
FILE: lib/spark/sql/column.rb
type Spark (line 1) | module Spark
type SQL (line 2) | module SQL
class Column (line 3) | class Column
method to_java (line 8) | def self.to_java(col)
method from_literal (line 16) | def self.from_literal(literal)
method from_name (line 20) | def self.from_name(name)
method when (line 42) | def self.when(condition, value)
method initialize (line 52) | def initialize(jcolumn)
method get_item (line 134) | def get_item(key)
method get_field (line 155) | def get_field(name)
method substr (line 169) | def substr(start, length)
method isin (line 191) | def isin(*cols)
method alias (line 211) | def alias(name)
method cast (line 224) | def cast(data_type)
method between (line 250) | def between(lower, upper)
method when (line 270) | def when(condition, value)
method otherwise (line 295) | def otherwise(value)
method over (line 304) | def over(*)
method method_missing (line 308) | def method_missing(method, item)
method to_s (line 312) | def to_s
method inspect (line 316) | def inspect
method func_op (line 327) | def func_op(name)
method bin_op (line 332) | def bin_op(name, item)
method unary_op (line 343) | def unary_op(name)
FILE: lib/spark/sql/context.rb
type Spark (line 1) | module Spark
type SQL (line 2) | module SQL
class Context (line 3) | class Context
method initialize (line 7) | def initialize(spark_context)
method read (line 12) | def read
FILE: lib/spark/sql/data_frame.rb
type Spark (line 1) | module Spark
type SQL (line 2) | module SQL
class DataFrame (line 8) | class DataFrame
method initialize (line 12) | def initialize(jdf, sql_context)
method [] (line 32) | def [](item)
method columns (line 55) | def columns
method schema (line 60) | def schema
method show_string (line 70) | def show_string(n=20, truncate=true)
method show (line 80) | def show(n=20, truncate=true)
method print_schema (line 92) | def print_schema
method explain (line 96) | def explain(extended=false)
method print_explain (line 120) | def print_explain(extended=false)
method dtypes (line 130) | def dtypes
method inspect (line 136) | def inspect
method method_missing (line 145) | def method_missing(method, *args, &block)
method collect (line 164) | def collect
method collect_as_hash (line 168) | def collect_as_hash
method values (line 174) | def values
method count (line 183) | def count
method take (line 188) | def take(num)
method first (line 193) | def first
method select (line 219) | def select(*cols)
method filter (line 243) | def filter(condition)
method limit (line 257) | def limit(num)
FILE: lib/spark/sql/data_frame_reader.rb
type Spark (line 1) | module Spark
type SQL (line 2) | module SQL
class DataFrameReader (line 3) | class DataFrameReader
method initialize (line 7) | def initialize(sql_context)
method df (line 12) | def df(jdf)
method format (line 18) | def format(source)
method option (line 24) | def option(key, value)
method options (line 30) | def options(options)
method load (line 45) | def load(path=nil, new_format=nil, new_schema=nil, new_options=nil)
method schema (line 65) | def schema(new_schema)
method json (line 89) | def json(path, new_schema=nil)
FILE: lib/spark/sql/data_type.rb
type Spark (line 1) | module Spark
type SQL (line 2) | module SQL
class DataType (line 6) | class DataType
method parse (line 14) | def self.parse(data)
method class_name (line 33) | def self.class_name
method type_name (line 37) | def self.type_name
method complex (line 41) | def self.complex
method atomic (line 45) | def self.atomic
method == (line 49) | def ==(other)
method type_name (line 53) | def type_name
method simple_string (line 57) | def simple_string
method json_value (line 61) | def json_value
method json (line 65) | def json
method to_s (line 69) | def to_s
method inspect (line 73) | def inspect
class StructType (line 91) | class StructType < DataType
method from_json (line 96) | def self.from_json(json)
method initialize (line 104) | def initialize(fields=[])
method json_value (line 109) | def json_value
method to_s (line 116) | def to_s
class StructField (line 125) | class StructField < DataType
method from_json (line 129) | def self.from_json(json)
method initialize (line 147) | def initialize(name, data_type, nullable=true, metadata={})
method json_value (line 154) | def json_value
method to_s (line 163) | def to_s
class AtomicType (line 174) | class AtomicType < DataType
class BooleanType (line 182) | class BooleanType < AtomicType
class NumericType (line 191) | class NumericType < AtomicType
class IntegralType (line 200) | class IntegralType < NumericType
class StringType (line 209) | class StringType < AtomicType
class LongType (line 222) | class LongType < IntegralType
FILE: lib/spark/sql/row.rb
type Spark (line 1) | module Spark
type SQL (line 2) | module SQL
class Row (line 6) | class Row
method from_java (line 9) | def self.from_java(object, with_schema=true)
method initialize (line 35) | def initialize(data={})
method [] (line 39) | def [](item)
method to_h (line 43) | def to_h
method inspect (line 47) | def inspect
FILE: lib/spark/stat_counter.rb
type Spark (line 1) | module Spark
class StatCounter (line 2) | class StatCounter
method initialize (line 10) | def initialize(iterator)
method merge (line 20) | def merge(other)
method sum (line 32) | def sum
method variance (line 37) | def variance
method sample_variance (line 47) | def sample_variance
method stdev (line 56) | def stdev
method sample_stdev (line 62) | def sample_stdev
method to_s (line 66) | def to_s
method merge_stat_counter (line 79) | def merge_stat_counter(other)
method merge_array (line 109) | def merge_array(array)
method merge_value (line 115) | def merge_value(value)
FILE: lib/spark/storage_level.rb
type Spark (line 4) | module Spark
class StorageLevel (line 5) | class StorageLevel
method reload (line 7) | def self.reload
method reload! (line 13) | def self.reload!
method java_get (line 28) | def self.java_get(arg)
FILE: lib/spark/version.rb
type Spark (line 1) | module Spark
FILE: lib/spark/worker/master.rb
type Master (line 18) | module Master
function create (line 20) | def self.create
class Base (line 29) | class Base
method initialize (line 32) | def initialize
method run (line 38) | def run
method receive_message (line 47) | def receive_message
method kill_worker_and_wait (line 60) | def kill_worker_and_wait
class Process (line 72) | class Process < Base
method create_worker (line 74) | def create_worker
method kill_worker (line 87) | def kill_worker
method fork? (line 94) | def fork?
method _fork? (line 98) | def _fork?
class Thread (line 113) | class Thread < Base
method initialize (line 115) | def initialize
method create_worker (line 125) | def create_worker
method kill_worker (line 131) | def kill_worker
FILE: lib/spark/worker/spark_files.rb
class SparkFiles (line 1) | class SparkFiles
method get (line 7) | def self.get(file_name)
method get_content (line 11) | def self.get_content(file_name)
FILE: lib/spark/worker/worker.rb
type Worker (line 18) | module Worker
class Base (line 19) | class Base
method initialize (line 27) | def initialize(port)
method run (line 35) | def run
method before_start (line 47) | def before_start
method before_end (line 51) | def before_end
method compute (line 57) | def compute
method send_error (line 88) | def send_error(e)
method successful_finish (line 114) | def successful_finish
method log (line 131) | def log(message=nil)
class Process (line 143) | class Process < Base
method id (line 145) | def id
method before_start (line 151) | def before_start
method kill_worker (line 155) | def kill_worker
class Thread (line 164) | class Thread < Base
method id (line 166) | def id
method load_command (line 172) | def load_command
method load_iterator (line 178) | def load_iterator
method kill_worker (line 189) | def kill_worker
FILE: spec/generator.rb
class Generator (line 1) | class Generator
method numbers (line 2) | def self.numbers(size=1000)
method numbers_with_zero (line 6) | def self.numbers_with_zero(size=1000)
method words (line 10) | def self.words(size=1000)
method word (line 14) | def self.word(size=10)
method lines (line 18) | def self.lines(size=1000, letters=3)
method hash (line 26) | def self.hash(size=1000)
method hash_with_values (line 32) | def self.hash_with_values(size=1000, values_count=10)
FILE: spec/lib/command_spec.rb
function to_s_method (line 3) | def to_s_method(x)
FILE: spec/lib/filter_spec.rb
function func4 (line 3) | def func4(item)
function rdd_numbers (line 48) | def rdd_numbers(workers)
function rdd_words (line 52) | def rdd_words(workers)
function rdd_numbers (line 68) | def rdd_numbers(workers)
function rdd_words (line 72) | def rdd_words(workers)
FILE: spec/lib/flat_map_spec.rb
function rdd (line 62) | def rdd(workers)
function rdd (line 77) | def rdd(workers)
function rdd (line 93) | def rdd(workers)
FILE: spec/lib/group_spec.rb
function make_result (line 39) | def make_result(*hashes)
function rdd_result (line 54) | def rdd_result(workers)
function rdd_1 (line 73) | def rdd_1(workers)
function rdd_2 (line 77) | def rdd_2(workers)
function rdd_3 (line 81) | def rdd_3(workers)
function rdd_numbers (line 98) | def rdd_numbers(workers)
function rdd_words (line 102) | def rdd_words(workers)
FILE: spec/lib/key_spec.rb
function rdd_numbers (line 28) | def rdd_numbers(workers)
function rdd_words (line 32) | def rdd_words(workers)
FILE: spec/lib/map_partitions_spec.rb
function func3 (line 3) | def func3(x)
function func4_with_index (line 7) | def func4_with_index(data, index)
function rdd (line 66) | def rdd(workers)
function rdd (line 80) | def rdd(workers)
FILE: spec/lib/map_spec.rb
function rdd (line 55) | def rdd(workers)
function rdd (line 68) | def rdd(workers)
function rdd (line 84) | def rdd(workers)
FILE: spec/lib/reduce_by_key_spec.rb
function flat_map (line 3) | def flat_map(line)
function map (line 7) | def map(item)
function reduce (line 11) | def reduce(x,y)
function rdd (line 65) | def rdd(workers)
function rdd (line 78) | def rdd(workers)
function fold_by_key (line 104) | def fold_by_key(num_partitions=nil)
FILE: spec/lib/reduce_spec.rb
function longest_words (line 3) | def longest_words(memo, word)
function rdd_numbers (line 99) | def rdd_numbers(workers)
function rdd_lines (line 103) | def rdd_lines(workers)
function rdd_numbers (line 120) | def rdd_numbers(workers)
function rdd_lines (line 124) | def rdd_lines(workers)
FILE: spec/lib/sample_spec.rb
function rdd (line 39) | def rdd(workers)
FILE: spec/lib/sort_spec.rb
function rdd (line 34) | def rdd(workers)
function rdd (line 50) | def rdd(workers)
FILE: spec/lib/whole_text_files_spec.rb
function rdd (line 26) | def rdd(workers)
FILE: spec/spec_helper.rb
function spark_start (line 14) | def spark_start
function windows? (line 23) | def windows?
Condensed preview — 191 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (482K chars).
[
{
"path": ".gitignore",
"chars": 421,
"preview": "/.gemtags\n/.tags\n/java/spark.jar\n.jbundler\ntarget/*\n*.class\n*.jar\npom.xml\nvendor/*\n*.gem\n*.rbc\n.bundle\n.config\n.yardoc\nG"
},
{
"path": ".travis.yml",
"chars": 208,
"preview": "language: ruby\n\nrvm:\n - 2.2.0\n\nbefore_script:\n - bundle exec rake compile\n - bundle exec ruby bin/ruby-spark build\n\nc"
},
{
"path": "CHANGELOG.md",
"chars": 323,
"preview": "## Unreleased\n\n## 1.3.0\n\n - new method on RDD (lookup)\n - fix sbt url\n - Spark 1.5.0\n\n## 1.2.0 (15.06.2015)\n\n - targ"
},
{
"path": "Gemfile",
"chars": 956,
"preview": "source 'https://rubygems.org'\n\ngemspec\n\ngem 'sourcify', '0.6.0.rc4'\ngem 'method_source'\ngem 'commander'\ngem 'pry'\ngem 'n"
},
{
"path": "Guardfile",
"chars": 176,
"preview": "guard :rspec, cmd: 'rspec' do\n watch(%r{^spec/.+_spec\\.rb$})\n watch(%r{^lib/(.+)\\.rb$}) { |m| \"spec/lib/#{m[1]}_sp"
},
{
"path": "LICENSE.txt",
"chars": 1072,
"preview": "Copyright (c) 2014 Ondřej Moravčík\n\nMIT License\n\nPermission is hereby granted, free of charge, to any person obtaining\na"
},
{
"path": "README.md",
"chars": 9474,
"preview": "# Ruby-Spark [](https://travis-ci.org/ondra-m"
},
{
"path": "Rakefile",
"chars": 623,
"preview": "#-*- mode: ruby -*-\n\nrequire \"bundler/gem_tasks\"\nrequire \"rspec/core/rake_task\"\n\nRSpec::Core::RakeTask.new\n\ntask default"
},
{
"path": "TODO.md",
"chars": 356,
"preview": "- refactor JavaBridge\n - to_java, from_java\n - every type should have class\n - automatic registration\n- add Streaming"
},
{
"path": "benchmark/aggregate.rb",
"chars": 550,
"preview": "require 'benchmark'\nrequire 'benchmark/ips'\n\ndata = 0..1_000_000\nzero_value = rand(100_000)\nfunction = Proc.new{|sum, n|"
},
{
"path": "benchmark/bisect.rb",
"chars": 1541,
"preview": "require \"benchmark\"\n\ndef bisect_left1(a, x, opts={})\n return nil if a.nil?\n return 0 if a.empty?\n\n lo = (opts[:lo] ||"
},
{
"path": "benchmark/comparison/prepare.sh",
"chars": 422,
"preview": "#!/usr/bin/env bash\n\n# Current dir\ncd \"$(dirname \"$0\")\"\n\n# Exit immediately if a pipeline returns a non-zero status.\nset"
},
{
"path": "benchmark/comparison/python.py",
"chars": 3233,
"preview": "import os\nimport math\nfrom time import time\nfrom random import random\nfrom operator import add\nfrom pyspark import Spark"
},
{
"path": "benchmark/comparison/r.r",
"chars": 1429,
"preview": "library(SparkR)\nsc <- sparkR.init(master=\"local[*]\")\n\nlogFile <- file(Sys.getenv(\"R_LOG\"), \"w\")\n\nlogInfo <- function(..."
},
{
"path": "benchmark/comparison/ruby.rb",
"chars": 3150,
"preview": "#!/usr/bin/env ruby\n\nlib = File.expand_path(File.dirname(__FILE__) + '/../../lib')\n$LOAD_PATH.unshift(lib) if File.direc"
},
{
"path": "benchmark/comparison/run-all.sh",
"chars": 2841,
"preview": "#!/usr/bin/env bash\n\n# Current dir\ncd \"$(dirname \"$0\")\"\n\n# Exit immediately if a pipeline returns a non-zero status.\nset"
},
{
"path": "benchmark/comparison/scala.scala",
"chars": 4419,
"preview": "import java.io._\nimport scala.math\nimport scala.io.Source\nimport org.apache.spark._\n\nobject Scala {\n\n val logFile = new"
},
{
"path": "benchmark/custom_marshal.rb",
"chars": 1519,
"preview": "require 'benchmark'\nrequire 'benchmark/ips'\n\ndef pack_int(data)\n [data].pack('l>')\nend\n\ndef pack_long(data)\n [data].pa"
},
{
"path": "benchmark/digest.rb",
"chars": 3210,
"preview": "lib = File.expand_path(File.dirname(__FILE__) + '/../lib')\n$LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PAT"
},
{
"path": "benchmark/enumerator.rb",
"chars": 1424,
"preview": "require \"benchmark\"\n\nclass Enumerator\n def defer(&blk)\n self.class.new do |y|\n each do |*input|\n blk.cal"
},
{
"path": "benchmark/serializer.rb",
"chars": 1978,
"preview": "require \"benchmark\"\nrequire \"yaml\"\nrequire \"msgpack\"\nrequire \"oj\"\n# require \"thrift\"\n \nputs \"Simple\"\n\ndata = (0..100000)"
},
{
"path": "benchmark/sort.rb",
"chars": 1356,
"preview": "require \"benchmark\"\n\narray = []\n1000.times { \n array << {:bar => rand(1000)} \n}\n\nn = 500\nBenchmark.bm(20) do |x|\n x.re"
},
{
"path": "benchmark/sort2.rb",
"chars": 3542,
"preview": "require \"benchmark\"\nrequire \"algorithms\"\n\nNUMBER_OF_SORTING = 1\nNUMBER_OF_ARRAY = 10\nWORDS_IN_ARRAY = 100000\nMAX_WO"
},
{
"path": "benchmark/take.rb",
"chars": 445,
"preview": "require \"benchmark\"\n\nSIZE = 100_000_000\n\n@array1 = (0..SIZE).to_a;\n@array2 = (0..SIZE).to_a;\n@array3 = (0..SIZE).to_a;\n\n"
},
{
"path": "bin/ruby-spark",
"chars": 199,
"preview": "#!/usr/bin/env ruby\n\nlib = File.expand_path(File.dirname(__FILE__) + '/../lib')\n$LOAD_PATH.unshift(lib) if File.director"
},
{
"path": "example/pi.rb",
"chars": 482,
"preview": "#!/usr/bin/env ruby\n\nlib = File.expand_path(File.dirname(__FILE__) + '/../lib')\n$LOAD_PATH.unshift(lib) if File.director"
},
{
"path": "example/website_search.rb",
"chars": 1655,
"preview": "#!/usr/bin/env ruby\n\n# Parse sitemap and search word on every page\n\nrequire 'optparse'\nrequire 'open-uri'\nrequire 'nokog"
},
{
"path": "ext/ruby_c/extconf.rb",
"chars": 50,
"preview": "require 'mkmf'\n\ncreate_makefile(\"ruby_spark_ext\")\n"
},
{
"path": "ext/ruby_c/murmur.c",
"chars": 3441,
"preview": "#include \"murmur.h\"\n\n#if defined(_MSC_VER)\n#define BIG_CONSTANT(x) (x)\n#else\n#define BIG_CONSTANT(x) (x##LLU)\n#endif\n\n/*"
},
{
"path": "ext/ruby_c/murmur.h",
"chars": 205,
"preview": "#ifndef MURMUR_INCLUDED\n#define MURMUR_INCLUDED\n\n#include \"ruby.h\"\n\nVALUE method_portable_hash(int argc, VALUE *argv, VA"
},
{
"path": "ext/ruby_c/ruby-spark.c",
"chars": 556,
"preview": "#include \"ruby.h\"\n#include \"murmur.h\"\n\n\nVALUE SparkModule;\nVALUE SparkDigestModule;\nVALUE SparkDigestMurmur2Class;\n\n\nvoi"
},
{
"path": "ext/ruby_java/Digest.java",
"chars": 990,
"preview": "import org.jruby.Ruby;\nimport org.jruby.RubyModule;\nimport org.jruby.RubyObject;\nimport org.jruby.RubyClass;\nimport org."
},
{
"path": "ext/ruby_java/Murmur2.java",
"chars": 2700,
"preview": "import org.jruby.Ruby;\nimport org.jruby.RubyClass;\nimport org.jruby.RubyObject;\nimport org.jruby.RubyString;\nimport org."
},
{
"path": "ext/ruby_java/RubySparkExtService.java",
"chars": 988,
"preview": "import org.jruby.Ruby;\nimport org.jruby.RubyClass;\nimport org.jruby.RubyModule;\nimport org.jruby.runtime.ObjectAllocator"
},
{
"path": "ext/ruby_java/extconf.rb",
"chars": 50,
"preview": "require 'mkmf'\n\ncreate_makefile(\"ruby_spark_ext\")\n"
},
{
"path": "ext/spark/build.sbt",
"chars": 3127,
"preview": "import AssemblyKeys._\n\nassemblySettings\n\n// Default values\nval defaultScalaVersion = \"2.10.4\"\nval defaultSparkVersio"
},
{
"path": "ext/spark/project/plugins.sbt",
"chars": 408,
"preview": "resolvers += Resolver.url(\"artifactory\", url(\"http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases\"))(Reso"
},
{
"path": "ext/spark/sbt/sbt",
"chars": 1377,
"preview": "#!/bin/bash\n\n# This script launches sbt for this project. If present it uses the system\n# version of sbt. If there is no"
},
{
"path": "ext/spark/src/main/scala/Exec.scala",
"chars": 2412,
"preview": "package org.apache.spark.api.ruby\n\nimport java.io.{File, FileOutputStream, InputStreamReader, BufferedReader}\n\nimport sc"
},
{
"path": "ext/spark/src/main/scala/MLLibAPI.scala",
"chars": 124,
"preview": "package org.apache.spark.mllib.api.python\n\n// PythonMLLibAPI is private for python\nclass MLLibAPI extends PythonMLLibAPI"
},
{
"path": "ext/spark/src/main/scala/Marshal.scala",
"chars": 1452,
"preview": "package org.apache.spark.api.ruby.marshal\n\nimport java.io.{DataInputStream, DataOutputStream, ByteArrayInputStream, Byte"
},
{
"path": "ext/spark/src/main/scala/MarshalDump.scala",
"chars": 2560,
"preview": "package org.apache.spark.api.ruby.marshal\n\nimport java.io.{DataInputStream, DataOutputStream, ByteArrayInputStream, Byte"
},
{
"path": "ext/spark/src/main/scala/MarshalLoad.scala",
"chars": 5419,
"preview": "package org.apache.spark.api.ruby.marshal\n\nimport java.io.{DataInputStream, DataOutputStream, ByteArrayInputStream, Byte"
},
{
"path": "ext/spark/src/main/scala/RubyAccumulatorParam.scala",
"chars": 2284,
"preview": "package org.apache.spark.api.ruby\n\nimport java.io._\nimport java.net._\nimport java.util.{List, ArrayList}\n\nimport scala.c"
},
{
"path": "ext/spark/src/main/scala/RubyBroadcast.scala",
"chars": 433,
"preview": "package org.apache.spark.api.ruby\n\nimport org.apache.spark.api.python.PythonBroadcast\n\n/**\n * An Wrapper for Ruby Broadc"
},
{
"path": "ext/spark/src/main/scala/RubyConstant.scala",
"chars": 289,
"preview": "package org.apache.spark.api.ruby\n\nobject RubyConstant {\n val DATA_EOF = -2\n val WORKER_ERROR = -1\n val WORKER_DONE ="
},
{
"path": "ext/spark/src/main/scala/RubyMLLibAPI.scala",
"chars": 1799,
"preview": "package org.apache.spark.mllib.api.ruby\n\nimport java.util.ArrayList\n\nimport scala.collection.JavaConverters._\n\nimport or"
},
{
"path": "ext/spark/src/main/scala/RubyMLLibUtilAPI.scala",
"chars": 560,
"preview": "package org.apache.spark.mllib.api.ruby\n\nimport java.util.ArrayList\n\nimport org.apache.spark.mllib.util.LinearDataGenera"
},
{
"path": "ext/spark/src/main/scala/RubyPage.scala",
"chars": 861,
"preview": "package org.apache.spark.ui.ruby\n\n// import javax.servlet.http.HttpServletRequest\n\n// import scala.xml.Node\n\n// import o"
},
{
"path": "ext/spark/src/main/scala/RubyRDD.scala",
"chars": 13384,
"preview": "package org.apache.spark.api.ruby\n\nimport java.io._\nimport java.net._\nimport java.util.{List, ArrayList, Collections}\n\ni"
},
{
"path": "ext/spark/src/main/scala/RubySerializer.scala",
"chars": 462,
"preview": "package org.apache.spark.api.ruby\n\nimport scala.collection.JavaConverters._\nimport scala.reflect.{ClassTag, classTag}\n\ni"
},
{
"path": "ext/spark/src/main/scala/RubyTab.scala",
"chars": 287,
"preview": "package org.apache.spark.ui.ruby\n\nimport scala.collection.mutable.HashMap\n\nimport org.apache.spark.ui._\n\n// class RubyTa"
},
{
"path": "ext/spark/src/main/scala/RubyUtils.scala",
"chars": 334,
"preview": "package org.apache.spark.api.ruby\n\nimport org.apache.spark.util._\nimport org.apache.spark.{SparkConf, Logging}\n\nobject R"
},
{
"path": "ext/spark/src/main/scala/RubyWorker.scala",
"chars": 8385,
"preview": "package org.apache.spark.api.ruby\n\nimport java.io.{File, DataInputStream, InputStream, DataOutputStream, FileOutputStrea"
},
{
"path": "ext/spark/src/test/scala/MarshalSpec.scala",
"chars": 2065,
"preview": "package org.apache.spark.api.ruby.marshal\n\nimport org.scalatest._\n\n\nimport org.apache.spark.api.ruby.marshal._\n\nclass Ma"
},
{
"path": "lib/ruby-spark.rb",
"chars": 25,
"preview": "require_relative 'spark'\n"
},
{
"path": "lib/spark/accumulator.rb",
"chars": 6087,
"preview": "module Spark\n ##\n # A shared variable that can be accumulated, i.e., has a commutative and associative \"add\"\n # opera"
},
{
"path": "lib/spark/broadcast.rb",
"chars": 2505,
"preview": "module Spark\n ##\n # Broadcast a read-only variable to the cluster, returning a Spark::Broadcast\n # object for reading"
},
{
"path": "lib/spark/build.rb",
"chars": 1339,
"preview": "module Spark\n module Build\n\n DEFAULT_SCALA_VERSION = '2.10.4'\n DEFAULT_CORE_VERSION = '2.10'\n DEFAULT_SPARK"
},
{
"path": "lib/spark/cli.rb",
"chars": 5488,
"preview": "require 'commander'\n\nmodule Commander\n module UI\n # Disable paging\n # for 'classic' help\n def self.enable_pagi"
},
{
"path": "lib/spark/command/base.rb",
"chars": 3561,
"preview": "##\n# Spark::Command::Base\n#\n# Parent for all commands (Map, FlatMap, Sort, ...)\n#\nclass Spark::Command::Base\n\n DEFAULT_"
},
{
"path": "lib/spark/command/basic.rb",
"chars": 7381,
"preview": "_Base = Spark::Command::Base\n\n# ----------------------------------------------------------------------------------------"
},
{
"path": "lib/spark/command/pair.rb",
"chars": 2602,
"preview": "_Base = Spark::Command::Base\n\n# ----------------------------------------------------------------------------------------"
},
{
"path": "lib/spark/command/sort.rb",
"chars": 1230,
"preview": "_Base = Spark::Command::Base\n\n# ----------------------------------------------------------------------------------------"
},
{
"path": "lib/spark/command/statistic.rb",
"chars": 3154,
"preview": "_Base = Spark::Command::Base\n\n# ----------------------------------------------------------------------------------------"
},
{
"path": "lib/spark/command.rb",
"chars": 2147,
"preview": "module Spark\n ##\n # Container which includes all commands and other things for worker\n # Every RDD have own copy of C"
},
{
"path": "lib/spark/command_builder.rb",
"chars": 3742,
"preview": "require 'spark/command_validator'\n\nmodule Spark\n ##\n # Builder for building correct {Spark::Command}\n #\n class Comma"
},
{
"path": "lib/spark/command_validator.rb",
"chars": 707,
"preview": "module Spark\n module CommandValidator\n\n def validate(value, options)\n validate_type(value, options[:type])\n "
},
{
"path": "lib/spark/config.rb",
"chars": 5537,
"preview": "# Necessary libraries\nSpark.load_lib\n\nmodule Spark\n # Common configuration for RubySpark and Spark\n class Config\n\n "
},
{
"path": "lib/spark/constant.rb",
"chars": 293,
"preview": "module Spark\n # Commond constant for Ruby and Spark\n module Constant\n DATA_EOF = -2\n WORKER_ERROR = -1\n WORKE"
},
{
"path": "lib/spark/context.rb",
"chars": 10104,
"preview": "# Necessary libraries\nSpark.load_lib\n\nmodule Spark\n ##\n # Main entry point for Spark functionality. A SparkContext rep"
},
{
"path": "lib/spark/error.rb",
"chars": 1000,
"preview": "module Spark\n # Extension cannot be built\n class BuildError < StandardError\n end\n\n # Proc.to_source\n # Java object "
},
{
"path": "lib/spark/ext/hash.rb",
"chars": 1130,
"preview": "module Spark\n module CoreExtension\n module Hash\n module ClassMethods\n end\n\n module InstanceMethods\n "
},
{
"path": "lib/spark/ext/integer.rb",
"chars": 594,
"preview": "module Spark\n module CoreExtension\n module Integer\n module ClassMethods\n end\n\n module InstanceMethods"
},
{
"path": "lib/spark/ext/io.rb",
"chars": 1321,
"preview": "module Spark\n module CoreExtension\n module IO\n module ClassMethods\n end\n\n module InstanceMethods\n\n "
},
{
"path": "lib/spark/ext/ip_socket.rb",
"chars": 495,
"preview": "module Spark\n module CoreExtension\n module IPSocket\n module ClassMethods\n end\n\n module InstanceMethod"
},
{
"path": "lib/spark/ext/module.rb",
"chars": 1362,
"preview": "module Spark\n module CoreExtension\n module Module\n\n # Patch method to class unless already exist\n #\n "
},
{
"path": "lib/spark/ext/object.rb",
"chars": 1996,
"preview": "module Spark\n module CoreExtension\n module Object\n module ClassMethods\n end\n\n module InstanceMethods\n"
},
{
"path": "lib/spark/ext/string.rb",
"chars": 539,
"preview": "module Spark\n module CoreExtension\n module String\n module ClassMethods\n end\n\n module InstanceMethods\n"
},
{
"path": "lib/spark/helper/logger.rb",
"chars": 885,
"preview": "module Spark\n module Helper\n module Logger\n\n def self.included(base)\n base.send :extend, Methods\n "
},
{
"path": "lib/spark/helper/parser.rb",
"chars": 2061,
"preview": "module Spark\n module Helper\n module Parser\n \n def self.included(base)\n base.send :extend, Methods\n"
},
{
"path": "lib/spark/helper/serialize.rb",
"chars": 1536,
"preview": "module Spark\n module Helper\n module Serialize\n\n DIRECTIVE_INTEGER_BIG_ENDIAN = 'l>'\n DIRECTIVE_INTEGERS_BI"
},
{
"path": "lib/spark/helper/statistic.rb",
"chars": 3005,
"preview": "module Spark\n module Helper\n module Statistic\n\n # Returns a sampling rate that guarantees a sample of size >= s"
},
{
"path": "lib/spark/helper/system.rb",
"chars": 909,
"preview": "module Spark\n module Helper\n module System\n\n def self.included(base)\n base.send :extend, Methods\n "
},
{
"path": "lib/spark/helper.rb",
"chars": 330,
"preview": "module Spark\n module Helper\n autoload :System, \"spark/helper/system\"\n autoload :Logger, \"spark/helper/logge"
},
{
"path": "lib/spark/java_bridge/base.rb",
"chars": 6399,
"preview": "##\n# Spark::JavaBridge::Base\n#\n# Parent for all adapter (ruby - java)\n#\nmodule Spark\n module JavaBridge\n class Base\n"
},
{
"path": "lib/spark/java_bridge/jruby.rb",
"chars": 432,
"preview": "require 'java'\n\nmodule Spark\n module JavaBridge\n class JRuby < Base\n\n def initialize(*args)\n super\n "
},
{
"path": "lib/spark/java_bridge/rjb.rb",
"chars": 702,
"preview": "if !ENV.has_key?('JAVA_HOME')\n raise Spark::ConfigurationError, 'Environment variable JAVA_HOME is not set'\nend\n\nrequir"
},
{
"path": "lib/spark/java_bridge.rb",
"chars": 355,
"preview": "module Spark\n module JavaBridge\n\n autoload :Base, 'spark/java_bridge/base'\n autoload :JRuby, 'spark/java_bridge/"
},
{
"path": "lib/spark/library.rb",
"chars": 450,
"preview": "module Spark\n module Library\n\n def autoload(klass, location, import=true)\n if import\n @for_importing ||="
},
{
"path": "lib/spark/logger.rb",
"chars": 1268,
"preview": "# Necessary libraries\nSpark.load_lib\n\nmodule Spark\n class Logger\n\n attr_reader :jlogger\n\n def initialize\n @j"
},
{
"path": "lib/spark/mllib/classification/common.rb",
"chars": 541,
"preview": "module Spark\n module Mllib\n class ClassificationModel\n\n attr_reader :weights, :intercept, :threshold\n\n def"
},
{
"path": "lib/spark/mllib/classification/logistic_regression.rb",
"chars": 6916,
"preview": "module Spark\n module Mllib\n ##\n # LogisticRegressionModel\n #\n # A linear binary classification model derive"
},
{
"path": "lib/spark/mllib/classification/naive_bayes.rb",
"chars": 2777,
"preview": "module Spark\n module Mllib\n ##\n # NaiveBayesModel\n #\n # Model for Naive Bayes classifiers.\n #\n # Cont"
},
{
"path": "lib/spark/mllib/classification/svm.rb",
"chars": 4150,
"preview": "module Spark\n module Mllib\n ##\n # SVMModel\n #\n # A support vector machine.\n #\n # == Examples:\n #\n "
},
{
"path": "lib/spark/mllib/clustering/gaussian_mixture.rb",
"chars": 2205,
"preview": "module Spark\n module Mllib\n ##\n # GaussianMixtureModel\n #\n # A clustering model derived from the Gaussian M"
},
{
"path": "lib/spark/mllib/clustering/kmeans.rb",
"chars": 3394,
"preview": "module Spark\n module Mllib\n ##\n # KMeansModel\n #\n # A clustering model derived from the k-means method.\n "
},
{
"path": "lib/spark/mllib/matrix.rb",
"chars": 2422,
"preview": "module Spark\n module Mllib\n module Matrices\n\n def self.dense(*args)\n DenseMatrix.new(*args)\n end\n\n "
},
{
"path": "lib/spark/mllib/regression/common.rb",
"chars": 1769,
"preview": "module Spark\n module Mllib\n ##\n # RegressionModel\n #\n # A linear model that has a vector of coefficients an"
},
{
"path": "lib/spark/mllib/regression/labeled_point.rb",
"chars": 767,
"preview": "module Spark\n module Mllib\n ##\n # LabeledPoint\n #\n # The features and labels of a data point.\n #\n # ="
},
{
"path": "lib/spark/mllib/regression/lasso.rb",
"chars": 3706,
"preview": "##\n# LassoModel\n#\n# Train a regression model with L1-regularization using Stochastic Gradient Descent.\n# This solves the"
},
{
"path": "lib/spark/mllib/regression/linear.rb",
"chars": 4268,
"preview": "##\n# LinearRegressionModel\n#\n# Train a linear regression model with no regularization using Stochastic Gradient Descent."
},
{
"path": "lib/spark/mllib/regression/ridge.rb",
"chars": 3716,
"preview": "##\n# RidgeRegressionModel\n#\n# Train a regression model with L2-regularization using Stochastic Gradient Descent.\n# This "
},
{
"path": "lib/spark/mllib/ruby_matrix/matrix_adapter.rb",
"chars": 1139,
"preview": "require 'matrix'\n\nmodule Spark\n module Mllib\n class MatrixAdapter < ::Matrix\n\n def self.new(*args)\n obje"
},
{
"path": "lib/spark/mllib/ruby_matrix/vector_adapter.rb",
"chars": 1027,
"preview": "require 'matrix'\n\n# Based on ruby 2.1\n\nclass Vector\n def self.elements(array, copy=true)\n DenseVector.new(convert_to"
},
{
"path": "lib/spark/mllib/stat/distribution.rb",
"chars": 460,
"preview": "##\n# MultivariateGaussian\n#\n# This class provides basic functionality for a Multivariate Gaussian (Normal) Distribution."
},
{
"path": "lib/spark/mllib/vector.rb",
"chars": 4035,
"preview": "module Spark\n module Mllib\n module Vectors\n\n def self.dense(*args)\n DenseVector.new(*args)\n end\n\n "
},
{
"path": "lib/spark/mllib.rb",
"chars": 3300,
"preview": "module Spark\n # MLlib is Spark’s scalable machine learning library consisting of common learning algorithms and utiliti"
},
{
"path": "lib/spark/rdd.rb",
"chars": 42715,
"preview": "module Spark\n ##\n # A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. Represents an immutable,\n "
},
{
"path": "lib/spark/sampler.rb",
"chars": 1761,
"preview": "require 'distribution'\n\n# Random Generators\nmodule Spark\n module RandomGenerator\n class Poisson\n\n def initializ"
},
{
"path": "lib/spark/serializer/auto_batched.rb",
"chars": 1314,
"preview": "module Spark\n module Serializer\n ##\n # AutoBatched serializator\n #\n # Batch size is computed automatically."
},
{
"path": "lib/spark/serializer/base.rb",
"chars": 1144,
"preview": "module Spark\n module Serializer\n # @abstract Parent for all serializers\n class Base\n\n def load_from_io(io)\n "
},
{
"path": "lib/spark/serializer/batched.rb",
"chars": 1651,
"preview": "module Spark\n module Serializer\n class Batched < Base\n\n attr_writer :serializer\n\n def initialize(serialize"
},
{
"path": "lib/spark/serializer/cartesian.rb",
"chars": 224,
"preview": "module Spark\n module Serializer\n class Cartesian < Pair\n\n def aggregate(item1, item2)\n item1.product(ite"
},
{
"path": "lib/spark/serializer/compressed.rb",
"chars": 503,
"preview": "module Spark\n module Serializer\n class Compressed < Base\n\n def initialize(serializer)\n @serializer = ser"
},
{
"path": "lib/spark/serializer/marshal.rb",
"chars": 266,
"preview": "module Spark\n module Serializer\n class Marshal < Base\n\n def dump(data)\n ::Marshal.dump(data)\n end\n\n"
},
{
"path": "lib/spark/serializer/message_pack.rb",
"chars": 417,
"preview": "module Spark\n module Serializer\n class MessagePack < Base\n\n def dump(data)\n ::MessagePack.dump(data)\n "
},
{
"path": "lib/spark/serializer/oj.rb",
"chars": 328,
"preview": "module Spark\n module Serializer\n class Oj < Base\n\n def dump(data)\n ::Oj.dump(data)\n end\n\n def "
},
{
"path": "lib/spark/serializer/pair.rb",
"chars": 928,
"preview": "module Spark\n module Serializer\n class Pair < Base\n\n def initialize(serializer1, serializer2)\n @serializ"
},
{
"path": "lib/spark/serializer/text.rb",
"chars": 483,
"preview": "module Spark\n module Serializer\n class Text < Base\n\n attr_reader :encoding\n\n def initialize(encoding=Encod"
},
{
"path": "lib/spark/serializer.rb",
"chars": 1773,
"preview": "module Spark\n ##\n # Serializer\n #\n module Serializer\n\n DEFAULT_COMPRESS = false\n DEFAULT_BATCH_SIZE = 1024\n "
},
{
"path": "lib/spark/sort.rb",
"chars": 4802,
"preview": "module Spark\n module InternalSorter\n class Base\n def initialize(key_function)\n @key_function = key_funct"
},
{
"path": "lib/spark/sql/column.rb",
"chars": 9904,
"preview": "module Spark\n module SQL\n class Column\n\n # ===================================================================="
},
{
"path": "lib/spark/sql/context.rb",
"chars": 316,
"preview": "module Spark\n module SQL\n class Context\n\n attr_reader :spark_context, :jsql_context\n\n def initialize(spark"
},
{
"path": "lib/spark/sql/data_frame.rb",
"chars": 6695,
"preview": "module Spark\n module SQL\n ##\n # Spark::SQL::DataFrame\n #\n # All example are base on people.json\n #\n c"
},
{
"path": "lib/spark/sql/data_frame_reader.rb",
"chars": 2943,
"preview": "module Spark\n module SQL\n class DataFrameReader\n\n attr_reader :sql_context, :jreader\n\n def initialize(sql_"
},
{
"path": "lib/spark/sql/data_type.rb",
"chars": 4685,
"preview": "module Spark\n module SQL\n ##\n # Spark::SQL::DataType\n #\n class DataType\n\n cattr_accessor :atomic_types"
},
{
"path": "lib/spark/sql/row.rb",
"chars": 1097,
"preview": "module Spark\n module SQL\n ##\n # Spark::SQL::Row\n #\n class Row\n attr_reader :data\n\n def self.from_"
},
{
"path": "lib/spark/sql.rb",
"chars": 821,
"preview": "module Spark\n module SQL\n extend Spark::Library\n\n autoload_without_import :Context, 'spark/sql/context'\n "
},
{
"path": "lib/spark/stat_counter.rb",
"chars": 2853,
"preview": "module Spark\n class StatCounter\n\n attr_reader :n # count of our values\n attr_reader :mu # mean of our values\n "
},
{
"path": "lib/spark/storage_level.rb",
"chars": 1237,
"preview": "# Necessary libraries\nSpark.load_lib\n\nmodule Spark\n class StorageLevel\n\n def self.reload\n return if @reloaded\n "
},
{
"path": "lib/spark/version.rb",
"chars": 37,
"preview": "module Spark\n VERSION = '1.2.1'\nend\n"
},
{
"path": "lib/spark/worker/master.rb",
"chars": 2871,
"preview": "#!/usr/bin/env ruby\n\n$PROGRAM_NAME = 'RubySparkMaster'\n\nrequire 'socket'\nrequire 'io/wait'\nrequire 'nio'\n\nrequire_relati"
},
{
"path": "lib/spark/worker/spark_files.rb",
"chars": 224,
"preview": "class SparkFiles\n\n class << self\n attr_accessor :root_directory\n end\n\n def self.get(file_name)\n File.join(root_"
},
{
"path": "lib/spark/worker/worker.rb",
"chars": 4264,
"preview": "#!/usr/bin/env ruby\n\n# Load root of the gem\nlib = File.expand_path(File.join('..', '..'), File.dirname(__FILE__))\n$LOAD_"
},
{
"path": "lib/spark.rb",
"chars": 6566,
"preview": "# Gems and libraries\nrequire 'method_source'\nrequire 'securerandom'\nrequire 'forwardable'\nrequire 'sourcify'\nrequire 'so"
},
{
"path": "ruby-spark.gemspec",
"chars": 1328,
"preview": "# coding: utf-8\n\nlib = File.expand_path('../lib', __FILE__)\n$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)\n\nreq"
},
{
"path": "spec/generator.rb",
"chars": 758,
"preview": "class Generator\n def self.numbers(size=1000)\n Array.new(size){ rand(1..1000) }\n end\n\n def self.numbers_with_zero(s"
},
{
"path": "spec/inputs/lorem_300.txt",
"chars": 30820,
"preview": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aenean ligula neque, ultricies et lorem\nvel, accumsan cursus fe"
},
{
"path": "spec/inputs/numbers/1.txt",
"chars": 141,
"preview": "1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19\n20\n21\n22\n23\n24\n25\n26\n27\n28\n29\n30\n31\n32\n33\n34\n35\n36\n37\n38\n39\n40\n41\n42\n43\n"
},
{
"path": "spec/inputs/numbers/10.txt",
"chars": 200,
"preview": "451\n452\n453\n454\n455\n456\n457\n458\n459\n460\n461\n462\n463\n464\n465\n466\n467\n468\n469\n470\n471\n472\n473\n474\n475\n476\n477\n478\n479\n480\n"
},
{
"path": "spec/inputs/numbers/11.txt",
"chars": 200,
"preview": "501\n502\n503\n504\n505\n506\n507\n508\n509\n510\n511\n512\n513\n514\n515\n516\n517\n518\n519\n520\n521\n522\n523\n524\n525\n526\n527\n528\n529\n530\n"
},
{
"path": "spec/inputs/numbers/12.txt",
"chars": 200,
"preview": "551\n552\n553\n554\n555\n556\n557\n558\n559\n560\n561\n562\n563\n564\n565\n566\n567\n568\n569\n570\n571\n572\n573\n574\n575\n576\n577\n578\n579\n580\n"
},
{
"path": "spec/inputs/numbers/13.txt",
"chars": 200,
"preview": "601\n602\n603\n604\n605\n606\n607\n608\n609\n610\n611\n612\n613\n614\n615\n616\n617\n618\n619\n620\n621\n622\n623\n624\n625\n626\n627\n628\n629\n630\n"
},
{
"path": "spec/inputs/numbers/14.txt",
"chars": 200,
"preview": "651\n652\n653\n654\n655\n656\n657\n658\n659\n660\n661\n662\n663\n664\n665\n666\n667\n668\n669\n670\n671\n672\n673\n674\n675\n676\n677\n678\n679\n680\n"
},
{
"path": "spec/inputs/numbers/15.txt",
"chars": 200,
"preview": "701\n702\n703\n704\n705\n706\n707\n708\n709\n710\n711\n712\n713\n714\n715\n716\n717\n718\n719\n720\n721\n722\n723\n724\n725\n726\n727\n728\n729\n730\n"
},
{
"path": "spec/inputs/numbers/16.txt",
"chars": 200,
"preview": "751\n752\n753\n754\n755\n756\n757\n758\n759\n760\n761\n762\n763\n764\n765\n766\n767\n768\n769\n770\n771\n772\n773\n774\n775\n776\n777\n778\n779\n780\n"
},
{
"path": "spec/inputs/numbers/17.txt",
"chars": 200,
"preview": "801\n802\n803\n804\n805\n806\n807\n808\n809\n810\n811\n812\n813\n814\n815\n816\n817\n818\n819\n820\n821\n822\n823\n824\n825\n826\n827\n828\n829\n830\n"
},
{
"path": "spec/inputs/numbers/18.txt",
"chars": 200,
"preview": "851\n852\n853\n854\n855\n856\n857\n858\n859\n860\n861\n862\n863\n864\n865\n866\n867\n868\n869\n870\n871\n872\n873\n874\n875\n876\n877\n878\n879\n880\n"
},
{
"path": "spec/inputs/numbers/19.txt",
"chars": 200,
"preview": "901\n902\n903\n904\n905\n906\n907\n908\n909\n910\n911\n912\n913\n914\n915\n916\n917\n918\n919\n920\n921\n922\n923\n924\n925\n926\n927\n928\n929\n930\n"
},
{
"path": "spec/inputs/numbers/2.txt",
"chars": 151,
"preview": "51\n52\n53\n54\n55\n56\n57\n58\n59\n60\n61\n62\n63\n64\n65\n66\n67\n68\n69\n70\n71\n72\n73\n74\n75\n76\n77\n78\n79\n80\n81\n82\n83\n84\n85\n86\n87\n88\n89\n90\n"
},
{
"path": "spec/inputs/numbers/20.txt",
"chars": 201,
"preview": "951\n952\n953\n954\n955\n956\n957\n958\n959\n960\n961\n962\n963\n964\n965\n966\n967\n968\n969\n970\n971\n972\n973\n974\n975\n976\n977\n978\n979\n980\n"
},
{
"path": "spec/inputs/numbers/3.txt",
"chars": 200,
"preview": "101\n102\n103\n104\n105\n106\n107\n108\n109\n110\n111\n112\n113\n114\n115\n116\n117\n118\n119\n120\n121\n122\n123\n124\n125\n126\n127\n128\n129\n130\n"
},
{
"path": "spec/inputs/numbers/4.txt",
"chars": 200,
"preview": "151\n152\n153\n154\n155\n156\n157\n158\n159\n160\n161\n162\n163\n164\n165\n166\n167\n168\n169\n170\n171\n172\n173\n174\n175\n176\n177\n178\n179\n180\n"
},
{
"path": "spec/inputs/numbers/5.txt",
"chars": 200,
"preview": "201\n202\n203\n204\n205\n206\n207\n208\n209\n210\n211\n212\n213\n214\n215\n216\n217\n218\n219\n220\n221\n222\n223\n224\n225\n226\n227\n228\n229\n230\n"
},
{
"path": "spec/inputs/numbers/6.txt",
"chars": 200,
"preview": "251\n252\n253\n254\n255\n256\n257\n258\n259\n260\n261\n262\n263\n264\n265\n266\n267\n268\n269\n270\n271\n272\n273\n274\n275\n276\n277\n278\n279\n280\n"
},
{
"path": "spec/inputs/numbers/7.txt",
"chars": 200,
"preview": "301\n302\n303\n304\n305\n306\n307\n308\n309\n310\n311\n312\n313\n314\n315\n316\n317\n318\n319\n320\n321\n322\n323\n324\n325\n326\n327\n328\n329\n330\n"
},
{
"path": "spec/inputs/numbers/8.txt",
"chars": 200,
"preview": "351\n352\n353\n354\n355\n356\n357\n358\n359\n360\n361\n362\n363\n364\n365\n366\n367\n368\n369\n370\n371\n372\n373\n374\n375\n376\n377\n378\n379\n380\n"
},
{
"path": "spec/inputs/numbers/9.txt",
"chars": 200,
"preview": "401\n402\n403\n404\n405\n406\n407\n408\n409\n410\n411\n412\n413\n414\n415\n416\n417\n418\n419\n420\n421\n422\n423\n424\n425\n426\n427\n428\n429\n430\n"
},
{
"path": "spec/inputs/numbers_0_100.txt",
"chars": 293,
"preview": "0\n1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19\n20\n21\n22\n23\n24\n25\n26\n27\n28\n29\n30\n31\n32\n33\n34\n35\n36\n37\n38\n39\n40\n41\n42\n4"
},
{
"path": "spec/inputs/numbers_1_100.txt",
"chars": 291,
"preview": "1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19\n20\n21\n22\n23\n24\n25\n26\n27\n28\n29\n30\n31\n32\n33\n34\n35\n36\n37\n38\n39\n40\n41\n42\n43\n"
},
{
"path": "spec/inputs/people.json",
"chars": 12535,
"preview": "{\"id\":1,\"name\":\"Matthew Fuller\",\"age\":49,\"email\":\"mfuller0@blogger.com\",\"active\":false}\n{\"id\":2,\"name\":\"Pamela Thomas\",\""
},
{
"path": "spec/lib/collect_spec.rb",
"chars": 859,
"preview": "require 'spec_helper'\n\nRSpec.describe Spark::RDD do\n\n let(:mapping) { lambda{|x| [x, 1]} }\n let(:numbers) { Generator."
},
{
"path": "spec/lib/command_spec.rb",
"chars": 1495,
"preview": "require 'spec_helper'\n\ndef to_s_method(x)\n x.to_s\nend\n\nRSpec::describe Spark::CommandBuilder do\n let(:numbers) { Gener"
},
{
"path": "spec/lib/config_spec.rb",
"chars": 1159,
"preview": "require 'spec_helper'\n\nRSpec.describe Spark::Config do\n\n before(:context) do\n Spark.stop\n end\n\n after(:context) do"
},
{
"path": "spec/lib/context_spec.rb",
"chars": 4617,
"preview": "require 'spec_helper'\n\nRSpec.describe Spark::Context do\n\n it '.run_job' do\n workers = 5\n numbers = (0...100).to_a"
},
{
"path": "spec/lib/ext_spec.rb",
"chars": 1031,
"preview": "require 'spec_helper'\n\nRSpec.describe Array do\n\n it '.deep_copy' do\n data = ['a', 'b', 'c']\n new_data = data.dup\n"
},
{
"path": "spec/lib/external_apps_spec.rb",
"chars": 863,
"preview": "require 'spec_helper'\n\nRSpec.describe Spark::RDD do\n\n context '.pipe' do\n let(:words) { Generator.words }\n let("
},
{
"path": "spec/lib/filter_spec.rb",
"chars": 2079,
"preview": "require 'spec_helper'\n\ndef func4(item)\n item.start_with?('a') && item.size > 3 && item[1].to_s.ord > 106\nend\n\nRSpec.sha"
},
{
"path": "spec/lib/flat_map_spec.rb",
"chars": 2793,
"preview": "require 'spec_helper'\n\nRSpec.shared_examples 'a flat mapping' do |workers|\n it \"with #{workers || 'default'} worker\" do"
},
{
"path": "spec/lib/group_spec.rb",
"chars": 2769,
"preview": "require 'spec_helper'\n\nRSpec.shared_examples 'a groupping by key' do |workers|\n it \"with #{workers || 'default'} worker"
},
{
"path": "spec/lib/helper_spec.rb",
"chars": 2616,
"preview": "require 'spec_helper'\n\nRSpec.configure do |c|\n c.include Spark::Helper::Parser\n c.include Spark::Helper::Statistic\nend"
},
{
"path": "spec/lib/key_spec.rb",
"chars": 1393,
"preview": "require 'spec_helper'\n\nRSpec.shared_examples 'a keying by' do |workers|\n it \"with #{workers || 'default'} worker\" do\n "
},
{
"path": "spec/lib/manipulation_spec.rb",
"chars": 3261,
"preview": "require 'spec_helper'\n\nRSpec.describe 'Spark::RDD' do\n let(:numbers) { 1..100 }\n let(:rand_numbers) { Generator.number"
},
{
"path": "spec/lib/map_partitions_spec.rb",
"chars": 2254,
"preview": "require 'spec_helper'\n\ndef func3(x)\n x.map(&:to_i).reduce(:+)\nend\n\ndef func4_with_index(data, index)\n [{\n index => "
},
{
"path": "spec/lib/map_spec.rb",
"chars": 2468,
"preview": "require 'spec_helper'\n\nRSpec.shared_examples 'a mapping' do |workers|\n it \"with #{workers || 'default'} worker\" do\n "
},
{
"path": "spec/lib/mllib/classification_spec.rb",
"chars": 1280,
"preview": "require 'spec_helper'\n\nRSpec.describe 'Spark::Mllib classification' do\n\n let(:data1) do\n [\n LabeledPoint.new(0."
},
{
"path": "spec/lib/mllib/clustering_spec.rb",
"chars": 1005,
"preview": "require 'spec_helper'\n\nRSpec.describe 'Spark::Mllib clustering' do\n context 'kmeans' do\n it 'test' do\n data = ["
},
{
"path": "spec/lib/mllib/matrix_spec.rb",
"chars": 854,
"preview": "require 'spec_helper'\n\nRSpec.describe 'Spark::Mllib::Matrix' do\n context 'dense' do\n it 'construct' do\n values "
},
{
"path": "spec/lib/mllib/regression_spec.rb",
"chars": 3223,
"preview": "require 'spec_helper'\n\n# Mllib functions are tested on Spark\n# This just test if ruby call proper methods\n\nRSpec.describ"
},
{
"path": "spec/lib/mllib/vector_spec.rb",
"chars": 2236,
"preview": "require 'spec_helper'\n\nRSpec.describe 'Spark::Mllib::Vector' do\n\n context 'parsing' do\n it 'dense vector' do\n d"
},
{
"path": "spec/lib/reduce_by_key_spec.rb",
"chars": 2864,
"preview": "require 'spec_helper'\n\ndef flat_map(line)\n line.split\nend\n\ndef map(item)\n [item, 1]\nend\n\ndef reduce(x,y)\n x+y\nend\n\nRS"
},
{
"path": "spec/lib/reduce_spec.rb",
"chars": 3346,
"preview": "require 'spec_helper'\n\ndef longest_words(memo, word)\n memo.length > word.length ? memo : word\nend\n\nRSpec.shared_example"
},
{
"path": "spec/lib/sample_spec.rb",
"chars": 1164,
"preview": "require 'spec_helper'\n\n# Sample method can not be tested because of random generator\n# Just test it for raising error\n\nR"
},
{
"path": "spec/lib/serializer_spec.rb",
"chars": 4027,
"preview": "require 'spec_helper'\nrequire 'zlib'\n\nRSpec.describe Spark::Serializer do\n let(:data) { [1, 'test', 2.0, [3], {key: 'va"
},
{
"path": "spec/lib/sort_spec.rb",
"chars": 1479,
"preview": "require 'spec_helper'\n\nRSpec.shared_examples 'a sorting' do |workers|\n it \"with #{workers || 'default'} worker\" do\n "
},
{
"path": "spec/lib/sql/column_spec.rb",
"chars": 3459,
"preview": "require 'spec_helper'\n\nRSpec.shared_examples 'binary comparison' do |op|\n it \"#{op}\" do\n to_test = 20\n\n result = "
},
{
"path": "spec/lib/sql/data_frame_spec.rb",
"chars": 2501,
"preview": "require 'spec_helper'\n\nRSpec.describe Spark::SQL::DataFrame do\n\n let(:file) { File.join('spec', 'inputs', 'people.json'"
},
{
"path": "spec/lib/statistic_spec.rb",
"chars": 5828,
"preview": "require 'spec_helper'\n\nRSpec.shared_examples 'a stats' do |workers|\n let(:numbers) { [1.0, 1.0, 2.0, 3.0, 5.0, 8.0] }\n\n"
},
{
"path": "spec/lib/whole_text_files_spec.rb",
"chars": 904,
"preview": "require 'spec_helper'\n\nRSpec.shared_examples 'a whole_text_files' do |workers|\n it \"with #{workers || 'default'} worker"
},
{
"path": "spec/spec_helper.rb",
"chars": 694,
"preview": "require 'simplecov'\nSimpleCov.start\n\n$LOAD_PATH.unshift File.dirname(__FILE__) + '/../lib'\nrequire 'ruby-spark'\nrequire "
}
]
About this extraction
This page contains the full source code of the ondra-m/ruby-spark GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 191 files (440.0 KB), approximately 128.7k tokens, and a symbol index with 1072 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.