[
  {
    "path": ".gitignore",
    "content": "/.gemtags\n/.tags\n/java/spark.jar\n.jbundler\ntarget/*\n*.class\n*.jar\npom.xml\nvendor/*\n*.gem\n*.rbc\n.bundle\n.config\n.yardoc\nGemfile.lock\nInstalledFiles\n_yardoc\ncoverage\ndoc/\nlib/bundler/man\npkg\nrdoc\nspec/reports\ntest/tmp\ntest/version_tmp\ntmp\n*.bundle\n*.so\n*.o\n*.a\nmkmf.log\next/spark/target/*\next/spark/project/target/*\next/spark/project/project/target/*\nwiki\n/benchmark/performance/spark/*\n/benchmark/performance/rspark/*\n/_*\n"
  },
  {
    "path": ".travis.yml",
    "content": "language: ruby\n\nrvm:\n  - 2.2.0\n\nbefore_script:\n  - bundle exec rake compile\n  - bundle exec ruby bin/ruby-spark build\n\ncache:\n  bundler: true\n  directories:\n    - $HOME/.m2\n    - $HOME/.ivy2\n    - $HOME/.sbt\n"
  },
  {
    "path": "CHANGELOG.md",
    "content": "## Unreleased\n\n## 1.3.0\n\n  - new method on RDD (lookup)\n  - fix sbt url\n  - Spark 1.5.0\n\n## 1.2.0 (15.06.2015)\n\n  - target folder is now located at HOME\n  - better serializators\n  - error when java class does not exist\n  - default setting at ~/.ruby-spark.conf\n  - compatible with Spark 1.4.0\n  - added calling site to RDD\n"
  },
  {
    "path": "Gemfile",
    "content": "source 'https://rubygems.org'\n\ngemspec\n\ngem 'sourcify', '0.6.0.rc4'\ngem 'method_source'\ngem 'commander'\ngem 'pry'\ngem 'nio4r'\ngem 'distribution'\n\nplatform :mri do\n  gem 'rjb'\n  gem 'msgpack'\n  gem 'oj'\n  gem 'narray'\nend\n\nplatform :jruby do\n  gem 'msgpack-jruby', require: 'msgpack'\n\n  # NameError: no constructorfor arguments (org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.joda.time.chrono.GJChronology) on Java::OrgJodaTime::DateTime\n  # gem 'mdarray'\nend\n\ngroup :stats do\n  # gem 'nmatrix'\n  # gem 'statsample'\n  # gem 'statsample-glm'\n  # gem 'statsample-timeseries'\n  # gem 'statistics2'\n  # gem 'statsample-optimization' # libgsl0-dev\n  # gem 'narray'\n  # gem 'gsl-nmatrix'\nend\n\ngroup :development do\n  gem 'benchmark-ips'\n  gem 'rspec'\n  gem 'rake-compiler'\n  gem 'guard'\n  gem 'guard-rspec'\n  gem 'listen'\nend\n\ngroup :test do\n  gem 'simplecov', require: false\nend\n"
  },
  {
    "path": "Guardfile",
    "content": "guard :rspec, cmd: 'rspec' do\n  watch(%r{^spec/.+_spec\\.rb$})\n  watch(%r{^lib/(.+)\\.rb$})     { |m| \"spec/lib/#{m[1]}_spec.rb\" }\n  watch('spec/spec_helper.rb')  { \"spec\" }\nend\n"
  },
  {
    "path": "LICENSE.txt",
    "content": "Copyright (c) 2014 Ondřej Moravčík\n\nMIT License\n\nPermission is hereby granted, free of charge, to any person obtaining\na copy of this software and associated documentation files (the\n\"Software\"), to deal in the Software without restriction, including\nwithout limitation the rights to use, copy, modify, merge, publish,\ndistribute, sublicense, and/or sell copies of the Software, and to\npermit persons to whom the Software is furnished to do so, subject to\nthe following conditions:\n\nThe above copyright notice and this permission notice shall be\nincluded in all copies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND,\nEXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\nMERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\nNONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE\nLIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION\nOF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION\nWITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n"
  },
  {
    "path": "README.md",
    "content": "# Ruby-Spark [![Build Status](https://travis-ci.org/ondra-m/ruby-spark.svg?branch=master)](https://travis-ci.org/ondra-m/ruby-spark)\n\nApache Spark™ is a fast and general engine for large-scale data processing.\n\nThis Gem allows the use Spark functionality on Ruby.\n\n> Word count in Spark's Ruby API\n\n```ruby\nfile = spark.text_file(\"hdfs://...\")\n\nfile.flat_map(:split)\n    .map(lambda{|word| [word, 1]})\n    .reduce_by_key(lambda{|a, b| a+b})\n```\n\n- [Apache Spark](http://spark.apache.org)\n- [Wiki](https://github.com/ondra-m/ruby-spark/wiki)\n- [Rubydoc](http://www.rubydoc.info/gems/ruby-spark)\n\n## Installation\n\n### Requirments\n\n- Java 7+\n- Ruby 2+\n- wget or curl\n- MRI or JRuby\n\nAdd this line to your application's Gemfile:\n\n```ruby\ngem 'ruby-spark'\n```\n\nAnd then execute:\n\n```\n$ bundle\n```\n\nOr install it yourself as:\n\n```\n$ gem install ruby-spark\n```\n\nRun `rake compile` if you are using gem from local filesystem.\n\n### Build Apache Spark\n\nThis command will download Spark and build extensions for this gem ([SBT](ext/spark/build.sbt) is used for compiling). For more informations check [wiki](https://github.com/ondra-m/ruby-spark/wiki/Installation). Jars will be stored at you HOME directory.\n\n```\n$ ruby-spark build\n```\n\n\n## Usage\n\nYou can use Ruby Spark via interactive shell (Pry is used)\n\n```\n$ ruby-spark shell\n```\n\nOr on existing project.\n\nIf you want configure Spark first. See [configurations](https://github.com/ondra-m/ruby-spark/wiki/Configuration) for more details.\n\n```ruby\nrequire 'ruby-spark'\n\n# Configuration\nSpark.config do\n   set_app_name \"RubySpark\"\n   set 'spark.ruby.serializer', 'oj'\n   set 'spark.ruby.serializer.batch_size', 100\nend\n\n# Start Apache Spark\nSpark.start\n\n# Context reference\nSpark.sc\n```\n\nFinally, to stop the cluster. On the shell is Spark stopped automatically when environment exit.\n\n```ruby\nSpark.stop\n```\nAfter first use, global configuration is created at **~/.ruby-spark.conf**. There can be specified properties for Spark and RubySpark.\n\n\n\n## Creating RDD (a new collection)\n\nSingle text file:\n\n```ruby\nrdd = sc.text_file(FILE, workers_num, serializer=nil)\n```\n\nAll files on directory:\n\n```ruby\nrdd = sc.whole_text_files(DIRECTORY, workers_num, serializer=nil)\n```\n\nDirect uploading structures from ruby:\n\n```ruby\nrdd = sc.parallelize([1,2,3,4,5], workers_num, serializer=nil)\nrdd = sc.parallelize(1..5, workers_num, serializer=nil)\n```\n\nThere is 2 conditions:\n1. choosen serializer must be able to serialize it\n2. data must be iterable\n\nIf you do not specified serializer -> default is used (defined from spark.ruby.serializer.* options). [Check this](https://github.com/ondra-m/ruby-spark/wiki/Loading-data#custom-serializer) if you want create custom serializer.\n\n## Operations\n\nAll operations can be divided into 2 groups:\n\n- **Transformations:** append new operation to current RDD and return new\n- **Actions:** add operation and start calculations\n\nMore informations:\n\n- [Wiki page](https://github.com/ondra-m/ruby-spark/wiki/RDD)\n- [Rubydoc](http://www.rubydoc.info/github/ondra-m/ruby-spark/master/Spark/RDD)\n- [rdd.rb](https://github.com/ondra-m/ruby-spark/blob/master/lib/spark/rdd.rb)\n\nYou can also check official Spark documentation. First make sure that method is implemented here.\n\n- [Transformations](http://spark.apache.org/docs/latest/programming-guide.html#transformations)\n- [Actions](http://spark.apache.org/docs/latest/programming-guide.html#actions)\n\n#### Transformations\n\n<dl>          \n  <dt><code>rdd.map(function)</code></dt>\n  <dd>Return a new RDD by applying a function to all elements of this RDD.</dd>\n\n  <dt><code>rdd.flat_map(function)</code></dt>\n  <dd>Return a new RDD by first applying a function to all elements of this RDD, and then flattening the results.</dd>\n\n  <dt><code>rdd.map_partitions(function)</code></dt>\n  <dd>Return a new RDD by applying a function to each partition of this RDD.</dd>\n\n  <dt><code>rdd.filter(function)</code></dt>\n  <dd>Return a new RDD containing only the elements that satisfy a predicate.</dd>\n\n  <dt><code>rdd.cartesian(other)</code></dt>\n  <dd>Return the Cartesian product of this RDD and another one, that is, the RDD of all pairs of elements `(a, b)` where `a` is in `self` and `b` is in `other`.</dd>\n\n  <dt><code>rdd.intersection(other)</code></dt>\n  <dd>Return the intersection of this RDD and another one. The output will not contain any duplicate elements, even if the input RDDs did.</dd>\n\n  <dt><code>rdd.sample(with_replacement, fraction, seed)</code></dt>\n  <dd>Return a sampled subset of this RDD. Operations are base on Poisson and Uniform distributions.</dd>\n\n  <dt><code>rdd.group_by_key(num_partitions)</code></dt>\n  <dd>Group the values for each key in the RDD into a single sequence.</dd>\n  \n  <dt><a href=\"http://www.rubydoc.info/gems/ruby-spark/Spark/RDD\" target=\"_blank\"><code>...many more...</code></a></dt>\n  <dd></dd>\n</dl>\n\n\n#### Actions\n\n<dl> \n  <dt><code>rdd.take(count)</code></dt>\n  <dd>Take the first num elements of the RDD.</dd>\n\n  <dt><code>rdd.reduce(function)</code></dt>\n  <dd>Reduces the elements of this RDD using the specified lambda or method.</dd>\n\n  <dt><code>rdd.aggregate(zero_value, seq_op, comb_op)</code></dt>\n  <dd>Aggregate the elements of each partition, and then the results for all the partitions, using given combine functions and a neutral “zero value”.</dd>\n\n  <dt><code>rdd.histogram(buckets)</code></dt>\n  <dd>Compute a histogram using the provided buckets.</dd>\n\n  <dt><code>rdd.collect</code></dt>\n  <dd>Return an array that contains all of the elements in this RDD.</dd>\n\n  <dt><a href=\"http://www.rubydoc.info/gems/ruby-spark/Spark/RDD\" target=\"_blank\"><code>...many more...</code></a></dt>\n  <dd></dd>\n</dl>\n\n\n## Examples\n\n##### Basic methods\n\n```ruby\n# Every batch will be serialized by Marshal and will have size 10\nser = Spark::Serializer.build('batched(marshal, 10)')\n\n# Range 0..100, 2 workers, custom serializer\nrdd = Spark.sc.parallelize(0..100, 2, ser)\n\n\n# Take first 5 items\nrdd.take(5)\n# => [0, 1, 2, 3, 4]\n\n\n# Numbers reducing\nrdd.reduce(lambda{|sum, x| sum+x})\nrdd.reduce(:+)\nrdd.sum\n# => 5050\n\n\n# Reducing with zero items\nseq = lambda{|x,y| x+y}\ncom = lambda{|x,y| x*y}\nrdd.aggregate(1, seq, com)\n# 1. Every workers adds numbers\n#    => [1226, 3826]\n# 2. Results are multiplied\n#    => 4690676\n\n\n# Statistic method\nrdd.stats\n# => StatCounter: (count, mean, max, min, variance,\n#                  sample_variance, stdev, sample_stdev)\n\n\n# Compute a histogram using the provided buckets.\nrdd.histogram(2)\n# => [[0.0, 50.0, 100], [50, 51]]\n\n\n# Mapping\nrdd.map(lambda {|x| x*2}).collect\n# => [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, ...]\nrdd.map(:to_f).collect\n# => [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ...]\n\n\n# Mapping the whole collection\nrdd.map_partitions(lambda{|part| part.reduce(:+)}).collect\n# => [1225, 3825]\n\n\n# Selecting\nrdd.filter(lambda{|x| x.even?}).collect\n# => [0, 2, 4, 6, 8, 10, 12, 14, 16, ...]\n\n\n# Sampling\nrdd.sample(true, 10).collect\n# => [3, 36, 40, 54, 58, 82, 86, 95, 98]\n\n\n# Sampling X items\nrdd.take_sample(true, 10)\n# => [53, 87, 71, 74, 18, 75, 55, 94, 46, 32]\n\n\n# Using external process\nrdd.pipe('cat', \"awk '{print $1*10}'\")\n# => [\"0\", \"10\", \"20\", \"30\", \"40\", \"50\", ...]\n```\n\n##### Words count using methods\n\n```ruby\n# Content:\n# \"first line\"\n# \"second line\"\nrdd = sc.text_file(PATH)\n\n# [\"first\", \"line\", \"second\", \"line\"]\nrdd = rdd.flat_map(lambda{|line| line.split})\n\n# [[\"first\", 1], [\"line\", 1], [\"second\", 1], [\"line\", 1]]\nrdd = rdd.map(lambda{|word| [word, 1]})\n\n# [[\"first\", 1], [\"line\", 2], [\"second\", 1]]\nrdd = rdd.reduce_by_key(lambda{|a, b| a+b})\n\n# {\"first\"=>1, \"line\"=>2, \"second\"=>1}\nrdd.collect_as_hash\n```\n\n##### Estimating PI with a custom serializer\n\n```ruby\nslices = 3\nn = 100000 * slices\n\ndef map(_)\n  x = rand * 2 - 1\n  y = rand * 2 - 1\n\n  if x**2 + y**2 < 1\n    return 1\n  else\n    return 0\n  end\nend\n\nrdd = Spark.context.parallelize(1..n, slices, serializer: 'oj')\nrdd = rdd.map(method(:map))\n\nputs 'Pi is roughly %f' % (4.0 * rdd.sum / n)\n```\n\n##### Estimating PI\n\n```ruby\nrdd = sc.parallelize([10_000], 1)\nrdd = rdd.add_library('bigdecimal/math')\nrdd = rdd.map(lambda{|x| BigMath.PI(x)})\nrdd.collect # => #<BigDecimal, '0.31415926...'>\n```\n\n### Mllib (Machine Learning Library)\n\nMllib functions are using Spark's Machine Learning Library. Ruby objects are serialized and deserialized in Java so you cannot use custom classes. Supported are primitive types such as string or integers.\n\nAll supported methods/models:\n\n- [Rubydoc / Mllib](http://www.rubydoc.info/github/ondra-m/ruby-spark/Spark/Mllib)\n- [Github / Mllib](https://github.com/ondra-m/ruby-spark/tree/master/lib/spark/mllib)\n\n##### Linear regression\n\n```ruby\n# Import Mllib classes into Object\n# Otherwise are accessible via Spark::Mllib::LinearRegressionWithSGD\nSpark::Mllib.import(Object)\n\n# Training data\ndata = [\n  LabeledPoint.new(0.0, [0.0]),\n  LabeledPoint.new(1.0, [1.0]),\n  LabeledPoint.new(3.0, [2.0]),\n  LabeledPoint.new(2.0, [3.0])\n]\n\n# Train a model\nlrm = LinearRegressionWithSGD.train(sc.parallelize(data), initial_weights: [1.0])\n\nlrm.predict([0.0])\n```\n\n##### K-Mean\n\n```ruby\nSpark::Mllib.import\n\n# Dense vectors\ndata = [\n  DenseVector.new([0.0,0.0]),\n  DenseVector.new([1.0,1.0]),\n  DenseVector.new([9.0,8.0]),\n  DenseVector.new([8.0,9.0])\n]\n\nmodel = KMeans.train(sc.parallelize(data), 2)\n\nmodel.predict([0.0, 0.0]) == model.predict([1.0, 1.0])\n# => true\nmodel.predict([8.0, 9.0]) == model.predict([9.0, 8.0])\n# => true\n```\n\n## Benchmarks\n\n"
  },
  {
    "path": "Rakefile",
    "content": "#-*- mode: ruby -*-\n\nrequire \"bundler/gem_tasks\"\nrequire \"rspec/core/rake_task\"\n\nRSpec::Core::RakeTask.new\n\ntask default: :spec\ntask test:    :spec\n\ndef java?\n  RUBY_PLATFORM =~ /java/\nend\n\nif java?\n  require \"rake/javaextensiontask\"\n  Rake::JavaExtensionTask.new(\"ruby_java\") do |ext|\n    ext.name = \"ruby_spark_ext\"\n  end\nelse\n  require \"rake/extensiontask\"\n  Rake::ExtensionTask.new(\"ruby_c\") do |ext|\n    ext.name = \"ruby_spark_ext\"\n  end\nend\n\n\ntask :clean do\n  Dir['lib/*.{jar,o,so}'].each do |path|\n    puts \"Deleting #{path} ...\"\n    File.delete(path)\n  end\n  FileUtils.rm_rf('./pkg')\n  FileUtils.rm_rf('./tmp')\nend\n"
  },
  {
    "path": "TODO.md",
    "content": "- refactor JavaBridge\n  - to_java, from_java\n  - every type should have class\n  - automatic registration\n- add Streaming\n- worker informations (time, memory, ...)\n- killing zombie workers\n- add_rb, add_inline_rb to Spark::{Context, RDD}\n- fix broadcast for cluster\n- dump to disk if there is memory limit\n- Add Partitioner to RDD\n- add NonExist serializer\n"
  },
  {
    "path": "benchmark/aggregate.rb",
    "content": "require 'benchmark'\nrequire 'benchmark/ips'\n\ndata = 0..1_000_000\nzero_value = rand(100_000)\nfunction = Proc.new{|sum, n| sum+n}\n\nBenchmark.ips do |r|  \n  r.report('each') do\n    sum = zero_value\n    data.each do |n|\n      sum += n\n    end\n  end\n\n  r.report('reduce') do\n    data.reduce(zero_value){|sum, n| sum+n}\n  end\n\n  r.report('each with function') do\n    sum = zero_value\n    data.each do |n|\n      sum = function.call(sum, n)\n    end\n  end\n\n  r.report('reduce with function') do\n    data.reduce(zero_value, &function)\n  end\n\n  r.compare!\nend\n\n"
  },
  {
    "path": "benchmark/bisect.rb",
    "content": "require \"benchmark\"\n\ndef bisect_left1(a, x, opts={})\n  return nil if a.nil?\n  return 0 if a.empty?\n\n  lo = (opts[:lo] || opts[:low]).to_i\n  hi = opts[:hi] || opts[:high] || a.length\n\n  while lo < hi\n    mid = (lo + hi) / 2\n    v = a[mid]\n    if v < x\n      lo = mid + 1\n    else\n      hi = mid\n    end\n  end\n  return lo\nend\n\ndef bisect_left2(list, item)\n  count = 0\n  list.each{|i|\n    return count if i >= item\n    count += 1\n  }\n  nil\nend\n\ndef bisect_left3(list, item, lo = 0, hi = list.size)\n  while lo < hi\n    i = (lo + hi - 1) >> 1\n\n    if 0 <= (list[i] <=> item)\n      hi = i\n    else\n      lo = i + 1\n    end\n  end\n  return hi\nend\n\narray = Array.new(1000000) { rand(0..1000000) };\nto_find = Array.new(500) { rand(0..10000) };\n\nBenchmark.bm(20) do |x|\n  x.report(\"bisect_left1\") do\n    to_find.each do |item|\n      bisect_left1(array, item)\n    end\n  end\n\n  x.report(\"bisect_left2\") do\n    to_find.each do |item|\n      bisect_left2(array, item)\n    end\n  end\n\n  x.report(\"bisect_left3\") do\n    to_find.each do |item|\n      bisect_left3(array, item)\n    end\n  end\nend\n\narray = Array.new(100000) { Array.new(rand(1..10)){(97+rand(26)).chr}.join };\nto_find = Array.new(500) { (97+rand(26)).chr };\n\nBenchmark.bm(20) do |x|\n  x.report(\"bisect_left1\") do\n    to_find.each do |item|\n      bisect_left1(array, item)\n    end\n  end\n\n  x.report(\"bisect_left2\") do\n    to_find.each do |item|\n      bisect_left2(array, item)\n    end\n  end\n\n  x.report(\"bisect_left3\") do\n    to_find.each do |item|\n      bisect_left3(array, item)\n    end\n  end\nend\n"
  },
  {
    "path": "benchmark/comparison/prepare.sh",
    "content": "#!/usr/bin/env bash\n\n# Current dir\ncd \"$(dirname \"$0\")\"\n\n# Exit immediately if a pipeline returns a non-zero status.\nset -e\n\n# Spark\nwget \"http://d3kbcqa49mib13.cloudfront.net/spark-1.3.0-bin-hadoop2.4.tgz\" -O spark.tgz\ntar xvzf spark.tgz\nmv spark-1.3.0-bin-hadoop2.4 spark\nrm spark.tgz\n\n# RSpark (only for 1.3.0)\ngit clone git@github.com:amplab-extras/SparkR-pkg.git rspark\ncd rspark\nSPARK_VERSION=1.3.0 ./install-dev.sh\n"
  },
  {
    "path": "benchmark/comparison/python.py",
    "content": "import os\nimport math\nfrom time import time\nfrom random import random\nfrom operator import add\nfrom pyspark import SparkContext\n\nsc = SparkContext(appName=\"Python\", master=\"local[*]\")\n\nlog_file = open(os.environ.get('PYTHON_LOG'), 'w')\n\ndef log(*values):\n  values = map(lambda x: str(x), values)\n  log_file.write(';'.join(values))\n  log_file.write('\\n')\n\nworkers = int(os.environ.get('WORKERS'))\nnumbers_count = int(os.environ.get('NUMBERS_COUNT'))\ntext_file = os.environ.get('TEXT_FILE')\n\nnumbers = range(numbers_count)\nfloats = [float(i) for i in numbers]\nwith open(text_file) as t:\n  strings = t.read().split(\"\\n\")\n\n\n# =============================================================================\n# Serialization\n# =============================================================================\n\nt = time()\nrdd_numbers = sc.parallelize(numbers, workers)\nt = time() - t\nlog('NumbersSerialization', t)\n\n\nt = time()\nrdd_floats = sc.parallelize(floats, workers)\nt = time() - t\nlog('FloatsSerialization', t)\n\n\nt = time()\nrdd_strings = sc.parallelize(strings, workers)\nt = time() - t\nlog('StringsSerialization', t)\n\n\n# =============================================================================\n# Computing\n# =============================================================================\n\n\n# --- Is prime? ---------------------------------------------------------------\n\ndef is_prime(x):\n  if x < 2:\n    return [x, False]\n  elif x == 2:\n    return [x, True]\n  elif x % 2 == 0:\n    return [x, False]\n  else:\n    upper = int(math.sqrt(float(x)))\n    result = True\n\n    i = 3\n    while i <= upper:\n      if x % i == 0:\n        result = False\n        break\n\n      i += 2\n\n    return [x, result]\n\nt = time()\nrdd_numbers.map(is_prime).collect()\nt = time() - t\n\nlog('IsPrime', t)\n\n\n# --- Matrix multiplication ---------------------------------------------------\n\nmatrix_size = int(os.environ.get('MATRIX_SIZE'))\n\nmatrix = []\nfor row in range(matrix_size):\n  matrix.append([])\n  for col in range(matrix_size):\n    matrix[row].append(row+col)\n\ndef multiplication_func(matrix):\n  matrix = list(matrix)\n  size = len(matrix)\n\n  new_matrix = []\n  for row in range(size):\n    new_matrix.append([])\n    for col in range(size):\n\n      result = 0\n      for i in range(size):\n        result += matrix[row][i] * matrix[col][i]\n      new_matrix[row].append(result)\n\n  return new_matrix\n\nt = time()\nrdd = sc.parallelize(matrix, 1)\nrdd.mapPartitions(multiplication_func).collect()\nt = time() - t\n\nlog('MatrixMultiplication', t)\n\n\n# --- Pi digits ---------------------------------------------------------------\n# http://rosettacode.org/wiki/Pi#Python\n\npi_digit = int(os.environ.get('PI_DIGIT'))\n\ndef pi_func(size):\n  size = size.next()\n  result = ''\n\n  q, r, t, k, n, l = 1, 0, 1, 1, 3, 3\n  while size > 0:\n    if 4*q+r-t < n*t:\n      result += str(n)\n      size -= 1\n      nr = 10*(r-n*t)\n      n  = ((10*(3*q+r))//t)-10*n\n      q  *= 10\n      r  = nr\n    else:\n      nr = (2*q+r)*l\n      nn = (q*(7*k)+2+(r*l))//(t*l)\n      q  *= k\n      t  *= l\n      l  += 2\n      k += 1\n      n  = nn\n      r  = nr\n\n  return [result]\n\nt = time()\nrdd = sc.parallelize([pi_digit], 1)\nrdd.mapPartitions(pi_func).collect()\nt = time() - t\n\nlog('PiDigit', t)\n\n\nlog_file.close()\n"
  },
  {
    "path": "benchmark/comparison/r.r",
    "content": "library(SparkR)\nsc <- sparkR.init(master=\"local[*]\")\n\nlogFile <- file(Sys.getenv(\"R_LOG\"), \"w\")\n\nlogInfo <- function(...){\n  args <- list(...)\n  line <- paste(args, collapse = \";\")\n  writeLines(line, logFile)\n}\n\nworkers <- as.integer(Sys.getenv('WORKERS'))\nnumbersCount <- as.integer(Sys.getenv('NUMBERS_COUNT'))\ntextFile <- Sys.getenv('TEXT_FILE')\n\n\n# =============================================================================\n# Serialization\n# =============================================================================\n\ntime <- proc.time()\nrddNumbers <- parallelize(sc, as.numeric(seq(0, numbersCount)), workers)\ntime <- as.double(proc.time()-time)[3]\n\nlogInfo('NumbersSerialization', time)\n\n\n# =============================================================================\n# Computing\n# =============================================================================\n\nisPrime = function(x) {\n  if(x < 2){\n    c(x, FALSE)\n  }\n  else if(x == 2){\n    c(x, TRUE)\n  }\n  else if(x %% 2 == 0){\n    c(x, FALSE)\n  }\n  else{\n    upper <- as.numeric(sqrt(as.double(x)))\n    result <- TRUE\n\n    i <- 3\n    while(i <= upper){\n      if(x %% i == 0){\n        result = FALSE\n        break\n      }\n\n      i <- i+2\n    }\n\n    c(x, result)\n  }\n}\n\ntime <- proc.time()\nrdd <- map(rddNumbers, isPrime)\ncapture.output(collect(rdd), file='/dev/null')\ntime <- as.double(proc.time()-time)[3]\n\nlogInfo('IsPrime', time)\n\n\nclose(logFile)\nsparkR.stop()\n"
  },
  {
    "path": "benchmark/comparison/ruby.rb",
    "content": "#!/usr/bin/env ruby\n\nlib = File.expand_path(File.dirname(__FILE__) + '/../../lib')\n$LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)\n\nrequire 'ruby-spark'\nrequire 'benchmark'\n\nSpark.start\nsc = Spark.context\n\n$log_file = File.open(ENV['RUBY_LOG'], 'w')\n\ndef log(*values)\n  $log_file.puts(values.join(';'))\nend\n\nworkers = ENV['WORKERS'].to_i\nnumbers_count = ENV['NUMBERS_COUNT'].to_i\ntext_file = ENV['TEXT_FILE']\n\nnumbers = (0...numbers_count).to_a\nfloats = numbers.map(&:to_f)\nstrings = File.read(text_file).split(\"\\n\")\n\n\n# =============================================================================\n# Serialization\n# =============================================================================\n\ntime = Benchmark.realtime do\n  @rdd_numbers = sc.parallelize(numbers, workers)\nend\n\nlog('NumbersSerialization', time)\n\n\ntime = Benchmark.realtime do\n  @rdd_floats = sc.parallelize(floats, workers)\nend\n\nlog('FloatsSerialization', time)\n\n\ntime = Benchmark.realtime do\n  @rdd_strings = sc.parallelize(strings, workers)\nend\n\nlog('StringsSerialization', time)\n\n\n# =============================================================================\n# Computing\n# =============================================================================\n\n\n# --- Is prime? ---------------------------------------------------------------\n\nis_prime = Proc.new do |x|\n  case\n  when x < 2\n    [x, false]\n  when x == 2\n    [x, true]\n  when x % 2 == 0\n    [x, false]\n  else\n    upper = Math.sqrt(x.to_f).to_i\n    result = true\n\n    i = 3\n    while i <= upper\n      if x % i == 0\n        result = false\n        break\n      end\n\n      i += 2\n    end\n\n    [x, result]\n  end\nend\n\ntime = Benchmark.realtime do\n  @rdd_numbers.map(is_prime).collect\nend\n\nlog('IsPrime', time)\n\n\n# --- Matrix multiplication ---------------------------------------------------\n\nmatrix_size = ENV['MATRIX_SIZE'].to_i\n\nmatrix = Array.new(matrix_size) do |row|\n  Array.new(matrix_size) do |col|\n    row+col\n  end\nend;\n\nmultiplication_func = Proc.new do |matrix|\n  size = matrix.size\n\n  Array.new(size) do |row|\n    Array.new(size) do |col|\n      matrix[row]\n\n      result = 0\n      size.times do |i|\n        result += matrix[row][i] * matrix[col][i]\n      end\n      result\n    end\n  end\nend\n\ntime = Benchmark.realtime do\n  rdd = sc.parallelize(matrix, 1)\n  rdd.map_partitions(multiplication_func).collect\nend\n\nlog('MatrixMultiplication', time)\n\n\n# --- Pi digits ---------------------------------------------------------------\n# http://rosettacode.org/wiki/Pi#Ruby\n\npi_digit = ENV['PI_DIGIT'].to_i\n\npi_func = Proc.new do |size|\n  size = size.first\n  result = ''\n\n  q, r, t, k, n, l = 1, 0, 1, 1, 3, 3\n  while size > 0\n    if 4*q+r-t < n*t\n      result << n.to_s\n      size -= 1\n      nr = 10*(r-n*t)\n      n = ((10*(3*q+r)) / t) - 10*n\n      q *= 10\n      r = nr\n    else\n      nr = (2*q+r) * l\n      nn = (q*(7*k+2)+r*l) / (t*l)\n      q *= k\n      t *= l\n      l += 2\n      k += 1\n      n = nn\n      r = nr\n    end\n  end\n\n  [result]\nend\n\ntime = Benchmark.realtime do\n  rdd = sc.parallelize([pi_digit], 1)\n  rdd.map_partitions(pi_func).collect\nend\n\nlog('PiDigit', time)\n\n\n$log_file.close\n"
  },
  {
    "path": "benchmark/comparison/run-all.sh",
    "content": "#!/usr/bin/env bash\n\n# Current dir\ncd \"$(dirname \"$0\")\"\n\n# Exit immediately if a pipeline returns a non-zero status.\nset -e\n\n# Settings\nexport WORKERS=2\nexport MATRIX_SIZE=100\nexport NUMBERS_COUNT=1000000\nexport TEXT_FILE=$(mktemp)\nexport PI_DIGIT=1000\nexport RUBY_BATCH_SIZE=2048\n\ntext_file_rows=10\ntext_file_per_line=10\ntext_file_duplicates=50\n\nmx=\"4096m\"\nms=\"4096m\"\n\n\n# Parse arguments\nwhile (( \"$#\" )); do\n  case $1 in\n    --workers)\n      WORKERS=\"$2\"\n      shift\n      ;;\n    --matrix-size)\n      MATRIX_SIZE=\"$2\"\n      shift\n      ;;\n    --numbers-count)\n      NUMBERS_COUNT=\"$2\"\n      shift\n      ;;\n    --random-file-rows)\n      text_file_rows=\"$2\"\n      shift\n      ;;\n    --text-file-per-line)\n      text_file_per_line=\"$2\"\n      shift\n      ;;\n    --text-file-duplicates)\n      text_file_duplicates=\"$2\"\n      shift\n      ;;\n    --pi-digit)\n      PI_DIGIT=\"$2\"\n      shift\n      ;;\n    --ruby-batch-size)\n      RUBY_BATCH_SIZE=\"$2\"\n      shift\n      ;;\n    --mx)\n      mx=\"$2\"\n      shift\n      ;;\n    --ms)\n      ms=\"$2\"\n      shift\n      ;;\n    *)\n      break\n      ;;\n  esac\n  shift\ndone\n\n\n# Generating\nfile=$(mktemp)\n\nfor (( i=0; i<$text_file_rows; i++ ))\ndo\n  shuf -n $text_file_per_line /usr/share/dict/words | tr '\\n' ' ' >> $file\n  echo >> $file\ndone\n\nfor (( i=0; i<$text_file_duplicates; i++ ))\ndo\n  cat $file >> $TEXT_FILE\ndone\n\n\n# Before run\nif [[ -z \"$SPARK_HOME\" ]]; then\n  export SPARK_HOME=$(pwd)/spark\nfi\n\nif [[ -z \"$RSPARK_HOME\" ]]; then\n  export RSPARK_HOME=$(pwd)/rspark\nfi\n\nexport SPARK_RUBY_BATCH_SIZE=\"$RUBY_BATCH_SIZE\"\nSPARK_CLASSPATH=$($SPARK_HOME/bin/compute-classpath.sh 2>/dev/null)\n\nexport _JAVA_OPTIONS=\"$_JAVA_OPTIONS -Xms$ms -Xmx$mx\"\n\n\n# Log files\nexport RUBY_MARSHAL_LOG=$(mktemp)\nexport RUBY_OJ_LOG=$(mktemp)\nexport PYTHON_LOG=$(mktemp)\nexport SCALA_LOG=$(mktemp)\nexport R_LOG=$(mktemp)\n\n\n# Run:\necho \"Workers: $WORKERS\"\necho \"Matrix size: $MATRIX_SIZE\"\necho \"Numbers count: $NUMBERS_COUNT\"\necho \"Pi digits: $PI_DIGIT\"\necho \"File: rows = $(($text_file_rows * $text_file_duplicates))\"\necho \"      per line = $text_file_per_line\"\n\n# --- Ruby\nexport SPARK_RUBY_SERIALIZER='marshal'\nexport RUBY_LOG=\"$RUBY_MARSHAL_LOG\"\n/usr/bin/env ruby ruby.rb &>/dev/null\n\nexport SPARK_RUBY_SERIALIZER='oj'\nexport RUBY_LOG=\"$RUBY_OJ_LOG\"\n/usr/bin/env ruby ruby.rb &>/dev/null\n\n# # --- Python\n\"$SPARK_HOME\"/bin/spark-submit --master \"local[*]\" $(pwd)/python.py &>/dev/null\n\n# # --- Scala\n/usr/bin/env scalac -cp $SPARK_CLASSPATH scala.scala -d scala.jar &>/dev/null\n\"$SPARK_HOME\"/bin/spark-submit --master \"local[*]\" $(pwd)/scala.jar &>/dev/null\n\n# --- R\n# \"$RSPARK_HOME\"/sparkR r.r #&>/dev/null\n\n\n# Parse results\necho \"# Ruby (Marshal)\"\ncat $RUBY_MARSHAL_LOG\necho \"\"\n\necho \"# Ruby (Oj)\"\ncat $RUBY_OJ_LOG\necho \"\"\n\necho \"# Python\"\ncat $PYTHON_LOG\necho \"\"\n\necho \"# Scala\"\ncat $SCALA_LOG\necho \"\"\n\necho \"# R\"\ncat $R_LOG\n"
  },
  {
    "path": "benchmark/comparison/scala.scala",
    "content": "import java.io._\nimport scala.math\nimport scala.io.Source\nimport org.apache.spark._\n\nobject Scala {\n\n  val logFile = new PrintWriter(new File(System.getenv(\"SCALA_LOG\")))\n\n  def log(args: Any*) {\n    logFile.write(args.mkString(\";\"))\n    logFile.write(\"\\n\")\n  }\n\n  def main(args: Array[String]) {\n    val conf = new SparkConf().setAppName(\"Scala\")\n    val sc = new SparkContext(conf)\n\n    val workers = System.getenv(\"WORKERS\").toInt\n    val numbersCount = System.getenv(\"NUMBERS_COUNT\").toInt\n    val textFile = System.getenv(\"TEXT_FILE\")\n\n    val numbers = 0 until numbersCount\n    val floats = numbers.map(_.toDouble)\n    val strings = Source.fromFile(textFile).mkString.split(\"\\n\")\n\n\n    // =============================================================================\n    // Serialization\n    // =============================================================================\n\n    var time: Long = 0\n\n    time = System.currentTimeMillis\n    val rddNumbers = sc.parallelize(numbers, workers)\n    time = System.currentTimeMillis - time\n\n    log(\"NumbersSerialization\", time/1000.0)\n\n\n    time = System.currentTimeMillis\n    val rddFloats = sc.parallelize(floats, workers)\n    time = System.currentTimeMillis - time\n\n    log(\"FloatsSerialization\", time/1000.0)\n\n\n    time = System.currentTimeMillis\n    val rddStrings = sc.parallelize(strings, workers)\n    time = System.currentTimeMillis - time\n\n    log(\"StringsSerialization\", time/1000.0)\n\n\n    // =============================================================================\n    // Computing\n    // =============================================================================\n\n    // --- Is prime? ---------------------------------------------------------------\n\n    time = System.currentTimeMillis\n    val primes = rddNumbers.map{ x =>\n      if(x < 2){\n        (x, false)\n      }\n      else if(x == 2){\n        (x, true)\n      }\n      else if(x % 2 == 0){\n        (x, false)\n      }\n      else{\n        val upper = math.sqrt(x.toDouble).toInt\n        var result = true\n\n        var i = 3\n        while(i <= upper && result == true){\n          if(x % i == 0){\n            result = false\n          }\n          else{\n            i += 2\n          }\n        }\n\n        (x, result)\n      }\n    }\n    primes.collect()\n    time = System.currentTimeMillis - time\n\n    log(\"IsPrime\", time/1000.0)\n\n\n    // --- Matrix multiplication ---------------------------------------------------\n\n    val matrixSize = System.getenv(\"MATRIX_SIZE\").toInt\n\n    val matrix = new Array[Array[Long]](matrixSize)\n\n    for( row <- 0 until matrixSize ) {\n      matrix(row) = new Array[Long](matrixSize)\n      for( col <- 0 until matrixSize ) {\n        matrix(row)(col) = row + col\n      }\n    }\n\n    time = System.currentTimeMillis\n    val rdd = sc.parallelize(matrix, 1)\n    rdd.mapPartitions { it =>\n      val matrix = it.toArray\n      val size = matrix.size\n\n      val newMatrix = new Array[Array[Long]](size)\n\n      for( row <- 0 until size ) {\n        newMatrix(row) = new Array[Long](size)\n        for( col <- 0 until size ) {\n\n          var result: Long = 0\n          for( i <- 0 until size ) {\n            result += matrix(row)(i) * matrix(col)(i)\n          }\n          newMatrix(row)(col) = result\n        }\n      }\n\n      newMatrix.toIterator\n    }\n    time = System.currentTimeMillis - time\n\n    log(\"MatrixMultiplication\", time/1000.0)\n\n\n    // --- Pi digits ---------------------------------------------------------------\n    // http://rosettacode.org/wiki/Pi#Scala\n\n    val piDigit = System.getenv(\"PI_DIGIT\").toInt\n\n    time = System.currentTimeMillis\n    val piDigits = sc.parallelize(Array(piDigit), 1)\n    piDigits.mapPartitions { it =>\n      var size = it.toArray.asInstanceOf[Array[Int]](0)\n      var result = \"\"\n\n      var r: BigInt = 0\n      var q, t, k: BigInt = 1\n      var n, l: BigInt = 3\n      var nr, nn: BigInt = 0\n\n      while(size > 0){\n        while((4*q+r-t) >= (n*t)){\n          nr = (2*q+r)*l\n          nn = (q*(7*k)+2+(r*l))/(t*l)\n          q = q * k\n          t = t * l\n          l = l + 2\n          k = k + 1\n          n  = nn\n          r  = nr\n        }\n\n        result += n.toString\n        size -= 1\n        nr = 10*(r-n*t)\n        n  = ((10*(3*q+r))/t)-(10*n)\n        q  = q * 10\n        r  = nr\n      }\n\n      Iterator(result)\n    }\n    time = System.currentTimeMillis - time\n\n    log(\"PiDigit\", time/1000.0)\n\n\n    sc.stop()\n    logFile.close()\n  }\n}\n"
  },
  {
    "path": "benchmark/custom_marshal.rb",
    "content": "require 'benchmark'\nrequire 'benchmark/ips'\n\ndef pack_int(data)\n  [data].pack('l>')\nend\n\ndef pack_long(data)\n  [data].pack('q>')\nend\n\ndef pack_doubles(data)\n  data.pack('G*')\nend\n\nmodule Standard\n  class LabeledPoint\n    def initialize(label, features)\n      @label = label\n      @features = Standard::Vector.new(features)\n    end\n\n    def marshal_dump\n      [@label, @features]\n    end\n\n    def marshal_load(*)\n    end\n  end\n\n  class Vector\n    def initialize(array)\n      @values = array\n    end\n\n    def marshal_dump\n      [@values]\n    end\n\n    def marshal_load(*)\n    end\n  end\nend\n\nmodule Custom\n  class LabeledPoint\n    def initialize(label, features)\n      @label = label\n      @features = Custom::Vector.new(features)\n    end\n\n    def _dump(*)\n      pack_long(@label) + @features._dump\n    end\n\n    def self._load(*)\n    end\n  end\n\n  class Vector\n    def initialize(array)\n      @values = array\n    end\n\n    def _dump(*)\n      result = 'v'\n      result << pack_int(@values.size)\n      result << pack_doubles(@values)\n      result.encode(Encoding::ASCII_8BIT)\n    end\n\n    def self._load(*)\n    end\n  end\nend\n\ndata_size = 10_000\nvector_size = 1_000\nvalues = Array.new(vector_size) { |x| rand(10_000..100_000) }\n\n@data1 = Array.new(data_size) {|i| Standard::LabeledPoint.new(i, values)}\n@data2 = Array.new(data_size) {|i| Custom::LabeledPoint.new(i, values)}\n\nBenchmark.ips do |r|\n  r.report('standard') do\n    Marshal.dump(@data1)\n  end\n\n  r.report('custom') do\n    Marshal.dump(@data2)\n  end\n\n  r.compare!\nend\n"
  },
  {
    "path": "benchmark/digest.rb",
    "content": "lib = File.expand_path(File.dirname(__FILE__) + '/../lib')\n$LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)\n\ndef java?\n  RUBY_PLATFORM =~ /java/\nend\n\nunless java?\n  require 'murmurhash3'\nend\n\nrequire 'digest'\nrequire 'benchmark'\nrequire 'ruby-spark'\n\nTEST = 5_000_000\nWORDS = [\"wefwefwef\", \"rgwefiwefwe\", \"a\", \"rujfwgrethrzjrhgawf\", \"irncrnuggo\"]\n\nputs \"TEST COUNT = #{TEST*WORDS.size}\"\n\n# =================================================================================================\n# Pure ruby mumrumur\n# funny-falcon/murmurhash3-ruby\n\nMASK32 = 0xffffffff\n\ndef murmur3_32_rotl(x, r)\n  ((x << r) | (x >> (32 - r))) & MASK32\nend\n\ndef murmur3_32_fmix(h)\n  h &= MASK32\n  h ^= h >> 16\n  h = (h * 0x85ebca6b) & MASK32\n  h ^= h >> 13\n  h = (h * 0xc2b2ae35) & MASK32\n  h ^ (h >> 16)\nend\n\ndef murmur3_32__mmix(k1)\n  k1 = (k1 * 0xcc9e2d51) & MASK32\n  k1 = murmur3_32_rotl(k1, 15)\n  (k1 * 0x1b873593) & MASK32\nend\n\ndef murmur3_32_str_hash(str, seed=0)\n  h1 = seed\n  numbers = str.unpack('V*C*')\n  tailn = str.bytesize % 4\n  tail = numbers.slice!(numbers.size - tailn, tailn)\n  for k1 in numbers\n    h1 ^= murmur3_32__mmix(k1)\n    h1 = murmur3_32_rotl(h1, 13)\n    h1 = (h1*5 + 0xe6546b64) & MASK32\n  end\n\n  unless tail.empty?\n    k1 = 0\n    tail.reverse_each do |c1|\n      k1 = (k1 << 8) | c1\n    end\n    h1 ^= murmur3_32__mmix(k1)\n  end\n\n  h1 ^= str.bytesize\n  murmur3_32_fmix(h1)\nend\n\n\n# =================================================================================================\n# Benchmark\n\nBenchmark.bm(18) do |x|\n\n  x.report(\"ruby hash\"){\n    TEST.times{\n      WORDS.each{ |word|\n        word.hash\n      }\n    }    \n  }\n\n  x.report(\"ext portable\"){\n    TEST.times{\n      WORDS.each{ |word|\n        Spark::Digest.portable_hash(word)\n      }\n    }    \n  }\n\n  x.report(\"murmur3 32\"){\n    TEST.times{\n      WORDS.each{ |word|\n        # MurmurHash3::V128.str_hash(word)\n        # [MurmurHash3::V128.str_hash(word).join.to_i].pack(\"q>\")\n        # MurmurHash3::V128.str_hash(word)\n        # a = MurmurHash3::V32.str_hash(word).to_s\n        # a.slice!(0,8)\n\n        MurmurHash3::V32.str_hash(word)\n      }\n    }    \n  } unless java?\n\n  # Too slow\n  # x.report(\"murmur3 32 (ruby)\"){\n  #   TEST.times{\n  #     WORDS.each{ |word|\n  #       # MurmurHash3::V128.str_hash(word)\n  #       # [MurmurHash3::V128.str_hash(word).join.to_i].pack(\"q>\")\n  #       # MurmurHash3::V128.str_hash(word)\n  #       # a = murmur3_32_str_hash(word).to_s\n  #       # a.slice!(0,8)\n\n  #       murmur3_32_str_hash(word)\n  #     }\n  #   }    \n  # }\n\n  x.report(\"murmur3 128\"){\n    TEST.times{\n      WORDS.each{ |word|\n        # MurmurHash3::V128.str_hash(word)\n        # [MurmurHash3::V128.str_hash(word).join.to_i].pack(\"q>\")\n        # a = MurmurHash3::V128.str_hash(word).to_s\n        # a.slice!(0,8)\n\n        MurmurHash3::V128.str_hash(word)\n      }\n    }    \n  } unless java?\n\n  # x.report(\"sha256\"){\n  #   TEST.times{\n  #     WORDS.each{ |word|\n  #       a = Digest::SHA256.digest(word)\n  #       # a.slice!(0,8)\n  #     }\n  #   }    \n  # }\n\n  # x.report(\"md5\"){\n  #   TEST.times{\n  #     WORDS.each{ |word|\n  #       a = Digest::MD5.digest(word)\n  #       # a.slice!(0,8)\n  #     }\n  #   }    \n  # }\nend\n"
  },
  {
    "path": "benchmark/enumerator.rb",
    "content": "require \"benchmark\"\n\nclass Enumerator\n  def defer(&blk)\n    self.class.new do |y|\n      each do |*input|\n        blk.call(y, *input)\n      end\n    end\n  end\nend\n\nARRAY_SIZE = 50_000_000\n\ndef type_yield\n  return to_enum(__callee__) unless block_given?\n\n  ARRAY_SIZE.times { |i|\n    yield i\n  }\nend\n\ndef yield_map_x2(enum)\n  return to_enum(__callee__, enum) unless block_given?\n  \n  enum.each do |item|\n    yield item*2\n  end\nend\n\ndef type_enumerator_new\n  Enumerator.new do |e|\n    ARRAY_SIZE.times { |i|\n      e << i\n    }\n  end\nend\n\ndef enumerator_new_map_x2(enum)\n  Enumerator.new do |e|\n    enum.each do |item|\n      e << item*2\n    end\n  end\nend\n\ndef enumerator_defer_x2(enum)\n  enum.defer do |out, inp|\n    out << inp*2\n  end\nend\n\nBenchmark.bm(26) do |x|\n  x.report(\"yield max\") do\n    type_yield.max\n  end\n\n  x.report(\"yield sum\") do\n    type_yield.reduce(:+)\n  end\n\n  x.report(\"yield map x*2 sum\") do\n    yield_map_x2(type_yield).reduce(:+)\n  end\n\n  x.report(\"yield defer map x*2 sum\") do\n    enumerator_defer_x2(type_yield).reduce(:+)\n  end\n\n  x.report(\"-----\"){}\n\n  x.report(\"Enum.new max\") do\n    type_enumerator_new.max\n  end\n\n  x.report(\"Enum.new sum\") do\n    type_enumerator_new.reduce(:+)\n  end\n\n  x.report(\"Enum.new map x*2 sum\") do\n    enumerator_new_map_x2(type_enumerator_new).reduce(:+)\n  end\n\n  x.report(\"Enum.new defer map x*2 sum\") do\n    enumerator_defer_x2(type_enumerator_new).reduce(:+)\n  end\n\nend\n"
  },
  {
    "path": "benchmark/serializer.rb",
    "content": "require \"benchmark\"\nrequire \"yaml\"\nrequire \"msgpack\"\nrequire \"oj\"\n# require \"thrift\"\n \nputs \"Simple\"\n\ndata = (0..100000).to_a\n\nBenchmark.bmbm do |x|\n  x.report(\"YAML\") do\n    serialized = YAML.dump(data)\n    deserialized = YAML.load(serialized)\n    puts \"Size: #{serialized.size}, Equal: #{deserialized == data}\"\n  end\n\n  x.report(\"Marshal\") do\n    serialized = Marshal.dump(data)\n    deserialized = Marshal.load(serialized)\n    puts \"Size: #{serialized.size}, Equal: #{deserialized == data}\"\n  end\n\n  x.report(\"MessagePack\") do\n    serialized = MessagePack.dump(data)\n    deserialized = MessagePack.load(serialized)\n    puts \"Size: #{serialized.size}, Equal: #{deserialized == data}\"\n  end\n\n  x.report(\"Oj\") do\n    serialized = Oj.dump(data)\n    deserialized = Oj.load(serialized)\n    puts \"Size: #{serialized.size}, Equal: #{deserialized == data}\"\n  end\n\n  # x.report(\"Thrift\") do\n  #   serializer = Thrift::Serializer.new\n  #   deserializer = Thrift::Deserializer.new\n\n  #   serialized = serializer.serialize(data)\n  # end\nend\n\nputs \"\"\nputs \"More complex\"\n\ndata = Array.new(10000000) { \n  [rand(97..122).chr, rand(10000000)]\n}\n\nBenchmark.bm do |x|\n  # Take too long\n  # x.report(\"YAML\") do\n  #   serialized = YAML.dump(data)\n  #   YAML.load(serialized)\n  # end\n\n  x.report(\"Marshal\") do\n    serialized = Marshal.dump(data)\n    deserialized = Marshal.load(serialized)\n    puts \" Size: #{serialized.size}, Equal: #{deserialized == data}\"\n  end\n\n  x.report(\"MessagePack\") do\n    serialized = MessagePack.dump(data)\n    deserialized = MessagePack.load(serialized)\n    puts \" Size: #{serialized.size}, Equal: #{deserialized == data}\"\n  end\n\n  x.report(\"Oj\") do\n    serialized = Oj.dump(data)\n    deserialized = Oj.load(serialized)\n    puts \" Size: #{serialized.size}, Equal: #{deserialized == data}\"\n  end\n\n  # x.report(\"Thrift\") do\n  #   serializer = Thrift::Serializer.new\n  #   deserializer = Thrift::Deserializer.new\n\n  #   serialized = serializer.serialize(data)\n  # end\nend\n"
  },
  {
    "path": "benchmark/sort.rb",
    "content": "require \"benchmark\"\n\narray = []\n1000.times { \n  array << {:bar => rand(1000)} \n}\n\nn = 500\nBenchmark.bm(20) do |x|\n  x.report(\"sort\")               { n.times { array.sort{ |a,b| b[:bar] <=> a[:bar] } } }\n  x.report(\"sort reverse\")       { n.times { array.sort{ |a,b| a[:bar] <=> b[:bar] }.reverse } }\n  x.report(\"sort_by -a[:bar]\")   { n.times { array.sort_by{ |a| -a[:bar] } } }\n  x.report(\"sort_by a[:bar]*-1\") { n.times { array.sort_by{ |a| a[:bar]*-1 } } }\n  x.report(\"sort_by.reverse!\")   { n.times { array.sort_by{ |a| a[:bar] }.reverse } }\nend\n\n\narray = Array.new(10000) { Array.new(rand(1..10)){(97+rand(26)).chr}.join }\n\nBenchmark.bm(20) do |x|\n  x.report(\"sort asc\")         { n.times { array.sort } }\n  x.report(\"sort asc block\")   { n.times { array.sort{|a,b| a <=> b} } }\n  x.report(\"sort desc\")        { n.times { array.sort{|a,b| b <=> a} } }\n  x.report(\"sort asc reverse\") { n.times { array.sort.reverse } }\nend\n\n\nkey_value = Struct.new(:key, :value) do\n  def <=>(other)\n    key <=> other.key\n  end\nend\n\ncount = 10000\nitem_range = 1000000\narray1 = Array.new(count) { [rand(item_range), rand(item_range)] }\narray2 = Array.new(count) { key_value.new rand(item_range), rand(item_range) }\n\nBenchmark.bm(20) do |x|\n  x.report(\"sort_by\")       { n.times { array1.sort_by {|a| a[0]} } }\n  x.report(\"sort struct\")   { n.times { array2.sort } }\nend\n\n"
  },
  {
    "path": "benchmark/sort2.rb",
    "content": "require \"benchmark\"\nrequire \"algorithms\"\n\nNUMBER_OF_SORTING = 1\nNUMBER_OF_ARRAY   = 10\nWORDS_IN_ARRAY    = 100000\nMAX_WORD_SIZE     = 10\nEVAL_N_VALUES     = 10\n\nputs \"NUMBER_OF_SORTING: #{NUMBER_OF_SORTING}\"\nputs \"NUMBER_OF_ARRAY: #{NUMBER_OF_ARRAY}\"\nputs \"WORDS_IN_ARRAY: #{WORDS_IN_ARRAY}\"\nputs \"MAX_WORD_SIZE: #{MAX_WORD_SIZE}\"\nputs \"EVAL_N_VALUES: #{EVAL_N_VALUES}\"\n\ndef words\n  Array.new(WORDS_IN_ARRAY) { word }\nend\n\ndef word\n  Array.new(rand(1..MAX_WORD_SIZE)){(97+rand(26)).chr}.join\nend\n\n@array = Array.new(NUMBER_OF_ARRAY) { words.sort }\n\n\n# =================================================================================================\n# Sort1\n\n# Vrátí nový (nevyhodnocený) enumerator\ndef sort1(data)\n  return to_enum(__callee__, data) unless block_given?\n\n  heap = []\n\n  # Inicializuji heap s prvními položkami\n  # připojím samotné enumeratory pro volání .next\n  data.each do |a|\n    heap << [a.next, a]\n  end\n\n  while data.any?\n    begin\n      # Seřadím pole podle hodnot\n      heap.sort_by!{|(item,_)| item}\n      # Uložím si hodnotu a enumerator\n      item, enum = heap.shift\n      # Hodnota půjde do výsledku\n      yield item\n      # Místo odstraněné položky nahradí další ze stejného seznamu\n      heap << [enum.next, enum]\n    rescue StopIteration\n      # Enumerator je prázdný\n      data.delete(enum)\n    end\n  end\nend\n\n\n# =================================================================================================\n# Sort1_2\n\n# Vrátí nový (nevyhodnocený) enumerator\ndef sort1_2(data)\n  return to_enum(__callee__, data) unless block_given?\n\n  heap = []\n  enums = []\n\n  # Inicializuji heap s prvními položkami\n  # připojím samotné enumeratory pro volání .next\n  data.each do |a|\n    EVAL_N_VALUES.times {\n      begin\n        heap << [a.next, a]\n      rescue StopIteration\n      end\n    }\n  end\n\n  while data.any? || heap.any?\n      # Seřadím pole podle hodnot\n      heap.sort_by!{|(item,_)| item}\n\n      # Minimálně můžu vzít EVAL_N_VALUES\n      EVAL_N_VALUES.times {\n        break if heap.empty?\n\n        # Uložím si hodnotu a enumerator\n        item, enum = heap.shift\n        # Hodnota půjde do výsledku\n        yield item\n\n        enums << enum\n      }\n\n    while (enum = enums.shift)\n      begin\n        heap << [enum.next, enum]\n      rescue StopIteration\n        data.delete(enum)\n        enums.delete(enum)\n      end\n    end\n\n  end\nend\n\n\n# =================================================================================================\n# Sort 2\n\ndef sort2(data)\n  return to_enum(__callee__, data) unless block_given?\n\n  heap = Containers::Heap.new\n\n  data.each do |enum|\n    item = enum.next\n    heap.push(item, [item, enum])\n  end\n\n  while data.any?\n    begin\n      item, enum = heap.pop\n      yield item\n\n      item = enum.next\n      heap.push(item, [item, enum])\n    rescue StopIteration\n      data.delete(enum)\n    end\n  end\nend\n\n\n# =================================================================================================\n# Benchmark\n\nBenchmark.bm(10) do |x|\n  x.report(\"sort\") do\n    NUMBER_OF_SORTING.times {\n      @result = @array.flatten.sort\n    }\n  end\n\n  x.report(\"sort 1\") do\n    NUMBER_OF_SORTING.times { \n      raise \"Bad sorting\" if @result != sort1(@array.map(&:each)).to_a\n    }\n  end\n\n  x.report(\"sort 1_2\") do\n    NUMBER_OF_SORTING.times { \n      raise \"Bad sorting\" if @result != sort1_2(@array.map(&:each)).to_a\n    }\n  end\n\n  # x.report(\"sort 2\") do\n  #   NUMBER_OF_SORTING.times {\n  #     raise \"Bad sorting\" if @result != sort2(@array.map(&:each)).to_a\n  #   }\n  # end\nend\n"
  },
  {
    "path": "benchmark/take.rb",
    "content": "require \"benchmark\"\n\nSIZE = 100_000_000\n\n@array1 = (0..SIZE).to_a;\n@array2 = (0..SIZE).to_a;\n@array3 = (0..SIZE).to_a;\n\nTAKE = 100_000\n\nBenchmark.bm(15) do |x|\n  # Fastest\n  x.report(\"take\"){\n    a=@array1.take(TAKE)\n  }\n\n  # Slowest and take most memory\n  x.report(\"reverse drop\"){\n    @array2.reverse!\n    @array2.drop(@array2.size - TAKE)\n    @array2.reverse!\n  }\n\n  # Least memory\n  x.report(\"splice\"){\n    a=@array2.slice!(0, TAKE)\n  }\nend\n"
  },
  {
    "path": "bin/ruby-spark",
    "content": "#!/usr/bin/env ruby\n\nlib = File.expand_path(File.dirname(__FILE__) + '/../lib')\n$LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)\n\nrequire 'ruby-spark'\n\nSpark::CLI.new.run\n"
  },
  {
    "path": "example/pi.rb",
    "content": "#!/usr/bin/env ruby\n\nlib = File.expand_path(File.dirname(__FILE__) + '/../lib')\n$LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)\n\nrequire 'ruby-spark'\n\nSpark.logger.disable\nSpark.start\n\nslices = 3\nn = 100000 * slices\n\ndef map(_)\n  x = rand * 2 - 1\n  y = rand * 2 - 1\n\n  if x**2 + y**2 < 1\n    return 1\n  else\n    return 0\n  end\nend\n\nrdd = Spark.context.parallelize(1..n, slices)\nrdd = rdd.map(method(:map))\n\nputs 'Pi is roughly %f' % (4.0 * rdd.sum / n)\n"
  },
  {
    "path": "example/website_search.rb",
    "content": "#!/usr/bin/env ruby\n\n# Parse sitemap and search word on every page\n\nrequire 'optparse'\nrequire 'open-uri'\nrequire 'nokogiri'\nrequire 'ruby-spark'\n\noptions = {\n  sitemap: 'http://fit.cvut.cz/sitemap.xml',\n  query: 'cvut',\n  workers: 2\n}\n\nopt_parser = OptionParser.new do |opts|\n  opts.banner = 'Usage: website_search.rb [options]'\n\n  opts.separator ''\n  opts.separator 'Specific options:'\n\n  opts.on('-s', '--sitemap SITEMAP', 'Sitemap URL') do |sitemap|\n    options[:sitemap] = sitemap\n  end\n\n  opts.on('-q', '--query QUERY', 'Query to search') do |query|\n    options[:query] = query\n  end\n\n  opts.on('-w', '--workers WORKERS_NUM', Integer, 'Number of workers') do |workers|\n    options[:workers] = workers\n  end\n\n  opts.on('--quite', 'Run quitely') do |v|\n    Spark.logger.disabled\n  end\n\n  opts.on_tail('-h', '--help', 'Show this message') do\n    puts opts\n    exit\n  end\nend\n\nopt_parser.parse!\n\n@links = []\n\ndef parse_sitemap(doc)\n  doc.xpath('//sitemapindex/sitemap/loc').each do |loc|\n    next_doc = Nokogiri::HTML(open(loc.text))\n    parse_sitemap(next_doc)\n  end\n\n  doc.xpath('//url/loc').each do |loc|\n    @links << loc.text\n  end\nend\n\ndoc = Nokogiri::HTML(open(options[:sitemap]))\nparse_sitemap(doc)\n\n# Map function\nfunc = Proc.new do |url|\n  begin\n    open(url) {|f|\n      [url, f.read.scan(query).count]\n    }\n  rescue\n    [url, 0]\n  end\nend\n\nSpark.start\n\nrdd = Spark.sc.parallelize(@links, options[:workers])\n              .add_library('open-uri')\n              .bind(query: options[:query])\n              .map(func)\n              .sort_by(lambda{|(_, value)| value}, false)\n\nrdd.collect.each do |(url, count)|\n  puts \"#{url} => #{count}\"\nend\n"
  },
  {
    "path": "ext/ruby_c/extconf.rb",
    "content": "require 'mkmf'\n\ncreate_makefile(\"ruby_spark_ext\")\n"
  },
  {
    "path": "ext/ruby_c/murmur.c",
    "content": "#include \"murmur.h\"\n\n#if defined(_MSC_VER)\n#define BIG_CONSTANT(x) (x)\n#else\n#define BIG_CONSTANT(x) (x##LLU)\n#endif\n\n/*-----------------------------------------------------------------------------\n// MurmurHash2, 64-bit versions, by Austin Appleby\n//\n// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment \n// and endian-ness issues if used across multiple platforms.\n//\n// 64-bit hash for 64-bit platforms\n*/\n\nuint64_t MurmurHash64A(const void * key, int len, uint64_t seed)\n{\n  const uint64_t m = BIG_CONSTANT(0xc6a4a7935bd1e995);\n  const int r = 47;\n\n  uint64_t h = seed ^ (len * m);\n\n  const uint64_t * data = (const uint64_t *)key;\n  const uint64_t * end = data + (len/8);\n\n  while(data != end)\n  {\n    uint64_t k = *data++;\n\n    k *= m; \n    k ^= k >> r; \n    k *= m; \n    \n    h ^= k;\n    h *= m; \n  }\n\n  const unsigned char * data2 = (const unsigned char*)data;\n\n  switch(len & 7)\n  {\n  case 7: h ^= ((uint64_t) data2[6]) << 48;\n  case 6: h ^= ((uint64_t) data2[5]) << 40;\n  case 5: h ^= ((uint64_t) data2[4]) << 32;\n  case 4: h ^= ((uint64_t) data2[3]) << 24;\n  case 3: h ^= ((uint64_t) data2[2]) << 16;\n  case 2: h ^= ((uint64_t) data2[1]) << 8;\n  case 1: h ^= ((uint64_t) data2[0]);\n          h *= m;\n  };\n \n  h ^= h >> r;\n  h *= m;\n  h ^= h >> r;\n\n  return h;\n} \n\n/* 64-bit hash for 32-bit platforms */\n\nuint64_t MurmurHash64B(const void * key, int len, uint64_t seed)\n{\n  const uint32_t m = 0x5bd1e995;\n  const int r = 24;\n\n  uint32_t h1 = ((uint32_t) seed) ^ len;\n  uint32_t h2 = ((uint32_t) (seed >> 32));\n\n  const uint32_t * data = (const uint32_t *)key;\n\n  while(len >= 8)\n  {\n    uint32_t k1 = *data++;\n    k1 *= m; k1 ^= k1 >> r; k1 *= m;\n    h1 *= m; h1 ^= k1;\n    len -= 4;\n\n    uint32_t k2 = *data++;\n    k2 *= m; k2 ^= k2 >> r; k2 *= m;\n    h2 *= m; h2 ^= k2;\n    len -= 4;\n  }\n\n  if(len >= 4)\n  {\n    uint32_t k1 = *data++;\n    k1 *= m; k1 ^= k1 >> r; k1 *= m;\n    h1 *= m; h1 ^= k1;\n    len -= 4;\n  }\n\n  switch(len)\n  {\n  case 3: h2 ^= ((unsigned char*)data)[2] << 16;\n  case 2: h2 ^= ((unsigned char*)data)[1] << 8;\n  case 1: h2 ^= ((unsigned char*)data)[0];\n      h2 *= m;\n  };\n\n  h1 ^= h2 >> 18; h1 *= m;\n  h2 ^= h1 >> 22; h2 *= m;\n  h1 ^= h2 >> 17; h1 *= m;\n  h2 ^= h1 >> 19; h2 *= m;\n\n  uint64_t h = h1;\n\n  h = (h << 32) | h2;\n\n  return h;\n}\n\n\n\n// ================================================================================================\n// Ruby methods\n\n#define PORTABLE_HASH_SEED 16154832\n\n\nVALUE murmur2_digest(VALUE rb_str, uint64_t seed)\n{\n  StringValue(rb_str);\n\n  void * key = RSTRING_PTR(rb_str);\n  long   len = RSTRING_LEN(rb_str);\n\n  uint64_t result = MurmurHash64A(key, len, seed);\n\n  return LONG2FIX(result);\n}\n\n// ------------------------------------------------------------------------------------------------\n// Spark::Digest::Murmur2.digest\n\nVALUE method_murmur2_digest(int argc, VALUE *argv, VALUE klass)\n{\n  if(argc == 0 || argc > 2){\n    rb_raise(rb_eArgError, \"wrong number of arguments (%d for 1..2)\", argc);\n  }\n\n  uint64_t seed = (argc == 1 ? 0 : NUM2UINT(argv[1]));\n\n  return murmur2_digest(argv[0], seed);\n}\n\n// ------------------------------------------------------------------------------------------------\n// Spark::Digest.portable_hash\n\nVALUE method_portable_hash(int argc, VALUE *argv, VALUE klass)\n{\n  if(argc != 1){\n    rb_raise(rb_eArgError, \"wrong number of arguments (%d for 1)\", argc);\n  }\n\n  return murmur2_digest(argv[0], PORTABLE_HASH_SEED);\n}\n"
  },
  {
    "path": "ext/ruby_c/murmur.h",
    "content": "#ifndef MURMUR_INCLUDED\n#define MURMUR_INCLUDED\n\n#include \"ruby.h\"\n\nVALUE method_portable_hash(int argc, VALUE *argv, VALUE klass);\nVALUE method_murmur2_digest(int argc, VALUE *argv, VALUE klass);\n\n#endif\n"
  },
  {
    "path": "ext/ruby_c/ruby-spark.c",
    "content": "#include \"ruby.h\"\n#include \"murmur.h\"\n\n\nVALUE SparkModule;\nVALUE SparkDigestModule;\nVALUE SparkDigestMurmur2Class;\n\n\nvoid Init_ruby_spark_ext()\n{\n  SparkModule             = rb_define_module(\"Spark\");\n  SparkDigestModule       = rb_define_module_under(SparkModule, \"Digest\");\n  SparkDigestMurmur2Class = rb_define_class_under(SparkDigestModule, \"Murmur2\", rb_cObject);\n\n  rb_define_singleton_method(SparkDigestModule, \"portable_hash\", method_portable_hash, -1);\n  rb_define_singleton_method(SparkDigestMurmur2Class, \"digest\", method_murmur2_digest, -1);\n}\n"
  },
  {
    "path": "ext/ruby_java/Digest.java",
    "content": "import org.jruby.Ruby;\nimport org.jruby.RubyModule;\nimport org.jruby.RubyObject;\nimport org.jruby.RubyClass;\nimport org.jruby.RubyString;\nimport org.jruby.RubyFixnum;\nimport org.jruby.anno.JRubyModule;\nimport org.jruby.anno.JRubyMethod;\nimport org.jruby.runtime.ThreadContext;\nimport org.jruby.runtime.builtin.IRubyObject;\n\n@JRubyModule(name=\"Spark::Digest\")\npublic class Digest extends RubyObject{\n\n  // Have to be the same as in C extension\n  final static long PORTABLE_HASH_SEED = 16154832;\n\n  public Digest(final Ruby ruby, RubyClass rubyClass) {\n    super(ruby, rubyClass);\n  }\n\n  @JRubyMethod(module=true)\n  public static IRubyObject portable_hash(ThreadContext context, IRubyObject self, IRubyObject arg) {\n    Ruby ruby = self.getRuntime();\n\n    RubyString keyString = (RubyString)arg;\n\n    long hash = Murmur2.hash64(keyString.getBytes(), (int)keyString.length().getLongValue(), PORTABLE_HASH_SEED);\n\n    RubyFixnum result = new RubyFixnum(ruby, hash);\n\n    return result;\n  }\n\n}\n\n"
  },
  {
    "path": "ext/ruby_java/Murmur2.java",
    "content": "import org.jruby.Ruby;\nimport org.jruby.RubyClass;\nimport org.jruby.RubyObject;\nimport org.jruby.RubyString;\nimport org.jruby.RubyFixnum;\nimport org.jruby.anno.JRubyClass;\nimport org.jruby.anno.JRubyMethod;\nimport org.jruby.runtime.ThreadContext;\nimport org.jruby.runtime.builtin.IRubyObject;\n\n/** Murmur hash 2.0.\n * \n * The murmur hash is a relative fast hash function from\n * http://murmurhash.googlepages.com/ for platforms with efficient\n * multiplication.\n *\n * http://d3s.mff.cuni.cz/~holub/sw/javamurmurhash/\n *\n */\n\n@JRubyClass(name=\"Spark::Digest::Murmur2\")\npublic class Murmur2 extends RubyObject {\n\n  public Murmur2(final Ruby ruby, RubyClass rubyClass) {\n    super(ruby, rubyClass);\n  }\n\n  @JRubyMethod(required=1, optional=1, module=true)\n  public static IRubyObject digest(ThreadContext context, IRubyObject self, IRubyObject[] args) {\n    Ruby ruby = context.getRuntime();\n\n    RubyString keyString = (RubyString)args[0];\n    long seed;\n\n    if(args.length > 1){\n      RubyFixnum rb_seed = (RubyFixnum)args[1];\n      seed = rb_seed.getLongValue();\n    }\n    else{\n      seed = 0;\n    }\n\n    long hash = hash64(keyString.getBytes(), (int)keyString.length().getLongValue(), seed);\n\n    RubyFixnum result = new RubyFixnum(ruby, hash);\n    return result;\n  }\n\n\n  /** Generates 64 bit hash from byte array of the given length and seed.\n   * \n   * @param data byte array to hash\n   * @param length length of the array to hash\n   * @param seed initial seed value\n   * @return 64 bit hash of the given array\n   */\n  public static long hash64(final byte[] data, int length, long seed) {\n    final long m = 0xc6a4a7935bd1e995L;\n    final int r = 47;\n\n    long h = (seed&0xffffffffl)^(length*m);\n\n    int length8 = length/8;\n\n    for (int i=0; i<length8; i++) {\n      final int i8 = i*8;\n      long k =  ((long)data[i8+0]&0xff)      +(((long)data[i8+1]&0xff)<<8)\n          +(((long)data[i8+2]&0xff)<<16) +(((long)data[i8+3]&0xff)<<24)\n          +(((long)data[i8+4]&0xff)<<32) +(((long)data[i8+5]&0xff)<<40)\n          +(((long)data[i8+6]&0xff)<<48) +(((long)data[i8+7]&0xff)<<56);\n      \n      k *= m;\n      k ^= k >>> r;\n      k *= m;\n      \n      h ^= k;\n      h *= m; \n    }\n    \n    switch (length%8) {\n    case 7: h ^= (long)(data[(length&~7)+6]&0xff) << 48;\n    case 6: h ^= (long)(data[(length&~7)+5]&0xff) << 40;\n    case 5: h ^= (long)(data[(length&~7)+4]&0xff) << 32;\n    case 4: h ^= (long)(data[(length&~7)+3]&0xff) << 24;\n    case 3: h ^= (long)(data[(length&~7)+2]&0xff) << 16;\n    case 2: h ^= (long)(data[(length&~7)+1]&0xff) << 8;\n    case 1: h ^= (long)(data[length&~7]&0xff);\n            h *= m;\n    };\n   \n    h ^= h >>> r;\n    h *= m;\n    h ^= h >>> r;\n\n    return h;\n  }\n\n}\n"
  },
  {
    "path": "ext/ruby_java/RubySparkExtService.java",
    "content": "import org.jruby.Ruby;\nimport org.jruby.RubyClass;\nimport org.jruby.RubyModule;\nimport org.jruby.runtime.ObjectAllocator;\nimport org.jruby.runtime.builtin.IRubyObject;\nimport org.jruby.runtime.load.BasicLibraryService;\n\npublic class RubySparkExtService implements BasicLibraryService\n{\n  public boolean basicLoad(final Ruby ruby) throws java.io.IOException {\n\n    RubyModule sparkModule = ruby.defineModule(\"Spark\");\n    RubyModule sparkDigestModule = sparkModule.defineModuleUnder(\"Digest\");\n    RubyClass  sparkDigestMurmur2Class = sparkDigestModule.defineClassUnder(\"Murmur2\", ruby.getObject(), sparkDigestMurmur2Allocator);\n\n    sparkDigestModule.defineAnnotatedMethods(Digest.class);\n    sparkDigestMurmur2Class.defineAnnotatedMethods(Murmur2.class);\n\n    return true;\n  }\n\n  public static ObjectAllocator sparkDigestMurmur2Allocator = new ObjectAllocator() {\n    public IRubyObject allocate(Ruby ruby, RubyClass rubyClass) {\n      return new Murmur2(ruby, rubyClass);\n    }\n  };\n\n}\n"
  },
  {
    "path": "ext/ruby_java/extconf.rb",
    "content": "require 'mkmf'\n\ncreate_makefile(\"ruby_spark_ext\")\n"
  },
  {
    "path": "ext/spark/build.sbt",
    "content": "import AssemblyKeys._\n\nassemblySettings\n\n// Default values\nval defaultScalaVersion     = \"2.10.4\"\nval defaultSparkVersion     = \"1.6.0\"\nval defaultSparkCoreVersion = \"2.10\"\nval defaultTargetDir        = \"target\"\nval defaultHadoopVersion    = \"1.0.4\"\n\n// Values\nval _hadoopVersion    = scala.util.Properties.envOrElse(\"HADOOP_VERSION\", defaultHadoopVersion)\nval _scalaVersion     = scala.util.Properties.envOrElse(\"SCALA_VERSION\", defaultScalaVersion)\nval _sparkVersion     = scala.util.Properties.envOrElse(\"SPARK_VERSION\", defaultSparkVersion)\nval _sparkCoreVersion = scala.util.Properties.envOrElse(\"SPARK_CORE_VERSION\", defaultSparkCoreVersion)\nval _targetDir        = scala.util.Properties.envOrElse(\"TARGET_DIR\", defaultTargetDir)\n\n// Project settings\nname := \"ruby-spark\"\n\nversion := \"1.0.0\"\n\nscalaVersion := _scalaVersion\n\njavacOptions ++= Seq(\"-source\", \"1.7\", \"-target\", \"1.7\")\n\n// Jar target folder\nartifactPath in Compile in packageBin := file(s\"${_targetDir}/ruby-spark.jar\")\noutputPath in packageDependency := file(s\"${_targetDir}/ruby-spark-deps.jar\")\n\n// Protocol buffer support\nseq(sbtprotobuf.ProtobufPlugin.protobufSettings: _*)\n\n// Additional libraries\nlibraryDependencies ++= Seq(\n  \"org.apache.spark\"  %% \"spark-core\"    % _sparkVersion excludeAll(ExclusionRule(organization = \"org.apache.hadoop\")),\n  \"org.apache.spark\"  %% \"spark-graphx\"  % _sparkVersion,\n  \"org.apache.spark\"  %% \"spark-mllib\"   % _sparkVersion,\n  \"org.apache.spark\"  %% \"spark-sql\"     % _sparkVersion,\n  \"org.apache.hadoop\" %  \"hadoop-client\" % _hadoopVersion,\n  \"com.github.fommil.netlib\" % \"all\" % \"1.1.2\",\n  \"org.scalatest\" % \"scalatest_2.10\" % \"2.2.1\" % \"test\"\n)\n\n// Repositories\nresolvers ++= Seq(\n  \"JBoss Repository\"     at \"http://repository.jboss.org/nexus/content/repositories/releases/\",\n  \"Spray Repository\"     at \"http://repo.spray.io/\",\n  \"Cloudera Repository\"  at \"https://repository.cloudera.com/artifactory/cloudera-repos/\",\n  \"Akka Repository\"      at \"http://repo.akka.io/releases/\",\n  \"Twitter4J Repository\" at \"http://twitter4j.org/maven2/\",\n  \"Apache HBase\"         at \"https://repository.apache.org/content/repositories/releases\",\n  \"Twitter Maven Repo\"   at \"http://maven.twttr.com/\",\n  \"scala-tools\"          at \"https://oss.sonatype.org/content/groups/scala-tools\",\n  \"Typesafe repository\"  at \"http://repo.typesafe.com/typesafe/releases/\",\n  \"Second Typesafe repo\" at \"http://repo.typesafe.com/typesafe/maven-releases/\",\n  \"Mesosphere Public Repository\" at \"http://downloads.mesosphere.io/maven\",\n  Resolver.sonatypeRepo(\"public\")\n)\n\n// Merge strategy\nmergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) =>\n  {\n    case m if m.toLowerCase.endsWith(\"manifest.mf\") => MergeStrategy.discard\n    case m if m.startsWith(\"META-INF\") => MergeStrategy.discard\n    case PathList(\"javax\", \"servlet\", xs @ _*) => MergeStrategy.first\n    case PathList(\"org\", \"apache\", xs @ _*) => MergeStrategy.first\n    case PathList(\"org\", \"jboss\", xs @ _*) => MergeStrategy.first\n    case \"about.html\"  => MergeStrategy.rename\n    case \"reference.conf\" => MergeStrategy.concat\n    case _ => MergeStrategy.first\n  }\n}\n"
  },
  {
    "path": "ext/spark/project/plugins.sbt",
    "content": "resolvers += Resolver.url(\"artifactory\", url(\"http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases\"))(Resolver.ivyStylePatterns)\n\nresolvers += \"Typesafe Repository\" at \"http://repo.typesafe.com/typesafe/releases/\"\n\nresolvers += \"Spray Repository\" at \"http://repo.spray.io/\"\n\naddSbtPlugin(\"com.eed3si9n\" % \"sbt-assembly\" % \"0.10.2\")\n\naddSbtPlugin(\"com.github.gseitz\" % \"sbt-protobuf\" % \"0.3.3\")\n"
  },
  {
    "path": "ext/spark/sbt/sbt",
    "content": "#!/bin/bash\n\n# This script launches sbt for this project. If present it uses the system\n# version of sbt. If there is no system version of sbt it attempts to download\n# sbt locally.\nSBT_VERSION=0.13.9\nURL1=http://dl.bintray.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar\nURL2=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar\nJAR=sbt/sbt-launch-${SBT_VERSION}.jar\n\n# Download sbt launch jar if it hasn't been downloaded yet\nif [ ! -f ${JAR} ]; then\n  # Download\n  printf \"Attempting to fetch sbt\\n\"\n  JAR_DL=${JAR}.part\n  if hash wget 2>/dev/null; then\n    (wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR}\n  elif hash curl 2>/dev/null; then\n    (curl --progress-bar ${URL1} > ${JAR_DL} || curl --progress-bar ${URL2} > ${JAR_DL}) && mv ${JAR_DL} ${JAR}\n  else\n    printf \"You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\\n\"\n    exit -1\n  fi\nfi\nif [ ! -f ${JAR} ]; then\n  # We failed to download\n  printf \"Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\\n\"\n  exit -1\nfi\nprintf \"Launching sbt from ${JAR}\\n\"\njava \\\n  -Xmx1200m -XX:MaxPermSize=350m -XX:ReservedCodeCacheSize=256m \\\n  -jar ${JAR} \\\n  \"$@\"\n"
  },
  {
    "path": "ext/spark/src/main/scala/Exec.scala",
    "content": "package org.apache.spark.api.ruby\n\nimport java.io.{File, FileOutputStream, InputStreamReader, BufferedReader}\n\nimport scala.collection.JavaConversions._\n\nimport org.apache.spark.{SparkEnv, Logging}\nimport org.apache.spark.util._\n\n\n/* =================================================================================================\n * class FileCommand\n * =================================================================================================\n *\n * Save command to file and than execute him because from Scala you cannot simply run\n * something like \"bash --norc -i -c 'source .zshrc; ruby master.rb'\"\n */\n\nclass FileCommand(command: String) extends Logging {\n\n  var pb: ProcessBuilder = null\n  var file: File = null\n\n  // Command is complete.\n  def this(command: String, env: SparkEnv) = {\n    this(command)\n    create(env)\n  }\n\n  // Template must contains %s which will be replaced for command\n  def this(template: String, command: String, env: SparkEnv, envVars: Map[String, String]) = {\n    this(template.format(command), env)\n    setEnvVars(envVars)\n  }\n\n  private def create(env: SparkEnv) {\n    val dir = new File(env.sparkFilesDir)\n    val ext = if(Utils.isWindows) \".cmd\" else \".sh\"\n    val shell = if(Utils.isWindows) \"cmd\" else \"bash\"\n\n    file = File.createTempFile(\"command\", ext, dir)\n\n    val out = new FileOutputStream(file)\n    out.write(command.getBytes)\n    out.close\n\n    logInfo(s\"New FileCommand at ${file.getAbsolutePath}\")\n\n    pb = new ProcessBuilder(shell, file.getAbsolutePath)\n  }\n\n  def setEnvVars(vars: Map[String, String]) {\n    pb.environment().putAll(vars)\n  }\n\n  def run = {\n    new ExecutedFileCommand(pb.start)\n  }\n}\n\n\n/* =================================================================================================\n * class ExecutedFileCommand\n * =================================================================================================\n *\n * Represent process executed from file.\n */\n\nclass ExecutedFileCommand(process: Process) {\n\n  var reader: BufferedReader = null\n\n  def readLine = {\n    openInput\n    reader.readLine.toString.trim\n  }\n\n  def openInput {\n    if(reader != null){\n      return\n    }\n\n    val input = new InputStreamReader(process.getInputStream)\n    reader = new BufferedReader(input)\n  }\n\n  // Delegation\n  def destroy = process.destroy\n  def getInputStream = process.getInputStream\n  def getErrorStream = process.getErrorStream\n}\n"
  },
  {
    "path": "ext/spark/src/main/scala/MLLibAPI.scala",
    "content": "package org.apache.spark.mllib.api.python\n\n// PythonMLLibAPI is private for python\nclass MLLibAPI extends PythonMLLibAPI {}\n"
  },
  {
    "path": "ext/spark/src/main/scala/Marshal.scala",
    "content": "package org.apache.spark.api.ruby.marshal\n\nimport java.io.{DataInputStream, DataOutputStream, ByteArrayInputStream, ByteArrayOutputStream}\n\nimport scala.collection.mutable.ArrayBuffer\nimport scala.collection.JavaConverters._\n\n\n/* =================================================================================================\n * object Marshal\n * =================================================================================================\n */\nobject Marshal {\n  def load(bytes: Array[Byte]) = {\n    val is = new DataInputStream(new ByteArrayInputStream(bytes))\n\n    val majorVersion = is.readUnsignedByte // 4\n    val minorVersion = is.readUnsignedByte // 8\n\n    (new MarshalLoad(is)).load\n  }\n\n  def dump(data: Any) = {\n    val aos = new ByteArrayOutputStream\n    val os = new DataOutputStream(aos)\n\n    os.writeByte(4)\n    os.writeByte(8)\n\n    (new MarshalDump(os)).dump(data)\n    aos.toByteArray\n  }\n}\n\n\n/* =================================================================================================\n * class IterableMarshaller\n * =================================================================================================\n */\nclass IterableMarshaller(iter: Iterator[Any]) extends Iterator[Array[Byte]] {\n  private val buffer = new ArrayBuffer[Any]\n\n  override def hasNext: Boolean = iter.hasNext\n\n  override def next(): Array[Byte] = {\n    while (iter.hasNext) {\n      buffer += iter.next()\n    }\n\n    Marshal.dump(buffer)\n  }\n}\n"
  },
  {
    "path": "ext/spark/src/main/scala/MarshalDump.scala",
    "content": "package org.apache.spark.api.ruby.marshal\n\nimport java.io.{DataInputStream, DataOutputStream, ByteArrayInputStream, ByteArrayOutputStream}\n\nimport scala.collection.mutable.ArrayBuffer\nimport scala.collection.JavaConverters._\nimport scala.reflect.{ClassTag, classTag}\n\nimport org.apache.spark.mllib.regression.LabeledPoint\nimport org.apache.spark.mllib.linalg.{Vector, DenseVector, SparseVector}\n\n\n/* =================================================================================================\n * class MarshalDump\n * =================================================================================================\n */\nclass MarshalDump(os: DataOutputStream) {\n\n  val NAN_BYTELIST               = \"nan\".getBytes\n  val NEGATIVE_INFINITY_BYTELIST = \"-inf\".getBytes\n  val INFINITY_BYTELIST          = \"inf\".getBytes\n\n  def dump(data: Any) {\n    data match {\n      case null =>\n        os.writeByte('0')\n\n      case item: Boolean =>\n        val char = if(item) 'T' else 'F'\n        os.writeByte(char)\n\n      case item: Int =>\n        os.writeByte('i')\n        dumpInt(item)\n\n      case item: Array[_] =>\n        os.writeByte('[')\n        dumpArray(item)\n\n      case item: Double =>\n        os.writeByte('f')\n        dumpFloat(item)\n\n      case item: ArrayBuffer[Any] => dump(item.toArray)\n    }\n  }\n\n  def dumpInt(data: Int) {\n    if(data == 0){\n      os.writeByte(0)\n    }\n    else if (0 < data && data < 123) {\n      os.writeByte(data + 5)\n    }\n    else if (-124 < data && data < 0) {\n      os.writeByte((data - 5) & 0xff)\n    }\n    else {\n      val buffer = new Array[Byte](4)\n      var value = data\n\n      var i = 0\n      while(i != 4 && value != 0 && value != -1){\n        buffer(i) = (value & 0xff).toByte\n        value = value >> 8\n\n        i += 1\n      }\n      val lenght = i + 1\n      if(value < 0){\n        os.writeByte(-lenght)\n      }\n      else{\n        os.writeByte(lenght)\n      }\n      os.write(buffer, 0, lenght)\n    }\n  }\n\n  def dumpArray(array: Array[_]) {\n    dumpInt(array.size)\n\n    for(item <- array) {\n      dump(item)\n    }\n  }\n\n  def dumpFloat(value: Double) {\n    if(value.isPosInfinity){\n      dumpString(NEGATIVE_INFINITY_BYTELIST)\n    }\n    else if(value.isNegInfinity){\n      dumpString(INFINITY_BYTELIST)\n    }\n    else if(value.isNaN){\n      dumpString(NAN_BYTELIST)\n    }\n    else{\n      // dumpString(\"%.17g\".format(value))\n      dumpString(value.toString)\n    }\n  }\n\n  def dumpString(data: String) {\n    dumpString(data.getBytes)\n  }\n\n  def dumpString(data: Array[Byte]) {\n    dumpInt(data.size)\n    os.write(data)\n  }\n\n}\n"
  },
  {
    "path": "ext/spark/src/main/scala/MarshalLoad.scala",
    "content": "package org.apache.spark.api.ruby.marshal\n\nimport java.io.{DataInputStream, DataOutputStream, ByteArrayInputStream, ByteArrayOutputStream}\n\nimport scala.collection.mutable.ArrayBuffer\nimport scala.collection.JavaConverters._\nimport scala.reflect.{ClassTag, classTag}\n\nimport org.apache.spark.mllib.regression.LabeledPoint\nimport org.apache.spark.mllib.linalg.{Vector, DenseVector, SparseVector}\n\n\n/* =================================================================================================\n * class MarshalLoad\n * =================================================================================================\n */\nclass MarshalLoad(is: DataInputStream) {\n\n  case class WaitForObject()\n\n  val registeredSymbols = ArrayBuffer[String]()\n  val registeredLinks = ArrayBuffer[Any]()\n\n  def load: Any = {\n    load(is.readUnsignedByte.toChar)\n  }\n\n  def load(dataType: Char): Any = {\n    dataType match {\n      case '0' => null\n      case 'T' => true\n      case 'F' => false\n      case 'i' => loadInt\n      case 'f' => loadAndRegisterFloat\n      case ':' => loadAndRegisterSymbol\n      case '[' => loadAndRegisterArray\n      case 'U' => loadAndRegisterUserObject\n      case _ =>\n        throw new IllegalArgumentException(s\"Format is not supported: $dataType.\")\n    }\n  }\n\n\n  // ----------------------------------------------------------------------------------------------\n  // Load by type\n\n  def loadInt: Int = {\n    var c = is.readByte.toInt\n\n    if (c == 0) {\n      return 0\n    } else if (4 < c && c < 128) {\n      return c - 5\n    } else if (-129 < c && c < -4) {\n      return c + 5\n    }\n\n    var result: Long = 0\n\n    if (c > 0) {\n      result = 0\n      for( i <- 0 until c ) {\n        result |= (is.readUnsignedByte << (8 * i)).toLong\n      }\n    } else {\n      c = -c\n      result = -1\n      for( i <- 0 until c ) {\n        result &= ~((0xff << (8 * i)).toLong)\n        result |= (is.readUnsignedByte << (8 * i)).toLong\n      }\n    }\n\n    result.toInt\n  }\n\n  def loadAndRegisterFloat: Double = {\n    val result = loadFloat\n    registeredLinks += result\n    result\n  }\n\n  def loadFloat: Double = {\n    val string = loadString\n    string match {\n      case \"nan\"  => Double.NaN\n      case \"inf\"  => Double.PositiveInfinity\n      case \"-inf\" => Double.NegativeInfinity\n      case _ => string.toDouble\n    }\n  }\n\n  def loadString: String = {\n    new String(loadStringBytes)\n  }\n\n  def loadStringBytes: Array[Byte] = {\n    val size = loadInt\n    val buffer = new Array[Byte](size)\n\n    var readSize = 0\n    while(readSize < size){\n      val read = is.read(buffer, readSize, size-readSize)\n\n      if(read == -1){\n        throw new IllegalArgumentException(\"Marshal too short.\")\n      }\n\n      readSize += read\n    }\n\n    buffer\n  }\n\n  def loadAndRegisterSymbol: String = {\n    val result = loadString\n    registeredSymbols += result\n    result\n  }\n\n  def loadAndRegisterArray: Array[Any] = {\n    val size = loadInt\n    val array = new Array[Any](size)\n\n    registeredLinks += array\n\n    for( i <- 0 until size ) {\n      array(i) = loadNextObject\n    }\n\n    array\n  }\n\n  def loadAndRegisterUserObject: Any = {\n    val klass = loadNextObject.asInstanceOf[String]\n\n    // Register future class before load the next object\n    registeredLinks += WaitForObject()\n    val index = registeredLinks.size - 1\n\n    val data = loadNextObject\n\n    val result = klass match {\n      case \"Spark::Mllib::LabeledPoint\" => createLabeledPoint(data)\n      case \"Spark::Mllib::DenseVector\" => createDenseVector(data)\n      case \"Spark::Mllib::SparseVector\" => createSparseVector(data)\n      case other =>\n        throw new IllegalArgumentException(s\"Object $other is not supported.\")\n    }\n\n    registeredLinks(index) = result\n\n    result\n  }\n\n\n  // ----------------------------------------------------------------------------------------------\n  // Other loads\n\n  def loadNextObject: Any = {\n    val dataType = is.readUnsignedByte.toChar\n\n    if(isLinkType(dataType)){\n      readLink(dataType)\n    }\n    else{\n      load(dataType)\n    }\n  }\n\n\n  // ----------------------------------------------------------------------------------------------\n  // To java objects\n\n  def createLabeledPoint(data: Any): LabeledPoint = {\n    val array = data.asInstanceOf[Array[_]]\n    new LabeledPoint(array(0).asInstanceOf[Double], array(1).asInstanceOf[Vector])\n  }\n\n  def createDenseVector(data: Any): DenseVector = {\n    new DenseVector(data.asInstanceOf[Array[_]].map(toDouble(_)))\n  }\n\n  def createSparseVector(data: Any): SparseVector = {\n    val array = data.asInstanceOf[Array[_]]\n    val size = array(0).asInstanceOf[Int]\n    val indices = array(1).asInstanceOf[Array[_]].map(_.asInstanceOf[Int])\n    val values = array(2).asInstanceOf[Array[_]].map(toDouble(_))\n\n    new SparseVector(size, indices, values)\n  }\n\n\n  // ----------------------------------------------------------------------------------------------\n  // Helpers\n\n  def toDouble(data: Any): Double = data match {\n    case x: Int => x.toDouble\n    case x: Double => x\n    case _ => 0.0\n  }\n\n\n  // ----------------------------------------------------------------------------------------------\n  // Cache\n\n  def readLink(dataType: Char): Any = {\n    val index = loadInt\n\n    dataType match {\n      case '@' => registeredLinks(index)\n      case ';' => registeredSymbols(index)\n    }\n  }\n\n  def isLinkType(dataType: Char): Boolean = {\n    dataType == ';' || dataType == '@'\n  }\n\n}\n"
  },
  {
    "path": "ext/spark/src/main/scala/RubyAccumulatorParam.scala",
    "content": "package org.apache.spark.api.ruby\n\nimport java.io._\nimport java.net._\nimport java.util.{List, ArrayList}\n\nimport scala.collection.JavaConversions._\nimport scala.collection.immutable._\n\nimport org.apache.spark._\nimport org.apache.spark.util.Utils\n\n/**\n * Internal class that acts as an `AccumulatorParam` for Ruby accumulators. Inside, it\n * collects a list of pickled strings that we pass to Ruby through a socket.\n */\nprivate class RubyAccumulatorParam(serverHost: String, serverPort: Int)\n  extends AccumulatorParam[List[Array[Byte]]] {\n\n  // Utils.checkHost(serverHost, \"Expected hostname\")\n\n  val bufferSize = SparkEnv.get.conf.getInt(\"spark.buffer.size\", 65536)\n\n  // Socket shoudl not be serialized\n  // Otherwise: SparkException: Task not serializable\n  @transient var socket: Socket = null\n  @transient var socketOutputStream: DataOutputStream = null\n  @transient var socketInputStream:  DataInputStream = null\n\n  def openSocket(){\n    synchronized {\n      if (socket == null || socket.isClosed) {\n        socket = new Socket(serverHost, serverPort)\n\n        socketInputStream  = new DataInputStream(new BufferedInputStream(socket.getInputStream, bufferSize))\n        socketOutputStream = new DataOutputStream(new BufferedOutputStream(socket.getOutputStream, bufferSize))\n      }\n    }\n  }\n\n  override def zero(value: List[Array[Byte]]): List[Array[Byte]] = new ArrayList\n\n  override def addInPlace(val1: List[Array[Byte]], val2: List[Array[Byte]]) : List[Array[Byte]] = synchronized {\n    if (serverHost == null) {\n      // This happens on the worker node, where we just want to remember all the updates\n      val1.addAll(val2)\n      val1\n    } else {\n      // This happens on the master, where we pass the updates to Ruby through a socket\n      openSocket()\n\n      socketOutputStream.writeInt(val2.size)\n      for (array <- val2) {\n        socketOutputStream.writeInt(array.length)\n        socketOutputStream.write(array)\n      }\n      socketOutputStream.flush()\n\n      // Wait for acknowledgement\n      // http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock\n      //\n      // if(in.readInt() != RubyConstant.ACCUMULATOR_ACK){\n      //   throw new SparkException(\"Accumulator was not acknowledged\")\n      // }\n\n      new ArrayList\n    }\n  }\n}\n"
  },
  {
    "path": "ext/spark/src/main/scala/RubyBroadcast.scala",
    "content": "package org.apache.spark.api.ruby\n\nimport org.apache.spark.api.python.PythonBroadcast\n\n/**\n * An Wrapper for Ruby Broadcast, which is written into disk by Ruby. It also will\n * write the data into disk after deserialization, then Ruby can read it from disks.\n *\n * Class use Python logic - only for semantic\n */\nclass RubyBroadcast(@transient var _path: String, @transient var id: java.lang.Long) extends PythonBroadcast(_path) {\n\n}\n"
  },
  {
    "path": "ext/spark/src/main/scala/RubyConstant.scala",
    "content": "package org.apache.spark.api.ruby\n\nobject RubyConstant {\n  val DATA_EOF = -2\n  val WORKER_ERROR = -1\n  val WORKER_DONE = 0\n  val CREATE_WORKER = 1\n  val KILL_WORKER = 2\n  val KILL_WORKER_AND_WAIT = 3\n  val SUCCESSFULLY_KILLED = 4\n  val UNSUCCESSFUL_KILLING = 5\n  val ACCUMULATOR_ACK = 6\n}\n"
  },
  {
    "path": "ext/spark/src/main/scala/RubyMLLibAPI.scala",
    "content": "package org.apache.spark.mllib.api.ruby\n\nimport java.util.ArrayList\n\nimport scala.collection.JavaConverters._\n\nimport org.apache.spark.rdd.RDD\nimport org.apache.spark.api.java.JavaRDD\nimport org.apache.spark.mllib.linalg._\nimport org.apache.spark.mllib.regression.LabeledPoint\nimport org.apache.spark.mllib.classification.NaiveBayes\nimport org.apache.spark.mllib.clustering.GaussianMixtureModel\nimport org.apache.spark.mllib.stat.distribution.MultivariateGaussian\nimport org.apache.spark.mllib.api.python.MLLibAPI\n\n\nclass RubyMLLibAPI extends MLLibAPI {\n  // trainLinearRegressionModelWithSGD\n  // trainLassoModelWithSGD\n  // trainRidgeModelWithSGD\n  // trainLogisticRegressionModelWithSGD\n  // trainLogisticRegressionModelWithLBFGS\n  // trainSVMModelWithSGD\n  // trainKMeansModel\n  // trainGaussianMixtureModel\n\n  // Rjb have a problem with theta: Array[Array[Double]]\n  override def trainNaiveBayesModel(data: JavaRDD[LabeledPoint], lambda: Double) = {\n    val model = NaiveBayes.train(data.rdd, lambda)\n\n    List(\n      Vectors.dense(model.labels),\n      Vectors.dense(model.pi),\n      model.theta.toSeq\n    ).map(_.asInstanceOf[Object]).asJava\n  }\n\n  // On python is wt just Object\n  def predictSoftGMM(\n      data: JavaRDD[Vector],\n      wt: ArrayList[Object],\n      mu: ArrayList[Object],\n      si: ArrayList[Object]): RDD[Array[Double]] = {\n\n      // val weight = wt.asInstanceOf[Array[Double]]\n      val weight = wt.toArray.map(_.asInstanceOf[Double])\n      val mean = mu.toArray.map(_.asInstanceOf[DenseVector])\n      val sigma = si.toArray.map(_.asInstanceOf[DenseMatrix])\n      val gaussians = Array.tabulate(weight.length){\n        i => new MultivariateGaussian(mean(i), sigma(i))\n      }\n      val model = new GaussianMixtureModel(weight, gaussians)\n      model.predictSoft(data)\n  }\n}\n"
  },
  {
    "path": "ext/spark/src/main/scala/RubyMLLibUtilAPI.scala",
    "content": "package org.apache.spark.mllib.api.ruby\n\nimport java.util.ArrayList\n\nimport org.apache.spark.mllib.util.LinearDataGenerator\nimport org.apache.spark.mllib.regression.LabeledPoint\n\nobject RubyMLLibUtilAPI {\n\n  // Ruby does have a problem with creating Array[Double]\n  def generateLinearInput(\n      intercept: Double,\n      weights: ArrayList[String],\n      nPoints: Int,\n      seed: Int,\n      eps: Double = 0.1): Seq[LabeledPoint] = {\n\n    LinearDataGenerator.generateLinearInput(intercept, weights.toArray.map(_.toString.toDouble), nPoints, seed, eps)\n  }\n\n}\n"
  },
  {
    "path": "ext/spark/src/main/scala/RubyPage.scala",
    "content": "package org.apache.spark.ui.ruby\n\n// import javax.servlet.http.HttpServletRequest\n\n// import scala.xml.Node\n\n// import org.apache.spark.ui.{WebUIPage, UIUtils}\n// import org.apache.spark.util.Utils\n\n// private[ui] class RubyPage(parent: RubyTab, rbConfig: Array[Tuple2[String, String]]) extends WebUIPage(\"\") {\n\n//   def render(request: HttpServletRequest): Seq[Node] = {\n//     val content = UIUtils.listingTable(header, row, rbConfig)\n//     UIUtils.headerSparkPage(\"Ruby Config\", content, parent)\n//   }\n\n//   private def header = Seq(\n//     \"Number\"\n//   )\n\n//   private def row(keyValue: (String, String)): Seq[Node] = {\n//     // scalastyle:off\n//     keyValue match {\n//       case (key, value) =>\n//         <tr>\n//           <td>{key}</td>\n//           <td>{value}</td>\n//         </tr>\n//     }\n//     // scalastyle:on\n//   }\n// }\n\nclass RubyPage {}\n"
  },
  {
    "path": "ext/spark/src/main/scala/RubyRDD.scala",
    "content": "package org.apache.spark.api.ruby\n\nimport java.io._\nimport java.net._\nimport java.util.{List, ArrayList, Collections}\n\nimport scala.util.Try\nimport scala.reflect.ClassTag\nimport scala.collection.JavaConversions._\n\nimport org.apache.spark._\nimport org.apache.spark.{SparkEnv, Partition, SparkException, TaskContext}\nimport org.apache.spark.api.ruby._\nimport org.apache.spark.api.ruby.marshal._\nimport org.apache.spark.api.java.{JavaSparkContext, JavaPairRDD, JavaRDD}\nimport org.apache.spark.api.python.PythonRDD\nimport org.apache.spark.broadcast.Broadcast\nimport org.apache.spark.rdd.RDD\nimport org.apache.spark.util.Utils\nimport org.apache.spark.InterruptibleIterator\n\n\n/* =================================================================================================\n * Class RubyRDD\n * =================================================================================================\n */\n\nclass RubyRDD(\n    @transient parent: RDD[_],\n    command: Array[Byte],\n    broadcastVars: ArrayList[Broadcast[RubyBroadcast]],\n    accumulator: Accumulator[List[Array[Byte]]])\n  extends RDD[Array[Byte]](parent){\n\n    val bufferSize = conf.getInt(\"spark.buffer.size\", 65536)\n\n    val asJavaRDD: JavaRDD[Array[Byte]] = JavaRDD.fromRDD(this)\n\n    override def getPartitions: Array[Partition] = firstParent.partitions\n\n    override val partitioner = None\n\n    /* ------------------------------------------------------------------------------------------ */\n\n    override def compute(split: Partition, context: TaskContext): Iterator[Array[Byte]] = {\n\n      val env = SparkEnv.get\n\n      // Get worker and id\n      val (worker, workerId) = RubyWorker.create(env)\n\n      // Start a thread to feed the process input from our parent's iterator\n      val writerThread = new WriterThread(env, worker, split, context)\n\n      context.addTaskCompletionListener { context =>\n        writerThread.shutdownOnTaskCompletion()\n        writerThread.join()\n\n        // Cleanup the worker socket. This will also cause the worker to exit.\n        try {\n          RubyWorker.remove(worker, workerId)\n          worker.close()\n        } catch {\n          case e: Exception => logWarning(\"Failed to close worker socket\", e)\n        }\n      }\n\n      val stream = new DataInputStream(new BufferedInputStream(worker.getInputStream, bufferSize))\n\n      // Send data\n      writerThread.start()\n\n      // For violent termination of worker\n      new MonitorThread(workerId, worker, context).start()\n\n      // Return an iterator that read lines from the process's stdout\n      val stdoutIterator = new StreamReader(stream, writerThread, context)\n\n      // An iterator that wraps around an existing iterator to provide task killing functionality.\n      new InterruptibleIterator(context, stdoutIterator)\n\n    } // end compute\n\n    /* ------------------------------------------------------------------------------------------ */\n\n    class WriterThread(env: SparkEnv, worker: Socket, split: Partition, context: TaskContext)\n      extends Thread(\"stdout writer for worker\") {\n\n      @volatile private var _exception: Exception = null\n\n      setDaemon(true)\n\n      // Contains the exception thrown while writing the parent iterator to the process.\n      def exception: Option[Exception] = Option(_exception)\n\n      // Terminates the writer thread, ignoring any exceptions that may occur due to cleanup.\n      def shutdownOnTaskCompletion() {\n        assert(context.isCompleted)\n        this.interrupt()\n      }\n\n      // -------------------------------------------------------------------------------------------\n      // Send the necessary data for worker\n      //   - split index\n      //   - command\n      //   - iterator\n\n      override def run(): Unit = Utils.logUncaughtExceptions {\n        try {\n          SparkEnv.set(env)\n          val stream = new BufferedOutputStream(worker.getOutputStream, bufferSize)\n          val dataOut = new DataOutputStream(stream)\n\n          // Partition index\n          dataOut.writeInt(split.index)\n\n          // Spark files\n          PythonRDD.writeUTF(SparkFiles.getRootDirectory, dataOut)\n\n          // Broadcast variables\n          dataOut.writeInt(broadcastVars.length)\n          for (broadcast <- broadcastVars) {\n            dataOut.writeLong(broadcast.value.id)\n            PythonRDD.writeUTF(broadcast.value.path, dataOut)\n          }\n\n          // Serialized command\n          dataOut.writeInt(command.length)\n          dataOut.write(command)\n\n          // Send it\n          dataOut.flush()\n\n          // Data\n          PythonRDD.writeIteratorToStream(firstParent.iterator(split, context), dataOut)\n          dataOut.writeInt(RubyConstant.DATA_EOF)\n          dataOut.flush()\n        } catch {\n          case e: Exception if context.isCompleted || context.isInterrupted =>\n            logDebug(\"Exception thrown after task completion (likely due to cleanup)\", e)\n\n          case e: Exception =>\n            // We must avoid throwing exceptions here, because the thread uncaught exception handler\n            // will kill the whole executor (see org.apache.spark.executor.Executor).\n            _exception = e\n        } finally {\n          Try(worker.shutdownOutput()) // kill worker process\n        }\n      }\n    } // end WriterThread\n\n\n    /* ------------------------------------------------------------------------------------------ */\n\n    class StreamReader(stream: DataInputStream, writerThread: WriterThread, context: TaskContext) extends Iterator[Array[Byte]] {\n\n      def hasNext = _nextObj != null\n      var _nextObj = read()\n\n      // -------------------------------------------------------------------------------------------\n\n      def next(): Array[Byte] = {\n        val obj = _nextObj\n        if (hasNext) {\n          _nextObj = read()\n        }\n        obj\n      }\n\n      // -------------------------------------------------------------------------------------------\n\n      private def read(): Array[Byte] = {\n        if (writerThread.exception.isDefined) {\n          throw writerThread.exception.get\n        }\n        try {\n          stream.readInt() match {\n            case length if length > 0 =>\n              val obj = new Array[Byte](length)\n              stream.readFully(obj)\n              obj\n            case RubyConstant.WORKER_DONE =>\n              val numAccumulatorUpdates = stream.readInt()\n              (1 to numAccumulatorUpdates).foreach { _ =>\n                val updateLen = stream.readInt()\n                val update = new Array[Byte](updateLen)\n                stream.readFully(update)\n                accumulator += Collections.singletonList(update)\n              }\n              null\n            case RubyConstant.WORKER_ERROR =>\n              // Exception from worker\n\n              // message\n              val length = stream.readInt()\n              val obj = new Array[Byte](length)\n              stream.readFully(obj)\n\n              // stackTrace\n              val stackTraceLen = stream.readInt()\n              val stackTrace = new Array[String](stackTraceLen)\n              (0 until stackTraceLen).foreach { i =>\n                val length = stream.readInt()\n                val obj = new Array[Byte](length)\n                stream.readFully(obj)\n\n                stackTrace(i) = new String(obj, \"utf-8\")\n              }\n\n              // Worker will be killed\n              stream.close\n\n              // exception\n              val exception = new RubyException(new String(obj, \"utf-8\"), writerThread.exception.getOrElse(null))\n              exception.appendToStackTrace(stackTrace)\n\n              throw exception\n          }\n        } catch {\n\n          case e: Exception if context.isInterrupted =>\n            logDebug(\"Exception thrown after task interruption\", e)\n            throw new TaskKilledException\n\n          case e: Exception if writerThread.exception.isDefined =>\n            logError(\"Worker exited unexpectedly (crashed)\", e)\n            throw writerThread.exception.get\n\n          case eof: EOFException =>\n            throw new SparkException(\"Worker exited unexpectedly (crashed)\", eof)\n        }\n      }\n    } // end StreamReader\n\n    /* ---------------------------------------------------------------------------------------------\n     * Monitor thread for controll worker. Kill worker if task is interrupted.\n     */\n\n    class MonitorThread(workerId: Long, worker: Socket, context: TaskContext)\n      extends Thread(\"Worker Monitor for worker\") {\n\n      setDaemon(true)\n\n      override def run() {\n        // Kill the worker if it is interrupted, checking until task completion.\n        while (!context.isInterrupted && !context.isCompleted) {\n          Thread.sleep(2000)\n        }\n        if (!context.isCompleted) {\n          try {\n            logWarning(\"Incomplete task interrupted: Attempting to kill Worker \"+workerId.toString())\n            RubyWorker.kill(workerId)\n          } catch {\n            case e: Exception =>\n              logError(\"Exception when trying to kill worker \"+workerId.toString(), e)\n          }\n        }\n      }\n    } // end MonitorThread\n  } // end RubyRDD\n\n\n\n/* =================================================================================================\n * Class PairwiseRDD\n * =================================================================================================\n *\n * Form an RDD[(Array[Byte], Array[Byte])] from key-value pairs returned from Ruby.\n * This is used by PySpark's shuffle operations.\n * Borrowed from Python Package -> need new deserializeLongValue ->\n *   Marshal will add the same 4b header\n */\n\nclass PairwiseRDD(prev: RDD[Array[Byte]]) extends RDD[(Long, Array[Byte])](prev) {\n  override def getPartitions = prev.partitions\n  override def compute(split: Partition, context: TaskContext) =\n    prev.iterator(split, context).grouped(2).map {\n      case Seq(a, b) => (Utils.deserializeLongValue(a.reverse), b)\n      case x => throw new SparkException(\"PairwiseRDD: unexpected value: \" + x)\n    }\n  val asJavaPairRDD : JavaPairRDD[Long, Array[Byte]] = JavaPairRDD.fromRDD(this)\n}\n\n\n\n/* =================================================================================================\n * Object RubyRDD\n * =================================================================================================\n */\n\nobject RubyRDD extends Logging {\n\n  def runJob(\n      sc: SparkContext,\n      rdd: JavaRDD[Array[Byte]],\n      partitions: ArrayList[Int],\n      allowLocal: Boolean,\n      filename: String): String = {\n    type ByteArray = Array[Byte]\n    type UnrolledPartition = Array[ByteArray]\n    val allPartitions: Array[UnrolledPartition] =\n      sc.runJob(rdd, (x: Iterator[ByteArray]) => x.toArray, partitions, allowLocal)\n    val flattenedPartition: UnrolledPartition = Array.concat(allPartitions: _*)\n    writeRDDToFile(flattenedPartition.iterator, filename)\n  }\n\n  def readRDDFromFile(sc: JavaSparkContext, filename: String, parallelism: Int): JavaRDD[Array[Byte]] = {\n    val file = new DataInputStream(new BufferedInputStream(new FileInputStream(filename)))\n    val objs = new collection.mutable.ArrayBuffer[Array[Byte]]\n    try {\n      while (true) {\n        val length = file.readInt()\n        val obj = new Array[Byte](length)\n        file.readFully(obj)\n        objs.append(obj)\n      }\n    } catch {\n      case eof: EOFException => {}\n    }\n    JavaRDD.fromRDD(sc.sc.parallelize(objs, parallelism))\n  }\n\n  def writeRDDToFile[T](items: Iterator[T], filename: String): String = {\n    val file = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(filename)))\n\n    try {\n      PythonRDD.writeIteratorToStream(items, file)\n    } finally {\n      file.close()\n    }\n\n    filename\n  }\n\n  def writeRDDToFile[T](rdd: RDD[T], filename: String): String = {\n    writeRDDToFile(rdd.collect.iterator, filename)\n  }\n\n  def readBroadcastFromFile(sc: JavaSparkContext, path: String, id: java.lang.Long): Broadcast[RubyBroadcast] = {\n    sc.broadcast(new RubyBroadcast(path, id))\n  }\n\n  /**\n   * Convert an RDD of serialized Ruby objects to RDD of objects, that is usable in Java.\n   */\n  def toJava(rbRDD: JavaRDD[Array[Byte]], batched: Boolean): JavaRDD[Any] = {\n    rbRDD.rdd.mapPartitions { iter =>\n      iter.flatMap { item =>\n        val obj = Marshal.load(item)\n        if(batched){\n          obj.asInstanceOf[Array[_]]\n        }\n        else{\n          Seq(item)\n        }\n      }\n    }.toJavaRDD()\n  }\n\n  /**\n   * Convert an RDD of Java objects to an RDD of serialized Ruby objects, that is usable by Ruby.\n   */\n  def toRuby(jRDD: JavaRDD[_]): JavaRDD[Array[Byte]] = {\n    jRDD.rdd.mapPartitions { iter => new IterableMarshaller(iter) }\n  }\n\n}\n\n\n\n/* =================================================================================================\n * Class RubyException\n * =================================================================================================\n */\n\nclass RubyException(msg: String, cause: Exception) extends RuntimeException(msg, cause) {\n  def appendToStackTrace(toAdded: Array[String]) {\n    val newStactTrace = getStackTrace.toBuffer\n\n    var regexpMatch = \"(.*):([0-9]+):in `([a-z]+)'\".r\n\n    for(item <- toAdded) {\n      item match {\n        case regexpMatch(fileName, lineNumber, methodName) =>\n          newStactTrace += new StackTraceElement(\"RubyWorker\", methodName, fileName, lineNumber.toInt)\n        case _ => null\n      }\n    }\n\n    setStackTrace(newStactTrace.toArray)\n  }\n}\n"
  },
  {
    "path": "ext/spark/src/main/scala/RubySerializer.scala",
    "content": "package org.apache.spark.api.ruby\n\nimport scala.collection.JavaConverters._\nimport scala.reflect.{ClassTag, classTag}\n\nimport org.apache.spark.api.java.JavaRDD\nimport org.apache.spark.api.ruby.marshal._\n\n\n/* =================================================================================================\n * object RubySerializer\n * =================================================================================================\n */\nobject RubySerializer { }\n"
  },
  {
    "path": "ext/spark/src/main/scala/RubyTab.scala",
    "content": "package org.apache.spark.ui.ruby\n\nimport scala.collection.mutable.HashMap\n\nimport org.apache.spark.ui._\n\n// class RubyTab(parent: SparkUI, rbConfig: HashMap[String, String]) extends SparkUITab(parent, \"ruby\"){\n//   attachPage(new RubyPage(this, rbConfig.toArray))\n// }\n\nclass RubyTab {}\n"
  },
  {
    "path": "ext/spark/src/main/scala/RubyUtils.scala",
    "content": "package org.apache.spark.api.ruby\n\nimport org.apache.spark.util._\nimport org.apache.spark.{SparkConf, Logging}\n\nobject RubyUtils extends Logging {\n\n  def loadPropertiesFile(conf: SparkConf, path: String): String = {\n    Utils.getPropertiesFromFile(path).foreach {\n      case (key, value) => conf.set(key, value)\n    }\n    path\n  }\n\n}\n"
  },
  {
    "path": "ext/spark/src/main/scala/RubyWorker.scala",
    "content": "package org.apache.spark.api.ruby\n\nimport java.io.{File, DataInputStream, InputStream, DataOutputStream, FileOutputStream}\nimport java.net.{InetAddress, ServerSocket, Socket, SocketException}\nimport java.nio.file.Paths\n\nimport scala.collection.mutable\nimport scala.collection.JavaConversions._\n\nimport org.apache.spark._\nimport org.apache.spark.api.python.PythonRDD\nimport org.apache.spark.util.Utils\nimport org.apache.spark.util.RedirectThread\n\n\n/* =================================================================================================\n * Object RubyWorker\n * =================================================================================================\n *\n * Create and store server for creating workers.\n */\n\nobject RubyWorker extends Logging {\n\n  val PROCESS_WAIT_TIMEOUT = 10000\n\n  private var serverSocket: ServerSocket = null\n  private val serverHost = InetAddress.getByAddress(Array(127, 0, 0, 1))\n  private var serverPort: Int = 0\n\n  private var master: ExecutedFileCommand = null\n  private var masterSocket: Socket = null\n  private var masterOutputStream: DataOutputStream = null\n  private var masterInputStream: DataInputStream = null\n\n  private var workers = new mutable.WeakHashMap[Socket, Long]()\n\n\n  /* ----------------------------------------------------------------------------------------------\n   * Create new worker but first check if exist SocketServer and master process.\n   * If not it will create them. Worker have 2 chance to create.\n   */\n\n  def create(env: SparkEnv): (Socket, Long) = {\n    synchronized {\n      // Create the server if it hasn't been started\n      createServer(env)\n\n      // Attempt to connect, restart and retry once if it fails\n      try {\n        createWorker\n      } catch {\n        case exc: SocketException =>\n          logWarning(\"Worker unexpectedly quit, attempting to restart\")\n          createWorker\n      }\n    }\n  }\n\n  /* ----------------------------------------------------------------------------------------------\n   * Create a worker throught master process. Return new socket and id.\n   * According spark.ruby.worker.type id will be:\n   *   process: PID\n   *   thread: thread object id\n   */\n\n  def createWorker: (Socket, Long) = {\n    synchronized {\n      masterOutputStream.writeInt(RubyConstant.CREATE_WORKER)\n      var socket = serverSocket.accept()\n\n      var id = new DataInputStream(socket.getInputStream).readLong()\n      workers.put(socket, id)\n\n      (socket, id)\n    }\n  }\n\n  /* ----------------------------------------------------------------------------------------------\n   * Create SocketServer and bind it to the localhost. Max numbers of connection on queue\n   * is set to default. If server is created withou exception -> create master.\n   */\n\n  private def createServer(env: SparkEnv){\n    synchronized {\n      // Already running?\n      if(serverSocket != null && masterSocket != null) {\n        return\n      }\n\n      try {\n        // Start Socket Server for comunication\n        serverSocket = new ServerSocket(0, 0, serverHost)\n        serverPort = serverSocket.getLocalPort\n\n        // Create a master for worker creations\n        createMaster(env)\n      } catch {\n        case e: Exception =>\n          throw new SparkException(\"There was a problem with creating a server\", e)\n      }\n    }\n  }\n\n  /* ----------------------------------------------------------------------------------------------\n   * In this point SocketServer must be created. Master process create and kill workers.\n   * Creating workers from Java can be an expensive operation because new process can\n   * get copy of address space.\n   */\n\n  private def createMaster(env: SparkEnv){\n    synchronized {\n      val isDriver = env.executorId == SparkContext.DRIVER_IDENTIFIER\n      val executorOptions = env.conf.get(\"spark.ruby.executor.options\", \"\")\n      val commandTemplate = env.conf.get(\"spark.ruby.executor.command\")\n      val workerType = env.conf.get(\"spark.ruby.worker.type\")\n\n      // Where is root of ruby-spark\n      var executorLocation = \"\"\n\n      if(isDriver){\n        // Use worker from current active gem location\n        executorLocation = env.conf.get(\"spark.ruby.driver_home\")\n      }\n      else{\n        // Use gem installed on the system\n        try {\n          val homeCommand = (new FileCommand(commandTemplate, \"ruby-spark home\", env, getEnvVars(env))).run\n          executorLocation = homeCommand.readLine\n        } catch {\n          case e: Exception =>\n            throw new SparkException(\"Ruby-spark gem is not installed.\", e)\n        }\n      }\n\n      // Master and worker are saved in GEM_ROOT/lib/spark/worker\n      executorLocation = Paths.get(executorLocation, \"lib\", \"spark\", \"worker\").toString\n\n      // Create master command\n      // -C: change worker dir before execution\n      val masterRb = s\"ruby $executorOptions -C $executorLocation master.rb $workerType $serverPort\"\n      val masterCommand = new FileCommand(commandTemplate, masterRb, env, getEnvVars(env))\n\n      // Start master\n      master = masterCommand.run\n\n      // Redirect master stdout and stderr\n      redirectStreamsToStderr(master.getInputStream, master.getErrorStream)\n\n      // Wait for it to connect to our socket\n      serverSocket.setSoTimeout(PROCESS_WAIT_TIMEOUT)\n      try {\n        // Use socket for comunication. Keep stdout and stdin for log\n        masterSocket = serverSocket.accept()\n        masterOutputStream = new DataOutputStream(masterSocket.getOutputStream)\n        masterInputStream  = new DataInputStream(masterSocket.getInputStream)\n\n        PythonRDD.writeUTF(executorOptions, masterOutputStream)\n      } catch {\n        case e: Exception =>\n          throw new SparkException(\"Ruby master did not connect back in time\", e)\n      }\n    }\n  }\n\n  /* ----------------------------------------------------------------------------------------------\n   * Gel all environment variables for executor\n   */\n\n  def getEnvVars(env: SparkEnv): Map[String, String] = {\n    val prefix = \"spark.ruby.executor.env.\"\n    env.conf.getAll.filter{case (k, _) => k.startsWith(prefix)}\n                   .map{case (k, v) => (k.substring(prefix.length), v)}\n                   .toMap\n  }\n\n  /* ------------------------------------------------------------------------------------------- */\n\n  def kill(workerId: Long){\n    masterOutputStream.writeInt(RubyConstant.KILL_WORKER)\n    masterOutputStream.writeLong(workerId)\n  }\n\n  /* ------------------------------------------------------------------------------------------- */\n\n  def killAndWait(workerId: Long){\n    masterOutputStream.writeInt(RubyConstant.KILL_WORKER_AND_WAIT)\n    masterOutputStream.writeLong(workerId)\n\n    // Wait for answer\n    masterInputStream.readInt() match {\n      case RubyConstant.SUCCESSFULLY_KILLED =>\n        logInfo(s\"Worker $workerId was successfully killed\")\n      case RubyConstant.UNSUCCESSFUL_KILLING =>\n        logInfo(s\"Worker $workerId cannot be killed (maybe is already killed)\")\n    }\n  }\n\n  /* ----------------------------------------------------------------------------------------------\n   * workers HashMap is week but it avoid long list of workers which cannot be killed (killAndWait)\n   */\n\n  def remove(worker: Socket, workerId: Long){\n    try {\n      workers.remove(worker)\n    } catch {\n      case e: Exception => logWarning(s\"Worker $workerId does not exist (maybe is already removed)\")\n    }\n  }\n\n  /* ------------------------------------------------------------------------------------------- */\n\n  def stopServer{\n    synchronized {\n      // Kill workers\n      workers.foreach { case (socket, id) => killAndWait(id) }\n\n      // Kill master\n      master.destroy\n\n      // Stop SocketServer\n      serverSocket.close()\n\n      // Clean variables\n      serverSocket = null\n      serverPort = 0\n      master = null\n      masterSocket = null\n      masterOutputStream = null\n      masterInputStream = null\n    }\n  }\n\n  /* ------------------------------------------------------------------------------------------- */\n\n  private def redirectStreamsToStderr(streams: InputStream*) {\n    try {\n      for(stream <- streams) {\n        new RedirectThread(stream, System.err, \"stream reader\").start()\n      }\n    } catch {\n      case e: Exception =>\n        logError(\"Exception in redirecting streams\", e)\n    }\n  }\n\n  /* ------------------------------------------------------------------------------------------- */\n}\n"
  },
  {
    "path": "ext/spark/src/test/scala/MarshalSpec.scala",
    "content": "package org.apache.spark.api.ruby.marshal\n\nimport org.scalatest._\n\n\nimport org.apache.spark.api.ruby.marshal._\n\nclass MarshalSpec extends FunSpec with Matchers {\n\n  // ====================================================================================\n  // Load\n\n  describe(\"Marshal.load\"){\n    describe(\"single value\"){\n      it(\"int\"){\n        val data = 1\n        val serialized = Array[Byte](4, 8, 105, 6)\n\n        Marshal.load(serialized) should equal(data)\n      }\n\n      it(\"double\"){\n        val data = 1.2\n        val serialized = Array[Byte](4, 8, 102, 8, 49, 46, 50)\n\n        Marshal.load(serialized) should equal(data)\n      }\n    }\n\n    describe(\"array\"){\n      it(\"ints\"){\n        val data = Array(1, 2, 3, 4, 5)\n        val serialized = Array[Byte](4, 8, 91, 10, 105, 6, 105, 7, 105, 8, 105, 9, 105, 10)\n\n        Marshal.load(serialized) should equal(data)\n      }\n\n      it(\"doubles\"){\n        val data = Array(1.1, 2.2, 3.3)\n        val serialized = Array[Byte](4, 8, 91, 8, 102, 8, 49, 46, 49, 102, 8, 50, 46, 50, 102, 8, 51, 46, 51)\n\n        Marshal.load(serialized) should equal(data)\n      }\n    }\n  }\n\n  // ====================================================================================\n  // Dump\n\n  describe(\"Marshal.dump\"){\n    describe(\"single value\"){\n      it(\"int\"){\n        val data = 1\n        val serialized = Array(4, 8, 105, 6)\n\n        Marshal.dump(data) should equal(serialized)\n      }\n\n      it(\"double\"){\n        val data = 1.2\n        val serialized = Array(4, 8, 102, 8, 49, 46, 50)\n\n        Marshal.dump(data) should equal(serialized)\n      }\n    }\n\n    describe(\"array\"){\n      it(\"ints\"){\n        val data = Array(1, 2, 3, 4, 5)\n        val serialized = Array(4, 8, 91, 10, 105, 6, 105, 7, 105, 8, 105, 9, 105, 10)\n\n        Marshal.dump(data) should equal(serialized)\n      }\n\n      it(\"doubles\"){\n        val data = Array(1.1, 2.2, 3.3)\n        val serialized = Array(4, 8, 91, 8, 102, 8, 49, 46, 49, 102, 8, 50, 46, 50, 102, 8, 51, 46, 51)\n\n        Marshal.dump(data) should equal(serialized)\n      }\n    }\n  }\n\n}\n"
  },
  {
    "path": "lib/ruby-spark.rb",
    "content": "require_relative 'spark'\n"
  },
  {
    "path": "lib/spark/accumulator.rb",
    "content": "module Spark\n  ##\n  # A shared variable that can be accumulated, i.e., has a commutative and associative \"add\"\n  # operation. Worker tasks on a Spark cluster can add values to an Accumulator with the `+=`\n  # operator, but only the driver program is allowed to access its value, using value.\n  # Updates from the workers get propagated automatically to the driver program.\n  #\n  # == Arguments:\n  # value::\n  #   Initial value for accumulator. This values is stored only on driver process\n  #\n  # accum_param::\n  #   How merge 2 value on worker or driver process.\n  #   Symbol or Proc (or String)\n  #\n  # zero_value::\n  #   Initial value for worker process\n  #\n  #\n  # == Examples:\n  #\n  #   accum1 = $sc.accumulator(1)\n  #   accum2 = $sc.accumulator(2, :*, 1)\n  #   accum3 = $sc.accumulator(3, lambda{|max, val| val > max ? val : max})\n  #\n  #   accum1 += 1\n  #\n  #   accum2.add(2)\n  #   accum2.add(2)\n  #   accum2.add(2)\n  #\n  #   accum3.add(9)\n  #   accum3.add(6)\n  #   accum3.add(7)\n  #\n  #   accum1.value # => 2\n  #   accum2.value # => 16\n  #   accum3.value # => 9\n  #\n  #   func = Proc.new do |_, index|\n  #     accum1.add(1)\n  #     accum2.add(2)\n  #     accum3.add(index * 10)\n  #   end\n  #\n  #   rdd = $sc.parallelize(0..4, 4)\n  #   rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3)\n  #   rdd = rdd.map_partitions_with_index(func)\n  #   rdd.collect\n  #\n  #   accum1.value # => 6\n  #   accum2.value # => 256\n  #   accum3.value # => 30\n  #\n  class Accumulator\n\n    attr_reader :id, :value, :accum_param, :zero_value\n\n    @@instances = {}\n    @@changed = []\n\n    SUPPORTED_SYMBOLS = [:+, :-, :*, :/, :**]\n\n\n    # =========================================================================\n    # Creating and selecting Spark::Accumulator\n\n    def initialize(value, accum_param=:+, zero_value=0)\n      @id = object_id\n      @value = value\n      @accum_param = accum_param\n      @zero_value = zero_value\n      @driver = true\n\n      valid_accum_param\n\n      @@instances[@id] = self\n    end\n\n    def inspect\n      result  = %{#<#{self.class.name}:0x#{object_id}\\n}\n      result << %{   ID: #{@id}\\n}\n      result << %{ Zero: #{@zero_value.to_s[0, 10]}\\n}\n      result << %{Value: #{@value.to_s[0, 10]}>}\n      result\n    end\n\n    def self.changed\n      @@changed\n    end\n\n    def self.instances\n      @@instances\n    end\n\n    def valid_accum_param\n      if @accum_param.is_a?(Symbol)\n        raise Spark::AccumulatorError, \"Unsupported symbol #{@accum_param}\" unless SUPPORTED_SYMBOLS.include?(@accum_param)\n        @serialized_accum_param = @accum_param\n        return\n      end\n\n      if @accum_param.is_a?(Proc)\n        begin\n          @serialized_accum_param = @accum_param.to_source\n          return\n        rescue\n          raise Spark::SerializeError, 'Proc can not be serialized. Use String instead.'\n        end\n      end\n\n      if @accum_param.is_a?(String)\n        @serialized_accum_param = @accum_param\n        @accum_param = eval(@accum_param)\n\n        unless @accum_param.is_a?(Proc)\n          raise Spark::SerializeError, 'Yours param is not a Proc.'\n        end\n\n        return\n      end\n\n      raise Spark::AccumulatorError, 'Unsupported param. Use Symbol, Proc or String.'\n    end\n\n    # Driver process or worker\n    def driver?\n      @driver\n    end\n\n\n    # =========================================================================\n    # Operations\n\n    def add(term)\n      if !driver? && !@@changed.include?(self)\n        @@changed << self\n      end\n\n      if @accum_param.is_a?(Proc)\n        @value = @accum_param.call(@value, term)\n      else\n        add_by_symbol(term)\n      end\n    end\n\n    def +(term)\n      add(term)\n      self\n    end\n\n    def add_by_symbol(term)\n      case @accum_param\n      when :+\n        @value += term\n      when :-\n        @value -= term\n      when :*\n        @value *= term\n      when :/\n        @value /= term\n      when :**\n        @value **= term\n      end\n    end\n\n\n    # =========================================================================\n    # Dump and load\n\n    def marshal_dump\n      [@id, @zero_value, @serialized_accum_param]\n    end\n\n    def marshal_load(array)\n      @id, @zero_value, @serialized_accum_param = array\n\n      @value = @zero_value\n      @driver = false\n      load_accum_param\n    end\n\n    def load_accum_param\n      if @serialized_accum_param.is_a?(String)\n        @accum_param = eval(@serialized_accum_param)\n      else\n        @accum_param = @serialized_accum_param\n      end\n    end\n\n  end\nend\n\n# =============================================================================\n# Server for handeling Accumulator update\n#\nmodule Spark\n  class Accumulator\n    class Server\n\n      attr_reader :server, :host, :port\n\n      def self.start\n        @instance ||= Spark::Accumulator::Server.new\n      end\n\n      def self.stop\n        @instance && @instance.stop\n      end\n\n      def self.host\n        start\n        @instance.host\n      end\n\n      def self.port\n        start\n        @instance.port\n      end\n\n      def initialize\n        @server = TCPServer.new(0)\n        @host = @server.hostname\n        @port = @server.port\n\n        @threads = []\n        handle_accept\n      end\n\n      def stop\n        @threads.each(&:kill)\n      rescue\n        nil\n      end\n\n      def handle_accept\n        @threads << Thread.new do\n          loop {\n            handle_connection(@server.accept)\n          }\n        end\n\n      end\n\n      def handle_connection(socket)\n        @threads << Thread.new do\n          until socket.closed?\n            count = socket.read_int\n            count.times do\n              data = socket.read_data\n              accum = Spark::Accumulator.instances[data[0]]\n              if accum\n                accum.add(data[1])\n              else\n                Spark.logger.warn(\"Accumulator with id #{data[0]} does not exist.\")\n              end\n            end\n\n            # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock\n            # socket.write_int(Spark::Constant::ACCUMULATOR_ACK)\n          end\n\n        end\n      end\n\n    end\n  end\nend\n"
  },
  {
    "path": "lib/spark/broadcast.rb",
    "content": "module Spark\n  ##\n  # Broadcast a read-only variable to the cluster, returning a Spark::Broadcast\n  # object for reading it in distributed functions. The variable will\n  # be sent to each cluster only once.\n  #\n  # == Example:\n  #\n  #   broadcast1 = $sc.broadcast('a')\n  #   broadcast2 = $sc.broadcast('b')\n  #   broadcast3 = $sc.broadcast([1,2,3])\n  #\n  #   func = Proc.new do |part, index|\n  #     [\n  #       broadcast1.value * index,\n  #       broadcast2.value * index,\n  #       broadcast3.value.reduce(:+)\n  #     ]\n  #   end\n  #\n  #   rdd = $sc.parallelize(0..5, 4)\n  #   rdd = rdd.bind(broadcast1: broadcast1, broadcast2: broadcast2, broadcast3: broadcast3)\n  #   rdd = rdd.map_partitions_with_index(func)\n  #   rdd.collect\n  #   # => [\"\", \"\", 6, \"a\", \"b\", 6, \"aa\", \"bb\", 6, \"aaa\", \"bbb\", 6]\n  #\n  class Broadcast\n\n    LOADED       = 0 # id, value, path\n    NOT_LOADED   = 1 # id, path\n    WITHOUT_PATH = 2 # id\n\n    attr_reader :id, :state, :path, :jbroadcast\n\n    @@registered = {}\n\n    # =========================================================================\n    # Creating broadcast for SparkContext\n\n    # Create new Broadcast and dump value to the disk\n    #\n    #   b = $sc.broadcast('a')\n    #\n    #   b.value # => 'a'\n    #   b.path\n    #   b.jbroadcast\n    #\n    def initialize(sc, value)\n      @id = object_id\n      @value = value\n      @state = LOADED\n\n      file = Tempfile.create('broadcast', sc.temp_dir)\n      file.binmode\n      file.write(Marshal.dump(value))\n      file.close\n\n      @path = file.path\n      @jbroadcast = RubyRDD.readBroadcastFromFile(sc.jcontext, @path, Spark.jb.to_long(@id))\n\n      ObjectSpace.define_finalizer(self, proc { File.unlink(@path) })\n    end\n\n    def inspect\n      result  = %{#<#{self.class.name}:0x#{object_id}\\n}\n      result << %{   ID: #{@id}\\n}\n      result << %{Value: #{@value.to_s[0, 10]}>}\n      result\n    end\n\n    def self.register(id, path)\n      @@registered[id] = path\n    end\n\n    def value\n      case state\n      when LOADED\n        @value\n      when NOT_LOADED\n        @value = Marshal.load(File.read(@path))\n        @state = LOADED\n        @value\n      when WITHOUT_PATH\n        @path = @@registered[id]\n\n        if @path\n          @state = NOT_LOADED\n          value\n        else\n          raise Spark::BroadcastError, \"Broadcast #{@id} do not have registered path.\"\n        end\n      end\n    end\n\n    def marshal_dump\n      @id\n    end\n\n    def marshal_load(id)\n      @id = id\n      @state = WITHOUT_PATH\n    end\n\n  end\nend\n"
  },
  {
    "path": "lib/spark/build.rb",
    "content": "module Spark\n  module Build\n\n    DEFAULT_SCALA_VERSION  = '2.10.4'\n    DEFAULT_CORE_VERSION   = '2.10'\n    DEFAULT_SPARK_VERSION  = '1.6.0'\n    DEFAULT_HADOOP_VERSION = '1.0.4'\n\n    SBT       = 'sbt/sbt'\n    SBT_DEPS  = 'assemblyPackageDependency'\n    SBT_EXT   = 'package'\n    SBT_CLEAN = 'clean'\n\n    def self.build(options={})\n      scala_version      = options[:scala_version]      || DEFAULT_SCALA_VERSION\n      spark_core_version = options[:spark_core_version] || DEFAULT_CORE_VERSION\n      spark_version      = options[:spark_version]      || DEFAULT_SPARK_VERSION\n      hadoop_version     = options[:hadoop_version]     || DEFAULT_HADOOP_VERSION\n      target             = options[:target]             || Spark.target_dir\n      only_ext           = options[:only_ext]           || false\n\n      env = {\n        'SCALA_VERSION' => scala_version,\n        'SPARK_VERSION' => spark_version,\n        'SPARK_CORE_VERSION' => spark_core_version,\n        'HADOOP_VERSION' => hadoop_version,\n        'TARGET_DIR' => target\n      }\n\n      cmd = [SBT]\n      cmd << SBT_EXT\n      cmd << SBT_DEPS unless only_ext\n      cmd << SBT_CLEAN unless $DEBUG\n\n      Dir.chdir(Spark.spark_ext_dir) do\n        unless Kernel.system(env, cmd.join(' '))\n          raise Spark::BuildError, 'Spark cannot be assembled.'\n        end\n      end\n    end\n\n  end\nend\n"
  },
  {
    "path": "lib/spark/cli.rb",
    "content": "require 'commander'\n\nmodule Commander\n  module UI\n    # Disable paging\n    # for 'classic' help\n    def self.enable_paging\n    end\n  end\nend\n\nmodule Spark\n  class CLI\n    include Commander::Methods\n\n    # IRB_HISTORY_FILE = File.join(Dir.home, '.irb_spark_history')\n    # IRB_HISTORY_SIZE = 100\n\n    def run\n      program :name, 'RubySpark'\n      program :version, Spark::VERSION\n      program :description, 'Ruby wrapper for Spark'\n\n      global_option('-d', '--debug', 'Logging message to stdout'){ $DEBUG = true }\n      default_command :help\n\n\n      # Build ---------------------------------------------------------------\n      command :build do |c|\n        c.syntax = 'build [options]'\n        c.description = 'Build spark and gem extensions'\n        c.option '--hadoop-version STRING', String, 'Version of hadoop which will assembled with the Spark'\n        c.option '--spark-core-version STRING', String, 'Version of Spark core'\n        c.option '--spark-version STRING', String, 'Version of Spark'\n        c.option '--scala-version STRING', String, 'Version of Scala'\n        c.option '--target STRING', String, 'Directory where Spark will be stored'\n        c.option '--only-ext', 'Build only extension for RubySpark'\n\n        c.action do |args, options|\n          Spark::Build.build(options.__hash__)\n          puts\n          puts 'Everything is OK'\n        end\n      end\n      alias_command :install, :build\n\n\n      # Shell -----------------------------------------------------------------\n      command :shell do |c|\n        c.syntax = 'shell [options]'\n        c.description = 'Start ruby shell for spark'\n        c.option '--target STRING', String, 'Directory where Spark is stored'\n        c.option '--properties-file STRING', String, 'Path to a file from which to load extra properties'\n        c.option '--[no-]start', 'Start Spark immediately'\n        c.option '--[no-]logger', 'Enable/disable logger (default: enable)'\n        c.option '--auto-reload', 'Autoreload changed files'\n\n        c.action do |args, options|\n          options.default start: true, logger: true\n\n          Spark.load_lib(options.target)\n          Spark.logger.disable unless options.logger\n\n          Spark.config do\n            set_app_name 'RubySpark'\n          end\n\n          Spark.config.from_file(options.properties_file)\n\n          if options.auto_reload\n            require 'listen'\n            listener = Listen.to(File.join(Spark.root, 'lib')) do |modified, added, removed|\n              (modified+added).each do |file|\n                silence_warnings { load(file) }\n              end\n            end\n            listener.start\n          end\n\n          if options.start\n            # Load Java and Spark\n            Spark.start\n            $sc = Spark.context\n\n            Spark.print_logo('Spark context is loaded as $sc')\n          else\n            Spark.print_logo('You can start Spark with Spark.start')\n          end\n\n          # Load Pry\n          require 'pry'\n          Pry.start\n        end\n      end\n\n\n      # # IRB -------------------------------------------------------------------\n      # command :irb do |c|\n      #   c.syntax = 'irb [options]'\n      #   c.description = 'Start ruby shell for spark'\n      #   c.option '--spark-home STRING', String, 'Directory where Spark is stored'\n      #   c.option '--[no-]start', 'Start Spark immediately'\n      #   c.option '--[no-]logger', 'Enable/disable logger (default: enable)'\n      #\n      #   c.action do |args, options|\n      #     options.default start: true, logger: true\n      #\n      #     Spark.load_lib(options.spark_home)\n      #     Spark::Logger.disable unless options.logger\n      #\n      #     Spark.config do\n      #       set_app_name 'Pry RubySpark'\n      #     end\n      #\n      #     if options.start\n      #       # Load Java and Spark\n      #       Spark.start\n      #       $sc = Spark.context\n      #\n      #       Spark.print_logo('Spark context is loaded as $sc')\n      #     else\n      #       Spark.print_logo('You can start Spark with Spark.start')\n      #     end\n      #\n      #     # Load IRB\n      #     require 'irb'\n      #     require 'irb/completion'\n      #     require 'irb/ext/save-history'\n      #\n      #     begin\n      #       file = File.expand_path(IRB_HISTORY_FILE)\n      #       if File.exists?(file)\n      #         lines = IO.readlines(file).collect { |line| line.chomp }\n      #         Readline::HISTORY.push(*lines)\n      #       end\n      #       Kernel.at_exit do\n      #         lines = Readline::HISTORY.to_a.reverse.uniq.reverse\n      #         lines = lines[-IRB_HISTORY_SIZE, IRB_HISTORY_SIZE] if lines.nitems > IRB_HISTORY_SIZE\n      #         File.open(IRB_HISTORY_FILE, File::WRONLY | File::CREAT | File::TRUNC) { |io| io.puts lines.join(\"\\n\") }\n      #       end\n      #     rescue\n      #     end\n      #\n      #     ARGV.clear # Clear Thor ARGV, otherwise IRB will parse it\n      #     ARGV.concat ['--readline', '--prompt-mode', 'simple']\n      #     IRB.start\n      #   end\n      # end\n\n\n      # Home ------------------------------------------------------------------\n      command :home do |c|\n        c.action do |args, options|\n          puts Spark.home\n          exit(0)\n        end\n      end\n\n\n      # Ruby spark jar --------------------------------------------------------\n      command :ruby_spark_jar do |c|\n        c.action do |args, options|\n          puts Spark.ruby_spark_jar\n          exit(0)\n        end\n      end\n\n      run!\n    end\n\n  end\nend\n"
  },
  {
    "path": "lib/spark/command/base.rb",
    "content": "##\n# Spark::Command::Base\n#\n# Parent for all commands (Map, FlatMap, Sort, ...)\n#\nclass Spark::Command::Base\n\n  DEFAULT_VARIABLE_OPTIONS = {\n    type: Hash,\n    function: true\n  }\n\n  def initialize(*args)\n    settings.variables.each do |name, options|\n      instance_variable_set(\"@#{name}\", args.shift)\n    end\n  end\n\n  def to_s\n    self.class.name.split('::').last\n  end\n\n  def self.error(message)\n    raise Spark::CommandError, message\n  end\n\n  def error(message)\n    self.class.error(message)\n  end\n\n  def log(message=nil)\n    $stdout.puts %{==> #{Time.now.strftime(\"%H:%M:%S\")} [#{self.class.name}] #{message}}\n    $stdout.flush\n  end\n\n\n  # ===============================================================================================\n  # Methods called during class loading\n  # This is not nicer way but these methods set/get classes variables for child\n\n  # Settings for command (variables)\n  def self.settings\n    init_settings\n    class_variable_get(:@@settings)\n  end\n\n  def settings\n    self.class.settings\n  end\n\n  # Init empty settings\n  def self.init_settings\n    if !class_variable_defined?(:@@settings)\n      struct = Struct.new(:variables)\n\n      class_variable_set(:@@settings, struct.new)\n      settings.variables = {}\n    end\n  end\n\n  # New variable for command\n  #\n  # == Example:\n  #\n  #   class Map < Spark::Command::Base\n  #     variable :map_function\n  #   end\n  #\n  #   command = Map.new(1)\n  #\n  #   command.instance_variables\n  #   # => [:@map_function]\n  #   command.instance_variable_get(:@map_function)\n  #   # => 1\n  #\n  def self.variable(name, options={})\n    if settings.variables.has_key?(name)\n      error \"Function #{name} already exist.\"\n    end\n\n    settings.variables[name] = DEFAULT_VARIABLE_OPTIONS.merge(options)\n  end\n\n\n  # ===============================================================================================\n  # Executing methods\n\n  # Execute command for data and split index\n  def execute(iterator, split_index)\n    # Implemented on Base but can be override\n    before_run\n\n    # Run has to be implemented on child\n    if iterator.is_a?(Enumerator::Lazy) && respond_to?(:lazy_run)\n      return lazy_run(iterator, split_index)\n    end\n\n    iterator = iterator.to_a\n    run(iterator, split_index)\n  end\n\n  def prepared?\n    !!@prepared\n  end\n\n  # This is called before execution. Executing will be stopped if\n  # some command contains error (e.g. badly serialized lambda).\n  #\n  # == What is doing?\n  # * evaluate lambda\n  # * evaluate method\n  # * make new lambda\n  #\n  def prepare\n    return if prepared?\n\n    to_function = settings.variables.select {|_, options| options[:function]}\n    to_function.each do |name, options|\n      name = \"@#{name}\"\n      data = instance_variable_get(name)\n\n      case data[:type]\n      when 'proc'\n        result = eval(data[:content])\n      when 'symbol'\n        result = lambda(&data[:content])\n      when 'method'\n        # Method must me added to instance not Class\n        instance_eval(data[:content])\n        # Method will be available as Proc\n        result = lambda(&method(data[:name]))\n      end\n\n      instance_variable_set(name, result)\n    end\n\n    @prepared = true\n  end\n\n  # This method is called before every execution.\n  def before_run\n  end\n\n\n  # ===============================================================================================\n  # Bound objects\n\n  attr_accessor :__objects__\n\n  def method_missing(method, *args, &block)\n    if __objects__ && __objects__.has_key?(method)\n      return __objects__[method]\n    end\n\n    super\n  end\n\nend\n"
  },
  {
    "path": "lib/spark/command/basic.rb",
    "content": "_Base = Spark::Command::Base\n\n# -------------------------------------------------------------------------------------------------\n# Map\n\nclass Spark::Command::Map < _Base\n  variable :map_function\n\n  def run(iterator, *)\n    iterator.map! do |item|\n      @map_function.call(item)\n    end\n    iterator\n  end\n\n  def lazy_run(iterator, *)\n    iterator.map do |item|\n      @map_function.call(item)\n    end\n  end\nend\n\n# -------------------------------------------------------------------------------------------------\n# FlatMap\n\nclass Spark::Command::FlatMap < Spark::Command::Map\n  def run(iterator, *)\n    iterator = super\n    iterator.flatten!(1)\n    iterator\n  end\n\n  def lazy_run(iterator, *)\n    iterator.flat_map do |item|\n      @map_function.call(item)\n    end\n  end\nend\n\n# -------------------------------------------------------------------------------------------------\n# MapPartitionsWithIndex\n\nclass Spark::Command::MapPartitionsWithIndex < _Base\n  variable :partition_function\n\n  def run(iterator, index)\n    iterator = @partition_function.call(iterator, index)\n    iterator\n  end\n\n  # User should controll if there is Enumerator or not\n  # alias_method :lazy_run, :run\nend\n\n# -------------------------------------------------------------------------------------------------\n# MapPartitions\n\nclass Spark::Command::MapPartitions < Spark::Command::MapPartitionsWithIndex\n  def run(iterator, *)\n    # Do not use `super` because `@partition_function` can be method with 1 argument\n    iterator = @partition_function.call(iterator)\n    iterator\n  end\n  # alias_method :lazy_run, :run\nend\n\n# -------------------------------------------------------------------------------------------------\n# Filter\n\nclass Spark::Command::Filter < _Base\n  variable :filter_function\n\n  def run(iterator, *)\n    iterator.select! do |item|\n      @filter_function.call(item)\n    end\n    iterator\n  end\n\n  def lazy_run(iterator, *)\n    iterator.select do |item|\n      @filter_function.call(item)\n    end\n  end\nend\n\n# -------------------------------------------------------------------------------------------------\n# Compact\n\nclass Spark::Command::Compact < _Base\n  def run(iterator, *)\n    iterator.compact!\n    iterator\n  end\n\n  def lazy_run(iterator, *)\n    iterator.select do |item|\n      !item.nil?\n    end\n  end\nend\n\n# -------------------------------------------------------------------------------------------------\n# Glom\n\nclass Spark::Command::Glom < _Base\n  def run(iterator, *)\n    [iterator]\n  end\n\n  def lazy_run(iterator, *)\n    run(iterator.to_a)\n  end\nend\n\n# -------------------------------------------------------------------------------------------------\n# Shuffle\n\nclass Spark::Command::Shuffle < _Base\n  variable :seed, function: false, type: Integer\n\n  def run(iterator, *)\n    iterator.shuffle!(random: rng)\n    iterator\n  end\n\n  def rng\n    Random.new(@seed)\n  end\nend\n\n# -------------------------------------------------------------------------------------------------\n# PartitionBy\n\nclass Spark::Command::PartitionBy\n\n  class Base < Spark::Command::Base\n    include Spark::Helper::Serialize\n\n    def prepare\n      super\n\n      # Default. Keep it after super because Sorting has own key_function.\n      @key_function ||= lambda{|x| x[0]}\n    end\n\n    def run(iterator, *)\n      iterator.map! do |item|\n        make_partition_item(item)\n      end\n      iterator.flatten!(1)\n      iterator\n    end\n\n    def lazy_run(iterator, *)\n      iterator.flat_map do |item|\n        make_partition_item(item)\n      end\n    end\n\n    private\n\n      def make_partition_item(item)\n        [\n          pack_long(@partition_func.call(@key_function[item])),\n          item\n        ]\n      end\n  end\n\n  class Basic < Base\n    variable :partition_func\n  end\n\n  class Sorting < Base\n    variable :key_function\n    variable :bounds, function: false, type: Array\n    variable :ascending, function: false, type: [TrueClass, FalseClass]\n    variable :num_partitions, function: false, type: Numeric\n\n    def prepare\n      super\n\n      # Index by bisect alghoritm\n      @partition_func ||= Proc.new do |key|\n        count = 0\n        @bounds.each{|i|\n          break if i >= key\n          count += 1\n        }\n\n        if @ascending\n          count\n        else\n          @num_partitions - 1 - count\n        end\n      end\n    end\n\n  end # Sorting\nend # PartitionBy\n\n# -------------------------------------------------------------------------------------------------\n# Aggregate\n\nclass Spark::Command::Aggregate < _Base\n  variable :reduce_func\n  variable :zero_value, function: false, type: Object\n\n  def run(iterator, *)\n    [iterator.reduce(@zero_value, &@reduce_func)]\n  end\n\n  def lazy_run(iterator, *)\n    run(iterator)\n  end\nend\n\n# -------------------------------------------------------------------------------------------------\n# Reduce\n\nclass Spark::Command::Reduce < Spark::Command::Aggregate\n  def run(iterator, *)\n    [iterator.reduce(&@reduce_func)]\n  end\nend\n\n# -------------------------------------------------------------------------------------------------\n# Foreach\n\nclass Spark::Command::Foreach < _Base\n  variable :each_function\n\n  def run(iterator, *)\n    iterator.each do |item|\n      @each_function.call(item)\n    end\n    nil\n  end\nend\n\n# -------------------------------------------------------------------------------------------------\n# ForeachPartition\n\nclass Spark::Command::ForeachPartition < _Base\n  variable :partition_function\n\n  def run(iterator, *)\n    @partition_function.call(iterator)\n    nil\n  end\nend\n\n# -------------------------------------------------------------------------------------------------\n# KeyBy\n\nclass Spark::Command::KeyBy < _Base\n  variable :key_function\n\n  def run(iterator, *)\n    iterator.map! do |item|\n      [@key_function.call(item), item]\n    end\n    iterator\n  end\n\n  def lazy_run(iterator, *)\n    iterator.map do |item|\n      [@key_function.call(item), item]\n    end\n  end\nend\n\n# -------------------------------------------------------------------------------------------------\n# Take\n\nclass Spark::Command::Take < _Base\n  variable :total,     function: false, type: Numeric\n  variable :last_part, function: false, type: Numeric\n\n  def run(iterator, index)\n    if index == @last_part && iterator.size > @total\n      return iterator.slice!(0, @total)\n    end\n\n    iterator\n  end\nend\n\n# -------------------------------------------------------------------------------------------------\n# Pipe\n\nclass Spark::Command::Pipe < _Base\n  variable :cmds, function: false, type: Array\n\n  def before_run\n    require 'open3'\n\n    @in, @out, @threads = Open3.pipeline_rw(*@cmds)\n  end\n\n  def run(iterator, *)\n    create_writing_thread(iterator)\n\n    new_iterator = []\n\n    # Read full input\n    begin\n      loop {\n        new_iterator << @out.readline.rstrip\n      }\n    rescue EOFError\n    end\n\n    new_iterator\n  end\n\n  def lazy_run(iterator, *)\n    create_writing_thread(iterator)\n\n    Enumerator::Lazy.new([nil]) do |yielder, _|\n      begin\n        loop {\n          yielder << @out.readline.rstrip\n        }\n      rescue EOFError\n      end\n    end\n  end\n\n  private\n\n    def create_writing_thread(iterator)\n      @writing_thread = Thread.new do\n        # Send complete iterator to the pipe\n        iterator.each do |item|\n          @in.puts(item.to_s.rstrip)\n        end\n\n        # Input must be closed for EOFError\n        @in.close\n      end\n    end\n\nend\n"
  },
  {
    "path": "lib/spark/command/pair.rb",
    "content": "_Base = Spark::Command::Base\n\n# -------------------------------------------------------------------------------------------------\n# CombineByKey\n\nclass Spark::Command::CombineByKey\n\n  # ---------------\n\n  class Base < Spark::Command::Base\n    def run(iterator, *)\n      _run(iterator).to_a\n    end\n\n    def lazy_run(iterator, *)\n      _run(iterator).lazy\n    end\n  end\n\n  # ---------------\n\n  class Combine < Base\n    variable :create_combiner\n    variable :merge_value\n\n    def _run(iterator)\n      # Not use combiners[key] ||= ..\n      # it tests nil and not has_key?\n      combiners = {}\n      iterator.each do |key, value|\n        if combiners.has_key?(key)\n          combiners[key] = @merge_value.call(combiners[key], value)\n        else\n          combiners[key] = @create_combiner.call(value)\n        end\n      end\n      combiners\n    end\n  end\n\n  # ---------------\n\n  class Merge < Base\n    variable :merge_combiners\n\n    def _run(iterator, *)\n      combiners = {}\n      iterator.each do |key, value|\n        if combiners.has_key?(key)\n          combiners[key] = @merge_combiners.call(combiners[key], value)\n        else\n          combiners[key] = value\n        end\n      end\n      combiners\n    end\n  end\n\n  # ---------------\n\n  class CombineWithZero < Base\n    variable :zero_value, function: false, type: Object\n    variable :merge_value\n\n    def _run(iterator)\n      # Not use combiners[key] ||= ..\n      # it tests nil and not has_key?\n      combiners = {}\n      iterator.each do |key, value|\n        unless combiners.has_key?(key)\n          combiners[key] = @zero_value\n        end\n\n        combiners[key] = @merge_value.call(combiners[key], value)\n      end\n      combiners\n    end\n  end\n\n\n  # ---------------\n\nend\n\n# -------------------------------------------------------------------------------------------------\n# MapValues\n\nclass Spark::Command::MapValues < _Base\n  variable :map_function\n\n  def run(iterator, *)\n    iterator.map! do |item|\n      item[1] = @map_function.call(item[1])\n      item\n    end\n    iterator\n  end\n\n  def lazy_run(iterator, *)\n    iterator.map do |item|\n      item[1] = @map_function.call(item[1])\n      item\n    end\n  end\nend\n\n# -------------------------------------------------------------------------------------------------\n# FlatMapValues\n\nclass Spark::Command::FlatMapValues < _Base\n  variable :map_function\n\n  def run(iterator, *)\n    iterator.map! do |(key, values)|\n      values = @map_function.call(values)\n      values.flatten!(1)\n      values.map! do |value|\n        [key, value]\n      end\n    end\n    iterator.flatten!(1)\n    iterator\n  end\nend\n"
  },
  {
    "path": "lib/spark/command/sort.rb",
    "content": "_Base = Spark::Command::Base\n\n# -------------------------------------------------------------------------------------------------\n# Sort\n\nclass Spark::Command::SortByKey < _Base\n  variable :key_function\n  variable :ascending,  function: false, type: [TrueClass, FalseClass]\n  variable :spilling,   function: false, type: [TrueClass, FalseClass]\n  variable :memory,     function: false, type: [Numeric, NilClass]\n  variable :serializer, function: false, type: Spark::Serializer::Base\n\n  # Currently disabled\n  def before_run\n    @spilling = false\n  end\n\n  def run(iterator, _)\n    if @spilling\n      iterator = run_with_spilling(iterator.each)\n    else\n      run_without_spilling(iterator)\n    end\n\n    iterator\n  end\n\n  def run_with_enum(iterator, _)\n    if @spilling\n      iterator = run_with_spilling(iterator)\n    else\n      iterator = iterator.to_a\n      run_without_spilling(iterator)\n    end\n\n    iterator\n  end\n\n  private\n\n    def run_with_spilling(iterator)\n      sorter = Spark::ExternalSorter.new(@memory, @serializer)\n      sorter.sort_by(iterator, @ascending, @key_function)\n    end\n\n    def run_without_spilling(iterator)\n      iterator.sort_by!(&@key_function)\n      iterator.reverse! unless @ascending\n    end\n\nend\n"
  },
  {
    "path": "lib/spark/command/statistic.rb",
    "content": "_Base = Spark::Command::Base\n\n# -------------------------------------------------------------------------------------------------\n# Sample\n\nclass Spark::Command::Sample < _Base\n  variable :with_replacement, function: false, type: [TrueClass, FalseClass]\n  variable :fraction,         function: false, type: Numeric\n  variable :seed,             function: false, type: [NilClass, Numeric]\n\n  def run(iterator, _)\n    sampler.sample(iterator)\n  end\n\n  def lazy_run(iterator, _)\n    sampler.lazy_sample(iterator)\n  end\n\n  def sampler\n    @sampler ||= _sampler\n  end\n\n  def _sampler\n    if @with_replacement\n      sampler = Spark::Sampler::Poisson\n    else\n      sampler = Spark::Sampler::Uniform\n    end\n\n    sampler = sampler.new(@fraction, @seed)\n  end\nend\n\n# -------------------------------------------------------------------------------------------------\n# Stats\n\nclass Spark::Command::Stats < _Base\n\n  def run(iterator, *)\n    [Spark::StatCounter.new(iterator)]\n  end\n\n  def lazy_run(iterator, *)\n    run(iterator)\n  end\n\nend\n\n# -------------------------------------------------------------------------------------------------\n# Histogram\n\nclass Spark::Command::Histogram < _Base\n  include Spark::Helper::Statistic\n\n  variable :even,    function: false, type: [TrueClass, FalseClass]\n  variable :buckets, function: false, type: Array\n\n  def run(iterator, *)\n    counters = Array.new(counter_size) { 0 }\n    iterator.each do |item|\n      if item.nil? || (item.is_a?(Float) && !item.finite?) || item > max || item < min\n        next\n      end\n\n      x = bucket_function.call(item)\n      if x.nil?\n        # next\n      else\n        counters[x] += 1\n      end\n    end\n    [counters]\n  end\n\n  def lazy_run(iterator, *)\n    run(iterator)\n  end\n\n  private\n\n    def min\n      @buckets.first\n    end\n\n    def max\n      @buckets.last\n    end\n\n    def counter_size\n      @buckets.size-1\n    end\n\n    def increment\n      @buckets[1]-@buckets[0]\n    end\n\n    # Decide which bucket function to pass. We decide here rather than having\n    # a general function so that the decission need only be made once.\n    def bucket_function\n      @bucket_function ||= _bucket_function\n    end\n\n    def _bucket_function\n      if @even\n        fast_bucket_function\n      else\n        basic_bucket_function\n      end\n    end\n\n    # Determine the bucket function in constant time.\n    # Requires that buckets are evenly spaced\n    def fast_bucket_function\n      Proc.new do |item|\n        if item.is_a?(Float) && item.nan?\n          nil\n        else\n          bucket_number = (item - min)/increment\n          if bucket_number > counter_size || bucket_number < 0\n            nil\n          else\n            [bucket_number.to_i, counter_size-1].min\n          end\n        end\n      end\n    end\n\n    # Basic bucket function. Same as right bisect.\n    def basic_bucket_function\n      Proc.new do |item|\n        bucket_number = bisect_right(@buckets, item) - 1\n\n        # Counters is @buckets.size - 1\n        # [bucket_number, counter_size-1].min\n\n        if bucket_number > counter_size-1\n          counter_size-1\n        else\n          bucket_number\n        end\n      end\n    end\n\nend\n"
  },
  {
    "path": "lib/spark/command.rb",
    "content": "module Spark\n  ##\n  # Container which includes all commands and other things for worker\n  # Every RDD have own copy of Command\n  #\n  class Command\n\n    attr_accessor :serializer, :deserializer, :commands, :libraries, :bound_objects\n\n    def initialize\n      @serializer = nil\n      @deserializer = nil\n      @commands = []\n      @libraries = []\n      @bound_objects = {}\n    end\n\n    def execute(iterator, split_index)\n      # Require necessary libraries\n      libraries.each{|lib| require lib}\n\n      # Prepare bound objects\n      @commands.each do |command|\n        command.__objects__ = bound_objects\n      end\n\n      # Prepare for running\n      @commands.each(&:prepare)\n\n      # Run all task\n      @commands.each do |command|\n        iterator = command.execute(iterator, split_index)\n      end\n\n      # Return changed iterator. This is not be necessary for some tasks\n      # because of using inplace changing but some task can return\n      # only one value (for example reduce).\n      iterator\n    end\n\n    def last\n      @commands.last\n    end\n\n    def bound_objects\n      # Objects from users\n      # Already initialized objects on worker\n      return @bound_objects if @bound_objects\n\n      if @serialized_bound_objects\n        # Still serialized\n        @bound_objects = Marshal.load(@serialized_bound_objects)\n      else\n        # Something else\n        @bound_objects = {}\n      end\n    end\n\n    # Bound objects can depend on library which is loaded during @execute\n    # In that case worker raise \"undefined class/module\"\n    def marshal_dump\n      [@serializer, @deserializer, @commands, @libraries, serialized_bound_objects]\n    end\n\n    def marshal_load(array)\n      @serializer = array.shift\n      @deserializer = array.shift\n      @commands = array.shift\n      @libraries = array.shift\n      @serialized_bound_objects = array.shift\n    end\n\n    private\n\n      def serialized_bound_objects\n        @serialized_bound_objects ||= Marshal.dump(@bound_objects)\n      end\n\n  end\nend\n\nrequire 'spark/command/base'\nrequire 'spark/command/basic'\nrequire 'spark/command/pair'\nrequire 'spark/command/statistic'\nrequire 'spark/command/sort'\n"
  },
  {
    "path": "lib/spark/command_builder.rb",
    "content": "require 'spark/command_validator'\n\nmodule Spark\n  ##\n  # Builder for building correct {Spark::Command}\n  #\n  class CommandBuilder\n\n    extend Forwardable\n\n    include Spark::Helper::Serialize\n    include Spark::Helper::System\n    include Spark::CommandValidator\n\n    attr_reader :command\n\n    def_delegators :@command, :serializer, :serializer=, :deserializer, :deserializer=, :commands,\n                              :commands=, :libraries, :libraries=, :bound_objects, :bound_objects=\n\n    def initialize(serializer, deserializer=nil)\n      create_command\n      self.serializer   = serializer\n      self.deserializer = deserializer || serializer.dup\n    end\n\n    def create_command\n      @command = Spark::Command.new\n    end\n\n    # Do not user Marshal.dump(Marshal.load(self)) because some variables\n    # have marshal_dump prepared for worker.\n    def deep_copy\n      copy = self.dup\n      copy.create_command\n      copy.serializer    = self.serializer.deep_copy\n      copy.deserializer  = self.deserializer.deep_copy\n      copy.commands      = self.commands.dup\n      copy.libraries     = self.libraries.dup\n      copy.bound_objects = self.bound_objects.dup\n      copy\n    end\n\n    # Serialize Command class for worker\n    # Java use signed number\n    def build\n      unpack_chars(Marshal.dump(@command))\n    end\n\n    def add_command(klass, *args)\n      variables = klass.settings.variables\n      validate_size(variables, args)\n\n      built_args = []\n      variables.values.zip(args) do |var, arg|\n        if var[:function]\n          arg = serialize_function(arg)\n        end\n\n        validate(arg, var)\n        built_args << arg\n      end\n\n      comm = klass.new(*built_args)\n      @command.commands << comm\n      self\n    end\n\n    def add_library(*libraries)\n      @command.libraries += libraries\n    end\n\n    def bind(objects)\n      objects.symbolize_keys!\n      @command.bound_objects.merge!(objects)\n    end\n\n    private\n\n        # Serialized can be Proc and Method\n        #\n        # === Func\n        # * *string:* already serialized proc\n        # * *proc:* proc\n        # * *symbol:* name of method\n        # * *method:* Method class\n        #\n        def serialize_function(func)\n          case func\n          when String\n            serialize_function_from_string(func)\n          when Symbol\n            serialize_function_from_symbol(func)\n          when Proc\n            serialize_function_from_proc(func)\n          when Method\n            serialize_function_from_method(func)\n          else\n            raise Spark::CommandError, 'You must enter String, Symbol, Proc or Method.'\n          end\n        end\n\n        def serialize_function_from_string(string)\n          {type: 'proc', content: string}\n        end\n\n        def serialize_function_from_symbol(symbol)\n          {type: 'symbol', content: symbol}\n        end\n\n        # Serialize Proc as String\n        #\n        #   lambda{|x| x*x}.to_source\n        #   # => \"proc { |x| (x * x) }\"\n        #\n        def serialize_function_from_proc(proc)\n          serialize_function_from_string(proc.to_source)\n        rescue\n          raise Spark::SerializeError, 'Proc can not be serialized. Use String instead.'\n        end\n\n        # Serialize method as string\n        #\n        #   def test(x)\n        #     x*x\n        #   end\n        #   serialize_function_from_method(method(:test))\n        #\n        #   # => \"def test(x)\\n  x*x\\nend\\n\"\n        #\n        def serialize_function_from_method(meth)\n          if pry?\n            meth = Pry::Method.new(meth)\n          end\n\n          {type: 'method', name: meth.name, content: meth.source}\n        rescue\n          raise Spark::SerializeError, 'Method can not be serialized. Use full path or Proc.'\n        end\n\n  end\nend\n"
  },
  {
    "path": "lib/spark/command_validator.rb",
    "content": "module Spark\n  module CommandValidator\n\n    def validate(value, options)\n      validate_type(value, options[:type])\n    end\n\n    def valid?(value, options)\n      begin\n        validate(value, options)\n        return true\n      rescue\n        return false\n      end\n    end\n\n    def validate_type(value, types)\n      types = [types] if !types.is_a?(Array)\n\n      types.each do |type|\n        return if value.is_a?(type)\n      end\n\n      error \"Value: #{value} should be a #{types.join(' or ')} but is #{value.class}.\"\n    end\n\n    def validate_size(array1, array2)\n      if array1.size != array2.size\n        error \"Wrong number of arguments (#{array1.size} for #{array2.size})\"\n      end\n    end\n\n  end\nend\n"
  },
  {
    "path": "lib/spark/config.rb",
    "content": "# Necessary libraries\nSpark.load_lib\n\nmodule Spark\n  # Common configuration for RubySpark and Spark\n  class Config\n\n    include Spark::Helper::System\n\n    TYPES = {\n      'spark.shuffle.spill' => :boolean,\n      'spark.ruby.serializer.compress' => :boolean\n    }\n\n    # Initialize java SparkConf and load default configuration.\n    def initialize\n      @spark_conf = SparkConf.new(true)\n      set_default\n      from_file(Spark::DEFAULT_CONFIG_FILE)\n    end\n\n    def from_file(file)\n      check_read_only\n\n      if file && File.exist?(file)\n        file = File.expand_path(file)\n        RubyUtils.loadPropertiesFile(spark_conf, file)\n      end\n    end\n\n    def [](key)\n      get(key)\n    end\n\n    def []=(key, value)\n      set(key, value)\n    end\n\n    def spark_conf\n      if Spark.started?\n        # Get latest configuration\n        Spark.context.jcontext.conf\n      else\n        @spark_conf\n      end\n    end\n\n    def valid!\n      errors = []\n\n      if !contains?('spark.app.name')\n        errors << 'An application name must be set in your configuration.'\n      end\n\n      if !contains?('spark.master')\n        errors << 'A master URL must be set in your configuration.'\n      end\n\n      if Spark::Serializer.find(get('spark.ruby.serializer')).nil?\n        errors << 'Unknow serializer.'\n      end\n\n      scanned = get('spark.ruby.executor.command').scan('%s')\n\n      if scanned.size == 0\n        errors << \"Executor command must contain '%s'.\"\n      end\n\n      if scanned.size > 1\n        errors << \"Executor command can contain only one '%s'.\"\n      end\n\n      if errors.any?\n        errors.map!{|error| \"- #{error}\"}\n\n        raise Spark::ConfigurationError, \"Configuration is not valid:\\r\\n#{errors.join(\"\\r\\n\")}\"\n      end\n    end\n\n    def read_only?\n      Spark.started?\n    end\n\n    # Rescue from NoSuchElementException\n    def get(key)\n      value = spark_conf.get(key.to_s)\n\n      case TYPES[key]\n      when :boolean\n        parse_boolean(value)\n      when :integer\n        parse_integer(value)\n      else\n        value\n      end\n    rescue\n      nil\n    end\n\n    def get_all\n      Hash[spark_conf.getAll.map{|tuple| [tuple._1, tuple._2]}]\n    end\n\n    def contains?(key)\n      spark_conf.contains(key.to_s)\n    end\n\n    def set(key, value)\n      check_read_only\n      spark_conf.set(key.to_s, value.to_s)\n    end\n\n    def set_app_name(name)\n      set('spark.app.name', name)\n    end\n\n    def set_master(master)\n      set('spark.master', master)\n    end\n\n    def parse_boolean(value)\n      case value\n      when 'true'\n        true\n      when 'false'\n        false\n      end\n    end\n\n    def parse_integer(value)\n      value.to_i\n    end\n\n    # =============================================================================\n    # Defaults\n\n    def set_default\n      set_app_name('RubySpark')\n      set_master('local[*]')\n      set('spark.ruby.driver_home', Spark.home)\n      set('spark.ruby.serializer', default_serializer)\n      set('spark.ruby.serializer.compress', default_serializer_compress)\n      set('spark.ruby.serializer.batch_size', default_serializer_batch_size)\n      set('spark.ruby.executor.command', default_executor_command)\n      set('spark.ruby.executor.options', default_executor_options)\n      set('spark.ruby.worker.type', default_worker_type)\n      load_executor_envs\n      # set('spark.ruby.executor.install', default_executor_install)\n    end\n\n    def default_serializer\n      ENV['SPARK_RUBY_SERIALIZER'] || Spark::Serializer::DEFAULT_SERIALIZER_NAME\n    end\n\n    def default_serializer_compress\n      ENV['SPARK_RUBY_SERIALIZER_COMPRESS'] || Spark::Serializer::DEFAULT_COMPRESS\n    end\n\n    def default_serializer_batch_size\n      ENV['SPARK_RUBY_SERIALIZER_BATCH_SIZE'] || Spark::Serializer::DEFAULT_BATCH_SIZE\n    end\n\n    # Command template which is applied when scala want create a ruby\n    # process (e.g. master, home request). Command is represented by '%s'.\n    #\n    # == Example:\n    #   bash --norc -i -c \"export HOME=/home/user; cd; source .bashrc; %s\"\n    #\n    def default_executor_command\n      ENV['SPARK_RUBY_EXECUTOR_COMMAND'] || '%s'\n    end\n\n    # Options for every worker.\n    #\n    # == Example:\n    #   -J-Xmx512m\n    #\n    def default_executor_options\n      ENV['SPARK_RUBY_EXECUTOR_OPTIONS'] || ''\n    end\n\n    # # Install command which is triggered before on start.\n    # # This command using executor command template.\n    # #\n    # # == Example:\n    # #   gem install ruby-spark -v 1.2.0\n    # #\n    # def default_executor_install\n    #   ENV['SPARK_RUBY_EXECUTOR_INSTALL'] || ''\n    # end\n\n    # Type of worker.\n    #\n    # == Options:\n    # process:: (default)\n    # thread:: (experimental)\n    #\n    def default_worker_type\n      ENV['SPARK_RUBY_WORKER_TYPE'] || 'process'\n    end\n\n    # Load environment variables for executor from ENV.\n    #\n    # == Examples:\n    #   SPARK_RUBY_EXECUTOR_ENV_KEY1=\"1\"\n    #   SPARK_RUBY_EXECUTOR_ENV_KEY2=\"2\"\n    #\n    def load_executor_envs\n      prefix = 'SPARK_RUBY_EXECUTOR_ENV_'\n\n      envs = ENV.select{|key, _| key.start_with?(prefix)}\n      envs.each do |key, value|\n        key = key.dup # ENV keys are frozen\n        key.slice!(0, prefix.size)\n\n        set(\"spark.ruby.executor.env.#{key}\", value)\n      end\n    end\n\n    # Aliases\n    alias_method :getAll,     :get_all\n    alias_method :setAppName, :set_app_name\n    alias_method :setMaster,  :set_master\n\n    private\n\n      def check_read_only\n        if read_only?\n          raise Spark::ConfigurationError, 'Configuration is ready only'\n        end\n      end\n\n  end\nend\n"
  },
  {
    "path": "lib/spark/constant.rb",
    "content": "module Spark\n  # Commond constant for Ruby and Spark\n  module Constant\n    DATA_EOF = -2\n    WORKER_ERROR = -1\n    WORKER_DONE = 0\n    CREATE_WORKER = 1\n    KILL_WORKER = 2\n    KILL_WORKER_AND_WAIT = 3\n    SUCCESSFULLY_KILLED = 4\n    UNSUCCESSFUL_KILLING = 5\n    ACCUMULATOR_ACK = 6\n  end\nend\n"
  },
  {
    "path": "lib/spark/context.rb",
    "content": "# Necessary libraries\nSpark.load_lib\n\nmodule Spark\n  ##\n  # Main entry point for Spark functionality. A SparkContext represents the connection to a Spark\n  # cluster, and can be used to create RDDs, accumulators and broadcast variables on that cluster.\n  #\n  class Context\n\n    include Spark::Helper::System\n    include Spark::Helper::Parser\n    include Spark::Helper::Logger\n\n    attr_reader :jcontext, :jaccumulator, :temp_dir\n\n    # Constructor for Ruby context. Configuration is automatically is taken\n    # from Spark. Config will be automatically set to default if user start\n    # context first.\n    #\n    def initialize\n      Spark.config.valid!\n      @jcontext = JavaSparkContext.new(Spark.config.spark_conf)\n      @jcontext.addJar(Spark.ruby_spark_jar)\n\n      # Does not work on 1.2\n      # ui.attachTab(RubyTab.new(ui, to_java_hash(RbConfig::CONFIG)))\n\n      spark_local_dir = JUtils.getLocalDir(sc.conf)\n      @temp_dir = JUtils.createTempDir(spark_local_dir, 'ruby').getAbsolutePath\n\n      accum_server = Spark::Accumulator::Server\n      accum_server.start\n      @jaccumulator = @jcontext.accumulator(ArrayList.new, RubyAccumulatorParam.new(accum_server.host, accum_server.port))\n\n      log_info(\"Ruby accumulator server is running on port #{accum_server.port}\")\n\n      set_call_site('Ruby') # description of stage\n    end\n\n    def inspect\n      result  = %{#<#{self.class.name}:0x#{object_id}\\n}\n      result << %{Tempdir: \"#{temp_dir}\">}\n      result\n    end\n\n    def stop\n      Spark::Accumulator::Server.stop\n      log_info('Ruby accumulator server was stopped')\n      @jcontext.stop\n    end\n\n    def sc\n      @jcontext.sc\n    end\n\n    def ui\n      sc.ui\n    end\n\n    # Default level of parallelism to use when not given by user (e.g. parallelize and makeRDD)\n    #\n    def default_parallelism\n      sc.defaultParallelism\n    end\n\n    # Default serializer\n    #\n    # Batch -> Compress -> Basic\n    #\n    def default_serializer\n      # Basic\n      serializer = Spark::Serializer.find!(config('spark.ruby.serializer')).new\n\n      # Compress\n      if config('spark.ruby.serializer.compress')\n        serializer = Spark::Serializer.compressed(serializer)\n      end\n\n      # Bactching\n      batch_size = default_batch_size\n      if batch_size == 'auto'\n        serializer = Spark::Serializer.auto_batched(serializer)\n      else\n        serializer = Spark::Serializer.batched(serializer, batch_size)\n      end\n\n      # Finally, \"container\" contains serializers\n      serializer\n    end\n\n    def default_batch_size\n      size = config('spark.ruby.serializer.batch_size').to_i\n      if size >= 1\n        size\n      else\n        'auto'\n      end\n    end\n\n    # Set a local property that affects jobs submitted from this thread, such as the\n    # Spark fair scheduler pool.\n    #\n    def set_local_property(key, value)\n      jcontext.setLocalProperty(key, value)\n    end\n\n    # Get a local property set in this thread, or null if it is missing\n    #\n    def get_local_property(key)\n      jcontext.getLocalProperty(key)\n    end\n\n    # Support function for API backtraces.\n    #\n    def set_call_site(site)\n      jcontext.setCallSite(site)\n    end\n\n    def clear_call_site\n      jcontext.clearCallSite\n    end\n\n    # Return a copy of this SparkContext's configuration. The configuration *cannot*\n    # be changed at runtime.\n    #\n    def config(key=nil)\n      if key\n        Spark.config.get(key)\n      else\n        Spark.config\n      end\n    end\n\n    # Add a file to be downloaded with this Spark job on every node.\n    # The path of file passed can be either a local file, a file in HDFS\n    # (or other Hadoop-supported filesystems), or an HTTP, HTTPS or FTP URI.\n    #\n    # To access the file in Spark jobs, use `SparkFiles.get(file_name)` with the\n    # filename to find its download location.\n    #\n    # == Example:\n    #   `echo 10 > test.txt`\n    #\n    #   $sc.add_file('test.txt')\n    #   $sc.parallelize(0..5).map(lambda{|x| x * SparkFiles.get_content('test.txt').to_i}).collect\n    #   # => [0, 10, 20, 30, 40, 50]\n    #\n    def add_file(*files)\n      files.each do |file|\n        sc.addFile(file)\n      end\n    end\n\n    # Broadcast a read-only variable to the cluster, returning a Spark::Broadcast\n    # object for reading it in distributed functions. The variable will\n    # be sent to each cluster only once.\n    #\n    # == Example:\n    #   broadcast1 = $sc.broadcast('a')\n    #   broadcast2 = $sc.broadcast('b')\n    #\n    #   rdd = $sc.parallelize(0..5, 4)\n    #   rdd = rdd.bind(broadcast1: broadcast1, broadcast2: broadcast2)\n    #   rdd = rdd.map_partitions_with_index(lambda{|part, index| [broadcast1.value * index, broadcast2.value * index] })\n    #   rdd.collect\n    #   # => [\"\", \"\", \"a\", \"b\", \"aa\", \"bb\", \"aaa\", \"bbb\"]\n    #\n    def broadcast(value)\n      Spark::Broadcast.new(self, value)\n    end\n\n    # Create an Accumulator with the given initial value, using a given\n    # accum_param helper object to define how to add values of the\n    # data type if provided.\n    #\n    # == Example:\n    #   accum = $sc.accumulator(7)\n    #\n    #   rdd = $sc.parallelize(0..5, 4)\n    #   rdd = rdd.bind(accum: accum)\n    #   rdd = rdd.map_partitions(lambda{|_| accum.add(1) })\n    #   rdd = rdd.collect\n    #\n    #   accum.value\n    #   # => 11\n    #\n    def accumulator(value, accum_param=:+, zero_value=0)\n      Spark::Accumulator.new(value, accum_param, zero_value)\n    end\n\n    # Distribute a local Ruby collection to form an RDD\n    # Direct method can be slow so be careful, this method update data inplace\n    #\n    # == Parameters:\n    # data:: Range or Array\n    # num_slices:: number of slice\n    # serializer:: custom serializer (default: serializer based on configuration)\n    #\n    # == Examples:\n    #   $sc.parallelize([\"1\", \"2\", \"3\"]).map(lambda{|x| x.to_i}).collect\n    #   #=> [1, 2, 3]\n    #\n    #   $sc.parallelize(1..3).map(:to_s).collect\n    #   #=> [\"1\", \"2\", \"3\"]\n    #\n    def parallelize(data, num_slices=nil, serializer=nil)\n      num_slices ||= default_parallelism\n      serializer ||= default_serializer\n\n      serializer.check_each(data)\n\n      # Through file\n      file = Tempfile.new('to_parallelize', temp_dir)\n      serializer.dump_to_io(data, file)\n      file.close # not unlink\n      jrdd = RubyRDD.readRDDFromFile(jcontext, file.path, num_slices)\n\n      Spark::RDD.new(jrdd, self, serializer)\n    ensure\n      file && file.unlink\n    end\n\n    # Read a text file from HDFS, a local file system (available on all nodes), or any\n    # Hadoop-supported file system URI, and return it as an RDD of Strings.\n    #\n    # == Example:\n    #   f = Tempfile.new(\"test\")\n    #   f.puts(\"1\")\n    #   f.puts(\"2\")\n    #   f.close\n    #\n    #   $sc.text_file(f.path).map(lambda{|x| x.to_i}).collect\n    #   # => [1, 2]\n    #\n    def text_file(path, min_partitions=nil, encoding=Encoding::UTF_8, serializer=nil)\n      min_partitions ||= default_parallelism\n      serializer     ||= default_serializer\n      deserializer     = Spark::Serializer.build { __text__(encoding) }\n\n      Spark::RDD.new(@jcontext.textFile(path, min_partitions), self, serializer, deserializer)\n    end\n\n    # Read a directory of text files from HDFS, a local file system (available on all nodes), or any\n    # Hadoop-supported file system URI. Each file is read as a single record and returned in a\n    # key-value pair, where the key is the path of each file, the value is the content of each file.\n    #\n    # == Example:\n    #   dir = Dir.mktmpdir\n    #   f1 = Tempfile.new(\"test1\", dir)\n    #   f2 = Tempfile.new(\"test2\", dir)\n    #   f1.puts(\"1\"); f1.puts(\"2\");\n    #   f2.puts(\"3\"); f2.puts(\"4\");\n    #   f1.close\n    #   f2.close\n    #\n    #   $sc.whole_text_files(dir).flat_map(lambda{|key, value| value.split}).collect\n    #   # => [\"1\", \"2\", \"3\", \"4\"]\n    #\n    def whole_text_files(path, min_partitions=nil, serializer=nil)\n      min_partitions ||= default_parallelism\n      serializer     ||= default_serializer\n      deserializer     = Spark::Serializer.build{ __pair__(__text__, __text__) }\n\n      Spark::RDD.new(@jcontext.wholeTextFiles(path, min_partitions), self, serializer, deserializer)\n    end\n\n    # Executes the given partition function f on the specified set of partitions,\n    # returning the result as an array of elements.\n    #\n    # If partitions is not specified, this will run over all partitions.\n    #\n    # == Example:\n    #   rdd = $sc.parallelize(0..10, 5)\n    #   $sc.run_job(rdd, lambda{|x| x.to_s}, [0,2])\n    #   # => [\"[0, 1]\", \"[4, 5]\"]\n    #\n    def run_job(rdd, f, partitions=nil, allow_local=false)\n      run_job_with_command(rdd, partitions, allow_local, Spark::Command::MapPartitions, f)\n    end\n\n    # Execute the given command on specific set of partitions.\n    #\n    def run_job_with_command(rdd, partitions, allow_local, command, *args)\n      if !partitions.nil? && !partitions.is_a?(Array)\n        raise Spark::ContextError, 'Partitions must be nil or Array'\n      end\n\n      partitions_size = rdd.partitions_size\n\n      # Execute all parts\n      if partitions.nil?\n        partitions = (0...partitions_size).to_a\n      end\n\n      # Can happend when you use coalesce\n      partitions.delete_if {|part| part >= partitions_size}\n\n      # Rjb represent Fixnum as Integer but Jruby as Long\n      partitions = to_java_array_list(convert_to_java_int(partitions))\n\n      # File for result\n      file = Tempfile.new('collect', temp_dir)\n\n      mapped = rdd.new_rdd_from_command(command, *args)\n      RubyRDD.runJob(rdd.context.sc, mapped.jrdd, partitions, allow_local, file.path)\n\n      mapped.collect_from_file(file)\n    end\n\n\n    # Aliases\n    alias_method :textFile, :text_file\n    alias_method :wholeTextFiles, :whole_text_files\n    alias_method :defaultParallelism, :default_parallelism\n    alias_method :setLocalProperty, :set_local_property\n    alias_method :getLocalProperty, :get_local_property\n    alias_method :setCallSite, :set_call_site\n    alias_method :clearCallSite, :clear_call_site\n    alias_method :runJob, :run_job\n    alias_method :runJobWithCommand, :run_job_with_command\n    alias_method :addFile, :add_file\n\n  end\nend\n"
  },
  {
    "path": "lib/spark/error.rb",
    "content": "module Spark\n  # Extension cannot be built\n  class BuildError < StandardError\n  end\n\n  # Proc.to_source\n  # Java object cannot be converted\n  class SerializeError < StandardError\n  end\n\n  # Serializer method\n  # Non-existing serializer\n  class NotImplemented < StandardError\n  end\n\n  # Missison app_name or master\n  class ConfigurationError < StandardError\n  end\n\n  # Wrong parameters\n  class RDDError < StandardError\n  end\n\n  # Validations\n  class CommandError < StandardError\n  end\n\n  # Parser helper\n  # SQL DataType\n  class ParseError < StandardError\n  end\n\n  # Validation in context\n  class ContextError < StandardError\n  end\n\n  # Broadcasts\n  # Missing path\n  class BroadcastError < StandardError\n  end\n\n  # Accumulators\n  # Existing keys\n  # Wrong ID\n  class AccumulatorError < StandardError\n  end\n\n  # Wrong instances\n  class MllibError < StandardError\n  end\n\n  # Wrong datatype\n  class SQLError < StandardError\n  end\n\n  # Missing Java class\n  class JavaBridgeError < StandardError\n  end\nend\n"
  },
  {
    "path": "lib/spark/ext/hash.rb",
    "content": "module Spark\n  module CoreExtension\n    module Hash\n      module ClassMethods\n      end\n\n      module InstanceMethods\n        # Destructively convert all keys to strings.\n        def stringify_keys_with_spark!\n          transform_keys!{ |key| key.to_s }\n        end\n\n        # Destructively convert all keys to symbols, as long as they respond\n        def symbolize_keys_with_spark!\n          transform_keys!{ |key| key.to_sym rescue key }\n        end\n\n        # Destructively convert all keys using the block operations.\n        # Same as transform_keys but modifies +self+.\n        def transform_keys_with_spark!\n          keys.each do |key|\n            self[yield(key)] = delete(key)\n          end\n          self\n        end\n      end\n\n      def self.included(base)\n        base.extend(ClassMethods)\n        base.send(:include, InstanceMethods)\n        base.class_eval do\n          patch_unless_exist :stringify_keys!, :spark\n          patch_unless_exist :symbolize_keys!, :spark\n          patch_unless_exist :transform_keys!, :spark\n        end\n      end\n    end\n  end\nend\n\nHash.__send__(:include, Spark::CoreExtension::Hash)\n"
  },
  {
    "path": "lib/spark/ext/integer.rb",
    "content": "module Spark\n  module CoreExtension\n    module Integer\n      module ClassMethods\n      end\n\n      module InstanceMethods\n      end\n\n      def self.included(base)\n        base.extend(ClassMethods)\n        base.send(:include, InstanceMethods)\n        base.class_eval do\n          const_set :MAX_WITH_SPARK, 1 << (1.size * 8 - 2) - 1\n          const_set :MIN_WITH_SPARK, -const_get(:MAX_WITH_SPARK) - 1\n\n          path_const_unless_exist :MAX, :SPARK\n          path_const_unless_exist :MIN, :SPARK\n        end\n      end\n    end\n  end\nend\n\nInteger.__send__(:include, Spark::CoreExtension::Integer)\n"
  },
  {
    "path": "lib/spark/ext/io.rb",
    "content": "module Spark\n  module CoreExtension\n    module IO\n      module ClassMethods\n      end\n\n      module InstanceMethods\n\n        # Reading\n\n        def read_int\n          unpack_int(read(4))\n        end\n\n        def read_int_or_eof\n          bytes = read(4)\n          return Spark::Constant::DATA_EOF if bytes.nil?\n          unpack_int(bytes)\n        end\n\n        def read_long\n          unpack_long(read(8))\n        end\n\n        def read_string\n          read(read_int)\n        end\n\n        def read_data\n          Marshal.load(read_string)\n        end\n\n\n        # Writing\n\n        def write_int(data)\n          write(pack_int(data))\n        end\n\n        def write_long(data)\n          write(pack_long(data))\n        end\n\n        # Size and data can have different encoding\n        # Marshal: both ASCII\n        # Oj: ASCII and UTF-8\n        def write_string(data)\n          write_int(data.bytesize)\n          write(data)\n        end\n\n        def write_data(data)\n          write_string(Marshal.dump(data))\n        end\n      end\n\n      def self.included(base)\n        base.extend(ClassMethods)\n        base.send(:include, Spark::Helper::Serialize)\n        base.send(:include, InstanceMethods)\n      end\n    end\n  end\nend\n\nIO.__send__(:include, Spark::CoreExtension::IO)\nStringIO.__send__(:include, Spark::CoreExtension::IO)\n"
  },
  {
    "path": "lib/spark/ext/ip_socket.rb",
    "content": "module Spark\n  module CoreExtension\n    module IPSocket\n      module ClassMethods\n      end\n\n      module InstanceMethods\n        def port\n          addr[1]\n        end\n\n        def hostname\n          addr(true)[2]\n        end\n\n        def numeric_address\n          addr[3]\n        end\n      end\n\n      def self.included(base)\n        base.extend(ClassMethods)\n        base.send(:include, InstanceMethods)\n      end\n    end\n  end\nend\n\nIPSocket.__send__(:include, Spark::CoreExtension::IPSocket)\n"
  },
  {
    "path": "lib/spark/ext/module.rb",
    "content": "module Spark\n  module CoreExtension\n    module Module\n\n      # Patch method to class unless already exist\n      #\n      # == Example:\n      #\n      #   class Hash\n      #     def a\n      #       1\n      #     end\n      #   end\n      #\n      #   module HashExtension\n      #     module InstanceMethods\n      #       def a_with_spark\n      #         2\n      #       end\n      #\n      #       def b_with_spark\n      #         1\n      #       end\n      #     end\n      #\n      #     def self.included(base)\n      #       base.send(:include, InstanceMethods)\n      #       base.class_eval do\n      #         patch_unless_exist :a, :spark\n      #         patch_unless_exist :b, :spark\n      #       end\n      #     end\n      #   end\n      #\n      #   Hash.include(HashExtension)\n      #\n      #   Hash.new.a # => 1\n      #   Hash.new.b # => 1\n      #\n      def patch_unless_exist(target, suffix)\n        unless method_defined?(target)\n          aliased_target, punctuation = target.to_s.sub(/([?!=])$/, ''), $1\n\n          alias_method target, \"#{aliased_target}_with_#{suffix}#{punctuation}\"\n        end\n      end\n\n      def path_const_unless_exist(target, suffix)\n        unless const_defined?(target)\n          const_set(target, const_get(\"#{target}_WITH_#{suffix}\"))\n        end\n      end\n\n    end\n  end\nend\n\nModule.__send__(:include, Spark::CoreExtension::Module)\n"
  },
  {
    "path": "lib/spark/ext/object.rb",
    "content": "module Spark\n  module CoreExtension\n    module Object\n      module ClassMethods\n      end\n\n      module InstanceMethods\n        def deep_copy_with_spark\n          Marshal.load(Marshal.dump(self))\n        end\n\n        def silence_warnings\n          old_verbose, $VERBOSE = $VERBOSE, nil\n          yield\n        ensure\n          $VERBOSE = old_verbose\n        end\n\n        def cattr_reader_with_spark(*syms)\n          syms.each do |sym|\n            raise NameError.new(\"Invalid attribute name: #{sym}\") unless sym =~ /^[_A-Za-z]\\w*$/\n\n            class_eval(<<-EOS, __FILE__, __LINE__ + 1)\n              @@#{sym} = nil unless defined? @@#{sym}\n              def self.#{sym}\n                @@#{sym}\n              end\n            EOS\n\n            class_eval(<<-EOS, __FILE__, __LINE__ + 1)\n              def #{sym}\n                @@#{sym}\n              end\n            EOS\n          end\n        end\n\n        def cattr_writer_with_spark(*syms)\n          syms.each do |sym|\n            raise NameError.new(\"Invalid attribute name: #{sym}\") unless sym =~ /^[_A-Za-z]\\w*$/\n\n            class_eval(<<-EOS, __FILE__, __LINE__ + 1)\n              @@#{sym} = nil unless defined? @@#{sym}\n              def self.#{sym}=(obj)\n                @@#{sym} = obj\n              end\n            EOS\n\n            class_eval(<<-EOS, __FILE__, __LINE__ + 1)\n              def #{sym}=(obj)\n                @@#{sym} = obj\n              end\n            EOS\n          end\n        end\n\n        def cattr_accessor_with_spark(*syms)\n          cattr_reader_with_spark(*syms)\n          cattr_writer_with_spark(*syms)\n        end\n      end\n\n      def self.included(base)\n        base.extend(ClassMethods)\n        base.send(:include, InstanceMethods)\n        base.class_eval do\n          patch_unless_exist :deep_copy, :spark\n          patch_unless_exist :silence_warnings, :spark\n          patch_unless_exist :cattr_accessor, :spark\n        end\n      end\n    end\n  end\nend\n\nObject.__send__(:include, Spark::CoreExtension::Object)\n"
  },
  {
    "path": "lib/spark/ext/string.rb",
    "content": "module Spark\n  module CoreExtension\n    module String\n      module ClassMethods\n      end\n\n      module InstanceMethods\n        def camelize_with_spark\n          self.gsub(/\\/(.?)/) { \"::#{$1.upcase}\" }.gsub(/(?:^|_)(.)/) { $1.upcase }\n        end\n      end\n\n      def self.included(base)\n        base.extend(ClassMethods)\n        base.send(:include, InstanceMethods)\n        base.class_eval do\n          patch_unless_exist :camelize, :spark\n        end\n      end\n    end\n  end\nend\n\nString.__send__(:include, Spark::CoreExtension::String)\n"
  },
  {
    "path": "lib/spark/helper/logger.rb",
    "content": "module Spark\n  module Helper\n    module Logger\n\n      def self.included(base)\n        base.send :extend,  Methods\n        base.send :include, Methods\n      end\n\n      module Methods\n        def log_info(message)\n          Spark.logger.info(message)\n        end\n\n        def log_debug(message)\n          Spark.logger.debug(message)\n        end\n\n        def log_trace(message)\n          Spark.logger.trace(message)\n        end\n\n        def log_warning(message)\n          Spark.logger.warning(message)\n        end\n\n        def log_error(message)\n          Spark.logger.error(message)\n        end\n\n        alias_method :logInfo,    :log_info\n        alias_method :logDebug,   :log_debug\n        alias_method :logTrace,   :log_trace\n        alias_method :logWarning, :log_warning\n        alias_method :logError,   :log_error\n\n      end # Methods\n    end # Logger\n  end # Helper\nend # Spark\n"
  },
  {
    "path": "lib/spark/helper/parser.rb",
    "content": "module Spark\n  module Helper\n    module Parser\n      \n      def self.included(base)\n        base.send :extend,  Methods\n        base.send :include, Methods\n      end\n     \n      module Methods\n        def to_java_hash(hash)\n          hash_map = HashMap.new\n          hash.each_pair do |key, value|\n            begin\n              # RJB raise Object is NULL (but new record is put correctly)\n              hash_map.put(key, value)\n            rescue RuntimeError\n            end\n          end\n          hash_map\n        end\n\n        def convert_to_java_int(data)\n          if data.is_a?(Array)\n            data.map{|x| JInteger.new(x)}\n          else\n            JInteger.new(data)\n          end\n        end\n\n        def to_java_array_list(array)\n          array_list = ArrayList.new\n          array.each do |item|\n            array_list.add(item)\n          end\n          array_list\n        end\n\n        # Parse and convert memory size. Shifting be better but Float doesn't support it.\n        #\n        # == Examples:\n        #   to_memory_size(\"512mb\")\n        #   # => 524288\n        #\n        #   to_memory_size(\"512 MB\")\n        #   # => 524288\n        #\n        #   to_memory_size(\"512mb\", \"GB\")\n        #   # => 0.5\n        #\n        def to_memory_size(memory, result_unit=\"KB\")\n          match = memory.match(/([\\d]+)[\\s]*([\\w]*)/)\n          if match.nil?\n            raise Spark::ParseError, \"Memory has wrong format. Use: 'SIZE UNIT'\"\n          end\n\n          size = match[1].to_f\n          unit = match[2]\n\n          size *= memory_multiplier_based_kb(unit)\n          size /= memory_multiplier_based_kb(result_unit)\n          size.round(2)\n        end\n\n        # Based to KB\n        def memory_multiplier_based_kb(type)\n          case type.to_s.upcase\n          when \"G\", \"GB\"\n            1048576\n          when \"M\", \"MB\"\n            1024\n          when \"K\", \"KB\"\n            1\n          else\n            raise Spark::ParseError, \"Unsupported type #{type}\"\n          end\n        end\n\n      end # Methods\n\n    end # Parser\n  end # Helper\nend # Spark\n\n\n"
  },
  {
    "path": "lib/spark/helper/serialize.rb",
    "content": "module Spark\n  module Helper\n    module Serialize\n\n      DIRECTIVE_INTEGER_BIG_ENDIAN = 'l>'\n      DIRECTIVE_INTEGERS_BIG_ENDIAN = 'l>*'\n      DIRECTIVE_LONG_BIG_ENDIAN = 'q>'\n      DIRECTIVE_LONGS_BIG_ENDIAN = 'q>*'\n      DIRECTIVE_DOUBLE_BIG_ENDIAN = 'G'\n      DIRECTIVE_DOUBLES_BIG_ENDIAN = 'G*'\n      DIRECTIVE_UNSIGNED_CHARS = 'C*'\n      DIRECTIVE_CHARS = 'c*'\n\n      # Packing\n\n      def pack_int(data)\n        [data].pack(DIRECTIVE_INTEGER_BIG_ENDIAN)\n      end\n\n      def pack_long(data)\n        [data].pack(DIRECTIVE_LONG_BIG_ENDIAN)\n      end\n\n      def pack_double(data)\n        [data].pack(DIRECTIVE_DOUBLE_BIG_ENDIAN)\n      end\n\n      def pack_unsigned_chars(data)\n        data.pack(DIRECTIVE_UNSIGNED_CHARS)\n      end\n\n      def pack_ints(data)\n        __check_array(data)\n        data.pack(DIRECTIVE_INTEGERS_BIG_ENDIAN)\n      end\n\n      def pack_longs(data)\n        __check_array(data)\n        data.pack(DIRECTIVE_LONGS_BIG_ENDIAN)\n      end\n\n      def pack_doubles(data)\n        __check_array(data)\n        data.pack(DIRECTIVE_DOUBLES_BIG_ENDIAN)\n      end\n\n      # Unpacking\n\n      def unpack_int(data)\n        data.unpack(DIRECTIVE_INTEGER_BIG_ENDIAN)[0]\n      end\n\n      def unpack_long(data)\n        data.unpack(DIRECTIVE_LONG_BIG_ENDIAN)[0]\n      end\n\n      def unpack_chars(data)\n        data.unpack(DIRECTIVE_CHARS)\n      end\n\n      private\n\n        def __check_array(data)\n          unless data.is_a?(Array)\n            raise ArgumentError, 'Data must be an Array.'\n          end\n        end\n\n    end\n  end\nend\n"
  },
  {
    "path": "lib/spark/helper/statistic.rb",
    "content": "module Spark\n  module Helper\n    module Statistic\n\n      # Returns a sampling rate that guarantees a sample of size >= sampleSizeLowerBound 99.99% of the time.\n      #\n      # == How the sampling rate is determined:\n      # Let p = num / total, where num is the sample size and total is the total number of\n      # datapoints in the RDD. We're trying to compute q > p such that\n      # * when sampling with replacement, we're drawing each datapoint with prob_i ~ Pois(q),\n      #   where we want to guarantee Pr[s < num] < 0.0001 for s = sum(prob_i for i from 0 to total),\n      #   i.e. the failure rate of not having a sufficiently large sample < 0.0001.\n      #   Setting q = p + 5 * sqrt(p/total) is sufficient to guarantee 0.9999 success rate for\n      #   num > 12, but we need a slightly larger q (9 empirically determined).\n      # * when sampling without replacement, we're drawing each datapoint with prob_i\n      #   ~ Binomial(total, fraction) and our choice of q guarantees 1-delta, or 0.9999 success\n      #   rate, where success rate is defined the same as in sampling with replacement.\n      #\n      def compute_fraction(lower_bound, total, with_replacement)\n        lower_bound = lower_bound.to_f\n\n        if with_replacement\n          upper_poisson_bound(lower_bound) / total\n        else\n          fraction = lower_bound / total\n          upper_binomial_bound(0.00001, total, fraction)\n        end\n      end\n\n      def upper_poisson_bound(bound)\n        num_std = if bound < 6\n          12\n        elsif bound < 16\n          9\n        else\n          6\n        end.to_f\n\n        [bound + num_std * Math.sqrt(bound), 1e-10].max\n      end\n\n      def upper_binomial_bound(delta, total, fraction)\n        gamma = -Math.log(delta) / total\n        [1, fraction + gamma + Math.sqrt(gamma*gamma + 2*gamma*fraction)].min\n      end\n\n      # Bisect right\n      #\n      # == Examples:\n      #   data = [1,5,6,8,96,120,133]\n      #\n      #   bisect_right(data, 0)   # => 0\n      #   bisect_right(data, 1)   # => 1\n      #   bisect_right(data, 5)   # => 2\n      #   bisect_right(data, 9)   # => 4\n      #   bisect_right(data, 150) # => 7\n      #\n      def bisect_right(data, value, low=0, high=data.size)\n        if low < 0\n          raise ArgumentError, 'Low must be >= 0.'\n        end\n\n        while low < high\n          mid = (low + high) / 2\n          if value < data[mid]\n            high = mid\n          else\n            low = mid + 1\n          end\n        end\n\n        low\n      end\n\n      # Determine bound of partitioning\n      #\n      # == Example:\n      #   data = [0,1,2,3,4,5,6,7,8,9,10]\n      #   determine_bounds(data, 3)\n      #   # => [3, 7]\n      #\n      def determine_bounds(data, num_partitions)\n        if num_partitions > data.size\n          return data\n        end\n\n        bounds = []\n        count = data.size\n        (0...(num_partitions-1)).each do |index|\n          bounds << data[count * (index+1) / num_partitions]\n        end\n        bounds\n      end\n\n    end\n  end\nend\n"
  },
  {
    "path": "lib/spark/helper/system.rb",
    "content": "module Spark\n  module Helper\n    module System\n\n      def self.included(base)\n        base.send :extend,  Methods\n        base.send :include, Methods\n      end\n     \n      module Methods\n        def windows?\n          RbConfig::CONFIG['host_os'] =~ /mswin|mingw/\n        end\n\n        def mri?\n          RbConfig::CONFIG['ruby_install_name'] == 'ruby'\n        end\n\n        def jruby?\n          RbConfig::CONFIG['ruby_install_name'] == 'jruby'\n        end\n\n        def pry?\n          !!Thread.current[:__pry__]\n        end\n\n        # Memory usage in kb\n        def memory_usage\n          if jruby?\n            runtime = java.lang.Runtime.getRuntime\n            (runtime.totalMemory - runtime.freeMemory) >> 10\n          elsif windows?\n            # not yet\n          else\n            `ps -o rss= -p #{Process.pid}`.to_i\n          end\n        end\n      end # Methods\n\n    end # System\n  end # Helper\nend # Spark\n"
  },
  {
    "path": "lib/spark/helper.rb",
    "content": "module Spark\n  module Helper\n    autoload :System,    \"spark/helper/system\"\n    autoload :Logger,    \"spark/helper/logger\"\n    autoload :Statistic, \"spark/helper/statistic\"\n    autoload :Serialize, \"spark/helper/serialize\"\n    autoload :Partition, \"spark/helper/partition\"\n    autoload :Parser,    \"spark/helper/parser\"\n  end\nend\n"
  },
  {
    "path": "lib/spark/java_bridge/base.rb",
    "content": "##\n# Spark::JavaBridge::Base\n#\n# Parent for all adapter (ruby - java)\n#\nmodule Spark\n  module JavaBridge\n    class Base\n\n      include Spark::Helper::System\n\n      JAVA_OBJECTS = [\n        'java.util.ArrayList',\n        'scala.collection.mutable.HashMap',\n        'org.apache.spark.SparkConf',\n        'org.apache.spark.api.java.JavaSparkContext',\n        'org.apache.spark.api.ruby.RubyRDD',\n        'org.apache.spark.api.ruby.RubyUtils',\n        'org.apache.spark.api.ruby.RubyWorker',\n        'org.apache.spark.api.ruby.PairwiseRDD',\n        'org.apache.spark.api.ruby.RubyAccumulatorParam',\n        'org.apache.spark.api.ruby.RubySerializer',\n        'org.apache.spark.api.python.PythonRDD',\n        'org.apache.spark.api.python.PythonPartitioner',\n        'org.apache.spark.api.python.PythonUtils',\n        'org.apache.spark.ui.ruby.RubyTab',\n        'org.apache.spark.mllib.api.ruby.RubyMLLibAPI',\n        :JInteger  => 'java.lang.Integer',\n        :JLong     => 'java.lang.Long',\n        :JLogger   => 'org.apache.log4j.Logger',\n        :JLevel    => 'org.apache.log4j.Level',\n        :JPriority => 'org.apache.log4j.Priority',\n        :JUtils    => 'org.apache.spark.util.Utils',\n        :JDataType => 'org.apache.spark.sql.types.DataType',\n        :JSQLContext => 'org.apache.spark.sql.SQLContext',\n        :JDenseVector => 'org.apache.spark.mllib.linalg.DenseVector',\n        :JDenseMatrix => 'org.apache.spark.mllib.linalg.DenseMatrix',\n        :JStorageLevel => 'org.apache.spark.storage.StorageLevel',\n        :JSQLFunctions => 'org.apache.spark.sql.functions'\n      ]\n\n      JAVA_TEST_OBJECTS = [\n        'org.apache.spark.mllib.api.ruby.RubyMLLibUtilAPI'\n      ]\n\n      RUBY_TO_JAVA_SKIP = [Fixnum, Integer]\n\n      def initialize(target)\n        @target = target\n      end\n\n      # Import all important classes into Objects\n      def import_all\n        return if @imported\n\n        java_objects.each do |name, klass|\n          import(name, klass)\n        end\n\n        @imported = true\n        nil\n      end\n\n      # Import classes for testing\n      def import_all_test\n        return if @imported_test\n\n        java_test_objects.each do |name, klass|\n          import(name, klass)\n        end\n\n        @imported_test = true\n        nil\n      end\n\n      # Call java object\n      def call(klass, method, *args)\n        # To java\n        args.map!{|item| to_java(item)}\n\n        # Call java\n        result = klass.__send__(method, *args)\n\n        # To ruby\n        to_ruby(result)\n      end\n\n      def to_array_list(array)\n        array_list = ArrayList.new\n        array.each do |item|\n          array_list.add(to_java(item))\n        end\n        array_list\n      end\n\n      def to_seq(array)\n        PythonUtils.toSeq(to_array_list(array))\n      end\n\n      def to_long(number)\n        return nil if number.nil?\n        JLong.new(number)\n      end\n\n      def to_java(object)\n        if RUBY_TO_JAVA_SKIP.include?(object.class)\n          # Some object are convert automatically\n          # This is for preventing errors\n          # For example: jruby store integer as long so 1.to_java is Long\n          object\n        elsif object.respond_to?(:to_java)\n          object.to_java\n        elsif object.is_a?(Array)\n          to_array_list(object)\n        else\n          object\n        end\n      end\n\n      # Array problem:\n      #   Rjb:   object.toArray -> Array\n      #   Jruby: object.toArray -> java.lang.Object\n      #\n      def to_ruby(object)\n        # Java object\n        if java_object?(object)\n          class_name = object.getClass.getSimpleName\n          case class_name\n          when 'ArraySeq'\n            result = []\n            iterator = object.iterator\n            while iterator.hasNext\n              result << to_ruby(iterator.next)\n            end\n            result\n          when 'Map2', 'Map3', 'Map4', 'HashTrieMap'\n            Hash[\n              object.toSeq.array.to_a.map!{|item| [item._1, item._2]}\n            ]\n          when 'SeqWrapper'; object.toArray.to_a.map!{|item| to_ruby(item)}\n          when 'ofRef';      object.array.to_a.map!{|item| to_ruby(item)} # WrappedArray$ofRef\n          when 'LabeledPoint'; Spark::Mllib::LabeledPoint.from_java(object)\n          when 'DenseVector';  Spark::Mllib::DenseVector.from_java(object)\n          when 'KMeansModel';  Spark::Mllib::KMeansModel.from_java(object)\n          when 'DenseMatrix';  Spark::Mllib::DenseMatrix.from_java(object)\n          when 'GenericRowWithSchema'; Spark::SQL::Row.from_java(object, true)\n          else\n            # Some RDD\n            if class_name != 'JavaRDD' && class_name.end_with?('RDD')\n              object = object.toJavaRDD\n              class_name = 'JavaRDD'\n            end\n\n            # JavaRDD\n            if class_name == 'JavaRDD'\n              jrdd = RubyRDD.toRuby(object)\n\n              serializer = Spark::Serializer.build { __batched__(__marshal__) }\n              serializer = Spark::Serializer.build { __batched__(__marshal__, 2) }\n\n              return Spark::RDD.new(jrdd, Spark.sc, serializer, deserializer)\n            end\n\n            # Unknow\n            Spark.logger.warn(\"Java object '#{object.getClass.name}' was not converted.\")\n            object\n          end\n\n        # Array can be automatically transfered but content not\n        elsif object.is_a?(Array)\n          object.map! do |item|\n            to_ruby(item)\n          end\n          object\n\n        # Already transfered\n        else\n          object\n        end\n      end\n\n      alias_method :java_to_ruby, :to_ruby\n      alias_method :ruby_to_java, :to_java\n\n      private\n\n        def jars\n          result = Dir.glob(File.join(@target, '*.jar'))\n          result.flatten!\n          result\n        end\n\n        def objects_with_names(objects)\n          hash = {}\n          objects.each do |object|\n            if object.is_a?(Hash)\n              hash.merge!(object)\n            else\n              key = object.split('.').last.to_sym\n              hash[key] = object\n            end\n          end\n          hash\n        end\n\n        def java_objects\n          objects_with_names(JAVA_OBJECTS)\n        end\n\n        def java_test_objects\n          objects_with_names(JAVA_TEST_OBJECTS)\n        end\n\n        def raise_missing_class(klass)\n          raise Spark::JavaBridgeError, \"Class #{klass} is missing. Make sure that Spark and RubySpark is assembled.\"\n        end\n\n    end\n  end\nend\n"
  },
  {
    "path": "lib/spark/java_bridge/jruby.rb",
    "content": "require 'java'\n\nmodule Spark\n  module JavaBridge\n    class JRuby < Base\n\n      def initialize(*args)\n        super\n        jars.each {|jar| require jar}\n      end\n\n      def import(name, klass)\n        klass = \"Java::#{klass}\"\n        Object.const_set(name, eval(klass))\n      rescue NameError\n        raise_missing_class(klass)\n      end\n\n      def java_object?(object)\n        object.is_a?(JavaProxy)\n      end\n\n    end\n  end\nend\n"
  },
  {
    "path": "lib/spark/java_bridge/rjb.rb",
    "content": "if !ENV.has_key?('JAVA_HOME')\n  raise Spark::ConfigurationError, 'Environment variable JAVA_HOME is not set'\nend\n\nrequire 'rjb'\n\nmodule Spark\n  module JavaBridge\n    class RJB < Base\n\n      def initialize(*args)\n        super\n        Rjb.load(jars)\n        Rjb.primitive_conversion = true\n      end\n\n      def import(name, klass)\n        Object.const_set(name, silence_warnings { Rjb.import(klass) })\n      rescue NoClassDefFoundError\n        raise_missing_class(klass)\n      end\n\n      def java_object?(object)\n        object.is_a?(Rjb::Rjb_JavaProxy)\n      end\n\n      private\n\n        def jars\n          separator = windows? ? ';' : ':'\n          super.join(separator)\n        end\n\n    end\n  end\nend\n"
  },
  {
    "path": "lib/spark/java_bridge.rb",
    "content": "module Spark\n  module JavaBridge\n\n    autoload :Base,  'spark/java_bridge/base'\n    autoload :JRuby, 'spark/java_bridge/jruby'\n    autoload :RJB,   'spark/java_bridge/rjb'\n\n    include Spark::Helper::System\n\n    def self.init(*args)\n      if jruby?\n        klass = JRuby\n      else\n        klass = RJB\n      end\n\n      klass.new(*args)\n    end\n\n  end\nend\n"
  },
  {
    "path": "lib/spark/library.rb",
    "content": "module Spark\n  module Library\n\n    def autoload(klass, location, import=true)\n      if import\n        @for_importing ||= []\n        @for_importing << klass\n      end\n\n      super(klass, location)\n    end\n\n    def autoload_without_import(klass, location)\n      autoload(klass, location, false)\n    end\n\n    def import(to=Object)\n      @for_importing.each do |klass|\n        to.const_set(klass, const_get(klass))\n      end\n      nil\n    end\n\n  end\nend\n"
  },
  {
    "path": "lib/spark/logger.rb",
    "content": "# Necessary libraries\nSpark.load_lib\n\nmodule Spark\n  class Logger\n\n    attr_reader :jlogger\n\n    def initialize\n      @jlogger = JLogger.getLogger('Ruby')\n    end\n\n    def level_off\n      JLevel.toLevel('OFF')\n    end\n\n    # Disable all Spark log\n    def disable\n      jlogger.setLevel(level_off)\n      JLogger.getLogger('org').setLevel(level_off)\n      JLogger.getLogger('akka').setLevel(level_off)\n      JLogger.getRootLogger.setLevel(level_off)\n    end\n\n    def enabled?\n      !disabled?\n    end\n\n    def info(message)\n      jlogger.info(message) if info?\n    end\n\n    def debug(message)\n      jlogger.debug(message) if debug?\n    end\n\n    def trace(message)\n      jlogger.trace(message) if trace?\n    end\n\n    def warning(message)\n      jlogger.warn(message) if warning?\n    end\n\n    def error(message)\n      jlogger.error(message) if error?\n    end\n\n    def info?\n      level_enabled?('info')\n    end\n\n    def debug?\n      level_enabled?('debug')\n    end\n\n    def trace?\n      level_enabled?('trace')\n    end\n\n    def warning?\n      level_enabled?('warn')\n    end\n\n    def error?\n      level_enabled?('error')\n    end\n\n    def level_enabled?(type)\n      jlogger.isEnabledFor(JPriority.toPriority(type.upcase))\n    end\n\n    alias_method :warn, :warning\n\n  end\nend\n"
  },
  {
    "path": "lib/spark/mllib/classification/common.rb",
    "content": "module Spark\n  module Mllib\n    class ClassificationModel\n\n      attr_reader :weights, :intercept, :threshold\n\n      def initialize(weights, intercept)\n        @weights = Spark::Mllib::Vectors.to_vector(weights)\n        @intercept = intercept.to_f\n        @threshold = nil\n      end\n\n      def threshold=(value)\n        @threshold = value.to_f\n      end\n\n      def clear_threshold\n        @threshold = nil\n      end\n\n    end\n  end\nend\n\nmodule Spark\n  module Mllib\n    class ClassificationMethodBase < RegressionMethodBase\n\n    end\n  end\nend\n"
  },
  {
    "path": "lib/spark/mllib/classification/logistic_regression.rb",
    "content": "module Spark\n  module Mllib\n    ##\n    # LogisticRegressionModel\n    #\n    # A linear binary classification model derived from logistic regression.\n    #\n    # == Examples:\n    #\n    #   Spark::Mllib.import\n    #\n    #   # Dense vectors\n    #   data = [\n    #     LabeledPoint.new(0.0, [0.0, 1.0]),\n    #     LabeledPoint.new(1.0, [1.0, 0.0]),\n    #   ]\n    #   lrm = LogisticRegressionWithSGD.train($sc.parallelize(data))\n    #\n    #   lrm.predict([1.0, 0.0])\n    #   # => 1\n    #   lrm.predict([0.0, 1.0])\n    #   # => 0\n    #\n    #   lrm.clear_threshold\n    #   lrm.predict([0.0, 1.0])\n    #   # => 0.123...\n    #\n    #\n    #   # Sparse vectors\n    #   data = [\n    #     LabeledPoint.new(0.0, SparseVector.new(2, {0 => 0.0})),\n    #     LabeledPoint.new(1.0, SparseVector.new(2, {1 => 1.0})),\n    #     LabeledPoint.new(0.0, SparseVector.new(2, {0 => 1.0})),\n    #     LabeledPoint.new(1.0, SparseVector.new(2, {1 => 2.0}))\n    #   ]\n    #   lrm = LogisticRegressionWithSGD.train($sc.parallelize(data))\n    #\n    #   lrm.predict([0.0, 1.0])\n    #   # => 1\n    #   lrm.predict([1.0, 0.0])\n    #   # => 0\n    #   lrm.predict(SparseVector.new(2, {1 => 1.0}))\n    #   # => 1\n    #   lrm.predict(SparseVector.new(2, {0 => 1.0}))\n    #   # => 0\n    #\n    #\n    #   # LogisticRegressionWithLBFGS\n    #   data = [\n    #     LabeledPoint.new(0.0, [0.0, 1.0]),\n    #     LabeledPoint.new(1.0, [1.0, 0.0]),\n    #   ]\n    #   lrm = LogisticRegressionWithLBFGS.train($sc.parallelize(data))\n    #\n    #   lrm.predict([1.0, 0.0])\n    #   # => 1\n    #   lrm.predict([0.0, 1.0])\n    #   # => 0\n    #\n    class LogisticRegressionModel < ClassificationModel\n\n      def initialize(*args)\n        super\n        @threshold = 0.5\n      end\n\n      # Predict values for a single data point or an RDD of points using\n      # the model trained.\n      def predict(vector)\n        vector = Spark::Mllib::Vectors.to_vector(vector)\n        margin = weights.dot(vector) + intercept\n        score = 1.0 / (1.0 + Math.exp(-margin))\n\n        if threshold.nil?\n          return score\n        end\n\n        if score > threshold\n          1\n        else\n          0\n        end\n      end\n\n    end\n  end\nend\n\nmodule Spark\n  module Mllib\n    class LogisticRegressionWithSGD < ClassificationMethodBase\n\n      DEFAULT_OPTIONS = {\n        iterations: 100,\n        step: 1.0,\n        mini_batch_fraction: 1.0,\n        initial_weights: nil,\n        reg_param: 0.01,\n        reg_type: 'l2',\n        intercept: false,\n        validate: true,\n        convergence_tol: 0.001\n      }\n\n      # Train a logistic regression model on the given data.\n      #\n      # == Arguments:\n      # rdd::\n      #   The training data, an RDD of LabeledPoint.\n      #\n      # iterations::\n      #   The number of iterations (default: 100).\n      #\n      # step::\n      #   The step parameter used in SGD (default: 1.0).\n      #\n      # mini_batch_fraction::\n      #   Fraction of data to be used for each SGD iteration.\n      #\n      # initial_weights::\n      #   The initial weights (default: nil).\n      #\n      # reg_param::\n      #   The regularizer parameter (default: 0.01).\n      #\n      # reg_type::\n      #   The type of regularizer used for training our model (default: \"l2\").\n      #\n      #   Allowed values:\n      #   - \"l1\" for using L1 regularization\n      #   - \"l2\" for using L2 regularization\n      #   - nil for no regularization\n      #\n      # intercept::\n      #   Boolean parameter which indicates the use\n      #   or not of the augmented representation for\n      #   training data (i.e. whether bias features\n      #   are activated or not).\n      #   (default: false)\n      #\n      # validate::\n      #   Boolean parameter which indicates if the\n      #   algorithm should validate data before training.\n      #   (default: true)\n      #\n      # convergence_tol::\n      #   A condition which decides iteration termination.\n      #   (default: 0.001)\n      #\n      def self.train(rdd, options={})\n        super\n\n        weights, intercept = Spark.jb.call(RubyMLLibAPI.new, 'trainLogisticRegressionModelWithSGD', rdd,\n                                           options[:iterations].to_i,\n                                           options[:step].to_f,\n                                           options[:mini_batch_fraction].to_f,\n                                           options[:initial_weights],\n                                           options[:reg_param].to_f,\n                                           options[:reg_type],\n                                           options[:intercept],\n                                           options[:validate],\n                                           options[:convergence_tol])\n\n        LogisticRegressionModel.new(weights, intercept)\n      end\n\n    end\n  end\nend\n\nmodule Spark\n  module Mllib\n    class LogisticRegressionWithLBFGS < ClassificationMethodBase\n\n      DEFAULT_OPTIONS = {\n        iterations: 100,\n        initial_weights: nil,\n        reg_param: 0.01,\n        reg_type: 'l2',\n        intercept: false,\n        corrections: 10,\n        tolerance: 0.0001\n      }\n\n      # Train a logistic regression model on the given data.\n      #\n      # == Arguments:\n      # rdd::\n      #   The training data, an RDD of LabeledPoint.\n      #\n      # iterations::\n      #   The number of iterations (default: 100).\n      #\n      # initial_weights::\n      #   The initial weights (default: nil).\n      #\n      # reg_param::\n      #   The regularizer parameter (default: 0.01).\n      #\n      # reg_type::\n      #   The type of regularizer used for training our model (default: \"l2\").\n      #\n      #   Allowed values:\n      #   - \"l1\" for using L1 regularization\n      #   - \"l2\" for using L2 regularization\n      #   - nil for no regularization\n      #\n      # intercept::\n      #   Boolean parameter which indicates the use\n      #   or not of the augmented representation for\n      #   training data (i.e. whether bias features\n      #   are activated or not).\n      #\n      # corrections::\n      #   The number of corrections used in the LBFGS update (default: 10).\n      #\n      # tolerance::\n      #   The convergence tolerance of iterations for L-BFGS (default: 0.0001).\n      #\n      def self.train(rdd, options={})\n        super\n\n        weights, intercept = Spark.jb.call(RubyMLLibAPI.new, 'trainLogisticRegressionModelWithLBFGS', rdd,\n                                           options[:iterations].to_i,\n                                           options[:initial_weights],\n                                           options[:reg_param].to_f,\n                                           options[:reg_type],\n                                           options[:intercept],\n                                           options[:corrections].to_i,\n                                           options[:tolerance].to_f)\n\n        LogisticRegressionModel.new(weights, intercept)\n      end\n\n    end\n  end\nend\n"
  },
  {
    "path": "lib/spark/mllib/classification/naive_bayes.rb",
    "content": "module Spark\n  module Mllib\n    ##\n    # NaiveBayesModel\n    #\n    # Model for Naive Bayes classifiers.\n    #\n    # Contains two parameters:\n    # pi:: vector of logs of class priors (dimension C)\n    # theta:: matrix of logs of class conditional probabilities (CxD)\n    #\n    # == Examples:\n    #\n    #   Spark::Mllib.import\n    #\n    #   # Dense vectors\n    #   data = [\n    #     LabeledPoint.new(0.0, [0.0, 0.0]),\n    #     LabeledPoint.new(0.0, [0.0, 1.0]),\n    #     LabeledPoint.new(1.0, [1.0, 0.0])\n    #   ]\n    #   model = NaiveBayes.train($sc.parallelize(data))\n    #\n    #   model.predict([0.0, 1.0])\n    #   # => 0.0\n    #   model.predict([1.0, 0.0])\n    #   # => 1.0\n    #\n    #\n    #   # Sparse vectors\n    #   data = [\n    #     LabeledPoint.new(0.0, SparseVector.new(2, {1 => 0.0})),\n    #     LabeledPoint.new(0.0, SparseVector.new(2, {1 => 1.0})),\n    #     LabeledPoint.new(1.0, SparseVector.new(2, {0 => 1.0}))\n    #   ]\n    #   model = NaiveBayes.train($sc.parallelize(data))\n    #\n    #   model.predict(SparseVector.new(2, {1 => 1.0}))\n    #   # => 0.0\n    #   model.predict(SparseVector.new(2, {0 => 1.0}))\n    #   # => 1.0\n    #\n    class NaiveBayesModel\n\n      attr_reader :labels, :pi, :theta\n\n      def initialize(labels, pi, theta)\n        @labels = labels\n        @pi = pi\n        @theta = theta\n      end\n\n      # Predict values for a single data point or an RDD of points using\n      # the model trained.\n      def predict(vector)\n        vector = Spark::Mllib::Vectors.to_vector(vector)\n        array = (vector.dot(theta) + pi).to_a\n        index = array.index(array.max)\n        labels[index]\n      end\n\n    end\n  end\nend\n\n\nmodule Spark\n  module Mllib\n    class NaiveBayes\n\n      # Trains a Naive Bayes model given an RDD of (label, features) pairs.\n      #\n      # This is the Multinomial NB (http://tinyurl.com/lsdw6p) which can handle all kinds of\n      # discrete data.  For example, by converting documents into TF-IDF vectors, it can be used for\n      # document classification.  By making every vector a 0-1 vector, it can also be used as\n      # Bernoulli NB (http://tinyurl.com/p7c96j6). The input feature values must be nonnegative.\n      #\n      # == Arguments:\n      # rdd:: RDD of LabeledPoint.\n      # lambda:: The smoothing parameter.\n      #\n      def self.train(rdd, lambda=1.0)\n        # Validation\n        first = rdd.first\n        unless first.is_a?(LabeledPoint)\n          raise Spark::MllibError, \"RDD should contains LabeledPoint, got #{first.class}\"\n        end\n\n        labels, pi, theta = Spark.jb.call(RubyMLLibAPI.new, 'trainNaiveBayesModel', rdd, lambda)\n        theta = Spark::Mllib::Matrices.dense(theta.size, theta.first.size, theta)\n\n        NaiveBayesModel.new(labels, pi, theta)\n      end\n\n    end\n  end\nend\n"
  },
  {
    "path": "lib/spark/mllib/classification/svm.rb",
    "content": "module Spark\n  module Mllib\n    ##\n    # SVMModel\n    #\n    # A support vector machine.\n    #\n    # == Examples:\n    #\n    #   Spark::Mllib.import\n    #\n    #   # Dense vectors\n    #   data = [\n    #       LabeledPoint.new(0.0, [0.0]),\n    #       LabeledPoint.new(1.0, [1.0]),\n    #       LabeledPoint.new(1.0, [2.0]),\n    #       LabeledPoint.new(1.0, [3.0])\n    #   ]\n    #   svm = SVMWithSGD.train($sc.parallelize(data))\n    #\n    #   svm.predict([1.0])\n    #   # => 1\n    #   svm.clear_threshold\n    #   svm.predict([1.0])\n    #   # => 1.25...\n    #\n    #\n    #   # Sparse vectors\n    #   data = [\n    #       LabeledPoint.new(0.0, SparseVector.new(2, {0 => -1.0})),\n    #       LabeledPoint.new(1.0, SparseVector.new(2, {1 => 1.0})),\n    #       LabeledPoint.new(0.0, SparseVector.new(2, {0 => 0.0})),\n    #       LabeledPoint.new(1.0, SparseVector.new(2, {1 => 2.0}))\n    #   ]\n    #   svm = SVMWithSGD.train($sc.parallelize(data))\n    #\n    #   svm.predict(SparseVector.new(2, {1 => 1.0}))\n    #   # => 1\n    #   svm.predict(SparseVector.new(2, {0 => -1.0}))\n    #   # => 0\n    #\n    class SVMModel < ClassificationModel\n\n      def initialize(*args)\n        super\n        @threshold = 0.0\n      end\n\n      # Predict values for a single data point or an RDD of points using\n      # the model trained.\n      def predict(vector)\n        vector = Spark::Mllib::Vectors.to_vector(vector)\n        margin = weights.dot(vector) + intercept\n\n        if threshold.nil?\n          return margin\n        end\n\n        if margin > threshold\n          1\n        else\n          0\n        end\n      end\n\n    end\n  end\nend\n\nmodule Spark\n  module Mllib\n    class SVMWithSGD < ClassificationMethodBase\n\n      DEFAULT_OPTIONS = {\n        iterations: 100,\n        step: 1.0,\n        reg_param: 0.01,\n        mini_batch_fraction: 1.0,\n        initial_weights: nil,\n        reg_type: 'l2',\n        intercept: false,\n        validate: true,\n        convergence_tol: 0.001\n      }\n\n      # Train a support vector machine on the given data.\n      #\n      # rdd::\n      #   The training data, an RDD of LabeledPoint.\n      #\n      # iterations::\n      #   The number of iterations (default: 100).\n      #\n      # step::\n      #   The step parameter used in SGD (default: 1.0).\n      #\n      # reg_param::\n      #   The regularizer parameter (default: 0.01).\n      #\n      # mini_batch_fraction::\n      #   Fraction of data to be used for each SGD iteration.\n      #\n      # initial_weights::\n      #   The initial weights (default: nil).\n      #\n      # reg_type::\n      #   The type of regularizer used for training our model (default: \"l2\").\n      #\n      #   Allowed values:\n      #   - \"l1\" for using L1 regularization\n      #   - \"l2\" for using L2 regularization\n      #   - nil for no regularization\n      #\n      # intercept::\n      #   Boolean parameter which indicates the use\n      #   or not of the augmented representation for\n      #   training data (i.e. whether bias features\n      #   are activated or not).\n      #   (default: false)\n      #\n      # validateData::\n      #   Boolean parameter which indicates if the\n      #   algorithm should validate data before training.\n      #   (default: true)\n      #\n      # convergence_tol::\n      #   A condition which decides iteration termination.\n      #   (default: 0.001)\n      #\n      def self.train(rdd, options={})\n        super\n\n        weights, intercept = Spark.jb.call(RubyMLLibAPI.new, 'trainSVMModelWithSGD', rdd,\n                                           options[:iterations].to_i,\n                                           options[:step].to_f,\n                                           options[:reg_param].to_f,\n                                           options[:mini_batch_fraction].to_f,\n                                           options[:initial_weights],\n                                           options[:reg_type],\n                                           options[:intercept],\n                                           options[:validate],\n                                           options[:convergence_tol])\n\n        SVMModel.new(weights, intercept)\n      end\n\n    end\n  end\nend\n"
  },
  {
    "path": "lib/spark/mllib/clustering/gaussian_mixture.rb",
    "content": "module Spark\n  module Mllib\n    ##\n    # GaussianMixtureModel\n    #\n    # A clustering model derived from the Gaussian Mixture Model method.\n    #\n    # == Examples:\n    #\n    #   Spark::Mllib.import\n    #\n    #   data = [\n    #     DenseVector.new([-0.1, -0.05]),\n    #     DenseVector.new([-0.01, -0.1]),\n    #     DenseVector.new([0.9, 0.8]),\n    #     DenseVector.new([0.75, 0.935]),\n    #     DenseVector.new([-0.83, -0.68]),\n    #     DenseVector.new([-0.91, -0.76])\n    #   ]\n    #\n    #   model = GaussianMixture.train($sc.parallelize(data), 3, convergence_tol: 0.0001, max_iterations: 50, seed: 10)\n    #\n    #   labels = model.predict($sc.parallelize(data)).collect\n    #\n    class GaussianMixtureModel\n\n      attr_reader :weights, :gaussians, :k\n\n      def initialize(weights, gaussians)\n        @weights = weights\n        @gaussians = gaussians\n        @k = weights.size\n      end\n\n      # Find the cluster to which the points in 'x' has maximum membership\n      # in this model.\n      def predict(rdd)\n        if rdd.is_a?(Spark::RDD)\n          predict_soft(rdd).map('lambda{|x| x.index(x.max)}')\n        else\n          raise ArgumentError, 'Argument must be a RDD.'\n        end\n      end\n\n      # Find the membership of each point in 'x' to all mixture components.\n      def predict_soft(rdd)\n        Spark.jb.call(RubyMLLibAPI.new, 'predictSoftGMM', rdd, weights, means, sigmas)\n      end\n\n      def means\n        @means ||= @gaussians.map(&:mu)\n      end\n\n      def sigmas\n        @sigmas ||= @gaussians.map(&:sigma)\n      end\n\n    end\n  end\nend\n\nmodule Spark\n  module Mllib\n    class GaussianMixture\n\n      def self.train(rdd, k, convergence_tol: 0.001, max_iterations: 100, seed: nil)\n        weights, means, sigmas = Spark.jb.call(RubyMLLibAPI.new, 'trainGaussianMixtureModel', rdd,\n                                               k, convergence_tol, max_iterations, Spark.jb.to_long(seed))\n\n        means.map! {|mu|    Spark.jb.java_to_ruby(mu)}\n        sigmas.map!{|sigma| Spark.jb.java_to_ruby(sigma)}\n\n        mvgs = Array.new(k) do |i|\n          MultivariateGaussian.new(means[i], sigmas[i])\n        end\n\n        GaussianMixtureModel.new(weights, mvgs)\n      end\n\n    end\n  end\nend\n"
  },
  {
    "path": "lib/spark/mllib/clustering/kmeans.rb",
    "content": "module Spark\n  module Mllib\n    ##\n    # KMeansModel\n    #\n    # A clustering model derived from the k-means method.\n    #\n    # == Examples:\n    #\n    #   Spark::Mllib.import\n    #\n    #   # Dense vectors\n    #   data = [\n    #     DenseVector.new([0.0,0.0]),\n    #     DenseVector.new([1.0,1.0]),\n    #     DenseVector.new([9.0,8.0]),\n    #     DenseVector.new([8.0,9.0])\n    #   ]\n    #\n    #   model = KMeans.train($sc.parallelize(data), 2, max_iterations: 10,\n    #                        runs: 30, initialization_mode: \"random\")\n    #\n    #   model.predict([0.0, 0.0]) == model.predict([1.0, 1.0])\n    #   # => true\n    #   model.predict([8.0, 9.0]) == model.predict([9.0, 8.0])\n    #   # => true\n    #\n    #\n    #   # Sparse vectors\n    #   data = [\n    #       SparseVector.new(3, {1 => 1.0}),\n    #       SparseVector.new(3, {1 => 1.1}),\n    #       SparseVector.new(3, {2 => 1.0}),\n    #       SparseVector.new(3, {2 => 1.1})\n    #   ]\n    #   model = KMeans.train($sc.parallelize(data), 2, initialization_mode: \"k-means||\")\n    #\n    #   model.predict([0.0, 1.0, 0.0]) == model.predict([0, 1.1, 0.0])\n    #   # => true\n    #   model.predict([0.0, 0.0, 1.0]) == model.predict([0, 0, 1.1])\n    #   # => true\n    #   model.predict(data[0]) == model.predict(data[1])\n    #   # => true\n    #   model.predict(data[2]) == model.predict(data[3])\n    #   # => true\n    #\n    class KMeansModel\n\n      attr_reader :centers\n\n      def initialize(centers)\n        @centers = centers\n      end\n\n      # Find the cluster to which x belongs in this model.\n      def predict(vector)\n        vector = Spark::Mllib::Vectors.to_vector(vector)\n        best = 0\n        best_distance = Float::INFINITY\n\n        @centers.each_with_index do |center, index|\n          distance = vector.squared_distance(center)\n          if distance < best_distance\n            best = index\n            best_distance = distance\n          end\n        end\n\n        best\n      end\n\n      def self.from_java(object)\n        centers = object.clusterCenters\n        centers.map! do |center|\n          Spark.jb.java_to_ruby(center)\n        end\n\n        KMeansModel.new(centers)\n      end\n\n    end\n  end\nend\n\nmodule Spark\n  module Mllib\n    class KMeans\n\n      # Trains a k-means model using the given set of parameters.\n      #\n      # == Arguments:\n      # rdd::\n      #   The training data, an RDD of Vectors.\n      #\n      # k::\n      #   Number of clusters.\n      #\n      # max_iterations::\n      #   Max number of iterations.\n      #\n      # runs::\n      #   Number of parallel runs, defaults to 1. The best model is returned.\n      #\n      # initialization_mode::\n      #   Initialization model, either \"random\" or \"k-means||\" (default).\n      #\n      # seed::\n      #   Random seed value for cluster initialization.\n      #\n      # epsilon::\n      #   The distance threshold within which we've consider centers to have converged.\n      #\n      def self.train(rdd, k, max_iterations: 100, runs: 1, initialization_mode: 'k-means||', seed: nil,\n                             initialization_steps: 5, epsilon: 0.0001)\n\n        cluster_initial_model = []\n\n        # Call returns KMeansModel\n        Spark.jb.call(RubyMLLibAPI.new, 'trainKMeansModel', rdd,\n                      k, max_iterations, runs, initialization_mode, Spark.jb.to_long(seed), initialization_steps, epsilon, cluster_initial_model)\n      end\n\n    end\n  end\nend\n"
  },
  {
    "path": "lib/spark/mllib/matrix.rb",
    "content": "module Spark\n  module Mllib\n    module Matrices\n\n      def self.dense(*args)\n        DenseMatrix.new(*args)\n      end\n\n      def self.sparse(*args)\n        SparseMatrix.new(*args)\n      end\n\n      def self.to_matrix(data)\n        if data.is_a?(SparseMatrix) || data.is_a?(DenseMatrix)\n          data\n        elsif data.is_a?(Array)\n          DenseMatrix.new(data)\n        end\n      end\n\n    end\n  end\nend\n\nmodule Spark\n  module Mllib\n    # @abstract Parent for all type of matrices\n    class MatrixBase < MatrixAdapter\n    end\n  end\nend\n\nmodule Spark\n  module Mllib\n    ##\n    # DenseMatrix\n    #\n    #   DenseMatrix.new(2, 3, [[1,2,3], [4,5,6]]).values\n    #   # => [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]\n    #\n    class DenseMatrix < MatrixBase\n\n      def initialize(rows, cols, values)\n        super(:dense, rows, cols, values.to_a)\n      end\n\n      def to_java\n        JDenseMatrix.new(shape[0], shape[1], values.flatten)\n      end\n\n      def self.from_java(object)\n        rows = object.numRows\n        cols = object.numCols\n        values = object.values\n\n        DenseMatrix.new(rows, cols, values)\n      end\n\n    end\n  end\nend\n\nmodule Spark\n  module Mllib\n    ##\n    # SparseMatrix\n    #\n    # == Arguments:\n    # rows::\n    #   Number of rows.\n    #\n    # cols::\n    #   Number of columns.\n    #\n    # col_pointers::\n    #   The index corresponding to the start of a new column.\n    #\n    # row_indices::\n    #   The row index of the entry. They must be in strictly\n    #   increasing order for each column.\n    #\n    # values::\n    #   Nonzero matrix entries in column major.\n    #\n    # == Examples:\n    #\n    #   SparseMatrix.new(3, 3, [0, 2, 3, 6], [0, 2, 1, 0, 1, 2], [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).values\n    #\n    #   # => [\n    #   #      [1.0, 0.0, 4.0],\n    #   #      [0.0, 3.0, 5.0],\n    #   #      [2.0, 0.0, 6.0]\n    #   #    ]\n    #\n    class SparseMatrix < MatrixBase\n\n      attr_reader :col_pointers, :row_indices\n\n      def initialize(rows, cols, col_pointers, row_indices, values)\n        super(:sparse, rows, cols)\n\n        @col_pointers = col_pointers\n        @row_indices = row_indices\n        @values = values\n\n        j = 0\n        while j < cols\n          idx = col_pointers[j]\n          idx_end = col_pointers[j+1]\n          while idx < idx_end\n            self[row_indices[idx], j] = values[idx]\n            idx += 1\n          end\n          j += 1\n        end\n      end\n\n    end\n  end\nend\n"
  },
  {
    "path": "lib/spark/mllib/regression/common.rb",
    "content": "module Spark\n  module Mllib\n    ##\n    # RegressionModel\n    #\n    # A linear model that has a vector of coefficients and an intercept.\n    #\n    class RegressionModel\n\n      attr_reader :weights, :intercept\n\n      def initialize(weights, intercept)\n        @weights = Spark::Mllib::Vectors.to_vector(weights)\n        @intercept = intercept.to_f\n      end\n\n      # Predict the value of the dependent variable given a vector data\n      # containing values for the independent variables.\n      #\n      # == Examples:\n      #   lm = RegressionModel.new([1.0, 2.0], 0.1)\n      #\n      #   lm.predict([-1.03, 7.777]) - 14.624 < 1e-6\n      #   # => true\n      #\n      #   lm.predict(SparseVector.new(2, {0 => -1.03, 1 => 7.777})) - 14.624 < 1e-6\n      #   # => true\n      #\n      def predict(data)\n        data = Spark::Mllib::Vectors.to_vector(data)\n        @weights.dot(data) + @intercept\n      end\n\n    end\n  end\nend\n\n\nmodule Spark\n  module Mllib\n    ##\n    # RegressionMethodBase\n    #\n    # Parent for regression methods\n    #\n    class RegressionMethodBase\n\n      def self.train(rdd, options)\n        # String keys to symbols\n        options.symbolize_keys!\n\n        # Reverse merge\n        self::DEFAULT_OPTIONS.each do |key, value|\n          if options.has_key?(key)\n            # value from user\n          else\n            options[key] = value\n          end\n        end\n\n        # Validation\n        first = rdd.first\n        unless first.is_a?(LabeledPoint)\n          raise Spark::MllibError, \"RDD should contains LabeledPoint, got #{first.class}\"\n        end\n\n        # Initial weights is optional for user (not for Spark)\n        options[:initial_weights] = Vectors.to_vector(options[:initial_weights] || [0.0] * first.features.size)\n      end\n\n    end\n  end\nend\n"
  },
  {
    "path": "lib/spark/mllib/regression/labeled_point.rb",
    "content": "module Spark\n  module Mllib\n    ##\n    # LabeledPoint\n    #\n    # The features and labels of a data point.\n    #\n    # == Parameters:\n    # label::\n    #   Label for this data point.\n    #\n    # features::\n    #   Vector of features for this point\n    #\n    class LabeledPoint\n\n      attr_reader :label, :features\n\n      def initialize(label, features)\n        @label = label.to_f\n        @features = Spark::Mllib::Vectors.to_vector(features)\n      end\n\n      def self.from_java(object)\n        LabeledPoint.new(\n          object.label,\n          Spark.jb.java_to_ruby(object.features)\n        )\n      end\n\n      def marshal_dump\n        [@label, @features]\n      end\n\n      def marshal_load(array)\n        initialize(array[0], array[1])\n      end\n\n    end\n  end\nend\n"
  },
  {
    "path": "lib/spark/mllib/regression/lasso.rb",
    "content": "##\n# LassoModel\n#\n# Train a regression model with L1-regularization using Stochastic Gradient Descent.\n# This solves the l1-regularized least squares regression formulation\n#   f(weights) = 1/2n ||A weights-y||^2^  + regParam ||weights||_1\n# Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with\n# its corresponding right hand side label y.\n# See also the documentation for the precise formulation.\n#\n# == Examples:\n#\n#   Spark::Mllib.import\n#\n#   # Dense vectors\n#   data = [\n#       LabeledPoint.new(0.0, [0.0]),\n#       LabeledPoint.new(1.0, [1.0]),\n#       LabeledPoint.new(3.0, [2.0]),\n#       LabeledPoint.new(2.0, [3.0])\n#   ]\n#   lrm = LassoWithSGD.train($sc.parallelize(data), initial_weights: [1.0])\n#\n#   lrm.predict([0.0]) - 0 < 0.5\n#   # => true\n#\n#   lrm.predict([1.0]) - 1 < 0.5\n#   # => true\n#\n#   lrm.predict(SparseVector.new(1, {0 => 1.0})) - 1 < 0.5\n#   # => true\n#\n#\n#   # Sparse vectors\n#   data = [\n#       LabeledPoint.new(0.0, SparseVector.new(1, {0 => 0.0})),\n#       LabeledPoint.new(1.0, SparseVector.new(1, {0 => 1.0})),\n#       LabeledPoint.new(3.0, SparseVector.new(1, {0 => 2.0})),\n#       LabeledPoint.new(2.0, SparseVector.new(1, {0 => 3.0}))\n#   ]\n#   lrm = LinearRegressionWithSGD.train($sc.parallelize(data), initial_weights: [1.0])\n#\n#   lrm.predict([0.0]) - 0 < 0.5\n#   # => true\n#\n#   lrm.predict(SparseVector.new(1, {0 => 1.0})) - 1 < 0.5\n#   # => true\n#\nclass Spark::Mllib::LassoModel < Spark::Mllib::RegressionModel\nend\n\nmodule Spark\n  module Mllib\n    class LassoWithSGD < RegressionMethodBase\n\n      DEFAULT_OPTIONS = {\n        iterations: 100,\n        step: 1.0,\n        reg_param: 0.01,\n        mini_batch_fraction: 1.0,\n        initial_weights: nil,\n        intercept: false,\n        validate: true,\n        convergence_tol: 0.001\n      }\n\n      # Train a Lasso regression model on the given data.\n      #\n      # == Parameters:\n      # rdd::\n      #   The training data (RDD instance).\n      #\n      # iterations::\n      #   The number of iterations (default: 100).\n      #\n      # step::\n      #   The step parameter used in SGD (default: 1.0).\n      #\n      # reg_param::\n      #   The regularizer parameter (default: 0.0).\n      #\n      # mini_batch_fraction::\n      #   Fraction of data to be used for each SGD iteration (default: 1.0).\n      #\n      # initial_weights::\n      #   The initial weights (default: nil).\n      #\n      # intercept::\n      #   Boolean parameter which indicates the use\n      #   or not of the augmented representation for\n      #   training data (i.e. whether bias features\n      #   are activated or not).\n      #   (default: false)\n      #\n      # validate::\n      #   Boolean parameter which indicates if the\n      #   algorithm should validate data before training.\n      #   (default: true)\n      #\n      # convergence_tol::\n      #   A condition which decides iteration termination.\n      #   (default: 0.001)\n      #\n      def self.train(rdd, options={})\n        super\n\n        weights, intercept = Spark.jb.call(RubyMLLibAPI.new, 'trainLassoModelWithSGD', rdd,\n                                           options[:iterations].to_i,\n                                           options[:step].to_f,\n                                           options[:reg_param].to_f,\n                                           options[:mini_batch_fraction].to_f,\n                                           options[:initial_weights],\n                                           options[:intercept],\n                                           options[:validate],\n                                           options[:convergence_tol])\n\n        LassoModel.new(weights, intercept)\n      end\n\n    end\n  end\nend\n"
  },
  {
    "path": "lib/spark/mllib/regression/linear.rb",
    "content": "##\n# LinearRegressionModel\n#\n# Train a linear regression model with no regularization using Stochastic Gradient Descent.\n# This solves the least squares regression formulation\n#   f(weights) = 1/n ||A weights-y||^2^\n# (which is the mean squared error).\n# Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with\n# its corresponding right hand side label y.\n# See also the documentation for the precise formulation.\n#\n# == Examples:\n#\n#   Spark::Mllib.import\n#\n#   # Dense vectors\n#   data = [\n#     LabeledPoint.new(0.0, [0.0]),\n#     LabeledPoint.new(1.0, [1.0]),\n#     LabeledPoint.new(3.0, [2.0]),\n#     LabeledPoint.new(2.0, [3.0])\n#   ]\n#   lrm = LinearRegressionWithSGD.train($sc.parallelize(data), initial_weights: [1.0])\n#\n#   lrm.intercept # => 0.0\n#   lrm.weights   # => [0.9285714285714286]\n#\n#   lrm.predict([0.0]) < 0.5\n#   # => true\n#\n#   lrm.predict([1.0]) - 1 < 0.5\n#   # => true\n#\n#   lrm.predict(SparseVector.new(1, {0 => 1.0})) - 1 < 0.5\n#   # => true\n#\n#   # Sparse vectors\n#   data = [\n#     LabeledPoint.new(0.0, SparseVector.new(1, {0 => 0.0})),\n#     LabeledPoint.new(1.0, SparseVector.new(1, {0 => 1.0})),\n#     LabeledPoint.new(3.0, SparseVector.new(1, {0 => 2.0})),\n#     LabeledPoint.new(2.0, SparseVector.new(1, {0 => 3.0}))\n#   ]\n#   lrm = LinearRegressionWithSGD.train($sc.parallelize(data), initial_weights: [1.0])\n#\n#   lrm.intercept # => 0.0\n#   lrm.weights   # => [0.9285714285714286]\n#\n#   lrm.predict([0.0]) < 0.5\n#   # => true\n#\n#   lrm.predict(SparseVector.new(1, {0 => 1.0})) - 1 < 0.5\n#   # => true\n#\nclass Spark::Mllib::LinearRegressionModel < Spark::Mllib::RegressionModel\nend\n\nmodule Spark\n  module Mllib\n    class LinearRegressionWithSGD < RegressionMethodBase\n\n      DEFAULT_OPTIONS = {\n        iterations: 100,\n        step: 1.0,\n        mini_batch_fraction: 1.0,\n        initial_weights: nil,\n        reg_param: 0.0,\n        reg_type: nil,\n        intercept: false,\n        validate: true,\n        convergence_tol: 0.001\n      }\n\n      # Train a linear regression model on the given data.\n      #\n      # == Parameters:\n      # rdd::\n      #   The training data (RDD instance).\n      #\n      # iterations::\n      #   The number of iterations (default: 100).\n      #\n      # step::\n      #   The step parameter used in SGD (default: 1.0).\n      #\n      # mini_batch_fraction::\n      #   Fraction of data to be used for each SGD iteration (default: 1.0).\n      #\n      # initial_weights::\n      #   The initial weights (default: nil).\n      #\n      # reg_param::\n      #   The regularizer parameter (default: 0.0).\n      #\n      # reg_type::\n      #   The type of regularizer used for training our model (default: nil).\n      #\n      #   Allowed values:\n      #   - \"l1\" for using L1 regularization (lasso),\n      #   - \"l2\" for using L2 regularization (ridge),\n      #   - None for no regularization\n      #\n      # intercept::\n      #   Boolean parameter which indicates the use\n      #   or not of the augmented representation for\n      #   training data (i.e. whether bias features\n      #   are activated or not).\n      #   (default: false)\n      #\n      # validate::\n      #   Boolean parameter which indicates if the\n      #   algorithm should validate data before training.\n      #   (default: true)\n      #\n      # convergence_tol::\n      #    A condition which decides iteration termination.\n      #    (default: 0.001)\n      #\n      def self.train(rdd, options={})\n        super\n\n        weights, intercept = Spark.jb.call(RubyMLLibAPI.new, 'trainLinearRegressionModelWithSGD', rdd,\n                                           options[:iterations].to_i,\n                                           options[:step].to_f,\n                                           options[:mini_batch_fraction].to_f,\n                                           options[:initial_weights],\n                                           options[:reg_param].to_f,\n                                           options[:reg_type],\n                                           options[:intercept],\n                                           options[:validate],\n                                           options[:convergence_tol])\n\n        LinearRegressionModel.new(weights, intercept)\n      end\n\n    end\n  end\nend\n"
  },
  {
    "path": "lib/spark/mllib/regression/ridge.rb",
    "content": "##\n# RidgeRegressionModel\n#\n# Train a regression model with L2-regularization using Stochastic Gradient Descent.\n# This solves the l1-regularized least squares regression formulation\n#   f(weights) = 1/2n ||A weights-y||^2^  + regParam/2 ||weights||^2^\n# Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with\n# its corresponding right hand side label y.\n# See also the documentation for the precise formulation.\n#\n# == Examples:\n#\n#   Spark::Mllib.import\n#\n#   data = [\n#       LabeledPoint.new(0.0, [0.0]),\n#       LabeledPoint.new(1.0, [1.0]),\n#       LabeledPoint.new(3.0, [2.0]),\n#       LabeledPoint.new(2.0, [3.0])\n#   ]\n#   lrm = RidgeRegressionWithSGD.train($sc.parallelize(data), initial_weights: [1.0])\n#\n#   lrm.predict([0.0]) - 0 < 0.5\n#   # => true\n#\n#   lrm.predict([1.0]) - 1 < 0.5\n#   # => true\n#\n#   lrm.predict(SparseVector.new(1, {0 => 1.0})) - 1 < 0.5\n#   # => true\n#\n#   data = [\n#       LabeledPoint.new(0.0, SparseVector.new(1, {0 => 0.0})),\n#       LabeledPoint.new(1.0, SparseVector.new(1, {0 => 1.0})),\n#       LabeledPoint.new(3.0, SparseVector.new(1, {0 => 2.0})),\n#       LabeledPoint.new(2.0, SparseVector.new(1, {0 => 3.0}))\n#   ]\n#   lrm = LinearRegressionWithSGD.train($sc.parallelize(data), initial_weights: [1.0])\n#\n#   lrm.predict([0.0]) - 0 < 0.5\n#   # => true\n#\n#   lrm.predict(SparseVector.new(1, {0 => 1.0})) - 1 < 0.5\n#   # => true\n#\nclass Spark::Mllib::RidgeRegressionModel < Spark::Mllib::RegressionModel\nend\n\nmodule Spark\n  module Mllib\n    class RidgeRegressionWithSGD < RegressionMethodBase\n\n      DEFAULT_OPTIONS = {\n        iterations: 100,\n        step: 1.0,\n        reg_param: 0.01,\n        mini_batch_fraction: 1.0,\n        initial_weights: nil,\n        intercept: false,\n        validate: true,\n        convergence_tol: 0.001\n      }\n\n      # Train a ridge regression model on the given data.\n      #\n      # == Parameters:\n      # rdd::\n      #   The training data (RDD instance).\n      #\n      # iterations::\n      #   The number of iterations (default: 100).\n      #\n      # step::\n      #   The step parameter used in SGD (default: 1.0).\n      #\n      # reg_param::\n      #   The regularizer parameter (default: 0.0).\n      #\n      # mini_batch_fraction::\n      #   Fraction of data to be used for each SGD iteration (default: 1.0).\n      #\n      # initial_weights::\n      #   The initial weights (default: nil).\n      #\n      # intercept::\n      #   Boolean parameter which indicates the use\n      #   or not of the augmented representation for\n      #   training data (i.e. whether bias features\n      #   are activated or not).\n      #   (default: false)\n      #\n      # validate::\n      #   Boolean parameter which indicates if the\n      #   algorithm should validate data before training.\n      #   (default: true)\n      #\n      # convergence_tol::\n      #   A condition which decides iteration termination.\n      #   (default: 0.001)\n      #\n      def self.train(rdd, options={})\n        super\n\n        weights, intercept = Spark.jb.call(RubyMLLibAPI.new, 'trainRidgeModelWithSGD', rdd,\n                                           options[:iterations].to_i,\n                                           options[:step].to_f,\n                                           options[:reg_param].to_f,\n                                           options[:mini_batch_fraction].to_f,\n                                           options[:initial_weights],\n                                           options[:intercept],\n                                           options[:validate],\n                                           options[:convergence_tol])\n\n        RidgeRegressionModel.new(weights, intercept)\n      end\n\n    end\n  end\nend\n"
  },
  {
    "path": "lib/spark/mllib/ruby_matrix/matrix_adapter.rb",
    "content": "require 'matrix'\n\nmodule Spark\n  module Mllib\n    class MatrixAdapter < ::Matrix\n\n      def self.new(*args)\n        object = self.allocate\n\n        if args.size == 2\n          # Matrix is initialized from Matrix\n          # Arguments: rows, column count\n          object.__send__(:original_initialize, *args)\n        else\n          object.__send__(:initialize, *args)\n        end\n\n        object\n      end\n\n      alias_method :original_initialize, :initialize\n\n      def initialize(type, rows, cols, values=nil)\n        case type\n        when :dense\n          values = values.dup\n          if rows * cols == values.size\n            # Values are on one row\n            # 2x2 => [1,2,3,4]\n            values = values.each_slice(cols).to_a\n          else\n            # 2x2 => [[1,2], [3,4]]\n          end\n        when :sparse\n          values = Array.new(rows) { Array.new(cols) { 0.0 } }\n        else\n          raise Spark::MllibError, 'Unknow vector type.'\n        end\n\n        super(values, cols)\n      end\n\n      def shape\n        [row_count, column_count]\n      end\n\n      def values\n        @values || to_a\n      end\n\n    end\n  end\nend\n"
  },
  {
    "path": "lib/spark/mllib/ruby_matrix/vector_adapter.rb",
    "content": "require 'matrix'\n\n# Based on ruby 2.1\n\nclass Vector\n  def self.elements(array, copy=true)\n    DenseVector.new(convert_to_array(array, copy))\n  end\nend\n\nmodule Spark\n  module Mllib\n    class VectorAdapter < ::Vector\n\n      def self.new(*args)\n        object = self.allocate\n        object.__send__(:initialize, *args)\n        object\n      end\n\n      def initialize(*args)\n        case args.shift\n        when :dense\n          values = args.shift.dup\n        when :sparse\n          values = [0.0] * args.shift.to_i\n        else\n          raise Spark::MllibError, 'Unknow vector type.'\n        end\n\n        super(values)\n      end\n\n      def []=(index, value)\n        @elements[index] = value\n      end\n\n      def dot(other)\n        if other.is_a?(Spark::Mllib::MatrixBase)\n          other * self\n        else\n          inner_product(other)\n        end\n      end\n\n      def squared_distance(other)\n        diff = self - other\n        diff.dot(diff)\n      end\n\n      def values\n        @values || to_a\n      end\n\n    end\n  end\nend\n"
  },
  {
    "path": "lib/spark/mllib/stat/distribution.rb",
    "content": "##\n# MultivariateGaussian\n#\n# This class provides basic functionality for a Multivariate Gaussian (Normal) Distribution. In\n# the event that the covariance matrix is singular, the density will be computed in a\n# reduced dimensional subspace under which the distribution is supported.\n#\n# == Arguments:\n# mu:: The mean vector of the distribution\n# sigma:: The covariance matrix of the distribution\n#\nSpark::Mllib::MultivariateGaussian = Struct.new(:mu, :sigma)\n"
  },
  {
    "path": "lib/spark/mllib/vector.rb",
    "content": "module Spark\n  module Mllib\n    module Vectors\n\n      def self.dense(*args)\n        DenseVector.new(*args)\n      end\n\n      def self.sparse(*args)\n        SparseVector.new(*args)\n      end\n\n      def self.parse(data)\n        if data.start_with?('[') && data.end_with?(']')\n          DenseVector.parse(data)\n        elsif data.start_with?('(') && data.end_with?(')')\n          SparseVector.parse(data)\n        else\n          raise ArgumentError, 'Unknow vector.'\n        end\n      end\n\n      def self.to_vector(data)\n        if data.is_a?(SparseVector) || data.is_a?(DenseVector)\n          data\n        elsif data.is_a?(Array)\n          DenseVector.new(data)\n        end\n      end\n\n    end\n  end\nend\n\nmodule Spark\n  module Mllib\n    # @abstract Parent for all type of vectors\n    class VectorBase < VectorAdapter\n    end\n  end\nend\n\nmodule Spark\n  module Mllib\n    ##\n    # A dense vector represented by a value array.\n    #\n    # Dense vector is a vector in which most of the elements are non-zero.\n    #\n    # == Example:\n    #   DenseVector.new([1,2,3,4,5]).values\n    #   # => [1, 2, 3, 4, 5]\n    #\n    #   DenseVector.new(1..5).values\n    #   # => [1, 2, 3, 4, 5]\n    #\n    class DenseVector < VectorBase\n\n      def initialize(values)\n        super(:dense, values.to_a)\n      end\n\n      # Covert string to vector\n      #\n      #   DenseVector.parse(\"[1.0,2.0,3.0,4.0,5.0]\")\n      #\n      def self.parse(data)\n        unless data =~ /\\[[0-9., ]+\\]/\n          raise ArgumentError, 'Unknow format for DenseVector.'\n        end\n\n        data.sub!('[', '')\n        data.sub!(']', '')\n\n        data = data.split(',')\n        data.map!(&:to_f)\n\n        DenseVector.new(data)\n      end\n\n      # Convert vector to string\n      #\n      #   DenseVector.new([1,2,3,4,5]).to_s\n      #   # => \"[1.0,2.0,3.0,4.0,5.0]\"\n      #\n      def to_s\n        \"[#{values.join(',')}]\"\n      end\n\n      def to_java\n        JDenseVector.new(values)\n      end\n\n      def self.from_java(object)\n        DenseVector.new(object.values)\n      end\n\n      def marshal_dump\n        values\n      end\n\n      def marshal_load(array)\n        initialize(array)\n      end\n\n    end\n  end\nend\n\nmodule Spark\n  module Mllib\n    ##\n    # A sparse vector represented by an index array and an value array.\n    #\n    # Sparse vector is a vector in which most of the elements are zero.\n    #\n    # == Example:\n    #   SparseVector.new(4, {1 => 1.0, 3 => 5.5}).values\n    #   # => [0, 1.0, 0, 5.5]\n    #\n    #   SparseVector.new(4, [[1, 3], [1.0, 5.5]]).values\n    #   # => [0, 1.0, 0, 5.5]\n    #\n    #   SparseVector.new(4, [1, 3], [1.0, 5.5]).values\n    #   # => [0, 1.0, 0, 5.5]\n    #\n    class SparseVector < VectorBase\n\n      attr_reader :indices\n\n      def initialize(arg1, arg2=nil, arg3=nil)\n          super(:sparse, arg1)\n\n          if arg2.is_a?(Hash)\n            @indices = arg2.keys\n            @values = arg2.values\n          else\n            @indices = arg2\n            @values = arg3\n          end\n\n          @indices.zip(@values).each do |(index, value)|\n            self[index] = value\n          end\n      end\n\n      # Covert string to vector\n      #\n      #   SparseVector.parse(\"(5,[1,4],[3.0,5.0])\")\n      #\n      def self.parse(data)\n        data = data.match(/\\(([0-9]+)[ ]*,[ ]*\\[([0-9,. ]*)\\][ ]*,[ ]*\\[([0-9,. ]*)\\]\\)/)\n        if data\n          size = data[1].to_i\n          indices = data[2].split(',')\n          indices.map!(&:to_i)\n          values = data[3].split(',')\n          values.map!(&:to_f)\n\n          SparseVector.new(size, indices, values)\n        else\n          raise ArgumentError, 'Unknow format for SparseVector.'\n        end\n      end\n\n      # Convert vector to string\n      #\n      #   SparseVector.new(5, {1 => 3, 4 => 5}).to_s\n      #   # => \"(5,[1,4],[3.0,5.0])\"\n      #\n      def to_s\n        \"(#{size},[#{indices.join(',')}],[#{values.join(',')}])\"\n      end\n\n      def marshal_dump\n        [size, indices, values]\n      end\n\n      def marshal_load(array)\n        initialize(array[0], array[1], array[2])\n      end\n\n    end\n  end\nend\n"
  },
  {
    "path": "lib/spark/mllib.rb",
    "content": "module Spark\n  # MLlib is Spark’s scalable machine learning library consisting of common learning algorithms and utilities,\n  # including classification, regression, clustering, collaborative filtering, dimensionality reduction,\n  # as well as underlying optimization primitives.\n  module Mllib\n    extend Spark::Library\n\n    # Base classes\n    autoload_without_import :VectorBase, 'spark/mllib/vector'\n    autoload_without_import :MatrixBase, 'spark/mllib/matrix'\n    autoload_without_import :RegressionMethodBase,     'spark/mllib/regression/common'\n    autoload_without_import :ClassificationMethodBase, 'spark/mllib/classification/common'\n\n    # Linear algebra\n    autoload :Vectors,      'spark/mllib/vector'\n    autoload :DenseVector,  'spark/mllib/vector'\n    autoload :SparseVector, 'spark/mllib/vector'\n    autoload :Matrices,     'spark/mllib/matrix'\n    autoload :DenseMatrix,  'spark/mllib/matrix'\n    autoload :SparseMatrix, 'spark/mllib/matrix'\n\n    # Regression\n    autoload :LabeledPoint,            'spark/mllib/regression/labeled_point'\n    autoload :RegressionModel,         'spark/mllib/regression/common'\n    autoload :LinearRegressionModel,   'spark/mllib/regression/linear'\n    autoload :LinearRegressionWithSGD, 'spark/mllib/regression/linear'\n    autoload :LassoModel,              'spark/mllib/regression/lasso'\n    autoload :LassoWithSGD,            'spark/mllib/regression/lasso'\n    autoload :RidgeRegressionModel,    'spark/mllib/regression/ridge'\n    autoload :RidgeRegressionWithSGD,  'spark/mllib/regression/ridge'\n\n    # Classification\n    autoload :ClassificationModel,         'spark/mllib/classification/common'\n    autoload :LogisticRegressionWithSGD,   'spark/mllib/classification/logistic_regression'\n    autoload :LogisticRegressionWithLBFGS, 'spark/mllib/classification/logistic_regression'\n    autoload :SVMModel,                    'spark/mllib/classification/svm'\n    autoload :SVMWithSGD,                  'spark/mllib/classification/svm'\n    autoload :NaiveBayesModel,             'spark/mllib/classification/naive_bayes'\n    autoload :NaiveBayes,                  'spark/mllib/classification/naive_bayes'\n\n    # Clustering\n    autoload :KMeans,               'spark/mllib/clustering/kmeans'\n    autoload :KMeansModel,          'spark/mllib/clustering/kmeans'\n    autoload :GaussianMixture,      'spark/mllib/clustering/gaussian_mixture'\n    autoload :GaussianMixtureModel, 'spark/mllib/clustering/gaussian_mixture'\n\n    # Stat\n    autoload :MultivariateGaussian, 'spark/mllib/stat/distribution'\n\n    def self.prepare\n      return if @prepared\n\n      # if narray?\n      #   require 'spark/mllib/narray/vector'\n      #   require 'spark/mllib/narray/matrix'\n      # elsif mdarray?\n      #   require 'spark/mllib/mdarray/vector'\n      #   require 'spark/mllib/mdarray/matrix'\n      # else\n      #   require 'spark/mllib/matrix/vector'\n      #   require 'spark/mllib/matrix/matrix'\n      # end\n\n      require 'spark/mllib/ruby_matrix/vector_adapter'\n      require 'spark/mllib/ruby_matrix/matrix_adapter'\n\n      @prepared = true\n      nil\n    end\n\n    def self.narray?\n      Gem::Specification::find_all_by_name('narray').any?\n    end\n\n    def self.mdarray?\n      Gem::Specification::find_all_by_name('mdarray').any?\n    end\n  end\nend\n\nSpark::Mllib.prepare\n"
  },
  {
    "path": "lib/spark/rdd.rb",
    "content": "module Spark\n  ##\n  # A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. Represents an immutable,\n  # partitioned collection of elements that can be operated on in parallel. This class contains the\n  # basic operations available on all RDDs, such as `map`, `filter`, and `persist`.\n  #\n  class RDD\n\n    extend Forwardable\n\n    attr_reader :jrdd, :context, :command\n\n    include Spark::Helper::Logger\n    include Spark::Helper::Parser\n    include Spark::Helper::Statistic\n\n    def_delegators :@command, :serializer, :deserializer, :libraries, :files\n\n    # Initializing RDD, this method is root of all Pipelined RDD - its unique\n    # If you call some operations on this class it will be computed in Java\n    #\n    # == Parameters:\n    # jrdd:: org.apache.spark.api.java.JavaRDD\n    # context:: {Spark::Context}\n    # serializer:: {Spark::Serializer}\n    #\n    def initialize(jrdd, context, serializer, deserializer=nil)\n      @jrdd = jrdd\n      @context = context\n\n      @cached = false\n      @checkpointed = false\n\n      @command = Spark::CommandBuilder.new(serializer, deserializer)\n    end\n\n    def inspect\n      comms = @command.commands.join(' -> ')\n\n      result  = %{#<#{self.class.name}:0x#{object_id}}\n      result << %{ (#{comms})} unless comms.empty?\n      result << %{ (cached)} if cached?\n      result << %{\\n}\n      result << %{  Serializer: \"#{serializer}\"\\n}\n      result << %{Deserializer: \"#{deserializer}\"}\n      result << %{>}\n      result\n    end\n\n\n    # =============================================================================\n    # Operators\n\n    def +(other)\n      self.union(other)\n    end\n\n\n    # =============================================================================\n    # Commad and serializer\n\n    def add_command(klass, *args)\n      @command.deep_copy.add_command(klass, *args)\n    end\n\n    # Add ruby library\n    # Libraries will be included before computing\n    #\n    # == Example:\n    #   rdd.add_library('pry').add_library('nio4r', 'distribution')\n    #\n    def add_library(*libraries)\n      @command.add_library(*libraries)\n      self\n    end\n\n    # Bind object to RDD\n    #\n    # == Example:\n    #   text = \"test\"\n    #\n    #   rdd = $sc.parallelize(0..5)\n    #   rdd = rdd.map(lambda{|x| x.to_s + \" \" + text})\n    #   rdd = rdd.bind(text: text)\n    #\n    #   rdd.collect\n    #   # => [\"0 test\", \"1 test\", \"2 test\", \"3 test\", \"4 test\", \"5 test\"]\n    #\n    def bind(objects)\n      unless objects.is_a?(Hash)\n        raise ArgumentError, 'Argument must be a Hash.'\n      end\n\n      @command.bind(objects)\n      self\n    end\n\n    def new_rdd_from_command(klass, *args)\n      comm = add_command(klass, *args)\n      PipelinedRDD.new(self, comm)\n    end\n\n\n    # =============================================================================\n    # Variables and non-computing functions\n\n    def config\n      @context.config\n    end\n\n    def default_reduce_partitions\n      config['spark.default.parallelism'] || partitions_size\n    end\n\n    # Count of ParallelCollectionPartition\n    def partitions_size\n      jrdd.rdd.partitions.size\n    end\n\n    # A unique ID for this RDD (within its SparkContext).\n    def id\n      jrdd.id\n    end\n\n    # Persist this RDD with the default storage level MEMORY_ONLY_SER because of serialization.\n    def cache\n      persist('memory_only_ser')\n    end\n\n    # Set this RDD's storage level to persist its values across operations after the first time\n    # it is computed. This can only be used to assign a new storage level if the RDD does not\n    # have a storage level set yet.\n    #\n    # See StorageLevel for type of new_level\n    #\n    def persist(new_level)\n      @cached = true\n      jrdd.persist(Spark::StorageLevel.java_get(new_level))\n      self\n    end\n\n    # Mark the RDD as non-persistent, and remove all blocks for it from memory and disk.\n    #\n    # == Parameters:\n    # blocking:: whether to block until all blocks are deleted.\n    #\n    def unpersist(blocking=true)\n      @cached = false\n      jrdd.unpersist(blocking)\n      self\n    end\n\n    def cached?\n      @cached\n    end\n\n    def checkpointed?\n      @checkpointed\n    end\n\n    # Return the name of this RDD.\n    #\n    def name\n      _name = jrdd.name\n      _name && _name.encode(Encoding::UTF_8)\n    end\n\n    # Assign a name to this RDD.\n    #\n    def set_name(value)\n      jrdd.setName(value)\n      value\n    end\n\n    def name=(value)\n      set_name(value)\n    end\n\n    def to_java\n      marshal = Spark::Serializer.marshal\n\n      if deserializer.batched?\n        ser = deserializer.deep_copy\n        ser.serializer = marshal\n      else\n        ser = Spark::Serializer.batched(marshal)\n      end\n\n      rdd = self.reserialize(ser)\n      RubyRDD.toJava(rdd.jrdd, rdd.serializer.batched?)\n    end\n\n\n    # =============================================================================\n    # Actions which return value\n\n    # Return an array that contains all of the elements in this RDD.\n    # RJB raise an error if stage is killed.\n    def collect(as_enum=false)\n      file = Tempfile.new('collect', context.temp_dir)\n\n      context.set_call_site(caller.first)\n      RubyRDD.writeRDDToFile(jrdd.rdd, file.path)\n\n      collect_from_file(file, as_enum)\n    rescue => e\n      raise Spark::RDDError, e.message\n    ensure\n      context.clear_call_site\n    end\n\n    def collect_from_file(file, as_enum=false)\n      if self.is_a?(PipelinedRDD)\n        klass = @command.serializer\n      else\n        klass = @command.deserializer\n      end\n\n      if as_enum\n        result = klass.load_from_file(file)\n      else\n        result = klass.load_from_io(file).to_a\n        file.close\n        file.unlink\n      end\n\n      result\n    end\n\n    # Convert an Array to Hash\n    #\n    def collect_as_hash\n      Hash[collect]\n    end\n\n    # Take the first num elements of the RDD.\n    #\n    # It works by first scanning one partition, and use the results from\n    # that partition to estimate the number of additional partitions needed\n    # to satisfy the limit.\n    #\n    # == Example:\n    #   rdd = $sc.parallelize(0..100, 20)\n    #   rdd.take(5)\n    #   # => [0, 1, 2, 3, 4]\n    #\n    def take(count)\n      buffer = []\n\n      parts_count = self.partitions_size\n      # No parts was scanned, yet\n      last_scanned = -1\n\n      while buffer.empty?\n        last_scanned += 1\n        buffer += context.run_job_with_command(self, [last_scanned], true, Spark::Command::Take, 0, -1)\n      end\n\n      # Assumption. Depend on batch_size and how Spark divided data.\n      items_per_part = buffer.size\n      left = count - buffer.size\n\n      while left > 0 && last_scanned < parts_count\n        parts_to_take = (left.to_f/items_per_part).ceil\n        parts_for_scanned = Array.new(parts_to_take) do\n          last_scanned += 1\n        end\n\n        # We cannot take exact number of items because workers are isolated from each other.\n        # => once you take e.g. 50% from last part and left is still > 0 then its very\n        # difficult merge new items\n        items = context.run_job_with_command(self, parts_for_scanned, true, Spark::Command::Take, left, last_scanned)\n        buffer += items\n\n        left = count - buffer.size\n        # Average size of all parts\n        items_per_part = [items_per_part, items.size].reduce(0){|sum, x| sum + x.to_f/2}\n      end\n\n      buffer.slice!(0, count)\n    end\n\n    # Return the first element in this RDD.\n    #\n    # == Example:\n    #   rdd = $sc.parallelize(0..100)\n    #   rdd.first\n    #   # => 0\n    #\n    def first\n      self.take(1)[0]\n    end\n\n    # Reduces the elements of this RDD using the specified lambda or method.\n    #\n    # == Example:\n    #   rdd = $sc.parallelize(0..10)\n    #   rdd.reduce(lambda{|sum, x| sum+x})\n    #   # => 55\n    #\n    def reduce(f)\n      _reduce(Spark::Command::Reduce, f, f)\n    end\n\n    # Aggregate the elements of each partition, and then the results for all the partitions, using a\n    # given associative function and a neutral \"zero value\".\n    #\n    # The function f(x, y) is allowed to modify x and return it as its result value to avoid\n    # object allocation; however, it should not modify y.\n    #\n    # Be careful, zero_values is applied to all stages. See example.\n    #\n    # == Example:\n    #   rdd = $sc.parallelize(0..10, 2)\n    #   rdd.fold(1, lambda{|sum, x| sum+x})\n    #   # => 58\n    #\n    def fold(zero_value, f)\n      self.aggregate(zero_value, f, f)\n    end\n\n    # Aggregate the elements of each partition, and then the results for all the partitions, using\n    # given combine functions and a neutral \"zero value\".\n    #\n    # This function can return a different result type. We need one operation for merging.\n    #\n    # Result must be an Array otherwise Serializer Array's zero value will be send\n    # as multiple values and not just one.\n    #\n    # == Example:\n    #   # 1 2 3 4 5  => 15 + 1 = 16\n    #   # 6 7 8 9 10 => 40 + 1 = 41\n    #   # 16 * 41 = 656\n    #\n    #   seq = lambda{|x,y| x+y}\n    #   com = lambda{|x,y| x*y}\n    #\n    #   rdd = $sc.parallelize(1..10, 2)\n    #   rdd.aggregate(1, seq, com)\n    #   # => 656\n    #\n    def aggregate(zero_value, seq_op, comb_op)\n      _reduce(Spark::Command::Aggregate, seq_op, comb_op, zero_value)\n    end\n\n    # Return the max of this RDD\n    #\n    # == Example:\n    #   rdd = $sc.parallelize(0..10)\n    #   rdd.max\n    #   # => 10\n    #\n    def max\n      self.reduce('lambda{|memo, item| memo > item ? memo : item }')\n    end\n\n    # Return the min of this RDD\n    #\n    # == Example:\n    #   rdd = $sc.parallelize(0..10)\n    #   rdd.min\n    #   # => 0\n    #\n    def min\n      self.reduce('lambda{|memo, item| memo < item ? memo : item }')\n    end\n\n    # Return the sum of this RDD\n    #\n    # == Example:\n    #   rdd = $sc.parallelize(0..10)\n    #   rdd.sum\n    #   # => 55\n    #\n    def sum\n      self.reduce('lambda{|sum, item| sum + item}')\n    end\n\n    # Return the number of values in this RDD\n    #\n    # == Example:\n    #   rdd = $sc.parallelize(0..10)\n    #   rdd.count\n    #   # => 11\n    #\n    def count\n      # nil is for seq_op => it means the all result go directly to one worker for combine\n      @count ||= self.map_partitions('lambda{|iterator| iterator.to_a.size }')\n                     .aggregate(0, nil, 'lambda{|sum, item| sum + item }')\n    end\n\n    # Return a {Spark::StatCounter} object that captures the mean, variance\n    # and count of the RDD's elements in one operation.\n    def stats\n      @stats ||= new_rdd_from_command(Spark::Command::Stats).reduce('lambda{|memo, item| memo.merge(item)}')\n    end\n\n    # Compute the mean of this RDD's elements.\n    #\n    # == Example:\n    #   $sc.parallelize([1, 2, 3]).mean\n    #   # => 2.0\n    #\n    def mean\n      stats.mean\n    end\n\n    # Compute the variance of this RDD's elements.\n    #\n    # == Example:\n    #   $sc.parallelize([1, 2, 3]).variance\n    #   # => 0.666...\n    #\n    def variance\n      stats.variance\n    end\n\n    # Compute the standard deviation of this RDD's elements.\n    #\n    # == Example:\n    #   $sc.parallelize([1, 2, 3]).stdev\n    #   # => 0.816...\n    #\n    def stdev\n      stats.stdev\n    end\n\n    # Compute the sample standard deviation of this RDD's elements (which\n    # corrects for bias in estimating the standard deviation by dividing by\n    # N-1 instead of N).\n    #\n    # == Example:\n    #   $sc.parallelize([1, 2, 3]).sample_stdev\n    #   # => 1.0\n    #\n    def sample_stdev\n      stats.sample_stdev\n    end\n\n    # Compute the sample variance of this RDD's elements (which corrects\n    # for bias in estimating the variance by dividing by N-1 instead of N).\n    #\n    # == Example:\n    #   $sc.parallelize([1, 2, 3]).sample_variance\n    #   # => 1.0\n    #\n    def sample_variance\n      stats.sample_variance\n    end\n\n    # Compute a histogram using the provided buckets. The buckets\n    # are all open to the right except for the last which is closed.\n    # e.g. [1,10,20,50] means the buckets are [1,10) [10,20) [20,50],\n    # which means 1<=x<10, 10<=x<20, 20<=x<=50. And on the input of 1\n    # and 50 we would have a histogram of 1,0,1.\n    #\n    # If your histogram is evenly spaced (e.g. [0, 10, 20, 30]),\n    # this can be switched from an O(log n) inseration to O(1) per\n    # element(where n = # buckets).\n    #\n    # Buckets must be sorted and not contain any duplicates, must be\n    # at least two elements.\n    #\n    # == Examples:\n    #   rdd = $sc.parallelize(0..50)\n    #\n    #   rdd.histogram(2)\n    #   # => [[0.0, 25.0, 50], [25, 26]]\n    #\n    #   rdd.histogram([0, 5, 25, 50])\n    #   # => [[0, 5, 25, 50], [5, 20, 26]]\n    #\n    #   rdd.histogram([0, 15, 30, 45, 60])\n    #   # => [[0, 15, 30, 45, 60], [15, 15, 15, 6]]\n    #\n    def histogram(buckets)\n\n      # -----------------------------------------------------------------------\n      # Integer\n      #\n      if buckets.is_a?(Integer)\n\n        # Validation\n        if buckets < 1\n          raise ArgumentError, \"Bucket count must be >= 1, #{buckets} inserted.\"\n        end\n\n        # Filter invalid values\n        # Nil and NaN\n        func = 'lambda{|x|\n          if x.nil? || (x.is_a?(Float) && x.nan?)\n            false\n          else\n            true\n          end\n        }'\n        filtered = self.filter(func)\n\n        # Compute the minimum and the maximum\n        func = 'lambda{|memo, item|\n          [memo[0] < item[0] ? memo[0] : item[0],\n           memo[1] > item[1] ? memo[1] : item[1]]\n        }'\n        min, max = filtered.map('lambda{|x| [x, x]}').reduce(func)\n\n        # Min, max must be valid numbers\n        if (min.is_a?(Float) && !min.finite?) || (max.is_a?(Float) && !max.finite?)\n          raise Spark::RDDError, 'Histogram on either an empty RDD or RDD containing +/-infinity or NaN'\n        end\n\n        # Already finished\n        if min == max || buckets == 1\n          return [min, max], [filtered.count]\n        end\n\n        # Custom range\n        begin\n          span = max - min # increment\n          buckets = (0...buckets).map do |x|\n            min + (x * span) / buckets.to_f\n          end\n          buckets << max\n        rescue NoMethodError\n          raise Spark::RDDError, 'Can not generate buckets with non-number in RDD'\n        end\n\n        even = true\n\n      # -----------------------------------------------------------------------\n      # Array\n      #\n      elsif buckets.is_a?(Array)\n\n        if buckets.size < 2\n          raise ArgumentError, 'Buckets should have more than one value.'\n        end\n\n        if buckets.detect{|x| x.nil? || (x.is_a?(Float) && x.nan?)}\n          raise ArgumentError, 'Can not have nil or nan numbers in buckets.'\n        end\n\n        if buckets.detect{|x| buckets.count(x) > 1}\n          raise ArgumentError, 'Buckets should not contain duplicated values.'\n        end\n\n        if buckets.sort != buckets\n          raise ArgumentError, 'Buckets must be sorted.'\n        end\n\n        even = false\n\n      # -----------------------------------------------------------------------\n      # Other\n      #\n      else\n        raise Spark::RDDError, 'Buckets should be number or array.'\n      end\n\n      reduce_func = 'lambda{|memo, item|\n        memo.size.times do |i|\n          memo[i] += item[i]\n        end\n        memo\n      }'\n\n      return buckets, new_rdd_from_command(Spark::Command::Histogram, even, buckets).reduce(reduce_func)\n    end\n\n    # Applies a function f to all elements of this RDD.\n    #\n    # == Example:\n    #   rdd = $sc.parallelize(0..5)\n    #   rdd.foreach(lambda{|x| puts x})\n    #   # => nil\n    #\n    def foreach(f, options={})\n      new_rdd_from_command(Spark::Command::Foreach, f).collect\n      nil\n    end\n\n    # Applies a function f to each partition of this RDD.\n    #\n    # == Example:\n    #   rdd = $sc.parallelize(0..5)\n    #   rdd.foreachPartition(lambda{|x| puts x.to_s})\n    #   # => nil\n    #\n    def foreach_partition(f, options={})\n      new_rdd_from_command(Spark::Command::ForeachPartition, f).collect\n      nil\n    end\n\n\n    # =============================================================================\n    # Transformations of RDD\n\n    # Return a new RDD by applying a function to all elements of this RDD.\n    #\n    # == Example:\n    #   rdd = $sc.parallelize(0..5)\n    #   rdd.map(lambda {|x| x*2}).collect\n    #   # => [0, 2, 4, 6, 8, 10]\n    #\n    def map(f)\n      new_rdd_from_command(Spark::Command::Map, f)\n    end\n\n    # Return a new RDD by first applying a function to all elements of this\n    # RDD, and then flattening the results.\n    #\n    # == Example:\n    #   rdd = $sc.parallelize(0..5)\n    #   rdd.flat_map(lambda {|x| [x, 1]}).collect\n    #   # => [0, 1, 1, 1, 2, 1, 3, 1, 4, 1, 5, 1]\n    #\n    def flat_map(f)\n      new_rdd_from_command(Spark::Command::FlatMap, f)\n    end\n\n    # Return a new RDD by applying a function to each partition of this RDD.\n    #\n    # == Example:\n    #   rdd = $sc.parallelize(0..10, 2)\n    #   rdd.map_partitions(lambda{|part| part.reduce(:+)}).collect\n    #   # => [15, 40]\n    #\n    def map_partitions(f)\n      new_rdd_from_command(Spark::Command::MapPartitions, f)\n    end\n\n    # Return a new RDD by applying a function to each partition of this RDD, while tracking the index\n    # of the original partition.\n    #\n    # == Example:\n    #   rdd = $sc.parallelize(0...4, 4)\n    #   rdd.map_partitions_with_index(lambda{|part, index| part.first * index}).collect\n    #   # => [0, 1, 4, 9]\n    #\n    def map_partitions_with_index(f, options={})\n      new_rdd_from_command(Spark::Command::MapPartitionsWithIndex, f)\n    end\n\n    # Return a new RDD containing only the elements that satisfy a predicate.\n    #\n    # == Example:\n    #   rdd = $sc.parallelize(0..10)\n    #   rdd.filter(lambda{|x| x.even?}).collect\n    #   # => [0, 2, 4, 6, 8, 10]\n    #\n    def filter(f)\n      new_rdd_from_command(Spark::Command::Filter, f)\n    end\n\n    # Return a new RDD containing non-nil elements.\n    #\n    # == Example:\n    #   rdd = $sc.parallelize([1, nil, 2, nil, 3])\n    #   rdd.compact.collect\n    #   # => [1, 2, 3]\n    #\n    def compact\n      new_rdd_from_command(Spark::Command::Compact)\n    end\n\n    # Return an RDD created by coalescing all elements within each partition into an array.\n    #\n    # == Example:\n    #   rdd = $sc.parallelize(0..10, 3)\n    #   rdd.glom.collect\n    #   # => [[0, 1, 2], [3, 4, 5, 6], [7, 8, 9, 10]]\n    #\n    def glom\n      new_rdd_from_command(Spark::Command::Glom)\n    end\n\n    # Return a new RDD that is reduced into num_partitions partitions.\n    #\n    # == Example:\n    #   rdd = $sc.parallelize(0..10, 3)\n    #   rdd.coalesce(2).glom.collect\n    #   # => [[0, 1, 2], [3, 4, 5, 6, 7, 8, 9, 10]]\n    #\n    def coalesce(num_partitions)\n      if self.is_a?(PipelinedRDD)\n        deser = @command.serializer\n      else\n        deser = @command.deserializer\n      end\n\n      new_jrdd = jrdd.coalesce(num_partitions)\n      RDD.new(new_jrdd, context, @command.serializer, deser)\n    end\n\n    # Return the Cartesian product of this RDD and another one, that is, the\n    # RDD of all pairs of elements `(a, b)` where `a` is in `self` and\n    # `b` is in `other`.\n    #\n    # == Example:\n    #   rdd1 = $sc.parallelize([1,2,3])\n    #   rdd2 = $sc.parallelize([4,5,6])\n    #\n    #   rdd1.cartesian(rdd2).collect\n    #   # => [[1, 4], [1, 5], [1, 6], [2, 4], [2, 5], [2, 6], [3, 4], [3, 5], [3, 6]]\n    #\n    def cartesian(other)\n      _deserializer = Spark::Serializer::Cartesian.new(self.deserializer, other.deserializer)\n\n      new_jrdd = jrdd.cartesian(other.jrdd)\n      RDD.new(new_jrdd, context, serializer, _deserializer)\n    end\n\n    # Return a new RDD containing the distinct elements in this RDD.\n    # Ordering is not preserved because of reducing\n    #\n    # == Example:\n    #   rdd = $sc.parallelize([1,1,1,2,3])\n    #   rdd.distinct.collect\n    #   # => [1, 2, 3]\n    #\n    def distinct\n      self.map('lambda{|x| [x, nil]}')\n          .reduce_by_key('lambda{|x,_| x}')\n          .map('lambda{|x| x[0]}')\n    end\n\n    # Return a shuffled RDD.\n    #\n    # == Example:\n    #   rdd = $sc.parallelize(0..10)\n    #   rdd.shuffle.collect\n    #   # => [3, 10, 6, 7, 8, 0, 4, 2, 9, 1, 5]\n    #\n    def shuffle(seed=nil)\n      seed ||= Random.new_seed\n\n      new_rdd_from_command(Spark::Command::Shuffle, seed)\n    end\n\n    # Return the union of this RDD and another one. Any identical elements will appear multiple\n    # times (use .distinct to eliminate them).\n    #\n    # == Example:\n    #   rdd = $sc.parallelize([1, 2, 3])\n    #   rdd.union(rdd).collect\n    #   # => [1, 2, 3, 1, 2, 3]\n    #\n    def union(other)\n      if self.serializer != other.serializer\n        other = other.reserialize(serializer)\n      end\n\n      new_jrdd = jrdd.union(other.jrdd)\n      RDD.new(new_jrdd, context, serializer, deserializer)\n    end\n\n    # Return a new RDD with different serializer. This method is useful during union\n    # and join operations.\n    #\n    # == Example:\n    #   rdd = $sc.parallelize([1, 2, 3], nil, serializer: \"marshal\")\n    #   rdd = rdd.map(lambda{|x| x.to_s})\n    #   rdd.reserialize(\"oj\").collect\n    #   # => [\"1\", \"2\", \"3\"]\n    #\n    def reserialize(new_serializer)\n      if serializer == new_serializer\n        return self\n      end\n\n      new_command = @command.deep_copy\n      new_command.serializer = new_serializer\n\n      PipelinedRDD.new(self, new_command)\n    end\n\n    # Return the intersection of this RDD and another one. The output will not contain\n    # any duplicate elements, even if the input RDDs did.\n    #\n    # == Example:\n    #   rdd1 = $sc.parallelize([1,2,3,4,5])\n    #   rdd2 = $sc.parallelize([1,4,5,6,7])\n    #   rdd1.intersection(rdd2).collect\n    #   # => [1, 4, 5]\n    #\n    def intersection(other)\n      mapping_function = 'lambda{|item| [item, nil]}'\n      filter_function  = 'lambda{|(key, values)| values.size > 1}'\n\n      self.map(mapping_function)\n          .cogroup(other.map(mapping_function))\n          .filter(filter_function)\n          .keys\n    end\n\n    # Return a copy of the RDD partitioned using the specified partitioner.\n    #\n    # == Example:\n    #   rdd = $sc.parallelize([\"1\",\"2\",\"3\",\"4\",\"5\"]).map(lambda {|x| [x, 1]})\n    #   rdd.partitionBy(2).glom.collect\n    #   # => [[[\"3\", 1], [\"4\", 1]], [[\"1\", 1], [\"2\", 1], [\"5\", 1]]]\n    #\n    def partition_by(num_partitions, partition_func=nil)\n      num_partitions ||= default_reduce_partitions\n      partition_func ||= 'lambda{|x| Spark::Digest.portable_hash(x.to_s)}'\n\n      _partition_by(num_partitions, Spark::Command::PartitionBy::Basic, partition_func)\n    end\n\n    # Return a sampled subset of this RDD. Operations are base on Poisson and Uniform\n    # distributions.\n    # TODO: Replace Unfirom for Bernoulli\n    #\n    # == Examples:\n    #   rdd = $sc.parallelize(0..100)\n    #\n    #   rdd.sample(true, 10).collect\n    #   # => [17, 17, 22, 23, 51, 52, 62, 64, 69, 70, 96]\n    #\n    #   rdd.sample(false, 0.1).collect\n    #   # => [3, 5, 9, 32, 44, 55, 66, 68, 75, 80, 86, 91, 98]\n    #\n    def sample(with_replacement, fraction, seed=nil)\n      new_rdd_from_command(Spark::Command::Sample, with_replacement, fraction, seed)\n    end\n\n    # Return a fixed-size sampled subset of this RDD in an array\n    #\n    # == Examples:\n    #   rdd = $sc.parallelize(0..100)\n    #\n    #   rdd.take_sample(true, 10)\n    #   # => [90, 84, 74, 44, 27, 22, 72, 96, 80, 54]\n    #\n    #   rdd.take_sample(false, 10)\n    #   # => [5, 35, 30, 48, 22, 33, 40, 75, 42, 32]\n    #\n    def take_sample(with_replacement, num, seed=nil)\n\n      if num < 0\n        raise Spark::RDDError, 'Size have to be greater than 0'\n      elsif num == 0\n        return []\n      end\n\n      # Taken from scala\n      num_st_dev = 10.0\n\n      # Number of items\n      initial_count = self.count\n      return [] if initial_count == 0\n\n      # Create new generator\n      seed ||= Random.new_seed\n      rng = Random.new(seed)\n\n      # Shuffle elements if requested num if greater than array size\n      if !with_replacement && num >= initial_count\n        return self.shuffle(seed).collect\n      end\n\n      # Max num\n      max_sample_size = Integer::MAX - (num_st_dev * Math.sqrt(Integer::MAX)).to_i\n      if num > max_sample_size\n        raise Spark::RDDError, \"Size can not be greate than #{max_sample_size}\"\n      end\n\n      # Approximate fraction with tolerance\n      fraction = compute_fraction(num, initial_count, with_replacement)\n\n      # Compute first samled subset\n      samples = self.sample(with_replacement, fraction, seed).collect\n\n      # If the first sample didn't turn out large enough, keep trying to take samples;\n      # this shouldn't happen often because we use a big multiplier for their initial size.\n      index = 0\n      while samples.size < num\n        log_warning(\"Needed to re-sample due to insufficient sample size. Repeat #{index}\")\n        samples = self.sample(with_replacement, fraction, rng.rand(0..Integer::MAX)).collect\n        index += 1\n      end\n\n      samples.shuffle!(random: rng)\n      samples[0, num]\n    end\n\n    # Return an RDD created by piping elements to a forked external process.\n    #\n    # == Cmds:\n    #   cmd = [env,] command... [,options]\n    #\n    #   env: hash\n    #     name => val : set the environment variable\n    #     name => nil : unset the environment variable\n    #   command...:\n    #     commandline                 : command line string which is passed to the standard shell\n    #     cmdname, arg1, ...          : command name and one or more arguments (This form does\n    #                                   not use the shell. See below for caveats.)\n    #     [cmdname, argv0], arg1, ... : command name, argv[0] and zero or more arguments (no shell)\n    #   options: hash\n    #\n    #   See http://ruby-doc.org/core-2.2.0/Process.html#method-c-spawn\n    #\n    # == Examples:\n    #   $sc.parallelize(0..5).pipe('cat').collect\n    #   # => [\"0\", \"1\", \"2\", \"3\", \"4\", \"5\"]\n    #\n    #   rdd = $sc.parallelize(0..5)\n    #   rdd = rdd.pipe('cat', \"awk '{print $1*10}'\")\n    #   rdd = rdd.map(lambda{|x| x.to_i + 1})\n    #   rdd.collect\n    #   # => [1, 11, 21, 31, 41, 51]\n    #\n    def pipe(*cmds)\n      new_rdd_from_command(Spark::Command::Pipe, cmds)\n    end\n\n\n    # =============================================================================\n    # Pair functions\n\n    # Merge the values for each key using an associative reduce function. This will also perform\n    # the merging locally on each mapper before sending results to a reducer, similarly to a\n    # \"combiner\" in MapReduce. Output will be hash-partitioned with the existing partitioner/\n    # parallelism level.\n    #\n    # == Example:\n    #   rdd = $sc.parallelize([\"a\",\"b\",\"c\",\"a\",\"b\",\"c\",\"a\",\"c\"]).map(lambda{|x| [x, 1]})\n    #   rdd.reduce_by_key(lambda{|x,y| x+y}).collect_as_hash\n    #   # => {\"a\"=>3, \"b\"=>2, \"c\"=>3}\n    #\n    def reduce_by_key(f, num_partitions=nil)\n      combine_by_key('lambda {|x| x}', f, f, num_partitions)\n    end\n\n    # Generic function to combine the elements for each key using a custom set of aggregation\n    # functions. Turns a JavaPairRDD[(K, V)] into a result of type JavaPairRDD[(K, C)], for a\n    # \"combined type\" C * Note that V and C can be different -- for example, one might group an\n    # RDD of type (Int, Int) into an RDD of type (Int, List[Int]). Users provide three\n    # functions:\n    #\n    # == Parameters:\n    # create_combiner:: which turns a V into a C (e.g., creates a one-element list)\n    # merge_value:: to merge a V into a C (e.g., adds it to the end of a list)\n    # merge_combiners:: to combine two C's into a single one.\n    #\n    # == Example:\n    #   def combiner(x)\n    #     x\n    #   end\n    #\n    #   def merge(x,y)\n    #     x+y\n    #   end\n    #\n    #   rdd = $sc.parallelize([\"a\",\"b\",\"c\",\"a\",\"b\",\"c\",\"a\",\"c\"], 2).map(lambda{|x| [x, 1]})\n    #   rdd.combine_by_key(method(:combiner), method(:merge), method(:merge)).collect_as_hash\n    #   # => {\"a\"=>3, \"b\"=>2, \"c\"=>3}\n    #\n    def combine_by_key(create_combiner, merge_value, merge_combiners, num_partitions=nil)\n      _combine_by_key(\n        [Spark::Command::CombineByKey::Combine, create_combiner, merge_value],\n        [Spark::Command::CombineByKey::Merge, merge_combiners],\n        num_partitions\n      )\n    end\n\n    # Return an RDD of grouped items.\n    #\n    # == Example:\n    #   rdd = $sc.parallelize(0..5)\n    #   rdd.group_by(lambda{|x| x%2}).collect\n    #   # => [[0, [0, 2, 4]], [1, [1, 3, 5]]]\n    #\n    def group_by(f, num_partitions=nil)\n      self.key_by(f).group_by_key(num_partitions)\n    end\n\n    # Group the values for each key in the RDD into a single sequence. Allows controlling the\n    # partitioning of the resulting key-value pair RDD by passing a Partitioner.\n    #\n    # Note: If you are grouping in order to perform an aggregation (such as a sum or average)\n    # over each key, using reduce_by_key or combine_by_key will provide much better performance.\n    #\n    # == Example:\n    #   rdd = $sc.parallelize([[\"a\", 1], [\"a\", 2], [\"b\", 3]])\n    #   rdd.group_by_key.collect\n    #   # => [[\"a\", [1, 2]], [\"b\", [3]]]\n    #\n    def group_by_key(num_partitions=nil)\n      create_combiner = 'lambda{|item| [item]}'\n      merge_value     = 'lambda{|combiner, item| combiner << item; combiner}'\n      merge_combiners = 'lambda{|combiner_1, combiner_2| combiner_1 += combiner_2; combiner_1}'\n\n      combine_by_key(create_combiner, merge_value, merge_combiners, num_partitions)\n    end\n\n    # Merge the values for each key using an associative function f\n    # and a neutral `zero_value` which may be added to the result an\n    # arbitrary number of times, and must not change the result\n    # (e.g., 0 for addition, or 1 for multiplication.).\n    #\n    # == Example:\n    #   rdd = $sc.parallelize([[\"a\", 1], [\"b\", 2], [\"a\", 3], [\"a\", 4], [\"c\", 5]])\n    #   rdd.fold_by_key(1, lambda{|x,y| x+y})\n    #   # => [[\"a\", 9], [\"c\", 6], [\"b\", 3]]\n    #\n    def fold_by_key(zero_value, f, num_partitions=nil)\n      self.aggregate_by_key(zero_value, f, f, num_partitions)\n    end\n\n    # Aggregate the values of each key, using given combine functions and a neutral zero value.\n    #\n    # == Example:\n    #   def combine(x,y)\n    #     x+y\n    #   end\n    #\n    #   def merge(x,y)\n    #     x*y\n    #   end\n    #\n    #   rdd = $sc.parallelize([[\"a\", 1], [\"b\", 2], [\"a\", 3], [\"a\", 4], [\"c\", 5]], 2)\n    #   rdd.aggregate_by_key(1, method(:combine), method(:merge))\n    #   # => [[\"b\", 3], [\"a\", 16], [\"c\", 6]]\n    #\n    def aggregate_by_key(zero_value, seq_func, comb_func, num_partitions=nil)\n      _combine_by_key(\n        [Spark::Command::CombineByKey::CombineWithZero, zero_value, seq_func],\n        [Spark::Command::CombineByKey::Merge, comb_func],\n        num_partitions\n      )\n    end\n\n    # The same functionality as cogroup but this can grouped only 2 rdd's and you\n    # can change num_partitions.\n    #\n    # == Example:\n    #   rdd1 = $sc.parallelize([[\"a\", 1], [\"a\", 2], [\"b\", 3]])\n    #   rdd2 = $sc.parallelize([[\"a\", 4], [\"a\", 5], [\"b\", 6]])\n    #   rdd1.group_with(rdd2).collect\n    #   # => [[\"a\", [1, 2, 4, 5]], [\"b\", [3, 6]]]\n    #\n    def group_with(other, num_partitions=nil)\n      self.union(other).group_by_key(num_partitions)\n    end\n\n    # For each key k in `this` or `other`, return a resulting RDD that contains a tuple with the\n    # list of values for that key in `this` as well as `other`.\n    #\n    # == Example:\n    #   rdd1 = $sc.parallelize([[\"a\", 1], [\"a\", 2], [\"b\", 3]])\n    #   rdd2 = $sc.parallelize([[\"a\", 4], [\"a\", 5], [\"b\", 6]])\n    #   rdd3 = $sc.parallelize([[\"a\", 7], [\"a\", 8], [\"b\", 9]])\n    #   rdd1.cogroup(rdd2, rdd3).collect\n    #   # => [[\"a\", [1, 2, 4, 5, 7, 8]], [\"b\", [3, 6, 9]]]\n    #\n    def cogroup(*others)\n      unioned = self\n      others.each do |other|\n        unioned = unioned.union(other)\n      end\n\n      unioned.group_by_key\n    end\n\n    # Return each (key, value) pair in self RDD that has no pair with matching\n    # key in other RDD.\n    #\n    # == Example:\n    #   rdd1 = $sc.parallelize([[\"a\", 1], [\"a\", 2], [\"b\", 3], [\"c\", 4]])\n    #   rdd2 = $sc.parallelize([[\"b\", 5], [\"c\", 6]])\n    #   rdd1.subtract_by_key(rdd2).collect\n    #   # => [[\"a\", 1], [\"a\", 2]]\n    #\n    def subtract_by_key(other, num_partitions=nil)\n      create_combiner = 'lambda{|item| [[item]]}'\n      merge_value     = 'lambda{|combiner, item| combiner.first << item; combiner}'\n      merge_combiners = 'lambda{|combiner_1, combiner_2| combiner_1 += combiner_2; combiner_1}'\n\n      self.union(other)\n          .combine_by_key(create_combiner, merge_value, merge_combiners, num_partitions)\n          .filter('lambda{|(key,values)| values.size == 1}')\n          .flat_map_values('lambda{|item| item.first}')\n    end\n\n    # Return an RDD with the elements from self that are not in other.\n    #\n    # == Example:\n    #   rdd1 = $sc.parallelize([[\"a\", 1], [\"a\", 2], [\"b\", 3], [\"c\", 4]])\n    #   rdd2 = $sc.parallelize([[\"a\", 2], [\"c\", 6]])\n    #   rdd1.subtract(rdd2).collect\n    #   # => [[\"a\", 1], [\"b\", 3], [\"c\", 4]]\n    #\n    def subtract(other, num_partitions=nil)\n      mapping_function = 'lambda{|x| [x,nil]}'\n\n      self.map(mapping_function)\n          .subtract_by_key(other.map(mapping_function), num_partitions)\n          .keys\n    end\n\n    # Sort the RDD by key\n    #\n    # == Example:\n    #   rdd = $sc.parallelize([[\"c\", 1], [\"b\", 2], [\"a\", 3]])\n    #   rdd.sort_by_key.collect\n    #   # => [[\"a\", 3], [\"b\", 2], [\"c\", 1]]\n    #\n    def sort_by_key(ascending=true, num_partitions=nil)\n      self.sort_by('lambda{|(key, _)| key}')\n    end\n\n    # Sort the RDD by value\n    #\n    # == Example:\n    #   rdd = $sc.parallelize([[\"a\", 3], [\"b\", 1], [\"c\", 2]])\n    #   rdd.sort_by_value.collect\n    #   # => [[\"b\", 1], [\"c\", 2], [\"a\", 3]]\n    #\n    def sort_by_value(ascending=true, num_partitions=nil)\n      self.sort_by('lambda{|(_, value)| value}')\n    end\n\n    # Sorts this RDD by the given key_function\n    #\n    # This is a different implementation than spark. Sort by doesn't use\n    # key_by method first. It can be slower but take less memory and\n    # you can always use map.sort_by_key\n    #\n    # == Example:\n    #   rdd = $sc.parallelize([\"aaaaaaa\", \"cc\", \"b\", \"eeee\", \"ddd\"])\n    #\n    #   rdd.sort_by.collect\n    #   # => [\"aaaaaaa\", \"b\", \"cc\", \"ddd\", \"eeee\"]\n    #\n    #   rdd.sort_by(lambda{|x| x.size}).collect\n    #   # => [\"b\", \"cc\", \"ddd\", \"eeee\", \"aaaaaaa\"]\n    #\n    def sort_by(key_function=nil, ascending=true, num_partitions=nil)\n      key_function   ||= 'lambda{|x| x}'\n      num_partitions ||= default_reduce_partitions\n\n      command_klass = Spark::Command::SortByKey\n\n      # Allow spill data to disk due to memory limit\n      # spilling = config['spark.shuffle.spill'] || false\n      spilling = false\n      memory = ''\n\n      # Set spilling to false if worker has unlimited memory\n      if memory.empty?\n        spilling = false\n        memory   = nil\n      else\n        memory = to_memory_size(memory)\n      end\n\n      # Sorting should do one worker\n      if num_partitions == 1\n        rdd = self\n        rdd = rdd.coalesce(1) if partitions_size > 1\n        return rdd.new_rdd_from_command(command_klass, key_function, ascending, spilling, memory, serializer)\n      end\n\n      # Compute boundary of collection\n      # Collection should be evenly distributed\n      # 20.0 is from scala RangePartitioner (for roughly balanced output partitions)\n      count = self.count\n      sample_size = num_partitions * 20.0\n      fraction = [sample_size / [count, 1].max, 1.0].min\n      samples = self.sample(false, fraction, 1).map(key_function).collect\n      samples.sort!\n      # Reverse is much faster than reverse sort_by\n      samples.reverse! if !ascending\n\n      # Determine part bounds\n      bounds = determine_bounds(samples, num_partitions)\n\n      shuffled = _partition_by(num_partitions, Spark::Command::PartitionBy::Sorting, key_function, bounds, ascending, num_partitions)\n      shuffled.new_rdd_from_command(command_klass, key_function, ascending, spilling, memory, serializer)\n    end\n\n    # Creates array of the elements in this RDD by applying function f.\n    #\n    # == Example:\n    #   rdd = $sc.parallelize(0..5)\n    #   rdd.key_by(lambda{|x| x%2}).collect\n    #   # => [[0, 0], [1, 1], [0, 2], [1, 3], [0, 4], [1, 5]]\n    #\n    def key_by(f)\n      new_rdd_from_command(Spark::Command::KeyBy, f)\n    end\n\n    # Pass each value in the key-value pair RDD through a map function without changing\n    # the keys. This also retains the original RDD's partitioning.\n    #\n    # == Example:\n    #   rdd = $sc.parallelize([\"ruby\", \"scala\", \"java\"])\n    #   rdd = rdd.map(lambda{|x| [x, x]})\n    #   rdd = rdd.map_values(lambda{|x| x.upcase})\n    #   rdd.collect\n    #   # => [[\"ruby\", \"RUBY\"], [\"scala\", \"SCALA\"], [\"java\", \"JAVA\"]]\n    #\n    def map_values(f)\n      new_rdd_from_command(Spark::Command::MapValues, f)\n    end\n\n    # Pass each value in the key-value pair RDD through a flat_map function\n    # without changing the keys; this also retains the original RDD's\n    # partitioning.\n    #\n    # == Example:\n    #   rdd = $sc.parallelize([[\"a\", [1,2]], [\"b\", [3]]])\n    #   rdd = rdd.flat_map_values(lambda{|x| x*2})\n    #   rdd.collect\n    #   # => [[\"a\", 1], [\"a\", 2], [\"a\", 1], [\"a\", 2], [\"b\", 3], [\"b\", 3]]\n    #\n    def flat_map_values(f)\n      new_rdd_from_command(Spark::Command::FlatMapValues, f)\n    end\n\n    # Return an RDD with the first element of PairRDD\n    #\n    # == Example:\n    #   rdd = $sc.parallelize([[1,2], [3,4], [5,6]])\n    #   rdd.keys.collect\n    #   # => [1, 3, 5]\n    #\n    def keys\n      self.map('lambda{|(key, _)| key}')\n    end\n\n    # Return an RDD with the second element of PairRDD\n    #\n    # == Example:\n    #   rdd = $sc.parallelize([[1,2], [3,4], [5,6]])\n    #   rdd.keys.collect\n    #   # => [2, 4, 6]\n    #\n    def values\n      self.map('lambda{|(_, value)| value}')\n    end\n\n    # Return the list of values in the RDD for key `key`.\n    # TODO: add Partitioner for efficiently searching\n    #\n    # == Example:\n    #   rdd = $sc.parallelize(0..10)\n    #   rdd = rdd.group_by(lambda {|x| x%3})\n    #   rdd.lookup(2)\n    #   # => [[2, 5, 8]]\n    #\n    #   rdd = $sc.parallelize(0..10)\n    #   rdd = rdd.key_by(lambda{|x| x.even?})\n    #   rdd.lookup(true)\n    #   # => [0, 2, 4, 6, 8, 10]\n    #\n    def lookup(key)\n      lookup_key = \"lookup_key_#{object_id}\"\n\n      self.filter(\"lambda{|(key, _)| key == #{lookup_key}}\")\n          .bind(lookup_key => key)\n          .values\n          .collect\n    end\n\n    # Aliases\n    alias_method :partitionsSize, :partitions_size\n    alias_method :defaultReducePartitions, :default_reduce_partitions\n    alias_method :setName, :set_name\n    alias_method :addLibrary, :add_library\n    alias_method :require, :add_library\n\n    alias_method :flatMap, :flat_map\n    alias_method :mapPartitions, :map_partitions\n    alias_method :mapPartitionsWithIndex, :map_partitions_with_index\n    alias_method :reduceByKey, :reduce_by_key\n    alias_method :combineByKey, :combine_by_key\n    alias_method :groupByKey, :group_by_key\n    alias_method :groupWith, :group_with\n    alias_method :partitionBy, :partition_by\n    alias_method :defaultReducePartitions, :default_reduce_partitions\n    alias_method :foreachPartition, :foreach_partition\n    alias_method :mapValues, :map_values\n    alias_method :takeSample, :take_sample\n    alias_method :sortBy, :sort_by\n    alias_method :sortByKey, :sort_by_key\n    alias_method :keyBy, :key_by\n    alias_method :groupBy, :group_by\n    alias_method :foldByKey, :fold_by_key\n    alias_method :aggregateByKey, :aggregate_by_key\n    alias_method :subtractByKey, :subtract_by_key\n    alias_method :sampleStdev, :sample_stdev\n    alias_method :sampleVariance, :sample_variance\n\n    private\n\n      # This is base method for reduce operation. Is used by reduce, fold and aggregation.\n      # Only difference is that fold has zero value.\n      #\n      def _reduce(klass, seq_op, comb_op, zero_value=nil)\n        if seq_op.nil?\n          # Partitions are already reduced\n          rdd = self\n        else\n          rdd = new_rdd_from_command(klass, seq_op, zero_value)\n        end\n\n        # Send all results to one worker and combine results\n        rdd = rdd.coalesce(1).compact\n\n        # Add the same function to new RDD\n        comm = rdd.add_command(klass, comb_op, zero_value)\n        comm.deserializer = @command.serializer\n\n        # Value is returned in array\n        PipelinedRDD.new(rdd, comm).collect[0]\n      end\n\n      def _partition_by(num_partitions, klass, *args)\n        # RDD is transform from [key, value] to [hash, [key, value]]\n        keyed = new_rdd_from_command(klass, *args)\n        keyed.serializer.unbatch!\n\n        # PairwiseRDD and PythonPartitioner are borrowed from Python\n        # but works great on ruby too\n        pairwise_rdd = PairwiseRDD.new(keyed.jrdd.rdd).asJavaPairRDD\n        partitioner = PythonPartitioner.new(num_partitions, args.first.object_id)\n        new_jrdd = pairwise_rdd.partitionBy(partitioner).values\n\n        # Reset deserializer\n        RDD.new(new_jrdd, context, @command.serializer, keyed.serializer)\n      end\n\n      # For using a different combine_by_key\n      #\n      # == Used for:\n      # * combine_by_key\n      # * fold_by_key (with zero value)\n      #\n      def _combine_by_key(combine, merge, num_partitions)\n        num_partitions ||= default_reduce_partitions\n\n        # Combine key\n        combined = new_rdd_from_command(combine.shift, *combine)\n\n        # Merge items\n        shuffled = combined.partition_by(num_partitions)\n        merge_comm = shuffled.add_command(merge.shift, *merge)\n\n        PipelinedRDD.new(shuffled, merge_comm)\n      end\n\n  end\n\n  # Pipelined Resilient Distributed Dataset, operations are pipelined and sended to worker\n  #\n  #   RDD\n  #   `-- map\n  #       `-- map\n  #           `-- map\n  #\n  # Code is executed from top to bottom\n  #\n  class PipelinedRDD < RDD\n\n    attr_reader :prev_jrdd, :command\n\n    def initialize(prev, command)\n\n      if prev.is_a?(PipelinedRDD) && prev.pipelinable?\n        # Second, ... stages\n        @prev_jrdd = prev.prev_jrdd\n      else\n        # First stage\n        @prev_jrdd = prev.jrdd\n      end\n\n      @cached = false\n      @checkpointed = false\n\n      @context = prev.context\n      @command = command\n    end\n\n    def pipelinable?\n      !(cached? || checkpointed?)\n    end\n\n    # Serialization necessary things and sent it to RubyRDD (scala extension)\n    def jrdd\n      @jrdd ||= _jrdd\n    end\n\n    private\n\n      def _jrdd\n        command = @command.build\n\n        broadcasts = @command.bound_objects.select{|_, value| value.is_a?(Spark::Broadcast)}.values\n        broadcasts = to_java_array_list(broadcasts.map(&:jbroadcast))\n\n        ruby_rdd = RubyRDD.new(@prev_jrdd.rdd, command, broadcasts, @context.jaccumulator)\n        ruby_rdd.asJavaRDD\n      end\n\n  end\nend\n"
  },
  {
    "path": "lib/spark/sampler.rb",
    "content": "require 'distribution'\n\n# Random Generators\nmodule Spark\n  module RandomGenerator\n    class Poisson\n\n      def initialize(mean, seed)\n        generator = Random.new(seed)\n        @exp_rng = Distribution::Exponential.rng(1.0/mean, random: generator)\n      end\n\n      def rand\n        t = 0.0\n        number = 0\n\n        loop{\n          t += @exp_rng.call\n          if t > 1\n            return number\n          end\n          number += 1\n        }\n      end\n\n    end\n  end\nend\n\n# Samplers\nmodule Spark\n  module Sampler\n\n    class Base\n      attr_reader :fraction, :seed\n\n      def initialize(fraction, seed=nil)\n        @fraction = fraction\n        @seed = seed || Random.new_seed\n      end\n    end\n\n    # Poisson Sampler\n    # -------------------------------------------------------------------------\n    class Poisson < Base\n\n      def sample(iterator)\n        iterator.map! do |item|\n          count = rng.rand\n          Array.new(count) { item }\n        end\n        iterator.flatten!\n        iterator.compact!\n        iterator\n      end\n\n      def lazy_sample(iterator)\n        Enumerator::Lazy.new(iterator) do |yielder, value|\n          count = rng.rand\n          count.times { yielder << value }\n        end\n      end\n\n      def rng\n        @rng ||= Spark::RandomGenerator::Poisson.new(fraction, seed)\n      end\n\n    end\n\n    # Uniform Sampler\n    # -------------------------------------------------------------------------\n    class Uniform < Base\n\n      def sample(iterator)\n        iterator.select!{|item| rng.rand <= fraction}\n        iterator\n      end\n\n      def lazy_sample(iterator)\n        iterator.select do |item|\n          rng.rand <= fraction\n        end\n      end\n\n      def rng\n        @rng ||= Random.new(seed)\n      end\n\n    end\n\n  end\nend\n"
  },
  {
    "path": "lib/spark/serializer/auto_batched.rb",
    "content": "module Spark\n  module Serializer\n    ##\n    # AutoBatched serializator\n    #\n    # Batch size is computed automatically. Simillar to Python's AutoBatchedSerializer.\n    #\n    class AutoBatched < Batched\n\n      MAX_RATIO = 10\n\n      def initialize(serializer, best_size=65536)\n        @serializer = serializer\n        @best_size = best_size.to_i\n\n        error('Batch size must be greater than 1') if @best_size < 2\n      end\n\n      def batched?\n        true\n      end\n\n      def unbatch!\n      end\n\n      def name\n        \"AutoBatched(#{@best_size})\"\n      end\n\n      def dump_to_io(data, io)\n        check_each(data)\n\n        # Only Array have .slice\n        data = data.to_a\n\n        index = 0\n        batch = 2\n        max = @best_size * MAX_RATIO\n\n        loop do\n          chunk = data.slice(index, batch)\n          if chunk.nil? || chunk.empty?\n            break\n          end\n\n          serialized = @serializer.dump(chunk)\n          io.write_string(serialized)\n\n          index += batch\n\n          size = serialized.bytesize\n          if size < @best_size\n            batch *= 2\n          elsif size > max && batch > 1\n            batch /= 2\n          end\n        end\n\n        io.flush\n      end\n\n    end\n  end\nend\n\nSpark::Serializer.register('auto_batched', 'autobatched', Spark::Serializer::AutoBatched)\n"
  },
  {
    "path": "lib/spark/serializer/base.rb",
    "content": "module Spark\n  module Serializer\n    # @abstract Parent for all serializers\n    class Base\n\n      def load_from_io(io)\n        return to_enum(__callee__, io) unless block_given?\n\n        loop do\n          size = io.read_int_or_eof\n          break if size == Spark::Constant::DATA_EOF\n\n          yield load(io.read(size))\n        end\n      end\n\n      def load_from_file(file, *args)\n        return to_enum(__callee__, file, *args) unless block_given?\n\n        load_from_io(file, *args).each do |item|\n          yield item\n        end\n\n        file.close\n        file.unlink\n      end\n\n      def ==(other)\n        self.to_s == other.to_s\n      end\n\n      def batched?\n        false\n      end\n\n      def unbatch!\n      end\n\n      def check_each(data)\n        unless data.respond_to?(:each)\n          error('Data must be iterable.')\n        end\n      end\n\n      def error(message)\n        raise Spark::SerializeError, message\n      end\n\n      def name\n        self.class.name.split('::').last\n      end\n\n      def to_s\n        name\n      end\n\n      def inspect\n        %{#<Spark::Serializer:0x#{object_id}  \"#{self}\">}\n      end\n\n    end\n  end\nend\n"
  },
  {
    "path": "lib/spark/serializer/batched.rb",
    "content": "module Spark\n  module Serializer\n    class Batched < Base\n\n      attr_writer :serializer\n\n      def initialize(serializer, batch_size=nil)\n        batch_size ||= Spark::Serializer::DEFAULT_BATCH_SIZE\n\n        @serializer = serializer\n        @batch_size = batch_size.to_i\n\n        error('Batch size must be greater than 0') if @batch_size < 1\n      end\n\n      # Really batched\n      def batched?\n        @batch_size > 1\n      end\n\n      def unbatch!\n        @batch_size = 1\n      end\n\n      def load(data)\n        @serializer.load(data)\n      end\n\n      def dump(data)\n        @serializer.dump(data)\n      end\n\n      def name\n        \"Batched(#{@batch_size})\"\n      end\n\n      def to_s\n        \"#{name} -> #{@serializer}\"\n      end\n\n\n      # === Dump ==============================================================\n\n      def dump_to_io(data, io)\n        check_each(data)\n\n        if batched?\n          data = data.each_slice(@batch_size)\n        end\n\n        data.each do |item|\n          serialized = dump(item)\n          io.write_string(serialized)\n        end\n\n        io.flush\n      end\n\n\n      # === Load ==============================================================\n\n      def load_from_io(io)\n        return to_enum(__callee__, io) unless block_given?\n\n        loop do\n          size = io.read_int_or_eof\n          break if size == Spark::Constant::DATA_EOF\n\n          data = io.read(size)\n          data = load(data)\n\n          if batched?\n            data.each{|item| yield item }\n          else\n            yield data\n          end\n        end\n      end\n\n    end\n  end\nend\n\nSpark::Serializer.register('batched', Spark::Serializer::Batched)\n"
  },
  {
    "path": "lib/spark/serializer/cartesian.rb",
    "content": "module Spark\n  module Serializer\n    class Cartesian < Pair\n\n      def aggregate(item1, item2)\n        item1.product(item2)\n      end\n\n    end\n  end\nend\n\nSpark::Serializer.register('cartesian', Spark::Serializer::Cartesian)\n"
  },
  {
    "path": "lib/spark/serializer/compressed.rb",
    "content": "module Spark\n  module Serializer\n    class Compressed < Base\n\n      def initialize(serializer)\n        @serializer = serializer\n      end\n\n      def dump(data)\n        Zlib::Deflate.deflate(@serializer.dump(data))\n      end\n\n      def load(data)\n        @serializer.load(Zlib::Inflate.inflate(data))\n      end\n\n    end\n  end\nend\n\nbegin\n  # TODO: require only if it is necessary\n  require 'zlib'\n\n  Spark::Serializer.register('compress', 'compressed', Spark::Serializer::Compressed)\nrescue LoadError\nend\n"
  },
  {
    "path": "lib/spark/serializer/marshal.rb",
    "content": "module Spark\n  module Serializer\n    class Marshal < Base\n\n      def dump(data)\n        ::Marshal.dump(data)\n      end\n\n      def load(data)\n        ::Marshal.load(data)\n      end\n\n    end\n  end\nend\n\nSpark::Serializer.register('marshal', Spark::Serializer::Marshal)\n"
  },
  {
    "path": "lib/spark/serializer/message_pack.rb",
    "content": "module Spark\n  module Serializer\n    class MessagePack < Base\n\n      def dump(data)\n        ::MessagePack.dump(data)\n      end\n\n      def load(data)\n        ::MessagePack.load(data)\n      end\n\n    end\n  end\nend\n\nbegin\n  # TODO: require only if it is necessary\n  require 'msgpack'\n\n  Spark::Serializer.register('messagepack', 'message_pack', 'msgpack', 'msg_pack', Spark::Serializer::MessagePack)\nrescue LoadError\nend\n"
  },
  {
    "path": "lib/spark/serializer/oj.rb",
    "content": "module Spark\n  module Serializer\n    class Oj < Base\n\n      def dump(data)\n        ::Oj.dump(data)\n      end\n\n      def load(data)\n        ::Oj.load(data)\n      end\n\n    end\n  end\nend\n\nbegin\n  # TODO: require only if it is necessary\n  require 'oj'\n\n  Spark::Serializer.register('oj', Spark::Serializer::Oj)\nrescue LoadError\nend\n"
  },
  {
    "path": "lib/spark/serializer/pair.rb",
    "content": "module Spark\n  module Serializer\n    class Pair < Base\n\n      def initialize(serializer1, serializer2)\n        @serializer1 = serializer1\n        @serializer2 = serializer2\n      end\n\n      def to_s\n        \"#{name}(#{@serializer1}, #{@serializer2})\"\n      end\n\n      def aggregate(item1, item2)\n        item1.zip(item2)\n      end\n\n      def load_from_io(io)\n        return to_enum(__callee__, io) unless block_given?\n\n        loop do\n          size = io.read_int_or_eof\n          break if size == Spark::Constant::DATA_EOF\n\n          item1 = @serializer1.load(io.read(size))\n          item2 = @serializer2.load(io.read_string)\n\n          item1 = [item1] unless @serializer1.batched?\n          item2 = [item2] unless @serializer2.batched?\n\n          aggregate(item1, item2).each do |item|\n            yield item\n          end\n        end\n      end\n\n    end\n  end\nend\n\nSpark::Serializer.register('pair', Spark::Serializer::Pair)\n"
  },
  {
    "path": "lib/spark/serializer/text.rb",
    "content": "module Spark\n  module Serializer\n    class Text < Base\n\n      attr_reader :encoding\n\n      def initialize(encoding=Encoding::UTF_8)\n        error('Encoding must be an instance of Encoding') unless encoding.is_a?(Encoding)\n\n        @encoding = encoding\n      end\n\n      def load(data)\n        data.to_s.force_encoding(@encoding)\n      end\n\n      def to_s\n        \"Text(#{@encoding})\"\n      end\n\n    end\n  end\nend\n\nSpark::Serializer.register('string', 'text', Spark::Serializer::Text)\n"
  },
  {
    "path": "lib/spark/serializer.rb",
    "content": "module Spark\n  ##\n  # Serializer\n  #\n  module Serializer\n\n    DEFAULT_COMPRESS = false\n    DEFAULT_BATCH_SIZE = 1024\n    DEFAULT_SERIALIZER_NAME = 'marshal'\n\n    @@registered = {}\n\n    # Register class and create method for quick access.\n    # Class will be available also as __name__ for using\n    # in build method (Proc binding problem).\n    #\n    # == Examples:\n    #   register('test1', 'test2', Class)\n    #\n    #   Spark::Serializer.test1\n    #   Spark::Serializer.test2\n    #\n    #   # Proc binding problem\n    #   build { marshal } # => Spark::Serializer::Marshal\n    #\n    #   marshal = 1\n    #   build { marshal } # => 1\n    #\n    #   build { __marshal__ } # => Spark::Serializer::Marshal\n    #\n    def self.register(*args)\n      klass = args.pop\n      args.each do |arg|\n        @@registered[arg] = klass\n        define_singleton_method(arg.to_sym){|*args| klass.new(*args) }\n        define_singleton_method(\"__#{arg}__\".to_sym){|*args| klass.new(*args) }\n      end\n    end\n\n    def self.find(name)\n      @@registered[name.to_s.downcase]\n    end\n\n    def self.find!(name)\n      klass = find(name)\n\n      if klass.nil?\n        raise Spark::SerializeError, \"Unknow serializer #{name}.\"\n      end\n\n      klass\n    end\n\n    def self.build(text=nil, &block)\n      if block_given?\n        class_eval(&block)\n      else\n        class_eval(text.to_s.downcase)\n      end\n    end\n\n  end\nend\n\n# Parent\nrequire 'spark/serializer/base'\n\n# Basic\nrequire 'spark/serializer/oj'\nrequire 'spark/serializer/marshal'\nrequire 'spark/serializer/message_pack'\nrequire 'spark/serializer/text'\n\n# Others\nrequire 'spark/serializer/batched'\nrequire 'spark/serializer/auto_batched'\nrequire 'spark/serializer/compressed'\nrequire 'spark/serializer/pair'\nrequire 'spark/serializer/cartesian'\n"
  },
  {
    "path": "lib/spark/sort.rb",
    "content": "module Spark\n  module InternalSorter\n    class Base\n      def initialize(key_function)\n        @key_function = key_function\n      end\n    end\n\n    class Ascending < Base\n      def sort(data)\n        data.sort_by!(&@key_function)\n      end\n    end\n\n    class Descending < Ascending\n      def sort(data)\n        super\n        data.reverse!\n      end\n    end\n\n    def self.get(ascending, key_function)\n      if ascending\n        type = Ascending\n      else\n        type = Descending\n      end\n\n      type.new(key_function)\n    end\n  end\nend\n\n\nmodule Spark\n  class ExternalSorter\n\n    include Spark::Helper::System\n\n    # Items from GC cannot be destroyed so #make_parts need some reserve\n    MEMORY_RESERVE = 50 # %\n\n    # How big will be chunk for adding new memory because GC not cleaning\n    # immediately un-referenced variables\n    MEMORY_FREE_CHUNK = 10 # %\n\n    # How many items will be evaluate from iterator at start\n    START_SLICE_SIZE = 10\n\n    # Maximum of slicing. Memory control can be avoided by large value.\n    MAX_SLICE_SIZE = 10_000\n\n    # How many values will be taken from each enumerator.\n    EVAL_N_VALUES = 10\n\n    # Default key function\n    KEY_FUNCTION = lambda{|item| item}\n\n    attr_reader :total_memory, :memory_limit, :memory_chunk, :serializer\n\n    def initialize(total_memory, serializer)\n      @total_memory = total_memory\n      @memory_limit = total_memory * (100-MEMORY_RESERVE)    / 100\n      @memory_chunk = total_memory * (100-MEMORY_FREE_CHUNK) / 100\n      @serializer   = serializer\n    end\n\n    def add_memory!\n      @memory_limit += memory_chunk\n    end\n\n    def sort_by(iterator, ascending=true, key_function=KEY_FUNCTION)\n      return to_enum(__callee__, iterator, key_function) unless block_given?\n\n      create_temp_folder\n      internal_sorter = Spark::InternalSorter.get(ascending, key_function)\n\n      # Make N sorted enumerators\n      parts = make_parts(iterator, internal_sorter)\n\n      return [] if parts.empty?\n\n      # Need new key function because items have new structure\n      # From: [1,2,3] to [[1, Enumerator],[2, Enumerator],[3, Enumerator]]\n      key_function_with_enum = lambda{|(key, _)| key_function[key]}\n      internal_sorter = Spark::InternalSorter.get(ascending, key_function_with_enum)\n\n      heap  = []\n      enums = []\n\n      # Load first items to heap\n      parts.each do |part|\n        EVAL_N_VALUES.times {\n          begin\n            heap << [part.next, part]\n          rescue StopIteration\n            break\n          end\n        }\n      end\n\n      # Parts can be empty but heap not\n      while parts.any? || heap.any?\n        internal_sorter.sort(heap)\n\n        # Since parts are sorted and heap contains EVAL_N_VALUES method\n        # can add EVAL_N_VALUES items to the result\n        EVAL_N_VALUES.times {\n          break if heap.empty?\n\n          item, enum = heap.shift\n          enums << enum\n\n          yield item\n        }\n\n        # Add new element to heap from part of which was result item\n        while (enum = enums.shift)\n          begin\n            heap << [enum.next, enum]\n          rescue StopIteration\n            parts.delete(enum)\n            enums.delete(enum)\n          end\n        end\n      end\n\n    ensure\n      destroy_temp_folder\n    end\n\n    private\n\n      def create_temp_folder\n        @dir = Dir.mktmpdir\n      end\n\n      def destroy_temp_folder\n        FileUtils.remove_entry_secure(@dir) if @dir\n      end\n\n      # New part is created when current part exceeds memory limit (is variable)\n      # Every new part have more memory because of ruby GC\n      def make_parts(iterator, internal_sorter)\n        slice = START_SLICE_SIZE\n\n        parts = []\n        part  = []\n\n        loop do\n          begin\n            # Enumerator does not have slice method\n            slice.times { part << iterator.next }\n          rescue StopIteration\n            break\n          end\n\n          # Carefully memory_limit is variable\n          if memory_usage > memory_limit\n            # Sort current part with origin key_function\n            internal_sorter.sort(part)\n            # Tempfile for current part\n            # will be destroyed on #destroy_temp_folder\n            file = Tempfile.new(\"part\", @dir)\n            serializer.dump(part, file)\n            # Peek is at the end of file\n            file.seek(0)\n            parts << serializer.load(file)\n\n            # Some memory will be released but not immediately\n            # need some new memory for start\n            part.clear\n            add_memory!\n          else\n            slice = [slice*2, MAX_SLICE_SIZE].min\n          end\n        end\n\n        # Last part which is not in the file\n        if part.any?\n          internal_sorter.sort(part)\n          parts << part.each\n        end\n\n        parts\n      end\n\n  end # ExternalSorter\nend # Spark\n"
  },
  {
    "path": "lib/spark/sql/column.rb",
    "content": "module Spark\n  module SQL\n    class Column\n\n      # =============================================================================\n      # Creating\n\n      def self.to_java(col)\n        if col.is_a?(Column)\n          col.jcolumn\n        else\n          from_name(col)\n        end\n      end\n\n      def self.from_literal(literal)\n        JSQLFunctions.lit(literal)\n      end\n\n      def self.from_name(name)\n        JSQLFunctions.col(name)\n      end\n\n\n      # =============================================================================\n      # Functions for virtual columns\n\n      # Evaluates a list of conditions and returns one of multiple possible result expressions.\n      # If {Column.otherwise} is not invoked, nil is returned for unmatched conditions.\n      #\n      # == Parameters:\n      # condition:: a boolean {Column} expression\n      # value:: a literal value, or a {Column} expression\n      #\n      # == Example:\n      #   df.select(when(df['age'] == 2, 3).otherwise(4).alias(\"age\")).collect()\n      #   # [Row(age=3), Row(age=4)]\n      #\n      #   df.select(when(df.age == 2, df.age + 1).alias(\"age\")).collect()\n      #   # [Row(age=3), Row(age=nil)]\n      #\n      def self.when(condition, value)\n        Column.new(JSQLFunctions).when(condition, value)\n      end\n\n\n      # =============================================================================\n      # Initialized column\n\n      attr_reader :jcolumn\n\n      def initialize(jcolumn)\n        @jcolumn = jcolumn\n      end\n\n      FUNC_OPERATORS = {\n        '!' => 'not',\n        '~' => 'negate',\n        '-@' => 'negate'\n      }\n\n      BIN_OPERATORS = {\n        '[]' => 'apply',\n        '+' => 'plus',\n        '-' => 'minus',\n        '*' => 'multiply',\n        '/' => 'divide',\n        '%' => 'mod',\n        '==' => 'equalTo',\n        '!=' => 'notEqual',\n        '<' => 'lt',\n        '<=' => 'leq',\n        '>' => 'gt',\n        '>=' => 'geq',\n        '&' => 'and',\n        '|' => 'or',\n        'like' => 'like',\n        'starts_with' => 'startsWith',\n        'ends_with' => 'endsWith',\n        'bitwiseOR' => 'bitwiseOR',\n        'bitwiseAND' => 'bitwiseAND',\n        'bitwiseXOR' => 'bitwiseXOR',\n      }\n\n      UNARY_OPERATORS = {\n        'asc' => 'asc',\n        'desc' => 'desc',\n        'is_null' => 'isNull',\n        'is_not_null' => 'isNotNull'\n      }\n\n      FUNC_OPERATORS.each do |op, func|\n        eval <<-METHOD\n          def #{op}\n            func_op('#{func}')\n          end\n        METHOD\n      end\n\n      BIN_OPERATORS.each do |op, func|\n        eval <<-METHOD\n          def #{op}(item)\n            bin_op('#{func}', item)\n          end\n        METHOD\n      end\n\n      UNARY_OPERATORS.each do |op, func|\n        eval <<-METHOD\n          def #{op}\n            unary_op('#{func}')\n          end\n        METHOD\n      end\n\n      # An expression that gets an item at position ordinal out of a list,\n      # or gets an item by key out of a Hash.\n      #\n      # == Example:\n      #   df.select(df.l.get_item(0), df.d.get_item(\"key\")).show\n      #   # +----+------+\n      #   # |l[0]|d[key]|\n      #   # +----+------+\n      #   # |   1| value|\n      #   # +----+------+\n      #\n      #   df.select(df.l[0], df.d[\"key\"]).show\n      #   # +----+------+\n      #   # |l[0]|d[key]|\n      #   # +----+------+\n      #   # |   1| value|\n      #   # +----+------+\n      #\n      def get_item(key)\n        self[key]\n      end\n\n      # An expression that gets a field by name in a StructField.\n      #\n      # == Example:\n      #   df.select(df.r.get_field(\"b\")).show\n      #   # +----+\n      #   # |r[b]|\n      #   # +----+\n      #   # |   b|\n      #   # +----+\n      #\n      #   df.select(df.r.a).show\n      #   # +----+\n      #   # |r[a]|\n      #   # +----+\n      #   # |   1|\n      #   # +----+\n      #\n      def get_field(name)\n        self[name]\n      end\n\n      # Return a {Column} which is a substring of the column.\n      #\n      # == Parameters:\n      # start:: start position (Integer or Column)\n      # length:: length of the substring (Integer or Column)\n      #\n      # == Example:\n      #   df.select(df.name.substr(1, 3).alias(\"col\")).collect\n      #   # => [#<Row(col: \"Ali\")>, #<Row(col: \"Bob\")>]\n      #\n      def substr(start, length)\n        if start.is_a?(Integer) && length.is_a?(Integer)\n          new_jcolumn = jcolumn.substr(start, length)\n        elsif start.is_a?(Column) && length.is_a?(Column)\n          new_jcolumn = jcolumn.substr(start.jcolumn, length.jcolumn)\n        else\n          raise ArgumentError, \"Unsupported type: #{start.class} and #{length.class}.\"\n        end\n\n        Column.new(new_jcolumn)\n      end\n\n      # A boolean expression that is evaluated to true if the value of this\n      # expression is contained by the evaluated values of the arguments.\n      #\n      # == Example:\n      #   df[df.name.isin(\"Bob\", \"Mike\")].collect\n      #   # => [#<Row(age: \"5\", name: \"Bob\")>]\n      #\n      #   df[df.age.isin(1, 2, 3)].collect\n      #   # => [#<Row(age: \"2\", name: \"Alice\")>]\n      #\n      def isin(*cols)\n        if cols.size == 1 && cols.first.is_a?(Array)\n          cols = cols.first\n        end\n\n        cols = cols.map do |col|\n          Column.from_literal(col)\n        end\n\n        new_jcolumn = jcolumn.isin(Spark.jb.to_seq(cols))\n        Column.new(new_jcolumn)\n      end\n\n      # Returns this column aliased with a new name or names (in the case of expressions that\n      # return more than one column, such as explode).\n      #\n      # == Example:\n      #   df.select(df.age.alias(\"age2\")).collect\n      #   # => [#<Row(age2: \"2\")>, #<Row(age2: \"5\")>]\n      #\n      def alias(name)\n        Column.new(jcolumn.as(name))\n      end\n\n      # Convert the column into type data_type.\n      #\n      # == Example:\n      #   df.select(df.age.cast(\"string\").alias('ages')).collect\n      #   # => [#<Row(ages: \"2\")>, #<Row(ages: \"5\")>]\n      #\n      #   df.select(df.age.cast(StringType.new).alias('ages')).collect\n      #   # => [#<Row(ages: \"2\")>, #<Row(ages: \"5\")>]\n      #\n      def cast(data_type)\n        case data_type\n        when String\n          new_jcolumn = jcolumn.cast(data_type)\n        when DataType\n          jdata_type = JDataType.fromJson(data_type.json)\n          new_jcolumn = jcolumn.cast(jdata_type)\n        else\n          raise ArgumentError, \"Unsupported type: #{data_type.class}\"\n        end\n\n        Column.new(new_jcolumn)\n      end\n\n      # A boolean expression that is evaluated to true if the value of this\n      # expression is between the given columns.\n      #\n      # == Example:\n      #   df.select(df.name, df.age.between(2, 4)).show\n      #   # +-----+--------------------------+\n      #   # | name|((age >= 2) && (age <= 4))|\n      #   # +-----+--------------------------+\n      #   # |Alice|                      true|\n      #   # |  Bob|                     false|\n      #   # +-----+--------------------------+\n      #\n      def between(lower, upper)\n        (self >= lower) & (self <= upper)\n      end\n\n      # Evaluates a list of conditions and returns one of multiple possible result expressions.\n      # If {Column.otherwise} is not invoked, nil is returned for unmatched conditions.\n      #\n      # == Parameters:\n      # condition:: a boolean {Column} expression.\n      # value:: a literal value, or a {Column} expression.\n      #\n      # == Example:\n      #   df.select(df.name, Column.when(df.age > 4, 1).when(df.age < 3, -1).otherwise(0)).show\n      #   # +-----+--------------------------------------------------------+\n      #   # | name|CASE WHEN (age > 4) THEN 1 WHEN (age < 3) THEN -1 ELSE 0|\n      #   # +-----+--------------------------------------------------------+\n      #   # |Alice|                                                      -1|\n      #   # |  Bob|                                                       1|\n      #   # +-----+--------------------------------------------------------+\n      #\n      def when(condition, value)\n        unless condition.is_a?(Column)\n          raise ArgumentError, \"Condition must be a Column\"\n        end\n\n        if value.is_a?(Column)\n          value = value.jcolumn\n        end\n        new_jcolumn = jcolumn.when(condition.jcolumn, value)\n        Column.new(new_jcolumn)\n      end\n\n\n      # Evaluates a list of conditions and returns one of multiple possible result expressions.\n      # If {Column.otherwise} is not invoked, nil is returned for unmatched conditions.\n      #\n      # == Example:\n      #   df.select(df.name, Column.when(df.age > 3, 1).otherwise(0)).show\n      #   # +-----+---------------------------------+\n      #   # | name|CASE WHEN (age > 3) THEN 1 ELSE 0|\n      #   # +-----+---------------------------------+\n      #   # |Alice|                                0|\n      #   # |  Bob|                                1|\n      #   # +-----+---------------------------------+\n      #\n      def otherwise(value)\n        if value.is_a?(Column)\n          value = value.jcolumn\n        end\n\n        new_jcolumn = jcolumn.otherwise(value)\n        Column.new(new_jcolumn)\n      end\n\n      def over(*)\n        raise Spark::NotImplemented\n      end\n\n      def method_missing(method, item)\n        get_field(item)\n      end\n\n      def to_s\n        \"Column(\\\"#{jcolumn.toString}\\\")\"\n      end\n\n      def inspect\n        \"#<#{to_s}>\"\n      end\n\n\n      alias_method :as, :alias\n      alias_method :slice, :substr\n      alias_method :astype, :cast\n\n      private\n\n        def func_op(name)\n          new_jcolumn = JSQLFunctions.__send__(name, jcolumn)\n          Column.new(new_jcolumn)\n        end\n\n        def bin_op(name, item)\n          if item.is_a?(Column)\n            col = item.jcolumn\n          else\n            col = item\n          end\n\n          new_jcolumn = jcolumn.__send__(name, col)\n          Column.new(new_jcolumn)\n        end\n\n        def unary_op(name)\n          new_jcolumn = jcolumn.__send__(name)\n          Column.new(new_jcolumn)\n        end\n\n\n    end\n  end\nend\n"
  },
  {
    "path": "lib/spark/sql/context.rb",
    "content": "module Spark\n  module SQL\n    class Context\n\n      attr_reader :spark_context, :jsql_context\n\n      def initialize(spark_context)\n        @spark_context = spark_context\n        @jsql_context = JSQLContext.new(spark_context.sc)\n      end\n\n      def read\n        DataFrameReader.new(self)\n      end\n\n    end\n  end\nend\n"
  },
  {
    "path": "lib/spark/sql/data_frame.rb",
    "content": "module Spark\n  module SQL\n    ##\n    # Spark::SQL::DataFrame\n    #\n    # All example are base on people.json\n    #\n    class DataFrame\n\n      attr_reader :jdf, :sql_context\n\n      def initialize(jdf, sql_context)\n        @jdf = jdf\n        @sql_context = sql_context\n      end\n\n      # Returns the column as a {Column}.\n      #\n      # == Examples:\n      #   df.select(df['age']).collect\n      #   # => [#<Row {\"age\"=>2}>, #<Row {\"age\"=>5}>]\n      #\n      #   df[ [\"name\", \"age\"] ].collect\n      #   # => [#<Row {\"name\"=>\"Alice\", \"age\"=>2}>, #<Row {\"name\"=>\"Bob\", \"age\"=>5}>]\n      #\n      #   df[ df.age > 3 ].collect\n      #   # => [#<Row {\"age\"=>5, \"name\"=>\"Bob\"}>]\n      #\n      #   df[df[0] > 3].collect\n      #   # => [#<Row {\"age\"=>5, \"name\"=>\"Bob\"}>]\n      #\n      def [](item)\n        case item\n        when String\n          jcolumn = jdf.apply(item)\n          Column.new(jcolumn)\n        when Array\n          select(*item)\n        when Numeric\n          jcolumn = jdf.apply(columns[item])\n          Column.new(jcolumn)\n        when Column\n          where(item)\n        else\n          raise ArgumentError, \"Unsupported type: #{item.class}\"\n        end\n      end\n\n      # Returns all column names as a Array.\n      #\n      # == Example:\n      #   df.columns\n      #   # => ['age', 'name']\n      #\n      def columns\n        schema.fields.map(&:name)\n      end\n\n      # Returns the schema of this {DataFrame} as a {StructType}.\n      def schema\n        return @schema if @schema\n\n        begin\n          @schema = DataType.parse(JSON.parse(jdf.schema.json))\n        rescue => e\n          raise Spark::ParseError, 'Unable to parse datatype from schema'\n        end\n      end\n\n      def show_string(n=20, truncate=true)\n        jdf.showString(n, truncate)\n      end\n\n      # Prints the first n rows to the console.\n      #\n      # == Parameters:\n      # n:: Number of rows to show.\n      # truncate:: Whether truncate long strings and align cells right.\n      #\n      def show(n=20, truncate=true)\n        puts show_string(n, truncate)\n      end\n\n      # Prints out the schema in the tree format.\n      #\n      # == Example:\n      #   df.print_schema\n      #   # root\n      #   #  |-- age: integer (nullable = true)\n      #   #  |-- name: string (nullable = true)\n      #\n      def print_schema\n        puts jdf.schema.treeString\n      end\n\n      def explain(extended=false)\n        if extended\n          jdf.queryExecution.toString\n        else\n          jdf.queryExecution.executedPlan.toString\n        end\n      end\n\n      # Prints the (logical and physical) plans to the console for debugging purpose.\n      #\n      # == Example:\n      #   df.print_explain\n      #   # Scan PhysicalRDD[age#0,name#1]\n      #\n      #   df.print_explain(true)\n      #   # == Parsed Logical Plan ==\n      #   # ...\n      #   # == Analyzed Logical Plan ==\n      #   # ...\n      #   # == Optimized Logical Plan ==\n      #   # ...\n      #   # == Physical Plan ==\n      #   # ...\n      #\n      def print_explain(extended=false)\n        puts explain(extended)\n      end\n\n      # Returns all column names and their data types as a list.\n      #\n      # == Example:\n      #   df.dtypes\n      #   # => [('age', 'int'), ('name', 'string')]\n      #\n      def dtypes\n        schema.fields.map do |field|\n          [field.name, field.data_type.simple_string]\n        end\n      end\n\n      def inspect\n        types = dtypes.map do |(name, type)|\n          \"#{name}: #{type}\"\n        end\n\n        \"#<DataFrame(#{types.join(', ')})>\"\n      end\n\n      # Get column by name\n      def method_missing(method, *args, &block)\n        name = method.to_s\n        if columns.include?(name)\n          self[name]\n        else\n          super\n        end\n      end\n\n\n      # =============================================================================\n      # Collect\n\n      # Returns all the records as a list of {Row}.\n      #\n      # == Example:\n      #   df.collect\n      #   # => [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]\n      #\n      def collect\n        Spark.jb.call(jdf, 'collect')\n      end\n\n      def collect_as_hash\n        result = collect\n        result.map!(&:to_h)\n        result\n      end\n\n      def values\n        result = collect\n        result.map! do |item|\n          item.to_h.values\n        end\n        result\n      end\n\n      # Returns the number of rows in this {DataFrame}.\n      def count\n        jdf.count.to_i\n      end\n\n      # Returns the first num rows as an Array of {Row}.\n      def take(num)\n        limit(num).collect\n      end\n\n      # Return first {Row}.\n      def first\n        take(1).first\n      end\n\n\n      # =============================================================================\n      # Queries\n\n      # Projects a set of expressions and returns a new {DataFrame}\n      #\n      # == Parameters:\n      # *cols::\n      #   List of column names (string) or expressions {Column}.\n      #   If one of the column names is '*', that column is expanded to include all columns\n      #   in the current DataFrame.\n      #\n      # == Example:\n      #   df.select('*').collect\n      #   # => [#<Row {\"age\"=>2, \"name\"=>\"Alice\"}>, #<Row {\"age\"=>5, \"name\"=>\"Bob\"}>]\n      #\n      #   df.select('name', 'age').collect\n      #   # => [#<Row {\"name\"=>\"Alice\", \"age\"=>2}>, #<Row {\"name\"=>\"Bob\", \"age\"=>5}>]\n      #\n      #   df.select(df.name, (df.age + 10).alias('age')).collect\n      #   # => [#<Row {\"name\"=>\"Alice\", \"age\"=>12}>, #<Row {\"name\"=>\"Bob\", \"age\"=>15}>]\n      #\n      def select(*cols)\n        jcols = cols.map do |col|\n          Column.to_java(col)\n        end\n\n        new_jdf = jdf.select(jcols)\n        DataFrame.new(new_jdf, sql_context)\n      end\n\n      # Filters rows using the given condition.\n      #\n      # == Examples:\n      #   df.filter(df.age > 3).collect\n      #   # => [#<Row {\"age\"=>5, \"name\"=>\"Bob\"}>]\n      #\n      #   df.where(df.age == 2).collect\n      #   # => [#<Row {\"age\"=>2, \"name\"=>\"Alice\"}>]\n      #\n      #   df.filter(\"age > 3\").collect\n      #   # => [#<Row {\"age\"=>5, \"name\"=>\"Bob\"}>]\n      #\n      #   df.where(\"age = 2\").collect\n      #   # => [#<Row {\"age\"=>2, \"name\"=>\"Alice\"}>]\n      #\n      def filter(condition)\n        case condition\n        when String\n          new_jdf = jdf.filter(condition)\n        when Column\n          new_jdf = jdf.filter(condition.jcolumn)\n        else\n          raise ArgumentError, 'Condition must be String or Column'\n        end\n\n        DataFrame.new(new_jdf, sql_context)\n      end\n\n      # Limits the result count to the number specified.\n      def limit(num)\n        new_jdf = jdf.limit(num)\n        DataFrame.new(new_jdf, sql_context)\n      end\n\n\n      alias_method :where, :filter\n\n    end\n  end\nend\n"
  },
  {
    "path": "lib/spark/sql/data_frame_reader.rb",
    "content": "module Spark\n  module SQL\n    class DataFrameReader\n\n      attr_reader :sql_context, :jreader\n\n      def initialize(sql_context)\n        @sql_context = sql_context\n        @jreader = sql_context.jsql_context.read\n      end\n\n      def df(jdf)\n        DataFrame.new(jdf, sql_context)\n      end\n\n      # Specifies the input data source format.\n      # Parameter is name of the data source, e.g. 'json', 'parquet'.\n      def format(source)\n        jreader.format(source)\n        self\n      end\n\n      # Adds an input option for the underlying data source.\n      def option(key, value)\n        jreader.option(key, value.to_s)\n        self\n      end\n\n      # Adds input options for the underlying data source.\n      def options(options)\n        options.each do |key, value|\n          jreader.option(key, value.to_s)\n        end\n        self\n      end\n\n      # Loads data from a data source and returns it as a :class`DataFrame`.\n      #\n      # == Parameters:\n      # path:: Optional string for file-system backed data sources.\n      # format:: Optional string for format of the data source. Default to 'parquet'.\n      # schema:: Optional {StructType} for the input schema.\n      # options:: All other string options.\n      #\n      def load(path=nil, new_format=nil, new_schema=nil, new_options=nil)\n        new_format && format(new_format)\n        new_schema && schema(new_schema)\n        new_options && options(new_options)\n\n        if path.nil?\n          df(jreader.load)\n        else\n          df(jreader.load(path))\n        end\n      end\n\n      # Specifies the input schema.\n      #\n      # Some data sources (e.g. JSON) can infer the input schema automatically from data.\n      # By specifying the schema here, the underlying data source can skip the schema\n      # inference step, and thus speed up data loading.\n      #\n      # Parameter schema must be StructType object.\n      #\n      def schema(new_schema)\n        unless new_schema.is_a?(StructType)\n          raise ArgumentError, 'Schema must be a StructType.'\n        end\n\n        jschema = sql_context.jsql_context.parseDataType(new_schema.json)\n        jreader.schema(jschema)\n        self\n      end\n\n      # Loads a JSON file (one object per line) and returns the result as {DataFrame}\n      #\n      # If the schema parameter is not specified, this function goes\n      # through the input once to determine the input schema.\n      #\n      # == Parameters:\n      # path:: string, path to the JSON dataset\n      # schema:: an optional {StructType} for the input schema.\n      #\n      # == Example:\n      #   df = sql.read.json('people.json')\n      #   df.dtypes\n      #   # => [('age', 'bigint'), ('name', 'string')]\n      #\n      def json(path, new_schema=nil)\n        # ClassNotFoundException: Failed to load class for data source: json\n        # df(jreader.json(path))\n\n        load(path, 'org.apache.spark.sql.execution.datasources.json', new_schema)\n      end\n\n    end\n  end\nend\n"
  },
  {
    "path": "lib/spark/sql/data_type.rb",
    "content": "module Spark\n  module SQL\n    ##\n    # Spark::SQL::DataType\n    #\n    class DataType\n\n      cattr_accessor :atomic_types\n      self.atomic_types = {}\n\n      cattr_accessor :complex_types\n      self.complex_types = {}\n\n      def self.parse(data)\n        if data.is_a?(Hash)\n          type = data['type']\n          if complex_types.has_key?(type)\n            complex_types[type].from_json(data)\n          # elsif type == 'udt'\n          #   UserDefinedType.from_json(data)\n          else\n            raise Spark::SQLError, \"Unsupported type: #{type}\"\n          end\n        else\n          if atomic_types.has_key?(data)\n            atomic_types[data].new\n          else\n            raise Spark::SQLError, \"Unsupported type: #{type}\"\n          end\n        end\n      end\n\n      def self.class_name\n        name.split('::').last\n      end\n\n      def self.type_name\n        class_name.sub('Type', '').downcase\n      end\n\n      def self.complex\n        complex_types[type_name] = self\n      end\n\n      def self.atomic\n        atomic_types[type_name] = self\n      end\n\n      def ==(other)\n        self.class == other.class && self.to_s == other.to_s\n      end\n\n      def type_name\n        self.class.type_name\n      end\n\n      def simple_string\n        type_name\n      end\n\n      def json_value\n        type_name\n      end\n\n      def json\n        json_value.to_json\n      end\n\n      def to_s\n        self.class.class_name\n      end\n\n      def inspect\n        \"#<#{to_s}>\"\n      end\n\n    end\n\n    ##\n    # Spark::SQL::StructType\n    #\n    # Struct type, consisting of a list of {StructField}.\n    # This is the data type representing a {Row}.\n    #\n    # == Example:\n    #   struct1 = StructType.new([StructField.new('f1', StringType.new, true)])\n    #   struct2 = StructType.new([StructField.new('f2', StringType.new, true)])\n    #   struct1 == struct2\n    #   # => true\n    #\n    class StructType < DataType\n      complex\n\n      attr_reader :fields\n\n      def self.from_json(json)\n        fields = json['fields'].map do |field|\n          StructField.from_json(field)\n        end\n\n        StructType.new(fields)\n      end\n\n      def initialize(fields=[])\n        @fields = fields\n        @names = fields.map(&:name)\n      end\n\n      def json_value\n        {\n          'type' => type_name,\n          'fields' => fields.map(&:json_value)\n        }\n      end\n\n      def to_s\n        \"StructType(#{fields.join(', ')})\"\n      end\n    end\n\n\n    ##\n    # Spark::SQL::StructField\n    #\n    class StructField < DataType\n\n      attr_reader :name, :data_type, :nullable, :metadata\n\n      def self.from_json(json)\n        StructField.new(json['name'], DataType.parse(json['type']), json['nullable'], json['metadata'])\n      end\n\n      # A field in {StructType}.\n      #\n      # == Parameters:\n      # name:: string, name of the field.\n      # data_type:: {DataType} of the field.\n      # nullable:: boolean, whether the field can be null (nil) or not.\n      # metadata:: a dict from string to simple type that can be to_internald to JSON automatically\n      #\n      # == Example:\n      #   f1 = StructField.new('f1', StringType.new, true)\n      #   f2 = StructField.new('f2', StringType.new, true)\n      #   f1 == f2\n      #   # => true\n      #\n      def initialize(name, data_type, nullable=true, metadata={})\n        @name = name\n        @data_type = data_type\n        @nullable = nullable\n        @metadata = metadata\n      end\n\n      def json_value\n        {\n          'name' => name,\n          'type' => data_type.json_value,\n          'nullable' => nullable,\n          'metadata' => metadata,\n        }\n      end\n\n      def to_s\n        %{StructField(#{name}, #{data_type}, #{nullable})}\n      end\n    end\n\n    ##\n    # Spark::SQL::AtomicType\n    #\n    # An internal type used to represent everything that is not\n    # null, UDTs, arrays, structs, and maps.\n    #\n    class AtomicType < DataType\n    end\n\n    ##\n    # Spark::SQL::BooleanType\n    #\n    # Boolean data type.\n    #\n    class BooleanType < AtomicType\n      atomic\n    end\n\n    ##\n    # Spark::SQL::NumericType\n    #\n    # Numeric data types.\n    #\n    class NumericType < AtomicType\n    end\n\n\n    ##\n    # Spark::SQL::IntegralType\n    #\n    # Integral data types.\n    #\n    class IntegralType < NumericType\n    end\n\n\n    ##\n    # Spark::SQL::StringType\n    #\n    # String data type.\n    #\n    class StringType < AtomicType\n      atomic\n    end\n\n\n    ##\n    # Spark::SQL::LongType\n    #\n    # Long data type, i.e. a signed 64-bit integer.\n    #\n    # If the values are beyond the range of [-9223372036854775808, 9223372036854775807],\n    # please use {DecimalType}.\n    #\n    class LongType < IntegralType\n      atomic\n    end\n\n  end\nend\n"
  },
  {
    "path": "lib/spark/sql/row.rb",
    "content": "module Spark\n  module SQL\n    ##\n    # Spark::SQL::Row\n    #\n    class Row\n      attr_reader :data\n\n      def self.from_java(object, with_schema=true)\n        if with_schema\n          fields = object.schema.fieldNames\n        else\n          # Create virtual schema (t0, t1, t2, ...)\n          raise Spark::NotImplemented, 'Row must have a schema'\n        end\n\n        if object.anyNull\n          data = {}\n          object.size.times do |i|\n            if object.isNullAt(i)\n              value = nil\n            else\n              value = Spark.jb.to_ruby(object.get(i))\n            end\n\n            data[ fields[i] ] = value\n          end\n        else\n          data = fields.zip(Spark.jb.to_ruby(object.values))\n        end\n\n        Row.new(data)\n      end\n\n      def initialize(data={})\n        @data = data.to_h\n      end\n\n      def [](item)\n        @data[item]\n      end\n\n      def to_h\n        @data\n      end\n\n      def inspect\n        formated = data.map do |key, value|\n          \"#{key}: \\\"#{value}\\\"\"\n        end\n\n        %{#<Row(#{formated.join(', ')})>}\n      end\n\n    end\n  end\nend\n"
  },
  {
    "path": "lib/spark/sql.rb",
    "content": "module Spark\n  module SQL\n    extend Spark::Library\n\n    autoload_without_import :Context,         'spark/sql/context'\n    autoload_without_import :DataType,        'spark/sql/data_type'\n    autoload_without_import :DataFrame,       'spark/sql/data_frame'\n    autoload_without_import :DataFrameReader, 'spark/sql/data_frame_reader'\n\n    autoload :Row,    'spark/sql/row'\n    autoload :Column, 'spark/sql/column'\n\n    # Types\n    autoload :StructType,   'spark/sql/data_type'\n    autoload :StructField,  'spark/sql/data_type'\n    autoload :AtomicType,   'spark/sql/data_type'\n    autoload :NumericType,  'spark/sql/data_type'\n    autoload :IntegralType, 'spark/sql/data_type'\n    autoload :StringType,   'spark/sql/data_type'\n    autoload :LongType,     'spark/sql/data_type'\n  end\n\n  SQLContext = Spark::SQL::Context\nend\n"
  },
  {
    "path": "lib/spark/stat_counter.rb",
    "content": "module Spark\n  class StatCounter\n\n    attr_reader :n   # count of our values\n    attr_reader :mu  # mean of our values\n    attr_reader :m2  # variance numerator (sum of (x - mean)^2)\n    attr_reader :max # max of our values\n    attr_reader :min # min of our values\n\n    def initialize(iterator)\n      @n = 0\n      @mu = 0.0\n      @m2 = 0.0\n      @max = -Float::INFINITY\n      @min = Float::INFINITY\n\n      merge(iterator)\n    end\n\n    def merge(other)\n      if other.is_a?(Spark::StatCounter)\n        merge_stat_counter(other)\n      elsif other.respond_to?(:each)\n        merge_array(other)\n      else\n        merge_value(other)\n      end\n\n      self\n    end\n\n    def sum\n      @n * @mu\n    end\n\n    # Return the variance of the values.\n    def variance\n      if @n == 0\n        Float::NAN\n      else\n        @m2 / @n\n      end\n    end\n\n    # Return the sample variance, which corrects for bias in estimating the variance by dividing\n    # by N-1 instead of N.\n    def sample_variance\n      if @n <= 1\n        Float::NAN\n      else\n        @m2 / (@n - 1)\n      end\n    end\n\n    # Return the standard deviation of the values.\n    def stdev\n      Math.sqrt(variance)\n    end\n\n    # Return the sample standard deviation of the values, which corrects for bias in estimating the\n    # variance by dividing by N-1 instead of N.\n    def sample_stdev\n      Math.sqrt(sample_variance)\n    end\n\n    def to_s\n      \"(count: #{count}, mean: #{mean}, stdev: #{stdev}, max: #{max}, min: #{min})\"\n    end\n\n    alias_method :count, :n\n    alias_method :mean, :mu\n    alias_method :max_value, :max\n    alias_method :min_value, :min\n    alias_method :sampleStdev, :sample_stdev\n    alias_method :sampleVariance, :sample_variance\n\n    private\n\n      def merge_stat_counter(other)\n        if other == self\n          other = self.deep_copy\n        end\n\n        if @n == 0\n          @n = other.n\n          @mu = other.mu\n          @m2 = other.m2\n          @max = other.max\n          @min = other.min\n        elsif other.n != 0\n          delta = other.mu - @mu\n\n          if other.n * 10 < @n\n            @mu = @mu + (delta * other.n) / (@n + other.n)\n          elsif @n * 10 < other.n\n            @mu = other.mu - (delta * @n) / (@n + other.n)\n          else\n            @mu = (@mu * @n + other.mu * other.n) / (@n + other.n)\n          end\n\n          @max = [@max, other.max].max\n          @min = [@min, other.min].min\n\n          @m2 += other.m2 + (delta * delta * @n * other.n) / (@n + other.n)\n          @n += other.n\n        end\n      end\n\n      def merge_array(array)\n        array.each do |item|\n          merge_value(item)\n        end\n      end\n\n      def merge_value(value)\n        delta = value - @mu\n        @n += 1\n        @mu += delta / @n\n        @m2 += delta * (value - @mu)\n        @max = [@max, value].max\n        @min = [@min, value].min\n      end\n\n  end\nend\n"
  },
  {
    "path": "lib/spark/storage_level.rb",
    "content": "# Necessary libraries\nSpark.load_lib\n\nmodule Spark\n  class StorageLevel\n\n    def self.reload\n      return if @reloaded\n      reload!\n      @reloaded = true\n    end\n\n    def self.reload!\n      self.const_set(:NONE,                  JStorageLevel.NONE)\n      self.const_set(:DISK_ONLY,             JStorageLevel.DISK_ONLY)\n      self.const_set(:DISK_ONLY_2,           JStorageLevel.DISK_ONLY_2)\n      self.const_set(:MEMORY_ONLY,           JStorageLevel.MEMORY_ONLY)\n      self.const_set(:MEMORY_ONLY_SER,       JStorageLevel.MEMORY_ONLY_SER)\n      self.const_set(:MEMORY_ONLY_2,         JStorageLevel.MEMORY_ONLY_2)\n      self.const_set(:MEMORY_ONLY_SER_2,     JStorageLevel.MEMORY_ONLY_SER_2)\n      self.const_set(:MEMORY_AND_DISK,       JStorageLevel.MEMORY_AND_DISK)\n      self.const_set(:MEMORY_AND_DISK_2,     JStorageLevel.MEMORY_AND_DISK_2)\n      self.const_set(:MEMORY_AND_DISK_SER,   JStorageLevel.MEMORY_AND_DISK_SER)\n      self.const_set(:MEMORY_AND_DISK_SER_2, JStorageLevel.MEMORY_AND_DISK_SER_2)\n      self.const_set(:OFF_HEAP,              JStorageLevel.OFF_HEAP)\n    end\n\n    def self.java_get(arg)\n      reload\n\n      if arg.is_a?(String)\n        const_get(arg.upcase)\n      else\n        arg\n      end\n    end\n\n  end\nend\n"
  },
  {
    "path": "lib/spark/version.rb",
    "content": "module Spark\n  VERSION = '1.2.1'\nend\n"
  },
  {
    "path": "lib/spark/worker/master.rb",
    "content": "#!/usr/bin/env ruby\n\n$PROGRAM_NAME = 'RubySparkMaster'\n\nrequire 'socket'\nrequire 'io/wait'\nrequire 'nio'\n\nrequire_relative 'worker'\n\n# New process group\n# Otherwise master can be killed from pry console\nProcess.setsid\n\n# =================================================================================================\n# Master\n#\nmodule Master\n\n  def self.create\n    case ARGV[0].to_s.strip\n    when 'thread'\n      Master::Thread.new\n    else\n      Master::Process.new\n    end\n  end\n\n  class Base\n    include Spark::Constant\n\n    def initialize\n      @port = ARGV[1].to_s.strip.to_i\n      @socket = TCPSocket.open('localhost', @port)\n      @worker_arguments = @socket.read_string\n    end\n\n    def run\n      selector = NIO::Selector.new\n      monitor = selector.register(@socket, :r)\n      monitor.value = Proc.new { receive_message }\n      loop {\n        selector.select {|monitor| monitor.value.call}\n      }\n    end\n\n    def receive_message\n      command = @socket.read_int\n\n      case command\n      when CREATE_WORKER\n        create_worker\n      when KILL_WORKER\n        kill_worker\n      when KILL_WORKER_AND_WAIT\n        kill_worker_and_wait\n      end\n    end\n\n    def kill_worker_and_wait\n      if kill_worker\n        @socket.write_int(SUCCESSFULLY_KILLED)\n      else\n        @socket.write_int(UNSUCCESSFUL_KILLING)\n      end\n    end\n  end\n\n  # ===============================================================================================\n  # Worker::Process\n  #\n  class Process < Base\n\n    def create_worker\n      if fork?\n        pid = ::Process.fork do\n          Worker::Process.new(@port).run\n        end\n      else\n        pid = ::Process.spawn(\"ruby #{@worker_arguments} worker.rb #{@port}\")\n      end\n\n      # Detach child from master to avoid zombie process\n      ::Process.detach(pid)\n    end\n\n    def kill_worker\n      worker_id = @socket.read_long\n      ::Process.kill('TERM', worker_id)\n    rescue\n      nil\n    end\n\n    def fork?\n      @can_fork ||= _fork?\n    end\n\n    def _fork?\n      return false if !::Process.respond_to?(:fork)\n\n      pid = ::Process.fork\n      exit unless pid # exit the child immediately\n      true\n    rescue NotImplementedError\n      false\n    end\n\n  end\n\n  # ===============================================================================================\n  # Worker::Thread\n  #\n  class Thread < Base\n\n    def initialize\n      ::Thread.abort_on_exception = true\n\n      # For synchronous access to socket IO\n      $mutex_for_command  = Mutex.new\n      $mutex_for_iterator = Mutex.new\n\n      super\n    end\n\n    def create_worker\n      ::Thread.new do\n        Worker::Thread.new(@port).run\n      end\n    end\n\n    def kill_worker\n      worker_id = @socket.read_long\n\n      thread = ObjectSpace._id2ref(worker_id)\n      thread.kill\n    rescue\n      nil\n    end\n\n  end\nend\n\n# Create proper master by worker_type\nMaster.create.run\n"
  },
  {
    "path": "lib/spark/worker/spark_files.rb",
    "content": "class SparkFiles\n\n  class << self\n    attr_accessor :root_directory\n  end\n\n  def self.get(file_name)\n    File.join(root_directory, file_name)\n  end\n\n  def self.get_content(file_name)\n    File.read(get(file_name))\n  end\n\nend\n"
  },
  {
    "path": "lib/spark/worker/worker.rb",
    "content": "#!/usr/bin/env ruby\n\n# Load root of the gem\nlib = File.expand_path(File.join('..', '..'), File.dirname(__FILE__))\n$LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)\n\nrequire 'ruby-spark.rb'\nrequire 'socket'\n\nrequire_relative 'spark_files'\n\n\n# =================================================================================================\n# Worker\n#\n# Iterator is LAZY !!!\n#\nmodule Worker\n  class Base\n\n    include Spark::Helper::Serialize\n    include Spark::Helper::System\n    include Spark::Constant\n\n    attr_accessor :socket\n\n    def initialize(port)\n      # Open socket to Spark\n      @socket = TCPSocket.open('localhost', port)\n\n      # Send back worker ID\n      socket.write_long(id)\n    end\n\n    def run\n      begin\n        compute\n      rescue => e\n        send_error(e)\n      else\n        successful_finish\n      end\n    end\n\n    private\n\n      def before_start\n        # Should be implemented in sub-classes\n      end\n\n      def before_end\n        # Should be implemented in sub-classes\n      end\n\n      # These methods must be on one method because iterator is Lazy\n      # which mean that exception can be raised at `serializer` or `compute`\n      def compute\n        before_start\n\n        # Load split index\n        @split_index = socket.read_int\n\n        # Load files\n        SparkFiles.root_directory = socket.read_string\n\n        # Load broadcast\n        count = socket.read_int\n        count.times do\n          Spark::Broadcast.register(socket.read_long, socket.read_string)\n        end\n\n        # Load command\n        @command = socket.read_data\n\n        # Load iterator\n        @iterator = @command.deserializer.load_from_io(socket).lazy\n\n        # Compute\n        @iterator = @command.execute(@iterator, @split_index)\n\n        # Result is not iterable\n        @iterator = [@iterator] unless @iterator.respond_to?(:each)\n\n        # Send result\n        @command.serializer.dump_to_io(@iterator, socket)\n      end\n\n      def send_error(e)\n        # Flag\n        socket.write_int(WORKER_ERROR)\n\n        # Message\n        socket.write_string(e.message)\n\n        # Backtrace\n        socket.write_int(e.backtrace.size)\n        e.backtrace.each do |item|\n          socket.write_string(item)\n        end\n\n        socket.flush\n\n        # Wait for spark\n        # Socket is closed before throwing an exception\n        # Singal that ruby exception was fully received\n        until socket.closed?\n          sleep(0.1)\n        end\n\n        # Depend on type of worker\n        kill_worker\n      end\n\n      def successful_finish\n        # Finish\n        socket.write_int(WORKER_DONE)\n\n        # Send changed accumulator\n        changed = Spark::Accumulator.changed\n        socket.write_int(changed.size)\n        changed.each do |accumulator|\n          socket.write_data([accumulator.id, accumulator.value])\n        end\n\n        # Send it\n        socket.flush\n\n        before_end\n      end\n\n      def log(message=nil)\n        return if !$DEBUG\n\n        $stdout.puts %{==> #{Time.now.strftime('%H:%M:%S')} [#{id}] #{message}}\n        $stdout.flush\n      end\n\n  end\n\n  # ===============================================================================================\n  # Worker::Process\n  #\n  class Process < Base\n\n    def id\n      ::Process.pid\n    end\n\n    private\n\n      def before_start\n        $PROGRAM_NAME = 'RubySparkWorker'\n      end\n\n      def kill_worker\n        Process.exit(false)\n      end\n\n  end\n\n  # ===============================================================================================\n  # Worker::Thread\n  #\n  class Thread < Base\n\n    def id\n      ::Thread.current.object_id\n    end\n\n    private\n\n      def load_command\n        $mutex_for_command.synchronize { super }\n      end\n\n      # Threads changing for reading is very slow\n      # Faster way is do it one by one\n      def load_iterator\n        # Wait for incoming connection for preventing deadlock\n        if jruby?\n          socket.io_wait\n        else\n          socket.wait_readable\n        end\n\n        $mutex_for_iterator.synchronize { super }\n      end\n\n      def kill_worker\n        Thread.current.kill\n      end\n\n  end\nend\n\n# Worker is loaded as standalone\nif $PROGRAM_NAME == __FILE__\n  worker = Worker::Process.new(ARGV[0])\n  worker.run\nend\n"
  },
  {
    "path": "lib/spark.rb",
    "content": "# Gems and libraries\nrequire 'method_source'\nrequire 'securerandom'\nrequire 'forwardable'\nrequire 'sourcify'\nrequire 'socket'\nrequire 'tempfile'\nrequire 'tmpdir'\nrequire 'json'\n\nmodule Spark\n  autoload :Context,        'spark/context'\n  autoload :Config,         'spark/config'\n  autoload :RDD,            'spark/rdd'\n  autoload :CLI,            'spark/cli'\n  autoload :Build,          'spark/build'\n  autoload :Serializer,     'spark/serializer'\n  autoload :Helper,         'spark/helper'\n  autoload :StorageLevel,   'spark/storage_level'\n  autoload :Command,        'spark/command'\n  autoload :CommandBuilder, 'spark/command_builder'\n  autoload :Sampler,        'spark/sampler'\n  autoload :Logger,         'spark/logger'\n  autoload :JavaBridge,     'spark/java_bridge'\n  autoload :ExternalSorter, 'spark/sort'\n  autoload :Constant,       'spark/constant'\n  autoload :Broadcast,      'spark/broadcast'\n  autoload :Accumulator,    'spark/accumulator'\n  autoload :StatCounter,    'spark/stat_counter'\n  autoload :Library,        'spark/library'\n\n  # Mllib\n  autoload :Mllib, 'spark/mllib'\n\n  # SQL\n  autoload :SQL,        'spark/sql'\n  autoload :SQLContext, 'spark/sql'\n\n  include Helper::System\n\n  DEFAULT_CONFIG_FILE = File.join(Dir.home, '.ruby-spark.conf')\n\n  def self.print_logo(message=nil)\n    puts <<-STRING\n\n    Welcome to\n                  __           ____              __\n        ______ __/ /  __ __   / __/__  ___ _____/ /__\n       / __/ // / _ \\\\/ // /  _\\\\ \\\\/ _ \\\\/ _ `/ __/  '_/\n      /_/  \\\\_,_/_.__/\\\\_, /  /___/ .__/\\\\_,_/_/ /_/\\\\_\\\\   version #{Spark::VERSION}\n                    /___/      /_/\n\n    #{message}\n\n    STRING\n  end\n\n  # Returns current configuration. Configurations can be changed until\n  # context is initialized. In this case config is locked only for reading.\n  #\n  # == Configuration can be changed:\n  #\n  #   Spark.config.set('spark.app.name', 'RubySpark')\n  #\n  #   Spark.config['spark.app.name'] = 'RubySpark'\n  #\n  #   Spark.config do\n  #     set 'spark.app.name', 'RubySpark'\n  #   end\n  #\n  def self.config(&block)\n    @config ||= Spark::Config.new\n\n    if block_given?\n      @config.instance_eval(&block)\n    else\n      @config\n    end\n  end\n\n  # Destroy current configuration. This can be useful for restarting config\n  # to set new. It has no effect if context is already started.\n  def self.clear_config\n    @config = nil\n  end\n\n  # Return a current active context or nil.\n  def self.context\n    @context\n  end\n\n  # Current active SQLContext or nil.\n  def self.sql_context\n    @sql_context\n  end\n\n  # Initialize spark context if not already. Config will be automatically\n  # loaded on constructor. From that point `config` will use configuration\n  # from running Spark and will be locked only for reading.\n  def self.start\n    @context ||= Spark::Context.new\n  end\n\n  def self.start_sql\n    @sql_context ||= Spark::SQL::Context.new(start)\n  end\n\n  def self.stop\n    @context.stop\n    RubyWorker.stopServer\n    logger.info('Workers were stopped')\n  rescue\n    nil\n  ensure\n    @context = nil\n    @sql_context = nil\n    clear_config\n  end\n\n  def self.started?\n    !!@context\n  end\n\n\n  # ===============================================================================\n  # Defaults\n\n  # Load default configuration for Spark and RubySpark\n  # By default are values stored at ~/.ruby-spark.conf\n  # File is automatically created\n  def self.load_defaults\n    unless File.exists?(DEFAULT_CONFIG_FILE)\n      save_defaults_to(DEFAULT_CONFIG_FILE)\n    end\n\n    load_defaults_from(DEFAULT_CONFIG_FILE)\n  end\n\n  # Clear prev setting and load new from file\n  def self.load_defaults_from(file_path)\n    # Parse values\n    values = File.readlines(file_path)\n    values.map!(&:strip)\n    values.select!{|value| value.start_with?('gem.')}\n    values.map!{|value| value.split(nil, 2)}\n    values = Hash[values]\n\n    # Clear prev values\n    @target_dir = nil\n    @ruby_spark_jar = nil\n    @spark_home = nil\n\n    # Load new\n    @target_dir = values['gem.target']\n  end\n\n  # Create target dir and new config file\n  def self.save_defaults_to(file_path)\n    dir = File.join(Dir.home, \".ruby-spark.#{SecureRandom.uuid}\")\n\n    if Dir.exist?(dir)\n      save_defaults_to(file_path)\n    else\n      Dir.mkdir(dir, 0700)\n      file = File.open(file_path, 'w')\n      file.puts \"# Directory where will be Spark saved\"\n      file.puts \"gem.target   #{dir}\"\n      file.puts \"\"\n      file.puts \"# You can also defined spark properties\"\n      file.puts \"# spark.master                       spark://master:7077\"\n      file.puts \"# spark.ruby.serializer              marshal\"\n      file.puts \"# spark.ruby.serializer.batch_size   2048\"\n      file.close\n    end\n  end\n\n\n  # ===============================================================================\n  # Global settings and variables\n\n  def self.logger\n    @logger ||= Spark::Logger.new\n  end\n\n  # Root of the gem\n  def self.root\n    @root ||= File.expand_path('..', File.dirname(__FILE__))\n  end\n\n  # Default directory for java extensions\n  def self.target_dir\n    @target_dir ||= File.join(root, 'target')\n  end\n\n  # Directory where is worker.rb\n  def self.worker_dir\n    @worker_dir ||= File.join(root, 'lib', 'spark', 'worker')\n  end\n\n  def self.ruby_spark_jar\n    @ruby_spark_jar ||= File.join(target_dir, 'ruby-spark.jar')\n  end\n\n  def self.spark_ext_dir\n    @spark_ext_dir ||= File.join(root, 'ext', 'spark')\n  end\n\n\n  # ===============================================================================\n  # Load JVM and jars\n\n  # Load dependent libraries, can be use once\n  # Cannot load before CLI::install\n  #\n  # == Parameters:\n  # target::\n  #   path to directory where are located sparks .jar files or single Spark jar\n  #\n  def self.load_lib(target=nil)\n    return if @java_bridge\n\n    target ||= Spark.target_dir\n\n    @java_bridge = JavaBridge.init(target)\n    @java_bridge.import_all\n    nil\n  end\n\n  def self.java_bridge\n    @java_bridge\n  end\n\n\n  # Aliases\n  class << self\n    alias_method :sc, :context\n    alias_method :jb, :java_bridge\n    alias_method :home, :root\n  end\n\nend\n\n# C/Java extensions\nrequire 'ruby_spark_ext'\n\n# Ruby core extensions\nrequire 'spark/ext/module'\nrequire 'spark/ext/object'\nrequire 'spark/ext/hash'\nrequire 'spark/ext/string'\nrequire 'spark/ext/integer'\nrequire 'spark/ext/ip_socket'\nrequire 'spark/ext/io'\n\n# Other requirments\nrequire 'spark/version'\nrequire 'spark/error'\n\n# Load default settings for gem and Spark\nSpark.load_defaults\n\n# Make sure that Spark be always stopped\nKernel.at_exit do\n  begin\n    Spark.started? && Spark.stop\n  rescue\n  end\nend\n"
  },
  {
    "path": "ruby-spark.gemspec",
    "content": "# coding: utf-8\n\nlib = File.expand_path('../lib', __FILE__)\n$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)\n\nrequire 'spark/version'\n\nGem::Specification.new do |spec|\n  spec.name          = 'ruby-spark'\n  spec.version       = Spark::VERSION\n  spec.authors       = ['Ondřej Moravčík']\n  spec.email         = ['moravcik.ondrej@gmail.com']\n  spec.summary       = %q{Ruby wrapper for Apache Spark}\n  spec.description   = %q{}\n  spec.homepage      = ''\n  spec.license       = 'MIT'\n\n  spec.files         = `git ls-files -z`.split(\"\\x0\")\n  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }\n  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})\n  spec.require_paths = ['lib']\n\n  if RUBY_PLATFORM =~ /java/\n    spec.platform = 'java'\n\n    extensions = ['ext/ruby_java/extconf.rb']\n  else\n    extensions = ['ext/ruby_c/extconf.rb']\n\n    spec.add_dependency 'rjb'\n  end\n\n  spec.extensions = extensions\n  spec.required_ruby_version = '>= 2.0'\n\n  spec.requirements << 'java, scala'\n\n  spec.add_dependency 'sourcify', '0.6.0.rc4'\n  spec.add_dependency 'method_source'\n  spec.add_dependency 'commander'\n  spec.add_dependency 'pry'\n  spec.add_dependency 'nio4r'\n  spec.add_dependency 'distribution'\n\n  spec.add_development_dependency 'bundler', '~> 1.6'\n  spec.add_development_dependency 'rake'\nend\n"
  },
  {
    "path": "spec/generator.rb",
    "content": "class Generator\n  def self.numbers(size=1000)\n    Array.new(size){ rand(1..1000) }\n  end\n\n  def self.numbers_with_zero(size=1000)\n    Array.new(size){ rand(0..1000) }\n  end\n\n  def self.words(size=1000)\n    Array.new(size) { word }\n  end\n\n  def self.word(size=10)\n    Array.new(rand(1..size)){(97+rand(26)).chr}.join\n  end\n\n  def self.lines(size=1000, letters=3)\n    Array.new(size) do\n      Array.new(rand(50..100)){\n        (97+rand(letters)).chr + (' ' * (rand(10) == 0 ? 1 : 0))\n      }.join\n    end\n  end\n\n  def self.hash(size=1000)\n    Array.new(size) do\n      [word(2), rand(1..10)]\n    end\n  end\n\n  def self.hash_with_values(size=1000, values_count=10)\n    Array.new(size) do\n      [word(2), Array.new(values_count) { rand(1..10) }]\n    end\n  end\nend\n"
  },
  {
    "path": "spec/inputs/lorem_300.txt",
    "content": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aenean ligula neque, ultricies et lorem\nvel, accumsan cursus felis. Maecenas ullamcorper, magna eu lobortis gravida, diam leo rutrum diam,\neget elementum sapien felis non magna. Etiam scelerisque, mauris et cursus fermentum, ipsum nisl\nvulputate nisl, sit amet pulvinar libero sem at lectus. Vivamus nibh lectus, elementum eget dui non,\nfermentum volutpat orci. Nam imperdiet, dui id placerat pellentesque, purus sem semper augue, id\ndictum est ipsum et erat. Integer arcu tortor, ullamcorper ac libero a, iaculis sollicitudin orci.\nSed dapibus hendrerit neque, ac aliquet arcu elementum sed. Phasellus ornare interdum erat, eget\nfringilla sapien ornare vitae. In condimentum, mi sed condimentum viverra, nisl sapien scelerisque\nmi, vel varius metus dolor eu lorem. Nulla pulvinar ac metus eu volutpat. Suspendisse potenti. Duis\nvitae mauris arcu. Proin et dignissim dolor, eget congue purus. Ut malesuada neque massa. Ut viverra\nfaucibus turpis, in pharetra nulla iaculis quis. Morbi imperdiet risus eu eros varius facilisis.\nAenean nec dapibus sapien. Fusce tempus, risus vitae volutpat faucibus, dolor diam cursus risus, sit\namet faucibus mauris mauris quis orci. Aliquam massa ante, accumsan non sapien quis, ullamcorper\nfermentum elit. Pellentesque risus orci, rhoncus ac mi sed, volutpat vehicula sem. Mauris suscipit\nodio vel mi scelerisque, at cursus libero ullamcorper. Nulla aliquam metus arcu, in vestibulum sem\nullamcorper eu. Pellentesque laoreet venenatis metus ut accumsan. Quisque ut enim interdum,\nfringilla lorem nec, dignissim orci. Fusce vel diam sed ante dictum scelerisque. Vestibulum lectus\nenim, gravida sit amet ullamcorper sit amet, rhoncus nec dui. Praesent eget molestie tellus, quis\niaculis sapien. Sed ut rutrum velit. Pellentesque habitant morbi tristique senectus et netus et\nmalesuada fames ac turpis egestas. Donec tortor quam, venenatis ac rhoncus et, gravida non orci. Ut\nlacus dolor, auctor id ante varius, pharetra placerat nulla. Nulla facilisi. Nam quis feugiat nibh,\nut ultrices est. Nulla at mi nec metus porttitor tempor. Donec leo lorem, rhoncus ut arcu eu,\nvenenatis eleifend risus. Phasellus non porttitor neque, sit amet accumsan nisl. Pellentesque non\nurna tempor, interdum orci non, gravida enim. Sed in urna et dolor cursus aliquet et vel magna.\nQuisque vestibulum tortor scelerisque orci mattis, eu aliquet sem condimentum. Proin ac ultricies\nerat. Integer sodales, turpis quis volutpat pretium, justo lacus lobortis mauris, nec commodo orci\nleo sit amet metus. Ut ornare ipsum vitae malesuada aliquam. Quisque lobortis semper elit id\nconsectetur. Aenean facilisis sapien eu ipsum adipiscing mattis. Praesent malesuada aliquet\nvenenatis. Ut aliquet vel sapien nec euismod. Morbi eros urna, rutrum ut iaculis sed, vulputate sit\namet nunc. Nulla facilisi. Morbi sagittis nec magna sed scelerisque. Maecenas a euismod eros.\nVestibulum suscipit pharetra velit porta fermentum. Phasellus euismod auctor metus ut interdum.\nQuisque lectus lorem, tristique ut libero vel, rhoncus tincidunt tellus. Sed malesuada vestibulum\npurus, at tincidunt massa imperdiet vitae. Ut mollis eleifend elit, et sodales nisl facilisis eu.\nFusce ligula ligula, porta id est sed, tincidunt malesuada odio. Maecenas ultricies dignissim nunc,\nquis adipiscing urna auctor commodo. Phasellus tincidunt odio non nulla luctus sollicitudin. Mauris\npharetra porttitor est iaculis sollicitudin. Curabitur quam sem, fringilla id tellus vitae,\nelementum convallis eros. Morbi sollicitudin eleifend leo, ut euismod ligula ornare sagittis. Nullam\nluctus, mi eget dapibus elementum, diam purus fringilla lectus, sit amet sodales neque turpis sed\nmi. Sed volutpat sem euismod posuere mollis. Integer viverra egestas lacinia. Quisque viverra metus\nmassa, in condimentum sem tincidunt a. Proin ac ipsum non leo sollicitudin consectetur id a sem.\nCras tempus venenatis nisl sit amet venenatis. Nulla facilisi. Morbi scelerisque mi est, vitae\nlobortis sem ultricies faucibus. In urna ante, faucibus ac eros et, dignissim mollis justo. Quisque\naliquet tortor sem, ac mattis tortor faucibus sed. Donec tortor lacus, egestas in convallis at,\nvulputate eu nibh. Aenean ligula augue, imperdiet in tempor id, consequat vitae erat. Sed id eros a\njusto semper ultricies. Curabitur nunc nisi, placerat at leo sed, vehicula pulvinar velit. Nullam ut\nipsum augue. Fusce condimentum quam commodo, venenatis massa eleifend, dignissim neque. Curabitur\nsit amet hendrerit tortor, a condimentum sem. Morbi lobortis porta porttitor. Maecenas mollis ipsum\nac est venenatis auctor at vel lectus. Mauris luctus euismod dolor. Cras vitae nibh eget sem\nplacerat adipiscing. Pellentesque ac molestie ligula. Vivamus sit amet lectus odio. Duis lacinia\nrutrum faucibus. Curabitur luctus ultricies enim, id imperdiet ipsum viverra vitae. Mauris et\niaculis erat, vel faucibus purus. Fusce non nisl tristique, dignissim lacus id, fermentum velit. Sed\nfacilisis sapien at interdum viverra. Aliquam erat volutpat. Maecenas suscipit diam vitae velit\nvulputate tincidunt. Nulla facilisi. Sed eget tortor et ante mollis cursus. Nullam vitae porttitor\nmagna. Quisque iaculis massa dui, id rutrum purus blandit eu. Duis convallis ipsum id commodo\niaculis. Praesent sagittis ut tortor ut varius. Curabitur consequat volutpat scelerisque. Cras\npharetra lectus eget urna imperdiet ullamcorper. Sed lacinia ut eros non malesuada. Quisque\nhendrerit suscipit convallis. Vivamus posuere vestibulum massa, non accumsan diam tincidunt eu.\nNulla bibendum dictum mi sit amet faucibus. Nullam egestas lorem nunc, vel malesuada elit imperdiet\nvitae. Sed luctus ligula at erat tempus tristique. Proin varius mi quis libero sollicitudin\nullamcorper. In hac habitasse platea dictumst. Praesent auctor arcu vel luctus consequat. Curabitur\nconsequat magna sit amet ante feugiat dictum. Morbi scelerisque faucibus urna, ac dapibus sem\nultricies eu. Pellentesque rhoncus sapien nec eros facilisis consectetur. Duis eleifend vestibulum\nsuscipit. Morbi orci metus, malesuada sit amet urna ac, laoreet vehicula lacus. Quisque gravida,\nnunc fringilla tincidunt vestibulum, lacus urna commodo nisl, quis sodales lectus ipsum et augue. Ut\nnon erat sit amet neque fermentum ultricies. Vestibulum tincidunt est elit, ac dapibus velit\nfaucibus id. Praesent in viverra libero. Proin eleifend, odio eget sodales dignissim, nunc arcu\nullamcorper libero, sit amet sodales diam ipsum in tellus. Suspendisse enim nunc, accumsan non\nligula et, vulputate viverra ante. Ut id elit eu dui dictum malesuada at id orci. Vivamus sed felis\naliquam metus consequat euismod nec eu libero. Phasellus mattis malesuada ipsum eu posuere. Nullam\nat massa enim. Duis vitae urna blandit, ultricies nisi in, consequat elit. Quisque nec nibh ut\ntortor pulvinar euismod. Praesent molestie felis ac risus elementum sollicitudin. Donec eu leo in\naugue convallis mattis. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur\nridiculus mus. Integer ut dignissim lectus. Vivamus eros felis, gravida et auctor ut, volutpat vitae\ndui. Nunc adipiscing sapien et lectus rutrum vestibulum. Mauris fermentum, metus eu sollicitudin\nmalesuada, lorem diam vestibulum metus, ut elementum metus nibh sed augue. Cras lectus risus,\nfeugiat eget fringilla a, cursus et eros. Praesent aliquam justo vel condimentum lacinia. Sed\ncondimentum dui nec leo blandit, vel elementum odio laoreet. Quisque suscipit molestie iaculis.\nNullam dignissim, mauris sit amet condimentum aliquet, magna sapien scelerisque nisl, tincidunt\nauctor purus libero at lectus. Nulla facilisi. Sed egestas erat at dictum egestas. Cras non mauris\nut dolor interdum condimentum. Fusce quis hendrerit purus, dictum cursus mi. Maecenas mattis, turpis\nsit amet mollis ultricies, mi turpis ornare velit, eget suscipit magna eros sit amet purus. Integer\nut viverra elit. Praesent eu augue viverra nunc convallis porta. Etiam venenatis dignissim nisl et\nsemper. Cras eu nisl vitae justo ornare porttitor vel nec augue. Pellentesque faucibus mollis neque,\nnec ullamcorper purus mollis sed. Suspendisse ut molestie lectus, faucibus aliquet libero. Aliquam\ntristique, neque ut lobortis ultricies, tellus elit ultrices risus, sodales dapibus sem mauris et\nmagna. Sed et sem porttitor, fringilla mauris vestibulum, porttitor dui. Proin vitae viverra elit.\nInteger nec adipiscing velit. Nunc quis urna tristique, ultrices orci eget, aliquet lorem. Curabitur\nconsequat adipiscing sodales. In elementum condimentum ante id placerat. Cras ac turpis tristique\nlacus vulputate dictum vel nec libero. Curabitur fringilla interdum tempus. Integer placerat dolor\nut magna aliquet bibendum. Cras ac metus magna. Curabitur vehicula magna ut sapien viverra ornare.\nDonec risus nisi, imperdiet eu laoreet in, tempor lobortis urna. Etiam malesuada et lacus ac\nconsectetur. Morbi facilisis sapien quis nisl laoreet semper. Suspendisse volutpat sapien vel quam\nblandit faucibus. Nam sagittis velit eros, vitae suscipit tortor elementum ac. Pellentesque habitant\nmorbi tristique senectus et netus et malesuada fames ac turpis egestas. Donec nec nibh dictum,\npretium nulla eu, pharetra mauris. Vestibulum leo mi, convallis et euismod ac, molestie in ligula.\nVestibulum tempor tincidunt porttitor. Integer nisl orci, dignissim ac volutpat a, auctor eget\naugue. Suspendisse eget euismod nunc, eu elementum ipsum. Cras libero tortor, gravida quis\nvestibulum a, tincidunt aliquam mauris. Integer elementum pellentesque posuere. Donec accumsan\nfeugiat pulvinar. Aliquam eros justo, dictum non elementum nec, tristique vel massa. Nulla a velit\nporttitor, aliquam turpis nec, ultricies ligula. Nam id dignissim dui. Ut placerat arcu nec accumsan\nvarius. Sed quis accumsan nunc, in dapibus lorem. Morbi egestas sagittis pulvinar. Morbi id mauris\nante. Sed magna nibh, venenatis quis lacinia in, congue quis metus. Nunc lacus lectus, adipiscing\nsed consequat id, luctus vel dui. Mauris eu nisi erat. Proin eleifend lectus sit amet ligula\nfringilla semper. Suspendisse tristique, quam ac pharetra dictum, libero risus rutrum ipsum, eget\ntristique arcu neque vel nisi. Ut auctor nulla vitae porta faucibus. Suspendisse ut tellus enim.\nMorbi commodo posuere quam. Proin consequat in quam pulvinar posuere. Nunc id ullamcorper est. Cras\nac molestie massa. Cras leo tellus, tempus id nibh quis, porttitor laoreet elit. Mauris in ornare\nnisi. Duis vel velit felis. Suspendisse gravida felis nec nulla hendrerit pretium. Cras at orci\nneque. Phasellus vehicula, ipsum at tempus sodales, mauris est condimentum metus, a vehicula ante\ntellus sit amet diam. Suspendisse fermentum elit in volutpat viverra. Nullam gravida in augue sed\nmollis. Curabitur aliquam diam non quam aliquam ultrices. Quisque pretium semper diam eget\nmalesuada. Suspendisse porttitor sagittis sem at malesuada. Donec euismod elementum nulla, sit amet\neleifend enim adipiscing nec. Nullam porta, enim ac tincidunt molestie, turpis mi porta justo,\nornare tristique sem orci quis turpis. Nullam leo dolor, pellentesque ac hendrerit et, tempus quis\nnisi. Fusce pretium mattis tortor sagittis suscipit. Vestibulum vitae suscipit libero. Mauris\nconsequat sagittis mi, id tempus est condimentum et. In eget condimentum odio, a malesuada quam.\nVivamus id turpis non nulla eleifend cursus ut sit amet tellus. Proin ultrices luctus nibh, eget\ncondimentum ligula vestibulum in. Aliquam pharetra aliquet erat nec lacinia. Cras fringilla est\nfringilla ante tristique, vitae bibendum dolor malesuada. Praesent ut dui pulvinar, suscipit velit\ngravida, malesuada nunc. Cras tempus feugiat interdum. Vivamus lectus lorem, rutrum ut neque at,\nsollicitudin euismod nulla. Vestibulum ac ligula suscipit, ultricies felis eget, adipiscing lectus.\nMaecenas nec enim vel eros molestie lobortis faucibus sit amet urna. Sed ac consequat nulla. Nulla\net libero nisi. Pellentesque euismod nunc quis ipsum tristique, suscipit elementum magna aliquam.\nPraesent sit amet tincidunt leo. Duis tempor arcu eget est posuere imperdiet. Quisque vel dui\nadipiscing, auctor nibh vel, vulputate sapien. Curabitur eu sodales lacus. Aliquam felis eros,\nmattis a diam eu, ullamcorper vestibulum turpis. Vivamus vitae vulputate lacus, sed convallis lorem.\nVestibulum mattis sollicitudin vulputate. Mauris cursus erat eget nisi accumsan, nec commodo tellus\nblandit. Etiam gravida nulla et lorem molestie auctor. Mauris venenatis iaculis nulla vel mollis.\nMorbi pretium sed eros at commodo. Aliquam eu justo turpis. Pellentesque lobortis, nisl eget\nultricies dictum, augue sem placerat elit, vitae pretium lectus massa eget tortor. Nulla accumsan,\nmassa eu rutrum pharetra, mi sapien aliquam massa, viverra facilisis metus nisi in dolor. Duis felis\nvelit, interdum a elit non, cursus pellentesque libero. Cum sociis natoque penatibus et magnis dis\nparturient montes, nascetur ridiculus mus. Nunc vel nisi quis augue accumsan aliquam. Suspendisse\nante lectus, lobortis nec suscipit at, ullamcorper at diam. Aliquam hendrerit, eros ac egestas\ncondimentum, enim metus lobortis nibh, sit amet convallis augue nulla nec lorem. Lorem ipsum dolor\nsit amet, consectetur adipiscing elit. Ut ac ligula eget est blandit scelerisque at vitae nunc. Sed\nvenenatis eros non quam auctor posuere. Curabitur convallis dapibus semper. Fusce et leo sed massa\nposuere porta. Morbi convallis lobortis eros. Quisque ac nisl dictum, sagittis eros et, pellentesque\nmetus. Quisque mattis sodales lorem quis malesuada. Aenean neque sapien, rutrum vitae euismod quis,\neuismod eu mi. Etiam ante tellus, auctor vitae pulvinar a, mattis nec tellus. Morbi libero lectus,\nmattis sit amet convallis at, viverra et nisi. Proin a ante tristique, blandit urna at, lobortis\nleo. Praesent nec odio sit amet ligula adipiscing pretium at rhoncus felis. Ut ut velit turpis. Sed\ntempor lectus massa, vel gravida libero gravida a. Nunc mollis, lorem id dapibus hendrerit, mi orci\ngravida orci, at vehicula neque nisl quis nibh. Mauris feugiat, ligula sit amet interdum laoreet,\nlectus leo accumsan dolor, eu cursus tortor quam eget lectus. Sed commodo, est in bibendum\ncondimentum, magna neque dictum sapien, at lacinia sem ipsum ut eros. In eget erat eu nulla\nhendrerit tincidunt id vulputate nibh. Nunc sed imperdiet urna, eu tempor orci. Phasellus\npellentesque sapien eu risus tincidunt, ut iaculis risus fermentum. Suspendisse condimentum erat\nvitae porta malesuada. Ut a vulputate lorem. Nulla ullamcorper, neque in posuere vulputate, neque\nmagna tempor erat, sit amet luctus nisi nibh quis ligula. Duis porta urna et fermentum interdum. Sed\npellentesque odio euismod nisi auctor rutrum. Suspendisse mi nibh, dignissim eget porttitor quis,\ncommodo a massa. Nunc vel eleifend turpis. Sed iaculis, massa quis egestas pellentesque, nibh ante\nfeugiat ante, a euismod lacus nunc et felis. Nam in aliquet odio. Nulla eget enim aliquam, faucibus\nest at, fringilla tellus. Duis molestie massa ornare, sodales leo eget, lobortis nibh. Nam bibendum\nmi a facilisis mattis. Duis ultrices arcu tellus, vitae interdum tortor dictum et. Sed id luctus\nlectus, eu tempus quam. Duis mi nisl, iaculis vel tortor sit amet, vulputate sodales risus. Cras\nvitae lobortis nisi, eu adipiscing ante. Nam eget scelerisque libero. Nulla pulvinar, velit et\nposuere sagittis, odio risus venenatis sapien, at tristique enim augue quis sem. Integer rutrum\nblandit eros eu faucibus. Etiam eget iaculis felis, in fermentum ante. Nullam a placerat risus, id\naccumsan quam. Donec est orci, elementum eu sapien non, ultricies ullamcorper leo. Praesent\ntincidunt, mauris in viverra hendrerit, dolor nisi cursus orci, vel lacinia neque ante eu magna. Nam\nfacilisis massa at nisi accumsan, non condimentum turpis facilisis. Cras quis ipsum at orci ornare\nvenenatis vitae et ante. Morbi vitae luctus lacus. Nullam eu felis at mi hendrerit commodo a eu\ndiam. Maecenas ultricies, urna sit amet egestas tempor, dolor ligula dictum nibh, vehicula commodo\nipsum diam at nunc. Proin facilisis tincidunt elit, sed vulputate leo lobortis sed. In tincidunt\nrisus lorem, venenatis pellentesque tellus accumsan vitae. Integer ullamcorper mi ut risus\nconsectetur dictum in quis dui. Pellentesque sed diam sed purus egestas mollis id at sapien. Nunc\ncursus mi nec accumsan porta. Nullam pulvinar pharetra felis. Etiam porta massa et diam scelerisque,\nut iaculis nisl luctus. Curabitur vel metus id lacus faucibus tempus. Nullam ornare neque orci, nec\nscelerisque erat mattis nec. Phasellus ultrices ultrices nisi quis venenatis. Sed ultrices iaculis\ndiam a faucibus. Phasellus quis suscipit nulla. Nulla ultricies, turpis et dictum ullamcorper, urna\nmetus porta tellus, quis congue dolor libero quis sem. Nam tempus metus risus, sed rutrum nibh\ncursus malesuada. Vivamus bibendum odio eget mi aliquet, sed tempor eros tincidunt. Suspendisse eu\nultricies ligula, non commodo sem. Ut aliquet elit sed leo laoreet aliquam. Vivamus feugiat a justo\nnon auctor. Sed rhoncus orci ut dictum dignissim. Duis eros libero, tempus non venenatis quis,\nsuscipit eget turpis. Aliquam sed ullamcorper velit, in tincidunt tellus. Ut dapibus erat vel nunc\nfeugiat elementum. Cras congue, erat sit amet lacinia venenatis, nisi magna rhoncus nulla, eu\nblandit eros neque ac eros. Donec vulputate placerat dapibus. Integer dignissim odio eget iaculis\nultrices. Vestibulum ligula neque, tincidunt at pretium ac, tincidunt sit amet tellus. Sed fermentum\negestas tortor, non volutpat sapien. Aliquam erat volutpat. Duis semper placerat sapien at placerat.\nPraesent facilisis pharetra dignissim. Morbi laoreet sed tortor eu rhoncus. Vivamus eleifend felis\neu dui ornare ornare sed at urna. Nulla nulla justo, hendrerit id enim vitae, blandit consequat\nnibh. Aliquam mattis diam mattis fringilla tempor. Suspendisse suscipit est sed pulvinar commodo.\nLorem ipsum dolor sit amet, consectetur adipiscing elit. In in scelerisque enim. Phasellus ornare\nnisl consequat volutpat bibendum. Vivamus et nunc viverra, ultrices lorem a, cursus purus. Curabitur\nnibh libero, hendrerit lobortis malesuada sit amet, fringilla et augue. Vestibulum est lacus,\nfringilla sit amet dictum pulvinar, lacinia at leo. Proin iaculis felis vitae metus viverra blandit.\nMauris accumsan sagittis semper. Quisque non diam a quam volutpat faucibus. Pellentesque eros orci,\ncommodo eget fringilla eu, euismod et turpis. Duis molestie et eros ac ullamcorper. Phasellus\nconsequat risus eget elementum semper. Donec at mi a justo laoreet condimentum porttitor in purus.\nNulla sit amet libero consectetur, iaculis neque nec, scelerisque turpis. Aliquam interdum nibh eget\naccumsan dictum. Ut lobortis, mi non eleifend lobortis, lorem mauris pretium urna, at fermentum\ntellus felis eu nunc. Aliquam in nibh tristique, tempus purus a, cursus massa. Suspendisse potenti.\nMaecenas porttitor et erat in sollicitudin. Cum sociis natoque penatibus et magnis dis parturient\nmontes, nascetur ridiculus mus. Vestibulum commodo placerat velit, vel pellentesque neque sagittis\neget. Interdum et malesuada fames ac ante ipsum primis in faucibus. Nullam eu massa placerat,\niaculis eros eget, viverra orci. Aliquam ac lacus porttitor, eleifend elit id, vehicula mauris. Sed\nac interdum libero. Sed laoreet suscipit mi, ac accumsan massa condimentum nec. Suspendisse sodales\nlibero sollicitudin, malesuada quam ac, viverra enim. Sed sapien libero, egestas sit amet orci non,\nvenenatis interdum augue. In hac habitasse platea dictumst. Fusce gravida orci at ligula fringilla\nadipiscing. Nunc quis ipsum quis nibh egestas porta. Proin et faucibus elit. Etiam in neque at nunc\npharetra adipiscing nec vel magna. Donec at nunc scelerisque, tincidunt risus ut, bibendum nisi.\nDonec pulvinar fermentum purus, ac adipiscing urna iaculis at. Nulla ut nunc vitae lorem dapibus\nfringilla. Ut placerat dignissim nulla ornare mattis. Mauris rutrum tellus quis odio dictum, ac\ntempor velit scelerisque. Quisque ligula elit, convallis nec volutpat vitae, pulvinar id mauris.\nVivamus vel accumsan tortor. Donec eu sollicitudin dolor. Pellentesque egestas congue tristique.\nPhasellus ut sollicitudin nisl. Praesent diam neque, malesuada id tincidunt id, malesuada in eros.\nPhasellus adipiscing ipsum vel justo molestie vulputate. Praesent ultricies dapibus lacus pulvinar\ngravida. Donec consequat, orci et mattis ultrices, nibh enim sagittis metus, vitae eleifend enim\ntellus vitae augue. Suspendisse placerat iaculis risus nec iaculis. Ut ullamcorper ultrices dui, sed\nblandit mauris hendrerit vitae. Nulla ac dolor lectus. Etiam pellentesque neque at odio bibendum, at\nvenenatis tellus fermentum. Maecenas a condimentum metus. Phasellus semper scelerisque feugiat.\nFusce varius varius tincidunt. Ut vel auctor magna. Cras dui turpis, euismod in enim a, scelerisque\nadipiscing lectus. Duis mollis pharetra risus, sed ultrices nulla blandit non. Integer ac pulvinar\nmagna. Aenean fermentum auctor magna. Ut in viverra sapien. Proin ac bibendum magna, cursus gravida\nelit. Phasellus vehicula facilisis nibh, tempor sagittis mauris accumsan et. Vestibulum sed lacus\nluctus diam ornare venenatis non vel felis. Morbi posuere sit amet nisl quis pulvinar. Suspendisse\nblandit tempus risus quis pretium. Nullam gravida libero vel aliquam suscipit. Nunc vel nunc at leo\npharetra tempor et ut mi. Aliquam erat volutpat. Nulla placerat odio tellus. Nam adipiscing massa\nnec varius posuere. Proin placerat tellus posuere lorem suscipit, sit amet sagittis sem condimentum.\nUt pharetra odio quis tellus mattis facilisis. Quisque eget interdum est. Quisque mattis, felis eu\nsemper feugiat, quam augue interdum mauris, eget sodales nisi neque quis erat. Curabitur semper, mi\nposuere luctus molestie, neque ante sagittis nulla, sit amet vehicula eros eros in justo. Integer\naliquet vehicula arcu, quis iaculis justo. Sed tincidunt sem id est porta volutpat. Mauris varius\nfelis ut est venenatis, ornare porttitor arcu adipiscing. Sed luctus rutrum ante, consectetur\nsollicitudin sapien accumsan vulputate. Vivamus id diam vehicula, fermentum nunc id, viverra justo.\nQuisque porttitor, odio in molestie hendrerit, libero eros vehicula odio, id vestibulum sapien neque\nquis nibh. Donec vel faucibus est. Ut nec sapien vitae nibh congue egestas vel euismod tellus. Lorem\nipsum dolor sit amet, consectetur adipiscing elit. Vestibulum quis lacus lorem. Integer egestas\neuismod ante, vitae condimentum neque eleifend non. Sed posuere bibendum ante, ut facilisis dui\ncondimentum at. In ut varius augue. Vivamus bibendum eu odio vel convallis. Vivamus cursus sodales\niaculis. Nullam convallis facilisis blandit. Phasellus iaculis porttitor elit, eget vestibulum ipsum\nconvallis eu. Quisque volutpat justo ipsum, eleifend cursus urna facilisis a. Sed at diam nec sem\nsemper scelerisque. Aliquam euismod erat quis nisi dictum, at sodales leo fermentum. Nam at nisl\nmetus. Proin luctus porttitor ante in tincidunt. Maecenas laoreet vitae enim eget elementum. Nulla\nid sagittis enim, nec ultrices tortor. Nam rutrum ipsum sit amet erat auctor, eu venenatis libero\nultricies. Ut condimentum neque non diam ullamcorper, ultrices feugiat neque egestas. Pellentesque\nat lobortis est, in blandit mi. Maecenas tincidunt eros id massa pulvinar, quis varius eros\nlobortis. Curabitur vitae sodales orci. Suspendisse potenti. Pellentesque eu fringilla nibh. Etiam\nsed pretium enim, lacinia consequat lectus. Quisque sed mi risus. Praesent posuere dolor sed mauris\ndapibus, id tristique mi mattis. Quisque nec urna rutrum, consectetur mauris ut, egestas libero.\nFusce a justo orci. Etiam vitae aliquet ipsum. Curabitur consequat tempor eros, ut placerat lectus\ntempus et. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis\negestas. Sed ligula mi, laoreet sit amet nunc id, ullamcorper fermentum magna. Maecenas enim dui,\nviverra at nulla ut, lacinia pretium nunc. Donec at ultricies nulla, nec cursus odio. Donec\nullamcorper nec turpis imperdiet hendrerit. Sed euismod aliquam vehicula. Nunc sed enim eleifend\nturpis venenatis sagittis. Sed laoreet velit erat. Proin nisl erat, vulputate et fermentum iaculis,\nmollis suscipit magna. Sed porta, augue ut accumsan fermentum, arcu tortor rutrum tellus, sit amet\nsollicitudin lectus turpis non felis. Vestibulum ante leo, interdum sed venenatis non, porttitor ut\nnibh. Sed sit amet luctus erat. Duis id rhoncus justo, non rutrum lorem. Mauris ut laoreet elit.\nPraesent sed diam porta, rhoncus massa a, tincidunt lorem. Mauris bibendum nunc nec est ullamcorper\nbibendum. Nullam venenatis libero sed ligula scelerisque euismod quis at dui. Donec ac velit luctus,\nmolestie mi at, tempor leo. Pellentesque a ultricies risus. Maecenas malesuada faucibus nulla quis\nconsectetur. Phasellus pretium interdum risus sit amet aliquet. Nullam eleifend sem id magna\nlaoreet, ut lobortis mi tincidunt. Maecenas in justo tempor, viverra ipsum eu, tincidunt nulla. Sed\nsed molestie turpis. Pellentesque imperdiet, eros non vulputate fringilla, turpis odio luctus\nlectus, eu lacinia purus nisl vitae justo. Etiam non dapibus dolor. Fusce non urna scelerisque,\ninterdum massa vitae, venenatis metus. Vestibulum scelerisque dolor ac lectus sollicitudin, eget\nfringilla sapien fringilla. Suspendisse non quam massa. Donec a sollicitudin eros, ut mollis turpis.\nNullam gravida congue semper. Phasellus vitae tellus vitae nulla cursus tempor et non elit.\nVestibulum pharetra in ligula a venenatis. Maecenas at erat sed nulla vulputate pulvinar et eu\nlibero. Donec pulvinar arcu nisi, sed posuere turpis cursus a. In nec turpis interdum, condimentum\nvelit in, consectetur lacus. Duis porta, felis a rhoncus ornare, ligula est elementum nunc, eu\nadipiscing massa lorem in nibh. In consequat gravida eros. Phasellus condimentum malesuada sapien\nultrices tempor. Suspendisse sit amet diam in est pulvinar iaculis nec vitae nibh. Vivamus rhoncus\nenim lorem, elementum posuere est pretium ut. Duis lectus lorem, ultricies ac dignissim in, egestas\net ipsum. Proin nec est ac dui sagittis dictum. Cras dictum augue ipsum, sit amet gravida ligula\nscelerisque nec. Ut congue blandit porta. Nunc porta vitae risus at sagittis. Donec viverra, ante id\nporta consectetur, felis turpis fringilla dui, ut vulputate nulla eros sit amet augue. Donec\naliquet, felis ut tempor pretium, enim leo suscipit risus, eget mollis justo ipsum ut augue. Nullam\nat lacus eu orci dapibus laoreet nec convallis leo. Fusce rhoncus sed neque sit amet viverra. Donec\narcu nisl, hendrerit non pulvinar eu, blandit ac neque. Curabitur porta velit metus, non ullamcorper\nnibh volutpat non. Proin tristique orci nec pretium lobortis. Curabitur quam neque, lacinia vitae\nmassa id, molestie pellentesque risus. Praesent vitae lectus bibendum, tincidunt augue vel, volutpat\nmagna. Curabitur quis feugiat magna. In libero risus, commodo eu mauris vitae, euismod ullamcorper\nlibero. Cras elementum rutrum lacus eu euismod. Morbi purus metus, rutrum nec varius sed, dignissim\neget nisi. Vivamus mauris nibh, hendrerit eu massa sed, ultrices suscipit est. Cras id odio dui.\nNulla condimentum luctus ipsum, eu molestie turpis commodo sed. Aliquam erat volutpat. Ut sodales\nurna sit amet est dapibus pharetra. In nec vestibulum mi. Nullam mattis fringilla venenatis. Sed\nrisus sem, tempor vitae suscipit a, viverra in quam. In malesuada odio nec laoreet accumsan. Donec\njusto diam, lacinia eu ante eget, pulvinar molestie mauris. Interdum et malesuada fames ac ante\nipsum primis in faucibus. Sed vulputate ornare dolor a tempor. Maecenas egestas, augue et semper\negestas, elit ipsum varius sem, a dapibus eros velit in sapien. Nulla sit amet eros ullamcorper,\nhendrerit nunc eu, aliquet ipsum. Sed sit amet lacus enim. Curabitur faucibus rutrum dui, a tempor\nvelit vestibulum sed. Curabitur sed nunc id lorem semper malesuada. Maecenas semper eros eu\npellentesque vulputate. Nulla accumsan dolor placerat eros euismod facilisis. Nam vitae velit\ntortor. Fusce tincidunt felis luctus, scelerisque dui in, rutrum nulla. Proin a pharetra tellus.\nAenean varius dolor nec risus eleifend fringilla. Proin at tellus ligula. Cras imperdiet mollis nisi\neget auctor. Etiam libero nunc, dictum at fermentum vitae, vehicula tincidunt justo. Proin tempor\nrisus elit, vestibulum auctor erat tristique vel. Etiam varius dui ante, a fringilla erat\nullamcorper vel. Quisque cursus quam imperdiet ornare dictum. Suspendisse turpis nunc, scelerisque a\ncongue eget, faucibus ut mauris. Suspendisse venenatis nisi nec dolor pharetra, id euismod sem\naccumsan. Quisque et accumsan justo, elementum vulputate nulla. Etiam et sapien scelerisque,\nmalesuada lacus non, pretium enim. Curabitur ultrices, ipsum hendrerit pulvinar volutpat, dui tortor\nmattis tortor, sed tincidunt magna lectus non eros. Ut hendrerit velit non metus pellentesque\nmattis. Nullam velit nisi, ornare sit amet ipsum id, commodo tincidunt nisi. Aliquam egestas, ante\nnon placerat convallis, mi mauris posuere ligula, nec auctor lectus mi quis quam. In auctor\nfacilisis ante id elementum. Donec interdum ipsum vitae lorem sollicitudin rutrum. Etiam congue\npharetra lorem ac dictum. Donec feugiat interdum vulputate. Curabitur mollis suscipit nisi, vel\ntincidunt risus fringilla at. Phasellus tincidunt, nulla a tincidunt tempor, libero turpis imperdiet\ntortor, vel convallis orci neque vitae nisi. Nunc euismod massa quis mollis ultricies. Proin non\nante elit. Pellentesque et convallis massa. Curabitur blandit mattis metus, non aliquam erat iaculis\nut. Nam vestibulum ipsum vitae nulla varius, sit amet sodales ipsum congue. Nullam eget mauris ut\nest blandit rhoncus sit amet ac arcu. Nulla at purus consequat, lobortis massa sit amet, posuere\nante. Nam bibendum laoreet tempus. Fusce ac nulla consequat, placerat sem vitae, condimentum enim.\nVestibulum sed tellus nec elit varius venenatis. Donec et dapibus dui. Nullam est metus, ultrices\nnec lectus vel, fermentum elementum lacus. Curabitur imperdiet vestibulum enim. Aenean sollicitudin\nat leo quis ullamcorper. Suspendisse in posuere risus. In quis mattis sem, eu facilisis arcu.\nVestibulum faucibus auctor accumsan. Morbi mattis sit amet augue ac sodales. Integer varius eget\norci iaculis aliquet. Suspendisse a auctor turpis. Fusce vestibulum vestibulum ante sed mattis.\nMauris ornare rhoncus enim ac egestas. Donec turpis eros, interdum non placerat nec, adipiscing eu\nurna. Integer feugiat mi quis eros fringilla vehicula. Proin suscipit magna ultricies laoreet\ndignissim. Donec vehicula ac lacus non vehicula. Sed euismod mattis facilisis. Etiam nec risus vitae\nrisus iaculis lobortis. Duis eu dui sit amet turpis tincidunt vulputate. Nunc tortor diam, egestas\nin ante ac, scelerisque placerat ante. Nullam interdum ultricies nisl a vehicula. Integer id nunc\nelit. Sed rutrum sit amet neque quis tristique."
  },
  {
    "path": "spec/inputs/numbers/1.txt",
    "content": "1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19\n20\n21\n22\n23\n24\n25\n26\n27\n28\n29\n30\n31\n32\n33\n34\n35\n36\n37\n38\n39\n40\n41\n42\n43\n44\n45\n46\n47\n48\n49\n50\n"
  },
  {
    "path": "spec/inputs/numbers/10.txt",
    "content": "451\n452\n453\n454\n455\n456\n457\n458\n459\n460\n461\n462\n463\n464\n465\n466\n467\n468\n469\n470\n471\n472\n473\n474\n475\n476\n477\n478\n479\n480\n481\n482\n483\n484\n485\n486\n487\n488\n489\n490\n491\n492\n493\n494\n495\n496\n497\n498\n499\n500\n"
  },
  {
    "path": "spec/inputs/numbers/11.txt",
    "content": "501\n502\n503\n504\n505\n506\n507\n508\n509\n510\n511\n512\n513\n514\n515\n516\n517\n518\n519\n520\n521\n522\n523\n524\n525\n526\n527\n528\n529\n530\n531\n532\n533\n534\n535\n536\n537\n538\n539\n540\n541\n542\n543\n544\n545\n546\n547\n548\n549\n550\n"
  },
  {
    "path": "spec/inputs/numbers/12.txt",
    "content": "551\n552\n553\n554\n555\n556\n557\n558\n559\n560\n561\n562\n563\n564\n565\n566\n567\n568\n569\n570\n571\n572\n573\n574\n575\n576\n577\n578\n579\n580\n581\n582\n583\n584\n585\n586\n587\n588\n589\n590\n591\n592\n593\n594\n595\n596\n597\n598\n599\n600\n"
  },
  {
    "path": "spec/inputs/numbers/13.txt",
    "content": "601\n602\n603\n604\n605\n606\n607\n608\n609\n610\n611\n612\n613\n614\n615\n616\n617\n618\n619\n620\n621\n622\n623\n624\n625\n626\n627\n628\n629\n630\n631\n632\n633\n634\n635\n636\n637\n638\n639\n640\n641\n642\n643\n644\n645\n646\n647\n648\n649\n650\n"
  },
  {
    "path": "spec/inputs/numbers/14.txt",
    "content": "651\n652\n653\n654\n655\n656\n657\n658\n659\n660\n661\n662\n663\n664\n665\n666\n667\n668\n669\n670\n671\n672\n673\n674\n675\n676\n677\n678\n679\n680\n681\n682\n683\n684\n685\n686\n687\n688\n689\n690\n691\n692\n693\n694\n695\n696\n697\n698\n699\n700\n"
  },
  {
    "path": "spec/inputs/numbers/15.txt",
    "content": "701\n702\n703\n704\n705\n706\n707\n708\n709\n710\n711\n712\n713\n714\n715\n716\n717\n718\n719\n720\n721\n722\n723\n724\n725\n726\n727\n728\n729\n730\n731\n732\n733\n734\n735\n736\n737\n738\n739\n740\n741\n742\n743\n744\n745\n746\n747\n748\n749\n750\n"
  },
  {
    "path": "spec/inputs/numbers/16.txt",
    "content": "751\n752\n753\n754\n755\n756\n757\n758\n759\n760\n761\n762\n763\n764\n765\n766\n767\n768\n769\n770\n771\n772\n773\n774\n775\n776\n777\n778\n779\n780\n781\n782\n783\n784\n785\n786\n787\n788\n789\n790\n791\n792\n793\n794\n795\n796\n797\n798\n799\n800\n"
  },
  {
    "path": "spec/inputs/numbers/17.txt",
    "content": "801\n802\n803\n804\n805\n806\n807\n808\n809\n810\n811\n812\n813\n814\n815\n816\n817\n818\n819\n820\n821\n822\n823\n824\n825\n826\n827\n828\n829\n830\n831\n832\n833\n834\n835\n836\n837\n838\n839\n840\n841\n842\n843\n844\n845\n846\n847\n848\n849\n850\n"
  },
  {
    "path": "spec/inputs/numbers/18.txt",
    "content": "851\n852\n853\n854\n855\n856\n857\n858\n859\n860\n861\n862\n863\n864\n865\n866\n867\n868\n869\n870\n871\n872\n873\n874\n875\n876\n877\n878\n879\n880\n881\n882\n883\n884\n885\n886\n887\n888\n889\n890\n891\n892\n893\n894\n895\n896\n897\n898\n899\n900\n"
  },
  {
    "path": "spec/inputs/numbers/19.txt",
    "content": "901\n902\n903\n904\n905\n906\n907\n908\n909\n910\n911\n912\n913\n914\n915\n916\n917\n918\n919\n920\n921\n922\n923\n924\n925\n926\n927\n928\n929\n930\n931\n932\n933\n934\n935\n936\n937\n938\n939\n940\n941\n942\n943\n944\n945\n946\n947\n948\n949\n950\n"
  },
  {
    "path": "spec/inputs/numbers/2.txt",
    "content": "51\n52\n53\n54\n55\n56\n57\n58\n59\n60\n61\n62\n63\n64\n65\n66\n67\n68\n69\n70\n71\n72\n73\n74\n75\n76\n77\n78\n79\n80\n81\n82\n83\n84\n85\n86\n87\n88\n89\n90\n91\n92\n93\n94\n95\n96\n97\n98\n99\n100\n"
  },
  {
    "path": "spec/inputs/numbers/20.txt",
    "content": "951\n952\n953\n954\n955\n956\n957\n958\n959\n960\n961\n962\n963\n964\n965\n966\n967\n968\n969\n970\n971\n972\n973\n974\n975\n976\n977\n978\n979\n980\n981\n982\n983\n984\n985\n986\n987\n988\n989\n990\n991\n992\n993\n994\n995\n996\n997\n998\n999\n1000\n"
  },
  {
    "path": "spec/inputs/numbers/3.txt",
    "content": "101\n102\n103\n104\n105\n106\n107\n108\n109\n110\n111\n112\n113\n114\n115\n116\n117\n118\n119\n120\n121\n122\n123\n124\n125\n126\n127\n128\n129\n130\n131\n132\n133\n134\n135\n136\n137\n138\n139\n140\n141\n142\n143\n144\n145\n146\n147\n148\n149\n150\n"
  },
  {
    "path": "spec/inputs/numbers/4.txt",
    "content": "151\n152\n153\n154\n155\n156\n157\n158\n159\n160\n161\n162\n163\n164\n165\n166\n167\n168\n169\n170\n171\n172\n173\n174\n175\n176\n177\n178\n179\n180\n181\n182\n183\n184\n185\n186\n187\n188\n189\n190\n191\n192\n193\n194\n195\n196\n197\n198\n199\n200\n"
  },
  {
    "path": "spec/inputs/numbers/5.txt",
    "content": "201\n202\n203\n204\n205\n206\n207\n208\n209\n210\n211\n212\n213\n214\n215\n216\n217\n218\n219\n220\n221\n222\n223\n224\n225\n226\n227\n228\n229\n230\n231\n232\n233\n234\n235\n236\n237\n238\n239\n240\n241\n242\n243\n244\n245\n246\n247\n248\n249\n250\n"
  },
  {
    "path": "spec/inputs/numbers/6.txt",
    "content": "251\n252\n253\n254\n255\n256\n257\n258\n259\n260\n261\n262\n263\n264\n265\n266\n267\n268\n269\n270\n271\n272\n273\n274\n275\n276\n277\n278\n279\n280\n281\n282\n283\n284\n285\n286\n287\n288\n289\n290\n291\n292\n293\n294\n295\n296\n297\n298\n299\n300\n"
  },
  {
    "path": "spec/inputs/numbers/7.txt",
    "content": "301\n302\n303\n304\n305\n306\n307\n308\n309\n310\n311\n312\n313\n314\n315\n316\n317\n318\n319\n320\n321\n322\n323\n324\n325\n326\n327\n328\n329\n330\n331\n332\n333\n334\n335\n336\n337\n338\n339\n340\n341\n342\n343\n344\n345\n346\n347\n348\n349\n350\n"
  },
  {
    "path": "spec/inputs/numbers/8.txt",
    "content": "351\n352\n353\n354\n355\n356\n357\n358\n359\n360\n361\n362\n363\n364\n365\n366\n367\n368\n369\n370\n371\n372\n373\n374\n375\n376\n377\n378\n379\n380\n381\n382\n383\n384\n385\n386\n387\n388\n389\n390\n391\n392\n393\n394\n395\n396\n397\n398\n399\n400\n"
  },
  {
    "path": "spec/inputs/numbers/9.txt",
    "content": "401\n402\n403\n404\n405\n406\n407\n408\n409\n410\n411\n412\n413\n414\n415\n416\n417\n418\n419\n420\n421\n422\n423\n424\n425\n426\n427\n428\n429\n430\n431\n432\n433\n434\n435\n436\n437\n438\n439\n440\n441\n442\n443\n444\n445\n446\n447\n448\n449\n450\n"
  },
  {
    "path": "spec/inputs/numbers_0_100.txt",
    "content": "0\n1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19\n20\n21\n22\n23\n24\n25\n26\n27\n28\n29\n30\n31\n32\n33\n34\n35\n36\n37\n38\n39\n40\n41\n42\n43\n44\n45\n46\n47\n48\n49\n50\n51\n52\n53\n54\n55\n56\n57\n58\n59\n60\n61\n62\n63\n64\n65\n66\n67\n68\n69\n70\n71\n72\n73\n74\n75\n76\n77\n78\n79\n80\n81\n82\n83\n84\n85\n86\n87\n88\n89\n90\n91\n92\n93\n94\n95\n96\n97\n98\n99\n100"
  },
  {
    "path": "spec/inputs/numbers_1_100.txt",
    "content": "1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19\n20\n21\n22\n23\n24\n25\n26\n27\n28\n29\n30\n31\n32\n33\n34\n35\n36\n37\n38\n39\n40\n41\n42\n43\n44\n45\n46\n47\n48\n49\n50\n51\n52\n53\n54\n55\n56\n57\n58\n59\n60\n61\n62\n63\n64\n65\n66\n67\n68\n69\n70\n71\n72\n73\n74\n75\n76\n77\n78\n79\n80\n81\n82\n83\n84\n85\n86\n87\n88\n89\n90\n91\n92\n93\n94\n95\n96\n97\n98\n99\n100"
  },
  {
    "path": "spec/inputs/people.json",
    "content": "{\"id\":1,\"name\":\"Matthew Fuller\",\"age\":49,\"email\":\"mfuller0@blogger.com\",\"active\":false}\n{\"id\":2,\"name\":\"Pamela Thomas\",\"age\":58,\"email\":\"pthomas1@apache.org\",\"address\":\"92 Beilfuss Lane\",\"active\":false,\"ip_address\":\"41.52.54.168\"}\n{\"id\":3,\"name\":\"Joan Stevens\",\"age\":33,\"email\":\"jstevens2@xrea.com\",\"address\":\"1 Wayridge Circle\",\"active\":true,\"ip_address\":\"159.204.170.10\"}\n{\"id\":4,\"name\":\"Laura Reynolds\",\"email\":\"lreynolds3@admin.ch\",\"address\":\"431 Spenser Court\",\"active\":true,\"ip_address\":\"164.254.150.90\"}\n{\"id\":5,\"name\":\"Daniel Baker\",\"email\":\"dbaker4@blinklist.com\",\"active\":true,\"ip_address\":\"165.138.63.70\"}\n{\"id\":6,\"name\":\"Christina Lane\",\"email\":\"clane5@cnbc.com\",\"address\":\"7 Chinook Park\",\"active\":true,\"ip_address\":\"46.240.67.103\"}\n{\"id\":7,\"name\":\"Carlos Washington\",\"age\":50,\"email\":\"cwashington6@issuu.com\",\"address\":\"6487 Memorial Trail\",\"active\":false,\"ip_address\":\"152.45.154.18\"}\n{\"id\":8,\"name\":\"Harold Reid\",\"age\":53,\"email\":\"hreid7@seesaa.net\",\"active\":true}\n{\"id\":9,\"name\":\"Earl Harris\",\"age\":37,\"email\":\"eharris8@homestead.com\",\"active\":false}\n{\"id\":10,\"name\":\"Jack Hernandez\",\"age\":30,\"email\":\"jhernandez9@adobe.com\",\"address\":\"29407 Memorial Alley\",\"active\":false,\"ip_address\":\"129.222.144.1\"}\n{\"id\":11,\"name\":\"Nicole Torres\",\"age\":25,\"email\":\"ntorresa@amazon.de\",\"address\":\"34804 Havey Point\",\"active\":false,\"ip_address\":\"5.114.113.83\"}\n{\"id\":12,\"name\":\"Theresa Gordon\",\"age\":19,\"email\":\"tgordonb@xinhuanet.com\",\"active\":false}\n{\"id\":13,\"name\":\"Emily Schmidt\",\"age\":25,\"email\":\"eschmidtc@arstechnica.com\",\"address\":\"115 Bluestem Pass\",\"active\":true}\n{\"id\":14,\"name\":\"Dennis Ford\",\"age\":50,\"email\":\"dfordd@hc360.com\",\"address\":\"4107 Kim Avenue\",\"active\":true,\"ip_address\":\"44.170.237.89\"}\n{\"id\":15,\"name\":\"Deborah Williams\",\"age\":28,\"email\":\"dwilliamse@cmu.edu\",\"address\":\"7 Kipling Pass\",\"active\":false}\n{\"id\":16,\"name\":\"Rachel Sullivan\",\"age\":31,\"email\":\"rsullivanf@pagesperso-orange.fr\",\"address\":\"8196 Harbort Park\",\"active\":true,\"ip_address\":\"216.142.141.210\"}\n{\"id\":17,\"name\":\"Phillip Jordan\",\"email\":\"pjordang@liveinternet.ru\",\"active\":false}\n{\"id\":18,\"name\":\"Fred Mitchell\",\"email\":\"fmitchellh@shinystat.com\",\"address\":\"279 Gateway Parkway\",\"active\":false}\n{\"id\":19,\"name\":\"Antonio Dunn\",\"age\":23,\"email\":\"adunni@mediafire.com\",\"address\":\"71 Maple Place\",\"active\":true,\"ip_address\":\"39.50.250.70\"}\n{\"id\":20,\"name\":\"Alan Boyd\",\"age\":59,\"email\":\"aboydj@sbwire.com\",\"address\":\"4302 Warner Road\",\"active\":false,\"ip_address\":\"106.253.236.0\"}\n{\"id\":21,\"name\":\"Louise Wright\",\"age\":19,\"email\":\"lwrightk@so-net.ne.jp\",\"address\":\"5 Maryland Hill\",\"active\":false,\"ip_address\":\"51.0.99.116\"}\n{\"id\":22,\"name\":\"Diane Greene\",\"age\":39,\"email\":\"dgreenel@jugem.jp\",\"address\":\"38 Merrick Lane\",\"active\":false,\"ip_address\":\"146.124.156.180\"}\n{\"id\":23,\"name\":\"Emily Richardson\",\"age\":23,\"email\":\"erichardsonm@csmonitor.com\",\"active\":true}\n{\"id\":24,\"name\":\"Joseph Henderson\",\"age\":36,\"email\":\"jhendersonn@drupal.org\",\"address\":\"55 Morningstar Lane\",\"active\":true,\"ip_address\":\"54.187.254.99\"}\n{\"id\":25,\"name\":\"Chris Fowler\",\"age\":31,\"email\":\"cfowlero@msu.edu\",\"address\":\"4 Oakridge Center\",\"active\":false}\n{\"id\":26,\"name\":\"Helen West\",\"age\":38,\"email\":\"hwestp@time.com\",\"address\":\"93 Blaine Parkway\",\"active\":true,\"ip_address\":\"159.131.255.177\"}\n{\"id\":27,\"name\":\"Jimmy Black\",\"age\":46,\"email\":\"jblackq@house.gov\",\"address\":\"80157 Bay Drive\",\"active\":true,\"ip_address\":\"163.137.84.52\"}\n{\"id\":28,\"name\":\"Melissa Allen\",\"age\":56,\"email\":\"mallenr@upenn.edu\",\"address\":\"381 Merrick Way\",\"active\":false}\n{\"id\":29,\"name\":\"Scott Walker\",\"age\":48,\"email\":\"swalkers@etsy.com\",\"active\":true}\n{\"id\":30,\"name\":\"Jimmy Wood\",\"email\":\"jwoodt@bloomberg.com\",\"address\":\"1041 Claremont Lane\",\"active\":true}\n{\"id\":31,\"name\":\"Betty Jacobs\",\"email\":\"bjacobsu@ihg.com\",\"address\":\"6520 Anderson Junction\",\"active\":false,\"ip_address\":\"166.45.58.141\"}\n{\"id\":32,\"name\":\"Richard Stone\",\"age\":34,\"email\":\"rstonev@rakuten.co.jp\",\"address\":\"51 Bay Pass\",\"active\":true,\"ip_address\":\"9.35.132.204\"}\n{\"id\":33,\"name\":\"Melissa Henderson\",\"age\":21,\"email\":\"mhendersonw@washington.edu\",\"address\":\"06 Delaware Avenue\",\"active\":false}\n{\"id\":34,\"name\":\"David Stanley\",\"age\":57,\"email\":\"dstanleyx@ucoz.com\",\"address\":\"692 Lien Avenue\",\"active\":true,\"ip_address\":\"194.251.38.0\"}\n{\"id\":35,\"name\":\"Cynthia Murphy\",\"age\":20,\"email\":\"cmurphyy@xinhuanet.com\",\"active\":false}\n{\"id\":36,\"name\":\"Todd Henry\",\"age\":38,\"address\":\"589 Katie Center\",\"active\":true,\"ip_address\":\"177.233.117.222\"}\n{\"id\":37,\"name\":\"Christina Stephens\",\"age\":40,\"email\":\"cstephens10@illinois.edu\",\"address\":\"51039 Hermina Point\",\"active\":true}\n{\"id\":38,\"name\":\"Sharon Gomez\",\"email\":\"sgomez11@parallels.com\",\"address\":\"57089 Texas Way\",\"active\":true,\"ip_address\":\"149.85.104.141\"}\n{\"id\":39,\"name\":\"Benjamin Fisher\",\"age\":30,\"email\":\"bfisher12@gmpg.org\",\"address\":\"3 Welch Plaza\",\"active\":false,\"ip_address\":\"116.184.105.191\"}\n{\"id\":40,\"name\":\"Mark Stewart\",\"age\":38,\"email\":\"mstewart13@uiuc.edu\",\"active\":false,\"ip_address\":\"167.115.237.197\"}\n{\"id\":41,\"name\":\"Mark Black\",\"age\":45,\"email\":\"mblack14@tuttocitta.it\",\"address\":\"9 Rutledge Pass\",\"active\":false,\"ip_address\":\"108.90.166.239\"}\n{\"id\":42,\"name\":\"Christina Lawrence\",\"age\":47,\"email\":\"clawrence15@simplemachines.org\",\"address\":\"239 Eggendart Junction\",\"active\":true,\"ip_address\":\"8.118.127.22\"}\n{\"id\":43,\"name\":\"Howard Lynch\",\"age\":52,\"email\":\"hlynch16@slideshare.net\",\"active\":true}\n{\"id\":44,\"name\":\"Heather Perez\",\"age\":60,\"email\":\"hperez17@techcrunch.com\",\"address\":\"1 Almo Court\",\"active\":false,\"ip_address\":\"110.184.153.36\"}\n{\"id\":45,\"name\":\"Michael Howell\",\"age\":57,\"email\":\"mhowell18@wufoo.com\",\"address\":\"341 Shelley Alley\",\"active\":false}\n{\"id\":46,\"name\":\"Gregory Johnson\",\"age\":57,\"email\":\"gjohnson19@japanpost.jp\",\"address\":\"4 Basil Plaza\",\"active\":true,\"ip_address\":\"249.29.102.40\"}\n{\"id\":47,\"name\":\"Christopher Miller\",\"age\":50,\"email\":\"cmiller1a@google.es\",\"address\":\"76 Granby Way\",\"active\":true}\n{\"id\":48,\"name\":\"Beverly Hall\",\"age\":60,\"email\":\"bhall1b@cam.ac.uk\",\"address\":\"9 Novick Place\",\"active\":true}\n{\"id\":49,\"name\":\"Todd Adams\",\"age\":58,\"email\":\"tadams1c@yahoo.co.jp\",\"active\":false}\n{\"id\":50,\"name\":\"Judith Watkins\",\"age\":30,\"email\":\"jwatkins1d@comcast.net\",\"address\":\"5874 Esker Parkway\",\"active\":true,\"ip_address\":\"229.176.89.163\"}\n{\"id\":51,\"name\":\"Cheryl Howard\",\"age\":34,\"email\":\"choward1e@cam.ac.uk\",\"address\":\"492 Mandrake Lane\",\"active\":false,\"ip_address\":\"255.117.98.35\"}\n{\"id\":52,\"name\":\"Mary West\",\"email\":\"mwest1f@cnn.com\",\"address\":\"4 Vera Avenue\",\"active\":false,\"ip_address\":\"118.130.207.177\"}\n{\"id\":53,\"name\":\"Carol Welch\",\"age\":39,\"email\":\"cwelch1g@sun.com\",\"address\":\"794 Burrows Pass\",\"active\":true,\"ip_address\":\"205.98.9.218\"}\n{\"id\":54,\"name\":\"Donald Reed\",\"age\":23,\"email\":\"dreed1h@wsj.com\",\"address\":\"0769 Dryden Trail\",\"active\":true,\"ip_address\":\"35.72.239.99\"}\n{\"id\":55,\"name\":\"Michael Wells\",\"age\":29,\"email\":\"mwells1i@deviantart.com\",\"address\":\"9033 Crescent Oaks Way\",\"active\":false,\"ip_address\":\"33.18.26.152\"}\n{\"id\":56,\"name\":\"Joyce Montgomery\",\"age\":34,\"email\":\"jmontgomery1j@sciencedaily.com\",\"address\":\"29093 Lyons Circle\",\"active\":true,\"ip_address\":\"85.155.89.174\"}\n{\"id\":57,\"name\":\"Angela Garza\",\"age\":24,\"email\":\"agarza1k@hc360.com\",\"address\":\"388 Kenwood Street\",\"active\":false,\"ip_address\":\"204.191.24.172\"}\n{\"id\":58,\"name\":\"Rose Green\",\"age\":26,\"email\":\"rgreen1l@businessinsider.com\",\"address\":\"3 Mesta Pass\",\"active\":true}\n{\"id\":59,\"name\":\"Wanda Williamson\",\"age\":39,\"email\":\"wwilliamson1m@cafepress.com\",\"address\":\"18596 Westridge Crossing\",\"active\":true,\"ip_address\":\"215.98.196.209\"}\n{\"id\":60,\"name\":\"Irene Washington\",\"age\":49,\"email\":\"iwashington1n@ameblo.jp\",\"address\":\"83 Monica Crossing\",\"active\":false,\"ip_address\":\"141.46.156.186\"}\n{\"id\":61,\"name\":\"Anna Freeman\",\"age\":50,\"email\":\"afreeman1o@blogs.com\",\"address\":\"3 Gulseth Way\",\"active\":true}\n{\"id\":62,\"name\":\"Kathleen Romero\",\"age\":23,\"email\":\"kromero1p@craigslist.org\",\"address\":\"419 Leroy Court\",\"active\":true}\n{\"id\":63,\"name\":\"Matthew Alexander\",\"age\":58,\"email\":\"malexander1q@gnu.org\",\"active\":false}\n{\"id\":64,\"name\":\"Louis Moore\",\"age\":50,\"email\":\"lmoore1r@salon.com\",\"address\":\"671 Buhler Hill\",\"active\":true,\"ip_address\":\"21.247.160.104\"}\n{\"id\":65,\"name\":\"Christina Brooks\",\"age\":27,\"email\":\"cbrooks1s@google.cn\",\"address\":\"80405 Jana Circle\",\"active\":true,\"ip_address\":\"121.100.200.46\"}\n{\"id\":66,\"name\":\"Sarah Moreno\",\"age\":30,\"address\":\"03 Cottonwood Way\",\"active\":true,\"ip_address\":\"111.174.142.117\"}\n{\"id\":67,\"name\":\"Harold Rodriguez\",\"age\":24,\"email\":\"hrodriguez1u@squidoo.com\",\"address\":\"76 Green Circle\",\"active\":true}\n{\"id\":68,\"name\":\"Louise Black\",\"age\":18,\"email\":\"lblack1v@yale.edu\",\"address\":\"951 Blackbird Junction\",\"active\":false,\"ip_address\":\"212.47.220.126\"}\n{\"id\":69,\"name\":\"Adam Montgomery\",\"email\":\"amontgomery1w@mlb.com\",\"address\":\"1 Mesta Terrace\",\"active\":false}\n{\"id\":70,\"name\":\"Jacqueline Pierce\",\"age\":58,\"email\":\"jpierce1x@google.com.au\",\"address\":\"0161 Village Plaza\",\"active\":false,\"ip_address\":\"116.164.88.112\"}\n{\"id\":71,\"name\":\"Ann Stone\",\"age\":45,\"email\":\"astone1y@yelp.com\",\"address\":\"1011 Heath Terrace\",\"active\":false}\n{\"id\":72,\"name\":\"Teresa Arnold\",\"age\":33,\"email\":\"tarnold1z@mayoclinic.com\",\"active\":false,\"ip_address\":\"81.165.73.142\"}\n{\"id\":73,\"name\":\"Arthur Shaw\",\"age\":27,\"email\":\"ashaw20@latimes.com\",\"address\":\"9956 Hooker Road\",\"active\":true}\n{\"id\":74,\"name\":\"Wayne Garrett\",\"age\":41,\"email\":\"wgarrett21@adobe.com\",\"address\":\"34 Grasskamp Street\",\"active\":true,\"ip_address\":\"29.26.28.17\"}\n{\"id\":75,\"name\":\"Russell Castillo\",\"age\":46,\"email\":\"rcastillo22@printfriendly.com\",\"address\":\"444 South Avenue\",\"active\":false}\n{\"id\":76,\"name\":\"Shirley Burke\",\"age\":47,\"email\":\"sburke23@lulu.com\",\"address\":\"70 Florence Drive\",\"active\":false}\n{\"id\":77,\"name\":\"Tammy Washington\",\"age\":46,\"email\":\"twashington24@youtube.com\",\"address\":\"559 Hollow Ridge Road\",\"active\":true,\"ip_address\":\"230.169.245.123\"}\n{\"id\":78,\"name\":\"Diane Freeman\",\"age\":49,\"email\":\"dfreeman25@github.com\",\"address\":\"04 Transport Center\",\"active\":false,\"ip_address\":\"138.200.234.169\"}\n{\"id\":79,\"name\":\"Anne Morrison\",\"email\":\"amorrison26@telegraph.co.uk\",\"address\":\"525 Shasta Junction\",\"active\":true}\n{\"id\":80,\"name\":\"Paul Johnston\",\"age\":51,\"email\":\"pjohnston27@youku.com\",\"address\":\"16254 Ryan Center\",\"active\":false,\"ip_address\":\"214.38.125.121\"}\n{\"id\":81,\"name\":\"Virginia Welch\",\"age\":58,\"email\":\"vwelch28@china.com.cn\",\"address\":\"2 Michigan Hill\",\"active\":true}\n{\"id\":82,\"name\":\"Louis Hughes\",\"age\":44,\"email\":\"lhughes29@mysql.com\",\"address\":\"423 Meadow Valley Pass\",\"active\":false,\"ip_address\":\"213.45.167.91\"}\n{\"id\":83,\"name\":\"Betty Reynolds\",\"age\":57,\"email\":\"breynolds2a@furl.net\",\"address\":\"4486 Kedzie Road\",\"active\":true}\n{\"id\":84,\"name\":\"Norma Olson\",\"age\":18,\"email\":\"nolson2b@goo.gl\",\"active\":true}\n{\"id\":85,\"name\":\"David Ward\",\"age\":28,\"email\":\"dward2c@ibm.com\",\"address\":\"3 Kings Place\",\"active\":true}\n{\"id\":86,\"name\":\"Phyllis Williamson\",\"age\":26,\"email\":\"pwilliamson2d@nationalgeographic.com\",\"address\":\"7 Northview Street\",\"active\":false,\"ip_address\":\"234.86.8.89\"}\n{\"id\":87,\"name\":\"Kathleen Holmes\",\"age\":46,\"email\":\"kholmes2e@zdnet.com\",\"address\":\"4814 Colorado Place\",\"active\":false}\n{\"id\":88,\"name\":\"George King\",\"age\":23,\"email\":\"gking2f@ask.com\",\"address\":\"966 Morrow Junction\",\"active\":false,\"ip_address\":\"89.94.24.41\"}\n{\"id\":89,\"name\":\"Raymond Garcia\",\"age\":47,\"email\":\"rgarcia2g@quantcast.com\",\"active\":true,\"ip_address\":\"135.10.187.167\"}\n{\"id\":90,\"name\":\"Rose Meyer\",\"age\":38,\"active\":true,\"ip_address\":\"228.216.201.80\"}\n{\"id\":91,\"name\":\"Jennifer Gray\",\"age\":50,\"email\":\"jgray2i@princeton.edu\",\"address\":\"58241 Calypso Court\",\"active\":true,\"ip_address\":\"158.144.236.158\"}\n{\"id\":92,\"name\":\"Bonnie Franklin\",\"age\":24,\"email\":\"bfranklin2j@slideshare.net\",\"address\":\"629 Prairieview Center\",\"active\":false}\n{\"id\":93,\"name\":\"Sarah Martin\",\"age\":52,\"email\":\"smartin2k@cnn.com\",\"address\":\"997 Kensington Lane\",\"active\":false}\n{\"id\":94,\"name\":\"Shirley Hamilton\",\"age\":39,\"email\":\"shamilton2l@nih.gov\",\"address\":\"934 Clarendon Lane\",\"active\":false}\n{\"id\":95,\"name\":\"Gregory Kim\",\"age\":37,\"email\":\"gkim2m@tinyurl.com\",\"active\":true,\"ip_address\":\"216.24.238.78\"}\n{\"id\":96,\"name\":\"Betty Sanchez\",\"age\":46,\"email\":\"bsanchez2n@washington.edu\",\"active\":true}\n{\"id\":97,\"name\":\"Ann Cooper\",\"age\":41,\"email\":\"acooper2o@issuu.com\",\"active\":false}\n{\"id\":98,\"name\":\"Christopher Cole\",\"active\":true}\n{\"id\":99,\"name\":\"Debra Lopez\",\"age\":36,\"address\":\"4 Grim Drive\",\"active\":false,\"ip_address\":\"1.217.64.60\"}\n{\"id\":100,\"name\":\"Shawn Moore\",\"age\":35,\"email\":\"smoore2r@mayoclinic.com\",\"active\":true}\n"
  },
  {
    "path": "spec/lib/collect_spec.rb",
    "content": "require 'spec_helper'\n\nRSpec.describe Spark::RDD do\n\n  let(:mapping) { lambda{|x| [x, 1]} }\n  let(:numbers) { Generator.numbers }\n\n  it '.collect_as_hash' do\n    rdd = $sc.parallelize(numbers)\n    rdd = rdd.map(mapping)\n\n    expect(rdd.collect_as_hash).to eql(Hash[numbers.map(&mapping)])\n  end\n\n  context '.take' do\n    let(:size)    { 1000 }\n    let(:numbers) { Generator.numbers(size) }\n    let(:rdd)     { $sc.parallelize(numbers) }\n\n    it 'nothing' do\n      expect(rdd.take(0)).to eql([])\n    end\n\n    it 'first' do\n      expect(rdd.first).to eql(numbers.first)\n    end\n\n    it 'less than limit' do\n      _size = size / 2\n      expect(rdd.take(_size)).to eql(numbers.take(_size))\n    end\n\n    it 'all' do\n      expect(rdd.take(size)).to eql(numbers)\n    end\n\n    it 'more than limit' do\n      expect(rdd.take(size*2)).to eql(numbers)\n    end\n  end\n\nend\n"
  },
  {
    "path": "spec/lib/command_spec.rb",
    "content": "require 'spec_helper'\n\ndef to_s_method(x)\n  x.to_s\nend\n\nRSpec::describe Spark::CommandBuilder do\n  let(:numbers) { Generator.numbers }\n  let(:rdd)     { $sc.parallelize(numbers, 1) }\n\n  context '.serialize_function' do\n    let(:result)  { numbers.map(&:to_s) }\n\n    it 'string' do\n      expect(rdd.map('lambda{|x| x.to_s}').collect).to eql(result)\n    end\n\n    it 'symbol' do\n      expect(rdd.map(:to_s).collect).to eql(result)\n    end\n\n    it 'lambda' do\n      expect(rdd.map(lambda{|x| x.to_s}).collect).to eql(result)\n    end\n\n    it 'method' do\n      expect(rdd.map(method(:to_s_method)).collect).to eql(result)\n    end\n  end\n\n  context '.bind' do\n    it 'number' do\n      number = rand(0..10000000)\n      rdd2 = rdd.map(lambda{|x| x * number}).bind(number: number)\n\n      expect(rdd2.collect).to eq(numbers.map{|x| x * number})\n    end\n\n    it 'open struct' do\n      require 'ostruct'\n\n      struct = OpenStruct.new\n      struct.number = 3\n      struct.string = '3'\n      struct.array = [1, 2, 3]\n\n      func = lambda{|item|\n        item * struct.number + struct.string.to_i + struct.array[0]\n      }\n\n      rdd2 = rdd.add_library('ostruct')\n      rdd2 = rdd2.map(func)\n      rdd2 = rdd2.bind(struct: struct)\n\n      expect(rdd2.collect).to eq(numbers.map(&func))\n    end\n\n    it 'different naming' do\n      array = [1, 2, 3]\n\n      rdd2 = rdd.map(lambda{|_| my_array.size})\n      rdd2 = rdd2.bind(my_array: array)\n\n      expect(rdd2.sum).to eq(numbers.size * array.size)\n    end\n  end\n\nend\n"
  },
  {
    "path": "spec/lib/config_spec.rb",
    "content": "require 'spec_helper'\n\nRSpec.describe Spark::Config do\n\n  before(:context) do\n    Spark.stop\n  end\n\n  after(:context) do\n    spark_start\n  end\n\n  it 'should be stopped' do\n    expect(Spark.started?).to be_falsy\n  end\n\n  context 'new config' do\n\n    let(:configuration) do\n      {\n        'test.test1' => 'test1',\n        'test.test2' => 'test2',\n        'test.test3' => 'test3'\n      }\n    end\n\n    before(:each) do\n      Spark.clear_config\n    end\n\n    it 'throught methods' do\n      configuration.each do |key, value|\n        Spark.config.set(key, value)\n      end\n\n      configuration.each do |key, value|\n        expect(Spark.config.get(key)).to eql(value)\n      end\n    end\n\n    it 'throught hash style' do\n      configuration.each do |key, value|\n        Spark.config[key] = value\n      end\n\n      configuration.each do |key, value|\n        expect(Spark.config[key]).to eql(value)\n      end\n    end\n\n    it 'throught dsl' do\n      configuration.each do |key, value|\n        Spark.config {\n          set key, value\n        }\n      end\n\n      configuration.each do |key, value|\n        expect(Spark.config[key]).to eql(value)\n      end\n    end\n  end\n\nend\n"
  },
  {
    "path": "spec/lib/context_spec.rb",
    "content": "require 'spec_helper'\n\nRSpec.describe Spark::Context do\n\n  it '.run_job' do\n    workers = 5\n    numbers = (0...100).to_a\n    func = lambda{|part| part.size}\n\n    ser = Spark::Serializer.build { __batched__(__marshal__, 1) }\n\n    rdd = $sc.parallelize(numbers, workers, ser)\n\n    rdd_result = $sc.run_job(rdd, func)\n    result = numbers.each_slice(numbers.size/workers).map(&func)\n    expect(rdd_result).to eql(result)\n\n    parts = [0, 2]\n    func = lambda{|part| part.to_s}\n\n    rdd_result = $sc.run_job(rdd, func, parts)\n    result = []\n    sliced_numbers = numbers.each_slice(numbers.size/workers).to_a\n    parts.each do |part|\n      result << func.call(sliced_numbers[part])\n    end\n\n    expect(rdd_result).to eql(result)\n  end\n\n  it '.broadcast' do\n    workers = rand(1..5)\n\n    values1 = [1,2,3]\n    values2 = [4,5,6]\n\n    broadcast1 = $sc.broadcast(values1)\n    broadcast2 = $sc.broadcast(values2)\n\n    rdd = $sc.parallelize(0..5, workers)\n    rdd = rdd.bind(broadcast1: broadcast1, broadcast2: broadcast2)\n    rdd = rdd.map_partitions(lambda{|_| broadcast1.value + broadcast2.value })\n\n    expect(rdd.sum).to eql(\n      (values1 + values2).reduce(:+) * workers\n    )\n  end\n\n  # context '.accumulator' do\n\n  #   it 'test' do\n  #     accum1 = $sc.accumulator(0,)\n  #     accum2 = $sc.accumulator(1, :*, 1)\n  #     accum3 = $sc.accumulator(0, lambda{|max, val| val > max ? val : max})\n\n  #     accum1 += 1\n\n  #     accum2.add(2)\n  #     accum2.add(2)\n  #     accum2.add(2)\n\n  #     accum3.add(9)\n  #     accum3.add(6)\n  #     accum3.add(7)\n\n  #     expect(accum1.value).to eql(1)\n  #     expect(accum2.value).to eql(8)\n  #     expect(accum3.value).to eql(9)\n\n  #     func = Proc.new do |_, index|\n  #       accum1.add(1)\n  #       accum2.add(2)\n  #       accum3.add(index * 10)\n  #     end\n\n  #     rdd = $sc.parallelize(0..4, 4)\n  #     rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3)\n  #     rdd = rdd.map_partitions_with_index(func)\n  #     rdd.collect\n\n  #     # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock\n  #     sleep(1)\n\n  #     expect(accum1.value).to eql(5)\n  #     expect(accum2.value).to eql(128)\n  #     expect(accum3.value).to eql(30)\n  #   end\n\n  #   context 'accum param' do\n  #     it 'symbol' do\n  #       accum1 = $sc.accumulator(1, :+, 0)\n  #       accum2 = $sc.accumulator(5, :-, 3)\n  #       accum3 = $sc.accumulator(1, :*, 1)\n  #       accum4 = $sc.accumulator(1.0, :/, 1.0)\n  #       accum5 = $sc.accumulator(2, :**, 2)\n\n  #       func = Proc.new do |_|\n  #         accum1.add(1)\n  #         accum2.add(1)\n  #         accum3.add(2)\n  #         accum4.add(2)\n  #         accum5.add(2)\n  #       end\n\n  #       rdd = $sc.parallelize(0..4, 2)\n  #       rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3, accum4: accum4, accum5: accum5)\n  #       rdd = rdd.map_partitions(func)\n  #       rdd.collect\n\n  #       # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock\n  #       sleep(1)\n\n  #       expect(accum1.value).to eq(3)\n  #       expect(accum2.value).to eq(1)\n  #       expect(accum3.value).to eq(4)\n  #       expect(accum4.value).to eq(4)\n  #       expect(accum5.value).to eq(65536)\n  #     end\n\n  #     it 'proc' do\n  #       accum1 = $sc.accumulator(1, lambda{|mem, val| mem + val}, 0)\n  #       accum2 = $sc.accumulator('a', lambda{|mem, val| mem + val}, '')\n  #       accum3 = $sc.accumulator([], lambda{|mem, val| mem << val}, [])\n\n  #       func = Proc.new do |_|\n  #         accum1.add(1)\n  #         accum2.add('a')\n  #         accum3.add(1)\n  #       end\n\n  #       rdd = $sc.parallelize(0..4, 2)\n  #       rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3)\n  #       rdd = rdd.map_partitions(func)\n  #       rdd.collect\n\n  #       # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock\n  #       sleep(1)\n\n  #       expect(accum1.value).to eq(3)\n  #       expect(accum2.value).to eq('aaa')\n  #       expect(accum3.value).to eq([[1], [1]])\n  #     end\n\n  #     it 'string' do\n  #       expect { $sc.accumulator(1, '0') }.to raise_error(Spark::SerializeError)\n\n  #       accum = $sc.accumulator(1, 'lambda{|mem, val| mem + val}', 0)\n\n  #       func = Proc.new do |_|\n  #         accum.add(1)\n  #       end\n\n  #       rdd = $sc.parallelize(0..4, 2)\n  #       rdd = rdd.bind(accum: accum)\n  #       rdd = rdd.map_partitions(func)\n  #       rdd.collect\n\n  #       # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock\n  #       sleep(1)\n\n  #       expect(accum.value).to eq(3)\n  #     end\n  #   end\n  # end\n\nend\n"
  },
  {
    "path": "spec/lib/ext_spec.rb",
    "content": "require 'spec_helper'\n\nRSpec.describe Array do\n\n  it '.deep_copy' do\n    data = ['a', 'b', 'c']\n    new_data = data.dup\n\n    data[0] << 'a'\n\n    expect(data).to eql(new_data)\n\n    new_data = data.deep_copy\n\n    data[1] << 'b'\n\n    expect(data).to_not eql(new_data)\n  end\n\nend\n\nRSpec.describe Hash do\n\n  it '.stringify_keys!' do\n    data = {\n      a: 'a',\n      b: 'b',\n      c: 'c'\n    }\n\n    data.stringify_keys!\n\n    expect(data).to eql({\n      'a' => 'a',\n      'b' => 'b',\n      'c' => 'c'\n    })\n  end\n\nend\n\nRSpec.describe String do\n\n  it '.camelize' do\n    data = 'aaa_bbb_ccc'.camelize\n    expect(data).to eql('AaaBbbCcc')\n  end\n\nend\n\nRSpec.describe IO do\n\n  it 'serialize' do\n    file = Tempfile.new('serialize')\n    file.binmode\n\n    file.write_int(1)\n    file.write_long(2)\n    file.write_string('3')\n    file.write_data([4])\n\n    file.rewind\n\n    expect(file.read_int).to eq(1)\n    expect(file.read_long).to eq(2)\n    expect(file.read_string).to eq('3')\n    expect(file.read_data).to eq([4])\n\n    file.unlink\n  end\n\nend\n"
  },
  {
    "path": "spec/lib/external_apps_spec.rb",
    "content": "require 'spec_helper'\n\nRSpec.describe Spark::RDD do\n\n  context '.pipe' do\n    let(:words)   { Generator.words }\n    let(:numbers) { Generator.numbers }\n\n    it 'single program' do\n      skip if windows?\n\n      rdd = $sc.parallelize(words, 1)\n      rdd = rdd.pipe('tr a b')\n\n      result = words.dup\n      result.map! do |x|\n        x.gsub('a', 'b')\n      end\n\n      expect(rdd.collect).to eql(result)\n    end\n\n    it 'multiple program' do\n      skip if windows?\n\n      rdd = $sc.parallelize(numbers, 1)\n      rdd = rdd.pipe(\"tr 1 5\", \"awk '{print $1*10}'\")\n      rdd = rdd.map(lambda{|x| x.to_i * 100})\n\n      result = numbers.dup\n      result.map! do |x|\n        x.to_s.gsub('1', '5')\n      end\n      result.map! do |x|\n        x.to_i * 10\n      end\n      result.map! do |x|\n        x * 100\n      end\n\n      expect(rdd.collect).to eql(result)\n    end\n  end\n\nend\n"
  },
  {
    "path": "spec/lib/filter_spec.rb",
    "content": "require 'spec_helper'\n\ndef func4(item)\n  item.start_with?('a') && item.size > 3 && item[1].to_s.ord > 106\nend\n\nRSpec.shared_examples 'a filtering' do |workers|\n  context \"with #{workers || 'default'} worker\" do\n    it 'when numbers' do\n      rdd2 = rdd_numbers(workers)\n      rdd2 = rdd2.filter(func1)\n      result = numbers.select(&func1)\n\n      expect(rdd2.collect).to eql(result)\n\n      rdd3 = rdd_numbers(workers)\n      rdd3 = rdd3.filter(func1)\n      rdd3 = rdd3.filter(func2)\n\n      expect(rdd3.collect).to eql([])\n    end\n\n    it 'when words' do\n      rdd2 = rdd_words(workers)\n      rdd2 = rdd2.filter(func3)\n      result = words.select{|x| func3.call(x)}\n\n      expect(rdd2.collect).to eql(result)\n\n      rdd3 = rdd_words(workers)\n      rdd3 = rdd3.filter(method(:func4))\n      result = words.select{|x| func4(x)}\n\n      expect(rdd3.collect).to eql(result)\n    end\n  end\nend\n\nRSpec.describe 'Spark::RDD.filter' do\n  let(:func1) { lambda{|x| x.to_i.even?} }\n  let(:func2) { lambda{|x| x.to_i.odd?} }\n  let(:func3) { lambda{|x| x.to_s.start_with?('b')} }\n\n  context 'throught parallelize' do\n    let(:numbers) { Generator.numbers_with_zero }\n    let(:words)   { Generator.words }\n\n    def rdd_numbers(workers)\n      $sc.parallelize(numbers, workers)\n    end\n\n    def rdd_words(workers)\n      $sc.parallelize(words, workers)\n    end\n\n    it_behaves_like 'a filtering', 2\n    # it_behaves_like 'a filtering', nil\n    # it_behaves_like 'a filtering', rand(2..10)\n  end\n\n  context 'throught text_file' do\n    let(:file_numbers) { File.join('spec', 'inputs', 'numbers_0_100.txt') }\n    let(:file_words)   { File.join('spec', 'inputs', 'lorem_300.txt') }\n\n    let(:numbers) { File.readlines(file_numbers).map(&:strip) }\n    let(:words)   { File.readlines(file_words).map(&:strip) }\n\n    def rdd_numbers(workers)\n      $sc.text_file(file_numbers, workers)\n    end\n\n    def rdd_words(workers)\n      $sc.text_file(file_words, workers)\n    end\n\n    it_behaves_like 'a filtering', 2\n    # it_behaves_like 'a filtering', nil\n    # it_behaves_like 'a filtering', rand(2..10)\n  end\nend\n"
  },
  {
    "path": "spec/lib/flat_map_spec.rb",
    "content": "require 'spec_helper'\n\nRSpec.shared_examples 'a flat mapping' do |workers|\n  it \"with #{workers || 'default'} worker\" do\n    rdd2 = rdd(workers).map(func1)\n    result = numbers.flat_map(&func1)\n\n    expect(rdd2.collect).to eql(result)\n\n    rdd3 = rdd(workers)\n    rdd3 = rdd3.flat_map(func1)\n    rdd3 = rdd3.flat_map(func2)\n    rdd3 = rdd3.flat_map(func3)\n    result = numbers.flat_map(&func1).flat_map(&func2).flat_map(&func3)\n\n    expect(rdd3.collect).to eql(result)\n\n    rdd4 = rdd(workers)\n    rdd4 = rdd4.flat_map(func1)\n    rdd4 = rdd4.flat_map(func2)\n    rdd4 = rdd4.flat_map(func3)\n\n    expect(rdd4.collect).to eql(rdd3.collect)\n  end\nend\n\nRSpec.shared_examples 'a flat mapping values' do |workers|\n  it \"with #{workers || 'default'} worker\" do\n    rdd2 = rdd(workers).flat_map_values(func1)\n    result = []\n    hash_with_values.each do |(key, values)|\n      values = func1.call(values).flatten\n      values.each do |value|\n        result << [key, value]\n      end\n    end\n\n    expect(rdd2.collect).to eql(result)\n\n    rdd2 = rdd(workers).flat_map_values(func2)\n    result = []\n    hash_with_values.each do |(key, values)|\n      values = func2.call(values).flatten\n      values.each do |value|\n        result << [key, value]\n      end\n    end\n\n    expect(rdd2.collect).to eql(result)\n  end\nend\n\nRSpec.describe 'Spark::RDD' do\n  let(:func1) { lambda{|x| x*2} }\n  let(:func2) { lambda{|x| [x*3, 1, 1]} }\n  let(:func3) { lambda{|x| [x*4, 2, 2]} }\n\n  context 'throught parallelize' do\n    context '.flat_map' do\n      let(:numbers) { Generator.numbers_with_zero }\n\n      def rdd(workers)\n        $sc.parallelize(numbers, workers)\n      end\n\n      it_behaves_like 'a flat mapping', 1\n      it_behaves_like 'a flat mapping', 2\n      # it_behaves_like 'a flat mapping', nil\n      # it_behaves_like 'a flat mapping', rand(2..10)\n    end\n\n    context '.flat_map_values' do\n      let(:func1) { lambda{|x| x*2} }\n      let(:func2) { lambda{|x| [x.first]} }\n      let(:hash_with_values) { Generator.hash_with_values }\n\n      def rdd(workers)\n        $sc.parallelize(hash_with_values, workers)\n      end\n\n      it_behaves_like 'a flat mapping values', 1\n      it_behaves_like 'a flat mapping values', 2\n      # it_behaves_like 'a flat mapping values', nil\n      # it_behaves_like 'a flat mapping values', rand(2..10)\n    end\n  end\n\n  context 'throught text_file' do\n    context '.flat_map' do\n      let(:file)    { File.join('spec', 'inputs', 'numbers_0_100.txt') }\n      let(:numbers) { File.readlines(file).map(&:strip) }\n\n      def rdd(workers)\n        $sc.text_file(file, workers)\n      end\n\n      it_behaves_like 'a flat mapping', 1\n      it_behaves_like 'a flat mapping', 2\n      # it_behaves_like 'a flat mapping', nil\n      # it_behaves_like 'a flat mapping', rand(2..10)\n    end\n  end\nend\n"
  },
  {
    "path": "spec/lib/group_spec.rb",
    "content": "require 'spec_helper'\n\nRSpec.shared_examples 'a groupping by key' do |workers|\n  it \"with #{workers || 'default'} worker\" do\n    expect(rdd_result(workers)).to eql(result)\n  end\nend\n\nRSpec.shared_examples 'a cogroupping by key' do |workers|\n  context \"with #{workers || 'default'} worker\" do\n    it '.group_with' do\n      rdd = rdd_1(workers).group_with(rdd_2(workers))\n      expect(rdd.collect_as_hash).to eql(result_12)\n    end\n\n    it '.cogroup' do\n      rdd = rdd_1(workers).cogroup(rdd_2(workers), rdd_3(workers))\n      expect(rdd.collect_as_hash).to eql(result_123)\n    end\n  end\nend\n\nRSpec.shared_examples 'a groupping by' do |workers|\n  it \"with #{workers || 'default'} worker\" do\n    rdd = rdd_numbers(workers)\n    rdd = rdd.group_by(key_function1)\n\n    expect(rdd.collect_as_hash).to eql(numbers.group_by(&key_function1))\n\n    rdd = rdd_words(workers)\n    rdd = rdd.group_by(key_function2)\n\n    expect(rdd.collect_as_hash).to eql(words.group_by(&key_function2))\n  end\nend\n\nRSpec.describe 'Spark::RDD' do\n\n  def make_result(*hashes)\n    _result = {}\n    hashes.each do |data|\n      data.each do |key, value|\n        _result[key] ||= []\n        _result[key] << value\n      end\n    end\n    _result\n  end\n\n  context '.group_by_key' do\n    let(:hash) { Generator.hash }\n    let(:result) { make_result(hash) }\n\n    def rdd_result(workers)\n      rdd = $sc.parallelize(hash)\n      rdd.group_by_key.collect_as_hash\n    end\n\n    it_behaves_like 'a groupping by key', 1\n    it_behaves_like 'a groupping by key', 2\n    # it_behaves_like 'a groupping by key', nil\n    # it_behaves_like 'a groupping by key', rand(2..10)\n  end\n\n  context 'cogroup' do\n    let(:hash1) { Generator.hash }\n    let(:hash2) { Generator.hash }\n    let(:hash3) { Generator.hash }\n\n    let(:result_12)  { make_result(hash1, hash2) }\n    let(:result_123) { make_result(hash1, hash2, hash3) }\n\n    def rdd_1(workers)\n      $sc.parallelize(hash1)\n    end\n\n    def rdd_2(workers)\n      $sc.parallelize(hash2)\n    end\n\n    def rdd_3(workers)\n      $sc.parallelize(hash3)\n    end\n\n    it_behaves_like 'a cogroupping by key', 1\n    it_behaves_like 'a cogroupping by key', 2\n    # it_behaves_like 'a cogroupping by key', nil\n    # it_behaves_like 'a cogroupping by key', rand(2..10)\n  end\n\n  context 'group_by' do\n    let(:key_function1) { lambda{|x| x%2} }\n    let(:key_function2) { lambda{|x| x.size} }\n\n    let(:numbers) { Generator.numbers }\n    let(:words)   { Generator.words }\n\n    def rdd_numbers(workers)\n      $sc.parallelize(numbers)\n    end\n\n    def rdd_words(workers)\n      $sc.parallelize(words)\n    end\n\n    it_behaves_like 'a groupping by', 1\n    it_behaves_like 'a groupping by', 2\n    # it_behaves_like 'a groupping by', nil\n    # it_behaves_like 'a groupping by', rand(2..10)\n  end\n\nend\n"
  },
  {
    "path": "spec/lib/helper_spec.rb",
    "content": "require 'spec_helper'\n\nRSpec.configure do |c|\n  c.include Spark::Helper::Parser\n  c.include Spark::Helper::Statistic\nend\n\nRSpec.describe Spark::Helper do\n\n  it 'memory size' do\n    expect(to_memory_size('512mb')).to eql(524288.0)\n    expect(to_memory_size('1586 mb')).to eql(1624064.0)\n    expect(to_memory_size('3 MB')).to eql(3072.0)\n    expect(to_memory_size('9gb')).to eql(9437184.0)\n    expect(to_memory_size('9gb', 'mb')).to eql(9216.0)\n    expect(to_memory_size('9mb', 'gb')).to eql(0.01)\n    expect(to_memory_size('6652548796kb', 'mb')).to eql(6496629.68)\n  end\n\n  context 'statistic' do\n    it 'compute_fraction' do\n      expect(compute_fraction(1, 1000, true)).to be_within(0.001).of(0.013)\n      expect(compute_fraction(2, 1000, true)).to be_within(0.001).of(0.018)\n      expect(compute_fraction(3, 1000, true)).to be_within(0.001).of(0.023)\n      expect(compute_fraction(4, 1000, true)).to be_within(0.001).of(0.028)\n      expect(compute_fraction(5, 1000, true)).to be_within(0.001).of(0.031)\n\n      expect(compute_fraction(1, 1000, false)).to be_within(0.001).of(0.0249)\n      expect(compute_fraction(2, 1000, false)).to be_within(0.001).of(0.0268)\n      expect(compute_fraction(3, 1000, false)).to be_within(0.001).of(0.0287)\n      expect(compute_fraction(4, 1000, false)).to be_within(0.001).of(0.0305)\n      expect(compute_fraction(5, 1000, false)).to be_within(0.001).of(0.0322)\n    end\n\n    it 'bisect_right' do\n      data = [10, 20, 30, 40, 50, 60, 70, 80, 90]\n\n      expect(bisect_right(data, 0)).to eq(0)\n      expect(bisect_right(data, 1)).to eq(0)\n      expect(bisect_right(data, 1, 2)).to eq(2)\n      expect(bisect_right(data, 1, 3)).to eq(3)\n      expect(bisect_right(data, 1, 4)).to eq(4)\n      expect(bisect_right(data, 9)).to eq(0)\n      expect(bisect_right(data, 10)).to eq(1)\n      expect(bisect_right(data, 40)).to eq(4)\n      expect(bisect_right(data, 42)).to eq(4)\n      expect(bisect_right(data, 72)).to eq(7)\n      expect(bisect_right(data, 80, 4)).to eq(8)\n      expect(bisect_right(data, 80, 5)).to eq(8)\n      expect(bisect_right(data, 80, 8)).to eq(8)\n      expect(bisect_right(data, 80, 9)).to eq(9)\n      expect(bisect_right(data, 200)).to eq(9)\n    end\n\n    it 'determine_bounds' do\n      data = [10, 20, 30, 40, 50, 60, 70, 80, 90]\n\n      expect(determine_bounds(data, 0)).to eq([])\n      expect(determine_bounds(data, 1)).to eq([])\n      expect(determine_bounds(data, 2)).to eq([50])\n      expect(determine_bounds(data, 3)).to eq([40, 70])\n      expect(determine_bounds(data, 4)).to eq([30, 50, 70])\n      expect(determine_bounds(data, 20)).to eq(data)\n    end\n  end\n\nend\n"
  },
  {
    "path": "spec/lib/key_spec.rb",
    "content": "require 'spec_helper'\n\nRSpec.shared_examples 'a keying by' do |workers|\n  it \"with #{workers || 'default'} worker\" do\n    rdd = rdd_numbers(workers)\n    rdd = rdd.key_by(key_function1)\n\n    result = numbers.map{|item| [key_function1.call(item), item]}\n    expect(rdd.collect).to eql(result)\n\n    rdd = rdd_words(workers)\n    rdd = rdd.key_by(key_function2)\n\n    result = words.map{|item| [key_function2.call(item), item]}\n    expect(rdd.collect).to eql(result)\n  end\nend\n\nRSpec.describe 'Spark::RDD' do\n\n  context 'key_by' do\n    let(:key_function1) { lambda{|x| x.even?} }\n    let(:key_function2) { lambda{|x| x.include?('a')} }\n\n    let(:numbers) { Generator.numbers }\n    let(:words)   { Generator.words }\n\n    def rdd_numbers(workers)\n      $sc.parallelize(numbers)\n    end\n\n    def rdd_words(workers)\n      $sc.parallelize(words)\n    end\n\n    it_behaves_like 'a keying by', 1\n    it_behaves_like 'a keying by', 2\n    # it_behaves_like 'a keying by', nil\n    # it_behaves_like 'a keying by', rand(2..10)\n  end\n\n  it 'lookup' do\n    numbers = Generator.numbers\n    rdd_numbers = $sc.parallelize(numbers, 2)\n\n    rdd = rdd_numbers.group_by(lambda {|x| x%3})\n    rdd.lookup(2)\n\n    expect(rdd.lookup(2).first).to eq(\n      numbers.group_by{|x| x%3}[2]\n    )\n\n    rdd = rdd_numbers.key_by(lambda{|x| x.even?})\n    expect(rdd.lookup(true)).to eq(\n      numbers.select(&:even?)\n    )\n  end\n\nend\n"
  },
  {
    "path": "spec/lib/manipulation_spec.rb",
    "content": "require 'spec_helper'\n\nRSpec.describe 'Spark::RDD' do\n  let(:numbers) { 1..100 }\n  let(:rand_numbers) { Generator.numbers }\n\n  it '.glom' do\n    rdd = $sc.parallelize(numbers, 1).glom\n    expect(rdd.collect).to eql([numbers.to_a])\n\n    ser = Spark::Serializer.build { __batched__(__marshal__, 1) }\n\n    rdd = $sc.parallelize(numbers, 5, ser).glom\n    expect(rdd.collect).to eql(numbers.each_slice(20).to_a)\n  end\n\n  it '.coalesce' do\n    rdd = $sc.parallelize(numbers, 5)\n\n    rdd2 = rdd.glom\n    expect(rdd2.collect.size).to eql(5)\n\n    rdd3 = rdd.coalesce(4).glom\n    expect(rdd3.collect.size).to eql(4)\n  end\n\n  it '.distinct' do\n    rdd = $sc.parallelize(rand_numbers, 5)\n    rdd = rdd.distinct\n    expect(rdd.collect.sort).to eql(rand_numbers.uniq.sort)\n\n    rdd = $sc.parallelize(numbers, 5)\n    rdd = rdd.map(lambda{|x| 1})\n    rdd = rdd.distinct\n    expect(rdd.collect).to eql([1])\n  end\n\n  context '.union' do\n    it 'classic method' do\n      rdd = $sc.parallelize(numbers, 5)\n      rdd = rdd.union(rdd).collect\n\n      expect(rdd.collect.sort).to eql((numbers.to_a+numbers.to_a).sort)\n    end\n\n    it 'with a different serializer' do\n      rdd1 = $sc.parallelize(numbers, 1, Spark::Serializer.build{ __batched__(__marshal__) })\n      rdd2 = $sc.parallelize(numbers, 1, Spark::Serializer.build{ __batched__(__oj__) })\n\n      expect { rdd1.union(rdd2).collect }.to_not raise_error\n    end\n\n    it 'as operator' do\n      rdd1 = $sc.parallelize(numbers)\n      rdd2 = $sc.parallelize(rand_numbers)\n\n      expect((rdd1+rdd2).sum).to eql((numbers.to_a+rand_numbers).reduce(:+))\n    end\n  end\n\n  it '.compact' do\n    data = [nil, nil , 0, 0, 1, 2, nil, 6]\n    result = data.compact\n    ser = Spark::Serializer.build { __batched__(__marshal__, 1) }\n\n    rdd = $sc.parallelize(data, 1).compact\n    expect(rdd.collect).to eql(result)\n\n    rdd = $sc.parallelize(data, 5, ser).compact\n    expect(rdd.collect).to eql(result)\n\n    rdd = $sc.parallelize(data, 1, ser).compact\n    expect(rdd.collect).to eql(result)\n  end\n\n  it '.intersection' do\n    data1 = [0,1,2,3,4,5,6,7,8,9,10]\n    data2 = [5,6,7,8,9,10,11,12,13,14,15]\n\n    rdd1 = $sc.parallelize(data1)\n    rdd2 = $sc.parallelize(data2)\n\n    expect(rdd1.intersection(rdd2).collect.sort).to eql(data1 & data2)\n  end\n\n  it '.shuffle' do\n    data = Generator.numbers\n    rdd = $sc.parallelize(data)\n\n    expect(rdd.shuffle.collect).to_not eql(data)\n  end\n\n  context '.cartesian' do\n    let(:data1) { Generator.numbers(100) }\n    let(:data2) { Generator.numbers(100) }\n    let(:result) { data1.product(data2).map(&:to_s).sort }\n\n    it 'unbatched' do\n      ser = Spark::Serializer.build { __batched__(__marshal__, 1) }\n\n      rdd1 = $sc.parallelize(data1, 2, ser)\n      rdd2 = $sc.parallelize(data2, 2, ser)\n\n      rdd = rdd1.cartesian(rdd2).map(lambda{|x| x.to_s})\n\n      expect(rdd.collect.sort).to eql(result)\n    end\n\n    it 'batched' do\n      ser1 = Spark::Serializer.build { __batched__(__marshal__, rand(4..10)) }\n      ser2 = Spark::Serializer.build { __batched__(__marshal__, rand(4..10)) }\n\n      rdd1 = $sc.parallelize(data1, 2, ser1)\n      rdd2 = $sc.parallelize(data2, 2, ser2)\n\n      rdd = rdd1.cartesian(rdd2).map(lambda{|x| x.to_s})\n\n      expect(rdd.collect.sort).to eql(result)\n    end\n  end\n\nend\n"
  },
  {
    "path": "spec/lib/map_partitions_spec.rb",
    "content": "require 'spec_helper'\n\ndef func3(x)\n  x.map(&:to_i).reduce(:+)\nend\n\ndef func4_with_index(data, index)\n  [{\n    index => data.map(&:to_i).reduce(:*)\n  }]\nend\n\nRSpec.shared_examples 'a map partitions' do |workers|\n  context \"with #{workers || 'default'} worker\" do\n    it 'without index' do\n      rdd2 = rdd(workers).map_partitions(func1)\n      result = func1.call(numbers)\n\n      expect(func1.call(rdd2.collect)).to eql(result)\n\n      rdd3 = rdd(workers)\n      rdd3 = rdd3.map_partitions(func1)\n      rdd3 = rdd3.map_partitions(func2)\n      rdd3 = rdd3.map_partitions(method(:func3))\n      result = func3(func2.call(func1.call(numbers)))\n\n      # Not same number of workers\n      expect(rdd3.collect.size).to be >= 1\n\n      rdd4 = rdd(workers)\n      rdd4 = rdd4.map_partitions(func1)\n      rdd4 = rdd4.map_partitions(func2)\n      rdd4 = rdd4.map_partitions(method(:func3))\n\n      expect(rdd4.collect).to eql(rdd3.collect)\n    end\n\n    it 'with index' do\n      rdd2 = rdd(workers).map_partitions_with_index(method(:func4_with_index))\n      result = rdd2.collect\n\n      expect(result).to be_a(Array)\n\n      result.each do |x|\n        expect(x).to be_a(Hash)\n      end\n\n      # Multiply by 0\n      # Some values are 0 because of batched serialization\n      expect(result.map(&:values).flatten.compact.uniq.first).to eql(0)\n    end\n  end\nend\n\nRSpec::describe 'Spark::RDD.map_partitions(_with_index)' do\n  let(:func1) { lambda{|x| x.map(&:to_i)} }\n  let(:func2) {\n    lambda{|x|\n      x.map{|y| y*2}\n    }\n  }\n\n  context 'throught parallelize' do\n    let(:numbers) { 0..1000 }\n\n    def rdd(workers)\n      $sc.parallelize(numbers, workers)\n    end\n\n    it_behaves_like 'a map partitions', 1\n    it_behaves_like 'a map partitions', 2\n    # it_behaves_like 'a map partitions', nil\n    # it_behaves_like 'a map partitions', rand(2..10)\n  end\n\n  context 'throught text_file' do\n    let(:file)    { File.join('spec', 'inputs', 'numbers_0_100.txt') }\n    let(:numbers) { File.readlines(file).map(&:strip) }\n\n    def rdd(workers)\n      $sc.text_file(file, workers)\n    end\n\n    it_behaves_like 'a map partitions', 1\n    it_behaves_like 'a map partitions', 2\n    # it_behaves_like 'a map partitions', nil\n    # it_behaves_like 'a map partitions', rand(2..10)\n  end\nend\n"
  },
  {
    "path": "spec/lib/map_spec.rb",
    "content": "require 'spec_helper'\n\nRSpec.shared_examples 'a mapping' do |workers|\n  it \"with #{workers || 'default'} worker\" do\n    rdd2 = rdd(workers).map(func1)\n    result = numbers.map(&func1)\n\n    expect(rdd2.collect).to eql(result)\n\n    rdd3 = rdd(workers)\n    rdd3 = rdd3.map(func1)\n    rdd3 = rdd3.map(func2)\n    rdd3 = rdd3.map(func3)\n    result = numbers.map(&func1).map(&func2).map(&func3)\n\n    expect(rdd3.collect).to eql(result)\n\n    rdd4 = rdd(workers)\n    rdd4 = rdd4.map(func3)\n    rdd4 = rdd4.map(func2)\n    rdd4 = rdd4.map(func1)\n\n    expect(rdd4.collect).to eql(rdd3.collect)\n  end\nend\n\nRSpec.shared_examples 'a mapping values' do |workers|\n  it \"with #{workers || 'default'} worker\" do\n    rdd2 = rdd(workers).map_values(func1)\n    result = hash.map{|key, value| [key, func1.call(value)]}\n\n    expect(rdd2.collect).to eql(result)\n\n    rdd3 = rdd(workers)\n    rdd3 = rdd3.map_values(func1)\n    rdd3 = rdd3.map_values(func2)\n    rdd3 = rdd3.map_values(func3)\n    result = hash.map{|key, value| [key, func1.call(value)]}\n                 .map{|key, value| [key, func2.call(value)]}\n                 .map{|key, value| [key, func3.call(value)]}\n\n    expect(rdd3.collect).to eql(result)\n  end\nend\n\nRSpec.describe 'Spark::RDD' do\n  let(:func1) { lambda{|x| x*2} }\n  let(:func2) { lambda{|x| x*3} }\n  let(:func3) { lambda{|x| x*4} }\n\n  context 'throught parallelize' do\n    context '.map' do\n      let(:numbers) { Generator.numbers }\n\n      def rdd(workers)\n        $sc.parallelize(numbers, workers)\n      end\n\n      it_behaves_like 'a mapping', 1\n      it_behaves_like 'a mapping', 2\n      # it_behaves_like 'a mapping', nil\n      # it_behaves_like 'a mapping', rand(2..10)\n    end\n\n    context '.map_values' do\n      let!(:hash) { Generator.hash }\n\n      def rdd(workers)\n        $sc.parallelize(hash, workers)\n      end\n\n      it_behaves_like 'a mapping values', 1\n      it_behaves_like 'a mapping values', 2\n      # it_behaves_like 'a mapping values', nil\n      # it_behaves_like 'a mapping values', rand(2..10)\n    end\n  end\n\n  context 'throught text_file' do\n    context '.map' do\n      let(:file) { File.join('spec', 'inputs', 'numbers_0_100.txt') }\n      let(:numbers) { File.readlines(file).map(&:strip) }\n\n      def rdd(workers)\n        $sc.text_file(file, workers)\n      end\n\n      it_behaves_like 'a mapping', 1\n      it_behaves_like 'a mapping', 2\n      # it_behaves_like 'a mapping', nil\n      # it_behaves_like 'a mapping', rand(2..10)\n    end\n  end\nend\n"
  },
  {
    "path": "spec/lib/mllib/classification_spec.rb",
    "content": "require 'spec_helper'\n\nRSpec.describe 'Spark::Mllib classification' do\n\n  let(:data1) do\n    [\n      LabeledPoint.new(0.0, [1, 0, 0]),\n      LabeledPoint.new(1.0, [0, 1, 1]),\n      LabeledPoint.new(0.0, [2, 0, 0]),\n      LabeledPoint.new(1.0, [0, 2, 1])\n    ]\n  end\n\n  let(:values1) do\n    data1.map do |lp|\n      lp.features.values\n    end\n  end\n\n  let(:rdd1) { $sc.parallelize(data1) }\n\n  context 'logistic regression' do\n    it 'test' do\n      lrm = LogisticRegressionWithSGD.train(rdd1)\n\n      expect(lrm.predict(values1[0])).to be <= 0\n      expect(lrm.predict(values1[1])).to be >  0\n      expect(lrm.predict(values1[2])).to be <= 0\n      expect(lrm.predict(values1[3])).to be >  0\n    end\n  end\n\n  context 'svm' do\n    it 'test' do\n      lrm = SVMWithSGD.train(rdd1)\n\n      expect(lrm.predict(values1[0])).to be <= 0\n      expect(lrm.predict(values1[1])).to be >  0\n      expect(lrm.predict(values1[2])).to be <= 0\n      expect(lrm.predict(values1[3])).to be >  0\n    end\n  end\n\n  context 'naive bayes' do\n    it 'test' do\n      lrm = NaiveBayes.train(rdd1)\n\n      expect(lrm.predict(values1[0])).to be <= 0\n      expect(lrm.predict(values1[1])).to be >  0\n      expect(lrm.predict(values1[2])).to be <= 0\n      expect(lrm.predict(values1[3])).to be >  0\n    end\n  end\nend\n"
  },
  {
    "path": "spec/lib/mllib/clustering_spec.rb",
    "content": "require 'spec_helper'\n\nRSpec.describe 'Spark::Mllib clustering' do\n  context 'kmeans' do\n    it 'test' do\n      data = [\n        DenseVector.new([0, 1.1]),\n        DenseVector.new([0, 1.2]),\n        DenseVector.new([1.1, 0]),\n        DenseVector.new([1.2, 0])\n      ]\n      model = KMeans.train($sc.parallelize(data), 2, initialization_mode: 'k-means||')\n\n      expect(model.predict(data[0])).to eq(model.predict(data[1]))\n      expect(model.predict(data[2])).to eq(model.predict(data[3]))\n    end\n\n    it 'deterministic' do\n      data = Array.new(10) do |i|\n        i *= 10\n        DenseVector.new([i, i])\n      end\n\n      clusters1 = KMeans.train($sc.parallelize(data), 3, initialization_mode: 'k-means||', seed: 42)\n      clusters2 = KMeans.train($sc.parallelize(data), 3, initialization_mode: 'k-means||', seed: 42)\n\n      centers1 = clusters1.centers.to_a\n      centers2 = clusters2.centers.to_a\n\n      centers1.zip(centers2).each do |c1, c2|\n        expect(c1).to eq(c2)\n      end\n    end\n  end\nend\n"
  },
  {
    "path": "spec/lib/mllib/matrix_spec.rb",
    "content": "require 'spec_helper'\n\nRSpec.describe 'Spark::Mllib::Matrix' do\n  context 'dense' do\n    it 'construct' do\n      values = [[1.0, 0.0, 4.0], [0.0, 3.0, 5.0], [2.0, 0.0, 6.0]]\n      matrix = DenseMatrix.new(3, 3, [[1.0, 0.0, 4.0], [0.0, 3.0, 5.0], [2.0, 0.0, 6.0]])\n\n      expect(matrix.shape).to eq([3, 3])\n      expect(matrix.values).to eq([[1.0, 0.0, 4.0], [0.0, 3.0, 5.0], [2.0, 0.0, 6.0]])\n    end\n  end\n\n  context 'sparse' do\n    it 'construct' do\n      values = [1.0, 2.0, 4.0, 5.0]\n      column_pointers = [0, 2, 2, 4, 4]\n      row_indices = [1, 2, 1, 2]\n\n      matrix = SparseMatrix.new(3, 4, column_pointers, row_indices, values)\n\n      expect(matrix.shape).to eq([3, 4])\n      expect(matrix.to_a).to eq(\n        [\n          [0.0, 0.0, 0.0, 0.0],\n          [1.0, 0.0, 4.0, 0.0],\n          [2.0, 0.0, 5.0, 0.0]\n        ]\n      )\n    end\n  end\nend\n"
  },
  {
    "path": "spec/lib/mllib/regression_spec.rb",
    "content": "require 'spec_helper'\n\n# Mllib functions are tested on Spark\n# This just test if ruby call proper methods\n\nRSpec.describe 'Spark::Mllib regression' do\n\n  let(:data1) do\n    [\n      LabeledPoint.new(-1.0, [0, -1]),\n      LabeledPoint.new(1.0, [0, 1]),\n      LabeledPoint.new(-1.0, [0, -2]),\n      LabeledPoint.new(1.0, [0, 2])\n    ]\n  end\n\n  let(:values1) do\n    data1.map do |lp|\n      lp.features.values\n    end\n  end\n\n  let(:rdd1) { $sc.parallelize(data1) }\n\n  context 'labeled point' do\n    let(:lp) { LabeledPoint.new(1, [1,2,3]) }\n\n    it 'from array' do\n      expect(lp.label).to eql(1.0)\n      expect(lp.features).to be_a(DenseVector)\n    end\n\n    it 'serialize' do\n      lp2 = Marshal.load(Marshal.dump(lp))\n\n      expect(lp2.label).to eql(lp.label)\n      expect(lp2.features.values).to eql(lp.features.values)\n    end\n  end\n\n  context 'linear regression' do\n    context 'test' do\n      let(:lrm) { LinearRegressionWithSGD.train(rdd1) }\n\n      it 'test' do\n        expect(lrm.predict(values1[0])).to be <= 0\n        expect(lrm.predict(values1[1])).to be >  0\n        expect(lrm.predict(values1[2])).to be <= 0\n        expect(lrm.predict(values1[3])).to be >  0\n      end\n\n      it 'test via rdd' do\n        rdd = $sc.parallelize(values1, 1)\n        rdd = rdd.map(lambda{|value| model.predict(value)})\n        rdd = rdd.bind(model: lrm)\n\n        result = rdd.collect\n\n        expect(result[0]).to be <= 0\n        expect(result[1]).to be >  0\n        expect(result[2]).to be <= 0\n        expect(result[3]).to be >  0\n      end\n    end\n\n    # Y = 3 + 10*X1 + 10*X2\n    it 'linear regression' do\n      data = Spark.jb.call(RubyMLLibUtilAPI, 'generateLinearInput', 3.0, ['10.0', '10.0'], 100, 42, 0.1)\n      rdd = $sc.parallelize(data)\n\n      lrm = LinearRegressionWithSGD.train(rdd, iterations: 1000, intercept: true, step: 1.0)\n\n      expect(lrm.intercept).to be_between(2.5, 3.5)\n      expect(lrm.weights.size).to eq(2)\n      expect(lrm.weights[0]).to be_between(9.0, 11.0)\n      expect(lrm.weights[1]).to be_between(9.0, 11.0)\n    end\n  end\n\n  context 'lasso' do\n    it 'test' do\n      lrm = LassoWithSGD.train(rdd1)\n\n      expect(lrm.predict(values1[0])).to be <= 0\n      expect(lrm.predict(values1[1])).to be >  0\n      expect(lrm.predict(values1[2])).to be <= 0\n      expect(lrm.predict(values1[3])).to be >  0\n    end\n\n    it 'local random SGD with initial weights' do\n      data = Spark.jb.call(RubyMLLibUtilAPI, 'generateLinearInput', 2.0, ['-1.5', '0.01'], 1000, 42, 0.1)\n      data.map! do |lp|\n        LabeledPoint.new(lp.label, [1.0] + lp.features.values)\n      end\n\n      rdd = $sc.parallelize(data);\n\n      lrm = LassoWithSGD.train(rdd, step: 1.0, reg_param: 0.01, iterations: 40, initial_weights: [-1.0, -1.0, -1.0])\n\n      expect(lrm.weights[0]).to be_between(1.9, 2.1)\n      expect(lrm.weights[1]).to be_between(-1.60, -1.40)\n      expect(lrm.weights[2]).to be_between(-1.0e-2, 1.0e-2)\n    end\n  end\n\n  context 'ridge' do\n    it 'test' do\n      lrm = RidgeRegressionWithSGD.train(rdd1)\n\n      expect(lrm.predict(values1[0])).to be <= 0\n      expect(lrm.predict(values1[1])).to be >  0\n      expect(lrm.predict(values1[2])).to be <= 0\n      expect(lrm.predict(values1[3])).to be >  0\n    end\n  end\nend\n"
  },
  {
    "path": "spec/lib/mllib/vector_spec.rb",
    "content": "require 'spec_helper'\n\nRSpec.describe 'Spark::Mllib::Vector' do\n\n  context 'parsing' do\n    it 'dense vector' do\n      dv  = DenseVector.new([1.0, 2.0, 3.0, 4.0, 5.0])\n      dv2 = DenseVector.parse(dv.to_s)\n      dv3 = Vectors.parse(dv.to_s)\n\n      expect(dv.to_s).to eq(\"[1.0,2.0,3.0,4.0,5.0]\")\n      expect(dv2.values).to eq(dv.values)\n      expect(dv3.values).to eq(dv.values)\n    end\n\n    it 'sparse vector' do\n      sv  = SparseVector.new(5, {1 => 3, 4 => 5})\n      sv2 = SparseVector.parse(sv.to_s)\n      sv3 = Vectors.parse(sv.to_s)\n\n      expect(sv.to_s).to eq(\"(5,[1,4],[3,5])\")\n      expect(sv2.size).to eq(sv.size)\n      expect(sv2.indices).to eq(sv.indices)\n      expect(sv2.values).to eq(sv.values)\n      expect(sv3.size).to eq(sv.size)\n      expect(sv3.indices).to eq(sv.indices)\n      expect(sv3.values).to eq(sv.values)\n    end\n  end\n\n  it 'dot' do\n    sv = SparseVector.new(4, {1 => 1, 3 => 2})\n    dv = DenseVector.new([1.0, 2.0, 3.0, 4.0])\n    lst = DenseVector.new([1, 2, 3, 4])\n\n    expect(sv.dot(dv)).to eq(10.0)\n    expect(dv.dot(dv)).to eq(30.0)\n    expect(lst.dot(dv)).to eq(30.0)\n  end\n\n  it 'squared distance' do\n    sv = SparseVector.new(4, {1 => 1, 3 => 2})\n    dv = DenseVector.new([1.0, 2.0, 3.0, 4.0])\n    lst = DenseVector.new([4, 3, 2, 1])\n\n    expect(sv.squared_distance(dv)).to eq(15)\n    expect(sv.squared_distance(lst)).to eq(25)\n    expect(dv.squared_distance(lst)).to eq(20)\n    expect(dv.squared_distance(sv)).to eq(15)\n    expect(lst.squared_distance(sv)).to eq(25)\n    expect(lst.squared_distance(dv)).to eq(20)\n    expect(sv.squared_distance(sv)).to eq(0)\n    expect(dv.squared_distance(dv)).to eq(0)\n    expect(lst.squared_distance(lst)).to eq(0)\n  end\n\n  it 'sparse vector indexing' do\n    sv1 = SparseVector.new(4, {1 => 1, 3 => 2})\n    sv2 = SparseVector.new(4, [1, 3], [1, 2])\n\n    expect(sv1[0]).to eq(0)\n    expect(sv1[3]).to eq(2)\n    expect(sv1[1]).to eq(1)\n    expect(sv1[2]).to eq(0)\n    expect(sv1[-1]).to eq(2)\n    expect(sv1[-2]).to eq(0)\n    expect(sv1[-4]).to eq(0)\n\n    expect(sv2[0]).to eq(0)\n    expect(sv2[3]).to eq(2)\n    expect(sv2[1]).to eq(1)\n    expect(sv2[2]).to eq(0)\n    expect(sv2[-1]).to eq(2)\n    expect(sv2[-2]).to eq(0)\n    expect(sv2[-4]).to eq(0)\n  end\nend\n"
  },
  {
    "path": "spec/lib/reduce_by_key_spec.rb",
    "content": "require 'spec_helper'\n\ndef flat_map(line)\n  line.split\nend\n\ndef map(item)\n  [item, 1]\nend\n\ndef reduce(x,y)\n  x+y\nend\n\nRSpec.shared_examples 'a words counting' do |workers|\n  context \"with #{workers || 'default'} worker\" do\n    let(:result) do\n      keyyed = lines.flat_map{|x| x.split}.map{|x| [x,1]}\n      result = keyyed.reduce({}){|memo, item|\n        key   = item[0]\n        value = item[1]\n\n        memo[key] ||= 0\n        memo[key] += value\n        memo\n      }\n      result\n    end\n\n    it 'when lambda' do\n      rdd2 = rdd(workers)\n      rdd2 = rdd2.flat_map(lambda{|line| line.split})\n      rdd2 = rdd2.map(lambda{|word| [word, 1]})\n      rdd2 = rdd2.reduce_by_key(lambda{|x,y| x+y})\n\n      expect(rdd2.collect_as_hash).to eql(result)\n    end\n\n    it 'when method' do\n      rdd2 = rdd(workers)\n      rdd2 = rdd2.flat_map(method(:flat_map))\n      rdd2 = rdd2.map(method(:map))\n      rdd2 = rdd2.reduce_by_key(method(:reduce))\n\n      expect(rdd2.collect_as_hash).to eql(result)\n    end\n\n    it 'keys, values' do\n      rdd2 = rdd(workers)\n      rdd2 = rdd2.flat_map(method(:flat_map))\n      rdd2 = rdd2.map(method(:map))\n      rdd2 = rdd2.reduce_by_key(method(:reduce))\n\n      expect(rdd2.keys.collect.sort).to eql(result.keys.sort)\n      expect { rdd2.values.collect.reduce(:+) }.to_not raise_error\n    end\n  end\nend\n\nRSpec.describe 'Spark::RDD' do\n  context '.reduce_by_key' do\n    context 'throught parallelize' do\n      let(:lines) { Generator.lines }\n\n      def rdd(workers)\n        $sc.parallelize(lines, workers)\n      end\n\n      it_behaves_like 'a words counting', 2\n      # it_behaves_like 'a words counting', nil\n      # it_behaves_like 'a words counting', rand(2..10)\n    end\n\n    context 'throught text_file' do\n      let(:file)  { File.join('spec', 'inputs', 'lorem_300.txt') }\n      let(:lines) { File.readlines(file).map(&:strip) }\n\n      def rdd(workers)\n        $sc.text_file(file, workers)\n      end\n\n      it_behaves_like 'a words counting', 2\n      # it_behaves_like 'a words counting', nil\n      # it_behaves_like 'a words counting', rand(2..10)\n    end\n  end\n\n  context '.fold_by_key' do\n    let(:numbers)    { Generator.numbers }\n    let(:zero_value) { 0 }\n    let(:rdd)        { $sc.parallelize(numbers) }\n    let(:map)        { lambda{|x| [x, 1]} }\n    let(:add)        { lambda{|x,y| x+y} }\n\n    let(:result) do\n      _result = {}\n      numbers.map(&map).each do |key, value|\n        _result[key] ||= zero_value\n        _result[key] = add.call(_result[key], value)\n      end\n      _result\n    end\n\n    def fold_by_key(num_partitions=nil)\n      rdd.map(map).fold_by_key(zero_value, add, num_partitions).collect_as_hash\n    end\n\n    it 'default num_partitions' do\n      expect(fold_by_key).to eq(result)\n    end\n\n    it 'default num_partitions' do\n      expect(\n        fold_by_key rand(1..10)\n      ).to eq(result)\n    end\n  end\nend\n"
  },
  {
    "path": "spec/lib/reduce_spec.rb",
    "content": "require 'spec_helper'\n\ndef longest_words(memo, word)\n  memo.length > word.length ? memo : word\nend\n\nRSpec.shared_examples 'a reducing' do |workers|\n  context \"with #{workers || 'default'} worker\" do\n    it '.reduce' do\n      rdd2 = rdd_numbers(workers)\n      rdd2 = rdd2.map(to_i)\n      rdd2 = rdd2.reduce(func1)\n      result = numbers.map(&:to_i).reduce(&func1)\n\n      expect(rdd2).to eql(result)\n\n      rdd3 = rdd_numbers(workers)\n      rdd3 = rdd3.map(to_i)\n      rdd3 = rdd3.reduce(func2)\n      result = numbers.map(&:to_i).reduce(&func2)\n\n      expect(rdd3).to eql(result)\n\n      rdd4 = rdd_lines(workers)\n      rdd4 = rdd4.flat_map(split)\n      rdd4 = rdd4.reduce(method(:longest_words))\n\n      result = lines.flat_map(&split).reduce(&lambda(&method(:longest_words)))\n\n      expect(rdd4).to eql(result)\n    end\n\n    it '.fold' do\n      rdd2 = rdd_numbers(workers)\n      rdd2 = rdd2.map(to_i)\n      rdd_result = rdd2.fold(1, func1)\n\n      # all workers add 1 + last reducing phase\n      result = numbers.map(&:to_i).reduce(&func1) + rdd2.partitions_size + 1\n\n      expect(rdd_result).to eql(result)\n    end\n\n    it '.aggregate' do\n      rdd2 = rdd_numbers(workers)\n      rdd2 = rdd2.map(to_i)\n\n      # Sum of items + their count\n      seq = lambda{|x,y| [x[0] + y, x[1] + 1]}\n      com = lambda{|x,y| [x[0] + y[0], x[1] + y[1]]}\n      rdd_result = rdd2.aggregate([0,0], seq, com)\n\n      result = [numbers.reduce(:+), numbers.size]\n\n      expect(rdd_result).to eql(result)\n    end\n\n    it '.max' do\n      rdd2 = rdd_numbers(workers)\n      rdd2 = rdd2.map(to_i)\n\n      expect(rdd2.max).to eql(numbers.map(&:to_i).max)\n    end\n\n    it '.min' do\n      rdd2 = rdd_numbers(workers)\n      rdd2 = rdd2.map(to_i)\n\n      expect(rdd2.min).to eql(numbers.map(&:to_i).min)\n    end\n\n    it '.sum' do\n      rdd2 = rdd_numbers(workers)\n      rdd2 = rdd2.map(to_i)\n\n      expect(rdd2.sum).to eql(numbers.map(&:to_i).reduce(:+))\n    end\n\n    it '.count' do\n      rdd2 = rdd_numbers(workers)\n      rdd2 = rdd2.map(to_i)\n\n      expect(rdd2.count).to eql(numbers.size)\n    end\n  end\nend\n\nRSpec.describe 'Spark::RDD' do\n  let(:func1) { lambda{|sum, x| sum+x} }\n  let(:func2) { lambda{|product, x| product*x} }\n\n  let(:to_i)  { lambda{|item| item.to_i} }\n  let(:split) { lambda{|item| item.split} }\n\n  context 'throught parallelize' do\n    let(:numbers) { Generator.numbers }\n    let(:lines)   { Generator.lines }\n\n    def rdd_numbers(workers)\n      $sc.parallelize(numbers, workers)\n    end\n\n    def rdd_lines(workers)\n      $sc.parallelize(lines, workers)\n    end\n\n    it_behaves_like 'a reducing', 1\n    it_behaves_like 'a reducing', 2\n    # it_behaves_like 'a reducing', nil\n    # it_behaves_like 'a reducing', rand(2..10)\n  end\n\n  context 'throught text_file' do\n    let(:file)       { File.join('spec', 'inputs', 'numbers_0_100.txt') }\n    let(:file_lines) { File.join('spec', 'inputs', 'lorem_300.txt') }\n\n    let(:numbers) { File.readlines(file).map(&:strip).map(&:to_i) }\n    let(:lines)   { File.readlines(file_lines).map(&:strip) }\n\n    def rdd_numbers(workers)\n      $sc.text_file(file, workers)\n    end\n\n    def rdd_lines(workers)\n      $sc.text_file(file_lines, workers)\n    end\n\n    it_behaves_like 'a reducing', 1\n    it_behaves_like 'a reducing', 2\n    # it_behaves_like 'a reducing', nil\n    # it_behaves_like 'a reducing', rand(2..10)\n  end\nend\n"
  },
  {
    "path": "spec/lib/sample_spec.rb",
    "content": "require 'spec_helper'\n\n# Sample method can not be tested because of random generator\n# Just test it for raising error\n\nRSpec.shared_examples 'a sampler' do |workers|\n  context \"with #{workers || 'default'} worker\" do\n\n    context '.sample' do\n      it 'with replacement' do\n        rdd2 = rdd(workers).sample(true, rand)\n        expect { rdd2.collect }.to_not raise_error\n      end\n\n      it 'without replacement' do\n        rdd2 = rdd(workers).sample(false, rand)\n        expect { rdd2.collect }.to_not raise_error\n      end\n    end\n\n    context '.take_sample' do\n      it 'with replacement' do\n        size = rand(10..999)\n        expect(rdd(workers).take_sample(true, size).size).to eql(size)\n      end\n\n      it 'without replacement' do\n        size = rand(10..999)\n        expect(rdd(workers).take_sample(false, size).size).to eql(size)\n      end\n    end\n\n  end\nend\n\nRSpec.describe 'Spark::RDD' do\n  let(:numbers) { Generator.numbers(1000) }\n\n  def rdd(workers)\n    $sc.parallelize(numbers, workers)\n  end\n\n  it_behaves_like 'a sampler', 1\n  it_behaves_like 'a sampler', 2\n  # it_behaves_like 'a sampler', nil\n  # it_behaves_like 'a sampler', rand(2..10)\nend\n"
  },
  {
    "path": "spec/lib/serializer_spec.rb",
    "content": "require 'spec_helper'\nrequire 'zlib'\n\nRSpec.describe Spark::Serializer do\n  let(:data) { [1, 'test', 2.0, [3], {key: 'value'}, :test, String] }\n\n  it 'find' do\n    expect(described_class.find('not_existed_class')).to eql(nil)\n\n    expect(described_class.find('Marshal')).to eq(described_class::Marshal)\n    expect(described_class.find('marshal')).to eq(described_class::Marshal)\n    expect(described_class.find(:marshal)).to eq(described_class::Marshal)\n    expect(described_class.find('batched')).to eq(described_class::Batched)\n  end\n\n  it 'find!' do\n    expect { expect(described_class.find!('not_existed_class')) }.to raise_error(Spark::SerializeError)\n    expect { expect(described_class.find!('marshal')) }.to_not raise_error\n    expect { expect(described_class.find!('batched')) }.to_not raise_error\n  end\n\n  it 'register' do\n    NewSerializer = Class.new\n\n    expect(described_class.find('new_serializer_1')).to eql(nil)\n    expect(described_class.find('new_serializer_2')).to eql(nil)\n    expect(described_class.find('new_serializer_3')).to eql(nil)\n\n    described_class.register('new_serializer_1', 'new_serializer_2', 'new_serializer_3', NewSerializer)\n\n    expect(described_class.find('new_serializer_1')).to eql(NewSerializer)\n    expect(described_class.find('new_serializer_2')).to eql(NewSerializer)\n    expect(described_class.find('new_serializer_3')).to eql(NewSerializer)\n  end\n\n  it '==' do\n    # One class\n    marshal1 = described_class::Marshal.new\n    marshal2 = described_class::Marshal.new\n\n    expect(marshal1).to eq(marshal1)\n    expect(marshal1).to eq(marshal2)\n\n    # Two classes\n    compressed1 = described_class::Compressed.new(marshal1)\n    compressed2 = described_class::Compressed.new(marshal2)\n\n    expect(compressed1).to eq(compressed1)\n    expect(compressed1).to eq(compressed2)\n\n    # Three classes\n    batched1 = described_class::Batched.new(compressed1, 1)\n    batched2 = described_class::Batched.new(compressed2, 1)\n    batched3 = described_class::Batched.new(compressed1, 2)\n\n    expect(batched1).to eq(batched2)\n    expect(batched1).to_not eq(batched3)\n  end\n\n  context 'build' do\n    let(:marshal1)    { described_class::Marshal.new }\n    let(:compressed1) { described_class::Compressed.new(marshal1) }\n    let(:batched1)    { described_class::Batched.new(compressed1, 1) }\n\n    it 'block' do\n      expect(described_class.build{ marshal }).to eq(marshal1)\n      expect(described_class.build{ marshal }).to eq(described_class.build{ __marshal__ })\n      expect(described_class.build{ compressed(marshal) }).to eq(compressed1)\n      expect(described_class.build{ batched(compressed(marshal), 1) }).to eq(batched1)\n    end\n\n    it 'text' do\n      expect(described_class.build('marshal')).to eq(marshal1)\n      expect(described_class.build('compressed(marshal)')).to eq(compressed1)\n      expect(described_class.build('batched(compressed(marshal), 1)')).to eq(batched1)\n    end\n  end\n\n  it 'serialization' do\n    marshal1 = described_class.build{ marshal }\n    compressed1 = described_class.build{ compressed(marshal) }\n\n    expect(marshal1.dump(data)).to eq(Marshal.dump(data))\n    expect(compressed1.dump(data)).to eq(\n      Zlib::Deflate.deflate(Marshal.dump(data))\n    )\n  end\n\n  context 'Auto batched' do\n    let(:klass) { Spark::Serializer::AutoBatched }\n    let(:marshal) { Spark::Serializer::Marshal.new }\n    let(:numbers) { Generator.numbers }\n\n    it 'initialize' do\n      expect { klass.new }.to raise_error(ArgumentError)\n      expect { klass.new(marshal) }.to_not raise_error\n      expect { klass.new(marshal, 1) }.to raise_error(Spark::SerializeError)\n    end\n\n    it 'serialization' do\n      serializer1 = klass.new(marshal)\n      serializer2 = klass.new(marshal, 2)\n\n      rdd1 = Spark.sc.parallelize(numbers, 2, serializer1)\n      rdd2 = Spark.sc.parallelize(numbers, 2, serializer2).map(:to_i)\n\n      result = rdd1.collect\n\n      expect(rdd1.serializer).to eq(serializer1)\n      expect(result).to eq(numbers)\n      expect(result).to eq(rdd2.collect)\n    end\n\n  end\nend\n"
  },
  {
    "path": "spec/lib/sort_spec.rb",
    "content": "require 'spec_helper'\n\nRSpec.shared_examples 'a sorting' do |workers|\n  it \"with #{workers || 'default'} worker\" do\n    rdd2 = rdd(workers)\n\n    rdd2 = rdd2.flat_map(split)\n    result = lines.flat_map(&split)\n\n    # Sort by self\n    rdd3 = rdd2.map(map).sort_by_key\n    result2 = result.map(&map).sort_by{|(key, _)| key}\n\n    expect(rdd3.collect).to eql(result2)\n\n    # Sort by len\n    rdd3 = rdd2.map(len_map).sort_by_key\n    result2 = result.map(&len_map).sort_by{|(key, _)| key}\n\n    expect(rdd3.collect).to eql(result2)\n  end\nend\n\n\nRSpec.describe 'Spark::RDD' do\n  let(:split)   { lambda{|x| x.split} }\n  let(:map)     { lambda{|x| [x.to_s, 1]} }\n  let(:len_map) { lambda{|x| [x.size, x]} }\n\n  context 'throught parallelize' do\n    context '.map' do\n      let(:lines) { Generator.lines }\n\n      def rdd(workers)\n        $sc.parallelize(lines, workers)\n      end\n\n      it_behaves_like 'a sorting', 1\n      it_behaves_like 'a sorting', 2\n      # it_behaves_like 'a sorting', nil\n      # it_behaves_like 'a sorting', rand(2..10)\n    end\n  end\n\n  context 'throught text_file' do\n    context '.map' do\n      let(:file)  { File.join('spec', 'inputs', 'lorem_300.txt') }\n      let(:lines) { File.readlines(file).map(&:strip) }\n\n      def rdd(workers)\n        $sc.text_file(file, workers)\n      end\n\n      it_behaves_like 'a sorting', 1\n      it_behaves_like 'a sorting', 2\n      # it_behaves_like 'a sorting', nil\n      # it_behaves_like 'a sorting', rand(2..10)\n    end\n  end\nend\n"
  },
  {
    "path": "spec/lib/sql/column_spec.rb",
    "content": "require 'spec_helper'\n\nRSpec.shared_examples 'binary comparison' do |op|\n  it \"#{op}\" do\n    to_test = 20\n\n    result = df.select('age').where( df.age.__send__(op, to_test) ).values.flatten\n    result.each do |item|\n      if op == '!='\n        expect(item).to_not eq(to_test)\n      else\n        expect(item).to be.__send__(op, to_test)\n      end\n    end\n  end\nend\n\nRSpec.describe Spark::SQL::Column do\n\n  let(:file) { File.join('spec', 'inputs', 'people.json') }\n  let(:df) { $sql.read.json(file) }\n\n  let(:data) do\n    # Data are line delimited\n    result = []\n    File.readlines(file).each do |line|\n      result << JSON.parse(line)\n    end\n    result\n  end\n\n  context 'operators' do\n    it 'func' do\n      result = df.select( df.id, df.active, ~df.id, !df.active ).collect_as_hash.map(&:values)\n      result.each do |item|\n        expect(item[0]).to eq(-item[2])\n        expect(item[1]).to eq(!item[3])\n      end\n    end\n\n    context 'binary' do\n      it 'arithmetic' do\n        result = df.select( df.id, df.id+1, df.id-1, df.id*2, df.id/2, df.id%2 ).collect_as_hash.map(&:values)\n        result.each do |item|\n          expect(item[1]).to eq(item[0]+1)\n          expect(item[2]).to eq(item[0]-1)\n          expect(item[3]).to eq(item[0]*2)\n          expect(item[4]).to eq(item[0]/2.0)\n          expect(item[5]).to eq(item[0]%2)\n        end\n      end\n\n      # comparison\n      it_behaves_like 'binary comparison', '=='\n      it_behaves_like 'binary comparison', '!='\n      it_behaves_like 'binary comparison', '<'\n      it_behaves_like 'binary comparison', '<='\n      it_behaves_like 'binary comparison', '>'\n      it_behaves_like 'binary comparison', '>='\n\n      it 'logical' do\n        result = df.select('id').where( (df.id >= 20) & (df.id <= 30) ).values.flatten\n        expect(result).to all( be_between(20, 30) )\n\n        result = df.select('id').where( (df.id == 1) | (df.id == 2) ).values.flatten\n        expect(result).to eq([1, 2])\n      end\n\n      it 'like' do\n        result = df.select('email').where( df.email.like('%com%') ).values.flatten\n        expect(result).to all( include('com') )\n      end\n\n      it 'null' do\n        result1 = df.select('address').where( df.address.is_null ).values.flatten\n        result2 = df.select('address').where( df.address.is_not_null ).values.flatten\n\n        expect(result1).to all( be_nil )\n        expect(result2).to all( be_an(String) )\n      end\n    end\n  end\n\n  it 'substr' do\n    result = df.select( df.name.substr(1, 3) ).values.flatten\n    result.each do |item|\n      expect(item.size).to eq(3)\n    end\n  end\n\n  it 'isin' do\n    result = df.select('age').where( df.age.isin(20, 21, 22) ).values.flatten\n    expect(result).to all( eq(20).or eq(21).or eq(22) )\n  end\n\n  it 'alias' do\n    result = df.select( df.id.as('id2') ).collect_as_hash.map(&:keys).flatten\n    expect(result).to all( eq('id2') )\n  end\n\n  it 'cast' do\n    result = df.select( df.id, df.id.cast('string').alias('age2') ).values\n    result.each do |item|\n      expect(item[0]).to be_an(Integer)\n      expect(item[0].to_s).to eq(item[1])\n    end\n  end\n\n  it 'when, otherwise' do\n    result = df.select(df.id, Spark::SQL::Column.when(df.id <= 20, 1).when(df.id >= 30, 3).otherwise(2)).values\n    result.each do |item|\n      id = item[0]\n      value = item[1]\n\n      if id <= 20\n        expect(value).to eq(1)\n      elsif id >= 30\n        expect(value).to eq(3)\n      else\n        expect(value).to eq(2)\n      end\n    end\n  end\n\nend\n"
  },
  {
    "path": "spec/lib/sql/data_frame_spec.rb",
    "content": "require 'spec_helper'\n\nRSpec.describe Spark::SQL::DataFrame do\n\n  let(:file) { File.join('spec', 'inputs', 'people.json') }\n  let(:df) { $sql.read.json(file) }\n\n  context '[]' do\n\n    it 'String' do\n      value = df['age']\n      expect(value).to be_a(Spark::SQL::Column)\n      expect(value.to_s).to eq('Column(\"age\")')\n    end\n\n    it 'Array' do\n      value = df[ ['name', 'age'] ]\n      expect(value).to be_a(Spark::SQL::DataFrame)\n      expect(value.columns).to eq(['name', 'age'])\n    end\n\n    it 'Numeric' do\n      value = df[0]\n      expect(value).to be_a(Spark::SQL::Column)\n      expect(value.to_s).to eq('Column(\"active\")')\n    end\n\n    it 'Column' do\n      value = df[ df[0] == true ]\n      expect(value).to be_a(Spark::SQL::DataFrame)\n    end\n\n  end\n\n  it 'columns' do\n    expect(df.columns).to eq(['active', 'address', 'age', 'email', 'id', 'ip_address', 'name'])\n  end\n\n  it 'schema' do\n    schema = df.schema\n    expect(schema).to be_a(Spark::SQL::StructType)\n    expect(schema.type_name).to eq('struct')\n  end\n\n  it 'show_string' do\n    expect(df.show_string).to start_with('+--')\n  end\n\n  it 'dtypes' do\n    expect(df.dtypes).to eq([['active', 'boolean'], ['address', 'string'], ['age', 'long'], ['email', 'string'], ['id', 'long'], ['ip_address', 'string'], ['name', 'string']])\n  end\n\n  it 'take' do\n    expect(df.take(10).size).to eq(10)\n  end\n\n  it 'count' do\n    expect(df.count).to eq(100)\n  end\n\n  context 'select' do\n\n    it '*' do\n      row = df.select('*').first\n      expect(row.data.keys).to eq(['active', 'address', 'age', 'email', 'id', 'ip_address', 'name'])\n    end\n\n    it 'with string' do\n      row = df.select('name', 'age').first\n      expect(row.data.keys).to eq(['name', 'age'])\n    end\n\n    it 'with column' do\n      row = df.select(df.name, df.age).first\n      expect(row.data.keys).to eq(['name', 'age'])\n    end\n\n  end\n\n  context 'where' do\n\n    it 'with string' do\n      eq_20 = df.filter('age = 20').collect\n      expect(eq_20.map{|c| c['age']}).to all(be == 20)\n    end\n\n    it 'with column' do\n      nil_values = df.where(df.age.is_null).collect\n      greater_or_eq_20 = df.where(df.age >= 20).collect\n      lesser_than_20 = df.where(df.age < 20).collect\n\n      expect(nil_values.size + greater_or_eq_20.size + lesser_than_20.size).to eq(df.count)\n\n      expect(nil_values.map{|c| c['age']}).to all(be_nil)\n      expect(greater_or_eq_20.map{|c| c['age']}).to all(be >= 20)\n      expect(lesser_than_20.map{|c| c['age']}).to all(be < 20)\n    end\n\n  end\n\nend\n"
  },
  {
    "path": "spec/lib/statistic_spec.rb",
    "content": "require 'spec_helper'\n\nRSpec.shared_examples 'a stats' do |workers|\n  let(:numbers) { [1.0, 1.0, 2.0, 3.0, 5.0, 8.0] }\n\n  context \"with #{workers || 'default'} worker\" do\n    it 'stats class' do\n      stats = $sc.parallelize(numbers, workers).stats\n\n      expect(stats.sum).to             be_within(0.1).of(20)\n      expect(stats.mean).to            be_within(0.1).of(20/6.0)\n      expect(stats.max).to             be_within(0.1).of(8.0)\n      expect(stats.min).to             be_within(0.1).of(1.0)\n      expect(stats.variance).to        be_within(0.1).of(6.22222)\n      expect(stats.sample_variance).to be_within(0.1).of(7.46667)\n      expect(stats.stdev).to           be_within(0.1).of(2.49444)\n      expect(stats.sample_stdev).to    be_within(0.1).of(2.73252)\n    end\n\n    it 'rdd methods' do\n      rdd = $sc.parallelize([1, 2, 3], workers)\n\n      expect(rdd.mean).to            be_within(0.1).of(2.0)\n      expect(rdd.variance).to        be_within(0.1).of(0.666)\n      expect(rdd.stdev).to           be_within(0.1).of(0.816)\n      expect(rdd.sample_stdev).to    be_within(0.1).of(1.0)\n      expect(rdd.sample_variance).to be_within(0.1).of(1.0)\n    end\n  end\nend\n\nRSpec.shared_examples 'a histogram' do |workers|\n\n  context \"with #{workers || 'default'} worker\" do\n    it 'empty' do\n      rdd = $sc.parallelize([], workers, ser)\n\n      expect( rdd.histogram([0, 10])[1] ).to eq([0])\n      expect( rdd.histogram([0, 4, 10])[1] ).to eq([0, 0])\n    end\n\n    it 'validation' do\n      rdd = $sc.parallelize([], workers, ser)\n      expect { rdd.histogram(0) }.to raise_error(ArgumentError)\n    end\n\n    it 'double' do\n      rdd = $sc.parallelize([1.0, 2.0, 3.0, 4.0], workers, ser)\n      buckets, counts = rdd.histogram(2)\n\n      expect(buckets).to eq([1.0, 2.5, 4.0])\n      expect(counts).to eq([2, 2])\n    end\n\n    it 'out of range' do\n      rdd = $sc.parallelize([10.01, -0.01], workers, ser)\n\n      expect( rdd.histogram([0, 10])[1] ).to eq([0])\n      expect( rdd.histogram([0, 4, 10])[1] ).to eq([0, 0])\n    end\n\n    it 'in range with one bucket' do\n      rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)\n\n      expect( rdd.histogram([0, 10])[1] ).to eq([4])\n      expect( rdd.histogram([0, 4, 10])[1] ).to eq([3, 1])\n    end\n\n    it 'in range with one bucket exact match' do\n      rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)\n      expect( rdd.histogram([1, 4])[1] ).to eq([4])\n    end\n\n    it 'out of range with two buckets' do\n      rdd = $sc.parallelize([10.01, -0.01], workers, ser)\n      expect( rdd.histogram([0, 5, 10])[1] ).to eq([0, 0])\n    end\n\n    it 'out of range with two uneven buckets' do\n      rdd = $sc.parallelize([10.01, -0.01], workers, ser)\n      expect( rdd.histogram([0, 4, 10])[1] ).to eq([0, 0])\n    end\n\n    it 'in range with two buckets' do\n      rdd = $sc.parallelize([1, 2, 3, 5, 6], workers, ser)\n      expect( rdd.histogram([0, 5, 10])[1] ).to eq([3, 2])\n    end\n\n    it 'in range with two bucket and nil' do\n      rdd = $sc.parallelize([1, 2, 3, 5, 6, nil, Float::NAN], workers, ser)\n      expect( rdd.histogram([0, 5, 10])[1] ).to eq([3, 2])\n    end\n\n    it 'in range with two uneven buckets' do\n      rdd = $sc.parallelize([1, 2, 3, 5, 6], workers, ser)\n      expect( rdd.histogram([0, 5, 11])[1] ).to eq([3, 2])\n    end\n\n    it 'mixed range with two uneven buckets' do\n      rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.0, 11.01], workers, ser)\n      expect( rdd.histogram([0, 5, 11])[1] ).to eq([4, 3])\n    end\n\n    it 'mixed range with four uneven buckets' do\n      rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1], workers, ser)\n      expect( rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1] ).to eq([4, 2, 1, 3])\n    end\n\n    it 'mixed range with uneven buckets and NaN' do\n      rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1, nil, Float::NAN], workers, ser)\n      expect( rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1] ).to eq([4, 2, 1, 3])\n    end\n\n    it 'out of range with infinite buckets' do\n      rdd = $sc.parallelize([10.01, -0.01, Float::NAN, Float::INFINITY], workers, ser)\n      expect( rdd.histogram([-Float::INFINITY, 0, Float::INFINITY])[1] ).to eq([1, 1])\n    end\n\n    it 'without buckets' do\n      rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)\n      expect( rdd.histogram(1) ).to eq([[1, 4], [4]])\n    end\n\n    it 'without buckets single element' do\n      rdd = $sc.parallelize([1], workers, ser)\n      expect( rdd.histogram(1) ).to eq([[1, 1], [1]])\n    end\n\n    it 'without bucket no range' do\n      rdd = $sc.parallelize([1, 1, 1, 1], workers, ser)\n      expect( rdd.histogram(1) ).to eq([[1, 1], [4]])\n    end\n\n    it 'without buckets basic two' do\n      rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)\n      expect( rdd.histogram(2) ).to eq([[1, 2.5, 4], [2, 2]])\n    end\n\n    it 'without buckets with more requested than elements' do\n      rdd = $sc.parallelize([1, 2], workers, ser)\n      buckets = [1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]\n      hist = [1, 0, 0, 0, 0, 0, 0, 0, 0, 1]\n\n      expect( rdd.histogram(10) ).to eq([buckets, hist])\n    end\n\n    it 'string' do\n      rdd = $sc.parallelize(['ab', 'ac', 'b', 'bd', 'ef'], workers, ser)\n\n      expect( rdd.histogram(['a', 'b', 'c'])[1] ).to eq([2, 2])\n      expect( rdd.histogram(1) ).to eq([['ab', 'ef'], [5]])\n      expect { rdd.histogram(2) }.to raise_error(Spark::RDDError)\n    end\n\n  end\nend\n\nRSpec.describe Spark::RDD do\n  let(:ser) { Spark::Serializer.build { __batched__(__marshal__, 1) } }\n\n  context '.stats' do\n    it_behaves_like 'a stats', 1\n    it_behaves_like 'a stats', 2\n    # it_behaves_like 'a stats', rand(2..5)\n  end\n\n  context '.histogram' do\n    it_behaves_like 'a histogram', 1\n    it_behaves_like 'a histogram', 2\n    # it_behaves_like 'a histogram', rand(2..5)\n  end\nend\n"
  },
  {
    "path": "spec/lib/whole_text_files_spec.rb",
    "content": "require 'spec_helper'\n\nRSpec.shared_examples 'a whole_text_files' do |workers|\n  it \"with #{workers || 'default'} worker\" do\n    rdd2 = rdd(workers).map(get_numbers)\n    result = files.size\n\n    expect(rdd2.collect.size).to eql(result)\n\n    rdd3 = rdd(workers)\n    rdd3 = rdd3.flat_map(get_numbers)\n\n    result = 0\n    files.each{|f| result += File.read(f).split.map(&:to_i).reduce(:+)}\n\n    expect(rdd3.sum).to eql(result)\n  end\nend\n\nRSpec.describe 'Spark::Context' do\n  let(:get_numbers) { lambda{|file, content| content.split.map(&:to_i)} }\n\n  let(:dir)   { File.join('spec', 'inputs', 'numbers') }\n  let(:files) { Dir.glob(File.join(dir, '*')) }\n\n  def rdd(workers)\n    $sc.whole_text_files(dir, workers)\n  end\n\n  it_behaves_like 'a whole_text_files', 1\n  it_behaves_like 'a whole_text_files', 2\n  # it_behaves_like 'a whole_text_files', nil\n  # it_behaves_like 'a whole_text_files', rand(2..10)\nend\n"
  },
  {
    "path": "spec/spec_helper.rb",
    "content": "require 'simplecov'\nSimpleCov.start\n\n$LOAD_PATH.unshift File.dirname(__FILE__) + '/../lib'\nrequire 'ruby-spark'\nrequire 'generator'\n\n# Loading\nSpark.load_lib\nSpark.jb.import_all_test\nSpark::Mllib.import\n\n# Keep it on method because its called from config test\ndef spark_start\n  Spark.logger.disable\n  Spark.config do\n    set 'spark.ruby.serializer.batch_size', 100\n  end\n  $sc = Spark.start\n  $sql = Spark.start_sql\nend\n\ndef windows?\n  RbConfig::CONFIG['host_os'] =~ /mswin|mingw/\nend\n\nRSpec.configure do |config|\n  config.default_formatter = 'doc'\n  config.color = true\n  config.tty   = true\n\n  config.before(:suite) do\n    spark_start\n  end\n  config.after(:suite) do\n    Spark.stop\n  end\nend\n"
  }
]