Repository: skrusche63/spark-elastic
Branch: master
Commit: 43163e14780e
Files: 39
Total size: 109.6 KB

Directory structure:
gitextract_mstyggjy/

├── .classpath
├── .project
├── README.md
├── pom.xml
└── src/
    └── main/
        ├── resources/
        │   ├── goals.xml
        │   ├── pageview.xml
        │   └── server.conf
        └── scala/
            └── de/
                └── kp/
                    └── spark/
                        └── elastic/
                            ├── Configuration.scala
                            ├── EsClient.scala
                            ├── EsContext.scala
                            ├── EsEvents.scala
                            ├── EsService.scala
                            ├── KafkaReader.scala
                            ├── KafkaService.scala
                            ├── SparkBase.scala
                            ├── actor/
                            │   ├── EsMaster.scala
                            │   └── KafkaMaster.scala
                            ├── apps/
                            │   ├── GoalApp.scala
                            │   ├── InsightApp.scala
                            │   └── SegmentApp.scala
                            ├── bayes/
                            │   └── ClickPredictor.scala
                            ├── enron/
                            │   ├── EnronApp.scala
                            │   ├── EnronEngine.scala
                            │   └── EnronUtils.scala
                            ├── ml/
                            │   ├── EsKMeans.scala
                            │   ├── EsNPref.scala
                            │   └── EsSimilarity.scala
                            ├── samples/
                            │   ├── EsCountMinSktech.scala
                            │   ├── EsHyperLogLog.scala
                            │   ├── KafkaEngine.scala
                            │   ├── KafkaSerializer.scala
                            │   ├── MessageApp.scala
                            │   ├── MessageGenerator.scala
                            │   └── MessageUtils.scala
                            ├── specs/
                            │   ├── FieldSpec.scala
                            │   ├── GoalSpec.scala
                            │   └── PageViewSpec.scala
                            └── stream/
                                ├── EsHistogram.scala
                                └── EsStream.scala

================================================
FILE CONTENTS
================================================

================================================
FILE: .classpath
================================================
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
	<classpathentry kind="src" output="target/classes" path="src/main/scala">
		<attributes>
			<attribute name="optional" value="true"/>
			<attribute name="maven.pomderived" value="true"/>
		</attributes>
	</classpathentry>
	<classpathentry kind="src" path="src/main/resources"/>
	<classpathentry kind="src" output="target/test-classes" path="src/test/scala">
		<attributes>
			<attribute name="optional" value="true"/>
			<attribute name="maven.pomderived" value="true"/>
		</attributes>
	</classpathentry>
	<classpathentry kind="con" path="org.scala-ide.sdt.launching.SCALA_CONTAINER"/>
	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
		<attributes>
			<attribute name="maven.pomderived" value="true"/>
		</attributes>
	</classpathentry>
	<classpathentry kind="output" path="target/classes"/>
</classpath>


================================================
FILE: .project
================================================
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
	<name>spark-elastic</name>
	<comment></comment>
	<projects>
	</projects>
	<buildSpec>
		<buildCommand>
			<name>org.eclipse.m2e.core.maven2Builder</name>
			<arguments>
			</arguments>
		</buildCommand>
		<buildCommand>
			<name>org.scala-ide.sdt.core.scalabuilder</name>
			<arguments>
			</arguments>
		</buildCommand>
	</buildSpec>
	<natures>
		<nature>org.scala-ide.sdt.core.scalanature</nature>
		<nature>org.eclipse.jdt.core.javanature</nature>
		<nature>org.eclipse.m2e.core.maven2Nature</nature>
	</natures>
</projectDescription>


================================================
FILE: README.md
================================================
![Dr.Krusche & Partner PartG](https://raw.github.com/skrusche63/spark-elastic/master/images/dr_kruscheundpartner_640.png)

## Integration of Elasticsearch with Spark

This project shows how to easily integrate [Apache Spark](http://spark.apache.org), a fast and general purpose engine for 
large-scale data processing, with [Elasticsearch](http://elasticsearch.org), a real-time distributed search and analytics 
engine.

Spark is an in-memory processing framework and outperforms Hadoop up to a factor of 100. Spark is accompanied by 

* [MLlib](https://spark.apache.org/mllib/), a scalable machine learning library,
* [Spark SQL](https://spark.apache.org/sql/), a unified access platform for structured big data,
* [Spark Streaming](https://spark.apache.org/streaming/), a library to build scalable fault-tolerant streaming applications.

If you are more interested in an Elasticsearch plugin-in that brings the power of [Predictiveworks.](http://predictiveworks.eu) to Elasticsearch,
then please refer to [Elasticinsight.](http://elasticinsight.eu)

![Elasticinsight. Overview](https://raw.github.com/skrusche63/spark-elastic/master/images/elasticinsight_640.png)

[Predictiveworks.](http://predictiveworks.eu) is an ensemble of dedicated predictive engines that covers a wide range of today's analytics requirements from Association Analysis,
to Context-Aware Recommendations up to Text Analysis. Elasticinsight. empowers Elasticsearch to seamlessly uses these multiple engines.

---

### <a name="1"></a>Machine Learning with Elasticsearch

Besides linguistic and semantic enrichment, for data in a search index there is an increasing demand to apply knowledge discovery and
data mining techniques, and even predictive analytics to gain deeper insights into the data and further increase their business value.

One of the key prerequisites is to easily connect existing data sources to state-of-the art machine learning and predictive analytics 
frameworks.

In this project, we give advice how to connect Elasticsearch, a powerful distributed search engine, to Apache Spark and profit from the increasing number of existing machine learning algorithms.

The figure shows the integration pattern for Elasticsearch and Spark from an architectural persepctive and also indicates how to proceed with the enriched content (i.e. the way back to the search index).

![Elasticsearch and Spark](https://raw.githubusercontent.com/skrusche63/spark-elastic/master/images/Elasticsearch%20and%20Spark.png)

The source code below describes a few lines of Scala, that are sufficient to read from Elasticsearch and provide data for further mining 
and prediction tasks:

```
val source = sc.newAPIHadoopRDD(conf, classOf[EsInputFormat[Text, MapWritable]], classOf[Text], classOf[MapWritable])
val docs = source.map(hit => {
  new EsDocument(hit._1.toString,toMap(hit._2))
})
```

#### <a name="1.1"></a>Document Segmentation with KMeans

From the data format extracted from Elasticsearch `RDD[EsDocument]` it is just a few lines of Scala to segment these documents with respect to their geo location (latitude,longitude). 

From these data a heatmap can be drawn to visualize from which region of world most of the documents come from. The image below shows a multi-colored heatmap, where the colors red, yellow, green and blue indicate different heat ranges.

![Heatmap from Piwik Data](https://raw.githubusercontent.com/skrusche63/spark-piwik/master/images/heatmap.png)

Segmenting documents into specific target groups is not restricted their geo location. Time of the day, product or service categories, total revenue, and other parameters may be used.

For segmentation, the [K-Means clustering](http://http://en.wikipedia.org/wiki/K-means_clustering) implementation 
of [MLlib](https://spark.apache.org/mllib/) is used:

```
def cluster(documents:RDD[EsDocument],esConf:Configuration):RDD[(Int,EsDocument)] =  {
  
  val fields = esConf.get("es.fields").split(",")
  val vectors = documents.map(doc => toVector(doc.data,fields))   

  val clusters = esConf.get("es.clusters").toInt
  val iterations = esConf.get("es.iterations").toInt
    
  /* Train model */
  val model = KMeans.train(vectors, clusters, iterations)
  
  /* Apply model */
  documents.map(doc => (model.predict(toVector(doc.data,fields)),doc))
    
}
```

Clustering Elasticsearch data with K-Means is a first and simple example of how to immediately benefit from the integration with Spark. Other business cases may cover recommendations:

Suppose Elasticsearch is used to index e-commerce transactions on a per user basis, then it is also straightforward to build a recommendation system in just two steps:

* **First**, implicit user-item ratings have to be derived from the e-commerce transactions, and  
* **Second**, from this item similarities are calculated to provide a recommendation model.

For more information, please read [here](https://github.com/skrusche63/spark-elastic/wiki/Item-Similarity-with-Spark).


#### <a name="1.1"></a> Insights from Elasticsearch with SQL

[Spark SQL](https://spark.apache.org/sql/) allows relational queries expressed in SQL to be executed using Spark. This enables to apply queries to Spark data structures and also to Spark data streams (see below).

As SQL queries generate Spark data structures, a mixture of SQL and native Spark operations is also possible, thus providing a sophisticated mechanism to compute valuable insight from data in real-time.

The code example below illustrates how to apply SQL queries on a Spark data structure (RDD) and provide further insight by mixing with native Spark operations.

```
/*
 * Elasticsearch specific configuration
 */
val esConf = new Configuration()                          

esConf.set("es.nodes","localhost")
esConf.set("es.port","9200")
    
esConf.set("es.resource", "enron/mails")                
esConf.set("es.query", "?q=*:*")                          

esConf.set("es.table", "docs")
esConf.set("es.sql", "select subject from docs")

...

/*
 * Read from ES and provide some insight with Spark & SparkSQL,
 * thereby mixing SQL and other Spark operations
 */
val documents = es.documentsAsJson(esConf)
val subjects = es.query(documents, esConf).filter(row => row.getString(0).contains("Re"))    

...

def query(documents:RDD[String], esConfig:Configuration):SchemaRDD =  {

  val query = esConfig.get("es.sql")
  val name  = esConfig.get("es.table")
    
  val table = sqlc.jsonRDD(documents)
  table.registerAsTable(name)

  sqlc.sql(query)   

}
```

---

### <a name="2"></a>Real-Time Stream Processing and Elasticsearch

Real-time analytics is a very popular topic with a wide range of application areas:

* High frequency trading (finance), 
* Real-time bidding (adtech), 
* Real-time social activity (social networks),
* Real-time sensoring (Internet of things),
* Real-time user behavior,

and more, gain tremendous business value from real-time analytics. There exist a lot of popular frameworks to aggregate data in real-time, such as Apache Storm, 
Apache S4, Apache Samza, Akka Streams, SQLStream to name just a few.

Spark Streaming, which is capable to process about 400,000 records per node per second for simple aggregations on small records, significantly outperforms other popular 
streaming systems. This is mainly because Spark Streaming groups messages in small batches which are then processed together. 

Moreover in case of failure, Spark Streaming batches are only processed once which greatly simplifies the logic (e.g. to make sure some values are not counted multiple times).

Spark Streaming is a layer on top of Spark and transforms and batches data streams from various sources, such as Kafka, Twitter or ZeroMQ into a sequence of 
Spark RDDs (Resilient Distributed DataSets) using a sliding window. These RDDs can then be manipulated using normal Spark operations.

This project provides a real-time data integration pattern based on Apache Kafka, Spark Streaming and Elasticsearch: 

[Apache Kafka](http://kafka.apache.org/) is a distributed publish-subscribe messaging system, that may also be seen as a real-time integration system. For example, Web tracking events are easily sent to Kafka, 
and may then be consumed by a set of different consumers.

In this project, we use Spark Streaming as a consumer and aggregator of e.g. such tracking data streams, and perform a live indexing. As Spark Streaming is also able to directly 
compute new insights from data streams, this data integration pattern may be used as a starting point for real-time data analytics and enrichment before search indexing.

The figure below illustrates the architecture of this pattern. For completeness reasons, [Spray](http://spray.io/) has been introduced. Spray is an open-source toolkit for 
building REST/HTTP-based integration layers on top of Scala and Akka. As it is asynchronous, actor-based, fast, lightweight, and modular, it is an easy way to connect Scala 
applications to the Web.

![Real-time Data Integration and Analytics](https://raw.github.com/skrusche63/spark-elastic/master/images/Real-time%20Data%20Integration%20and%20Analytics.png)

The code example below illustrates that such an integration pattern may be implemented with just a few lines of Scala code:

```
val stream = KafkaUtils.createStream[String,Message,StringDecoder,MessageDecoder](ssc, kafkaConfig, kafkaTopics, StorageLevel.MEMORY_AND_DISK).map(_._2)
stream.foreachRDD(messageRDD => {
  /**
   * Live indexing of Kafka messages; note, that this is also
   * an appropriate place to integrate further message analysis
   */
  val messages = messageRDD.map(prepare)
  messages.saveAsNewAPIHadoopFile("-",classOf[NullWritable],classOf[MapWritable],classOf[EsOutputFormat],esConfig)    
      
})

```

#### <a name="2.1"></a> Most Frequent Items from Streams

Using the architecture as illustrated above not only enables to apply Spark to data streams. It also opens real-time streams to other data processing libraries such as [Algebird](https://github.com/twitter/algebird) from 
Twitter.  

Algebird brings, as the name indicates, algebraic algorithms to streaming data. An important representative is [Count-Min Sketch](http://en.wikipedia.org/wiki/Count%E2%80%93min_sketch) which enables to compute the most 
frequent items from streams in a certain time window. The code example below describes how to apply the CountMinSketchMonoid (Algebird) to compute the most frequent messages from a Kafka Stream with respect to the messages' classification: 

```

object EsCountMinSktech {
    
  def findTopK(stream:DStream[Message]):Seq[(Long,Long)] = {
  
    val DELTA = 1E-3
    val EPS   = 0.01
    
    val SEED = 1
    val PERC = 0.001
 
    val k = 5
    
    var globalCMS = new CountMinSketchMonoid(DELTA, EPS, SEED, PERC).zero
 
    val clases = stream.map(message => message.clas)
    val approxTopClases = clases.mapPartitions(clases => {
      
      val localCMS = new CountMinSketchMonoid(DELTA, EPS, SEED, PERC)
      clases.map(clas => localCMS.create(clas))
    
    }).reduce(_ ++ _)

    approxTopClases.foreach(rdd => {
      if (rdd.count() != 0) globalCMS ++= rdd.first()
    })
        
    /**
     * Retrieve approximate TopK classifiers from the provided messages
     */
    val globalTopK = globalCMS.heavyHitters.map(clas => (clas, globalCMS.frequency(clas).estimate))
      /*
       * Retrieve the top k message classifiers: it may also be interesting to 
       * return the classifier frequency from this method, ignoring the line below
       */
      .toSeq.sortBy(_._2).reverse.slice(0, k)
  
    globalTopK
    
  }
}

```

***

### <a name="3"></a> Technology Stack

* [Scala](http://scala-lang.org)
* [Apache Kafka](http://kafka.apache.org/)
* [Apache Spark](http://spark.apache.org)
* [Spark SQL](https://spark.apache.org/sql/)
* [Spark Streaming](https://spark.apache.org/streaming/)
* [Twitter Algebird](https://github.com/twitter/algebird)
* [Elasticsearch](http://elasticsearch.org)
* [Elasticsearch Hadoop](http://elasticsearch.org/overview/hadoop/)
* [Spray](http://spray.io/)


================================================
FILE: pom.xml
================================================
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>spark-elastic</groupId>
  <artifactId>spark-elastic</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  <name>Spark-ELASTIC</name>
  <description>This project combines Apache Spark and Elasticsearch to enable mining &amp; prediction for Elasticsearch.</description>
  <inceptionYear>2010</inceptionYear>
  <licenses>
    <license>
      <name>GPL v3</name>
      <url>http://....</url>
      <distribution>repo</distribution>
    </license>
  </licenses>

  <properties>
    <maven.compiler.source>1.6</maven.compiler.source>
    <maven.compiler.target>1.6</maven.compiler.target>
    <encoding>UTF-8</encoding>
    <scala.tools.version>2.10</scala.tools.version>
    <scala.version>2.10.0</scala.version>
  </properties>

  <dependencies>
    <dependency>
      <groupId>org.scala-lang</groupId>
      <artifactId>scala-library</artifactId>
      <version>${scala.version}</version>
    </dependency>

    <!-- Test -->
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.11</version>
      <scope>test</scope>
    </dependency>
    <dependency>
      <groupId>org.specs2</groupId>
      <artifactId>specs2_${scala.tools.version}</artifactId>
      <version>1.13</version>
      <scope>test</scope>
    </dependency>

    <dependency>
      <groupId>org.scalatest</groupId>
      <artifactId>scalatest_${scala.tools.version}</artifactId>
      <version>2.0.M6-SNAP8</version>
      <scope>test</scope>
    </dependency>

    <!-- spark 1.0.2 -->
    <dependency>
	  <groupId>org.apache.spark</groupId>
	  <artifactId>spark-core_2.10</artifactId>
	  <version>1.0.2</version>
	  <exclusions>
	    <!-- 
	    Apache Sparks uses an elder version of jackson-mapper-asl (1.01), which
	    is in conflict with jackson-core-asl 1.8.8, used by ES Hadoop
	     -->
        <exclusion>
          <groupId>org.codehaus.jackson</groupId>
          <artifactId>jackson-mapper-asl</artifactId>
        </exclusion>
	  </exclusions>
    </dependency>
    
    <!-- spark mllib 1.0.2 -->
    <dependency>
	  <groupId>org.apache.spark</groupId>
	  <artifactId>spark-mllib_2.10</artifactId>
	  <version>1.0.2</version>
    </dependency>

    <!-- spark sql 1.0.2 -->
    <dependency>
	  <groupId>org.apache.spark</groupId>
	  <artifactId>spark-sql_2.10</artifactId>
	  <version>1.0.2</version>
    </dependency>            
    
    <!-- spark streaming 1.0.2 -->
    <dependency>
	  <groupId>org.apache.spark</groupId>
	  <artifactId>spark-streaming_2.10</artifactId>
	  <version>1.0.2</version>
    </dependency>

    <!-- spark streaming twitter 1.0.2 -->
    <dependency>
	  <groupId>org.apache.spark</groupId>
	  <artifactId>spark-streaming-twitter_2.10</artifactId>
	  <version>1.0.2</version>
    </dependency> 
       
    <!-- spark kafka streaming 1.0.2 -->
    <dependency>
	  <groupId>org.apache.spark</groupId>
	  <artifactId>spark-streaming-kafka_2.10</artifactId>
	  <version>1.0.2</version>
    </dependency>

    <!--  twitter algebird 0.7.0 -->
    <dependency>
	  <groupId>com.twitter</groupId>
	  <artifactId>algebird-core_2.10</artifactId>
	  <version>0.7.0</version>
    </dependency> 
    
    <!-- elasticsearch hadoop -->
    <dependency>
      <groupId>org.elasticsearch</groupId>
      <artifactId>elasticsearch-hadoop</artifactId>
      <version>2.0.0</version>
    </dependency>

    <!-- elastic search -->
    <dependency>
      <groupId>org.elasticsearch</groupId>
      <artifactId>elasticsearch</artifactId>
      <version>1.3.0</version>      
    </dependency>  
    
    <!-- json4s -->
    <dependency>
	  <groupId>org.json4s</groupId>
	  <artifactId>json4s-native_2.10</artifactId>
	  <version>3.2.10</version>
    </dependency>
    
    <!--  spray 1.2.0 -->
    <dependency>
      <groupId>io.spray</groupId>
      <artifactId>spray-client</artifactId>
      <version>1.2.0</version>
    </dependency>  

    <dependency>
      <groupId>io.spray</groupId>
      <artifactId>spray-httpx</artifactId>
      <version>1.2.0</version>
    </dependency>  

    <!-- akka 2.2.3 -->
    <dependency>
      <groupId>com.typesafe.akka</groupId>
      <artifactId>akka-actor_2.10</artifactId>
      <version>2.2.3</version>
    </dependency>
    
    <dependency>
	  <groupId>com.typesafe.akka</groupId>
	  <artifactId>akka-contrib_2.10</artifactId>
	  <version>2.2.3</version>
    </dependency> 
 
    <dependency>
	  <groupId>com.typesafe.akka</groupId>
	  <artifactId>akka-remote_2.10</artifactId>
	  <version>2.2.3</version>
    </dependency>

    <!--  kafka 0.8.1.1 -->
    <dependency>
	  <groupId>org.apache.kafka</groupId>
	  <artifactId>kafka_2.10</artifactId>
	  <version>0.8.1.1</version>
	  <exclusions>
        <exclusion>
          <groupId>com.sun.jmx</groupId>
          <artifactId>jmxri</artifactId>
        </exclusion>
        <exclusion>
          <groupId>com.sun.jdmk</groupId>
          <artifactId>jmxtools</artifactId>
        </exclusion>
        <exclusion>
          <groupId>javax.jms</groupId>
          <artifactId>jms</artifactId>
        </exclusion>
      </exclusions>  
    </dependency>    

    <dependency>
      <groupId>com.twitter</groupId>
      <artifactId>twitter-text</artifactId>
      <version>1.9.9</version>
    </dependency>

  </dependencies>

  <repositories>
    <repository>
      <id>spray repo</id>
      <name>Spray Repository</name>
      <url>http://repo.spray.io/</url>
    </repository>
    <repository>
      <id>conjars.org</id>
      <url>http://conjars.org/repo</url>
    </repository>  
  </repositories>
  
  <build>
    <sourceDirectory>src/main/scala</sourceDirectory>
    <testSourceDirectory>src/test/scala</testSourceDirectory>
    <plugins>
      <plugin>
        <!-- see http://davidb.github.com/scala-maven-plugin -->
        <groupId>net.alchim31.maven</groupId>
        <artifactId>scala-maven-plugin</artifactId>
        <version>3.1.3</version>
        <executions>
          <execution>
            <goals>
              <goal>compile</goal>
              <goal>testCompile</goal>
            </goals>
            <configuration>
              <args>
                <arg>-make:transitive</arg>
                <arg>-dependencyfile</arg>
                <arg>${project.build.directory}/.scala_dependencies</arg>
              </args>
            </configuration>
          </execution>
        </executions>
      </plugin>
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-surefire-plugin</artifactId>
        <version>2.13</version>
        <configuration>
          <useFile>false</useFile>
          <disableXmlReport>true</disableXmlReport>
          <!-- If you have classpath issue like NoDefClassError,... -->
          <!-- useManifestOnlyJar>false</useManifestOnlyJar -->
          <includes>
            <include>**/*Test.*</include>
            <include>**/*Suite.*</include>
          </includes>
        </configuration>
      </plugin>
    </plugins>
  </build>
  <organization>
  	<name>Dr. Krusche &amp; Partner PartG</name>
  	<url>http://dr-kruscheundpartner.de</url>
  </organization>
</project>


================================================
FILE: src/main/resources/goals.xml
================================================
<goalspec>
	<goal id="1">/shoppingCart,/checkOut,/signin,/signup,/billing,/confirmShipping,/placeOrder</goal>
</goalspec>

================================================
FILE: src/main/resources/pageview.xml
================================================
<fieldspec>
	<field name="sessionid" type="String">sessionid</field>
	<field name="timestamp" type="Long">timestamp</field>
	<field name="userid" type="String">userid</field>
    <field name="pageurl" type="String">pageurl</field>	
    <field name="visittime" type="String">visittime</field>	
    <field name="referrer" type="String">referrer</field>
</fieldspec>

================================================
FILE: src/main/resources/server.conf
================================================
akka {
  actor {
    provider = "akka.remote.RemoteActorRefProvider"
  }
  remote {
    enabled-transports = ["akka.remote.netty.tcp"]
    netty.tcp {
      hostname = "127.0.0.1"
      port = 2600
    }
    log-sent-messages = on
    log-received-messages = on
  }
}

================================================
FILE: src/main/scala/de/kp/spark/elastic/Configuration.scala
================================================
package de.kp.spark.elastic
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/
import com.typesafe.config.ConfigFactory
import java.util.Properties

object Configuration {

    /* Load configuration for router */
  val path = "application.conf"
  val config = ConfigFactory.load(path)

  def elastic():(String,String,String,String) = {
  
    val cfg = config.getConfig("elastic")

    val host = cfg.getString("host")
    val port = cfg.getString("port")

    val index = cfg.getString("index")
    val mapping = cfg.getString("mapping")

    (host,port,index,mapping)    
  
  }

  def kafka():Properties = {
    
    val cfg = config.getConfig("kafka")

    val host = cfg.getString("zk.connect.host")
    val port = cfg.getString("zk.connect.port")
    
    val gid = config.getString("consumer.groupid")

    val ctimeout = cfg.getString("consumer.timeout.ms")
    val stimeout = cfg.getString("consumer.socket.timeout.ms")

    val ccommit = cfg.getString("consumer.commit.ms")
    val aoffset = cfg.getString("auto.offset.reset")
    
    val params = Map(
      "zookeeper.connect" -> (host + ":" + port),

      "group.id" -> gid,

      "socket.timeout.ms" -> stimeout,
      "consumer.timeout.ms" -> ctimeout,

      "autocommit.interval.ms" -> ccommit,
      "auto.offset.reset" -> aoffset
    
    )

    val props = new Properties()
    params.map(kv => {
      props.put(kv._1,kv._2)
    })

    props

  }
  
  def router():(Int,Int,Int) = {
  
    val cfg = config.getConfig("router")
  
    val time    = cfg.getInt("time")
    val retries = cfg.getInt("retries")  
    val workers = cfg.getInt("workers")
    
    (time,retries,workers)

  }

  def topic() = config.getString("topic")
}

================================================
FILE: src/main/scala/de/kp/spark/elastic/EsClient.scala
================================================
package de.kp.spark.elastic
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/
 
import akka.actor.ActorSystem

import spray.http.{HttpRequest,HttpResponse}
import spray.client.pipelining.{Get,Post,sendReceive }

import org.elasticsearch.client.transport.TransportClient
import org.elasticsearch.common.transport.InetSocketTransportAddress

import org.elasticsearch.transport.ConnectTransportException

import scala.concurrent.Future
import scala.util.{Success,Failure}

case class EsConfig(
  hosts:Seq[String],ports:Seq[Int]
)

/**
 * A Http client implementation based on Akka & Spray
 */
class EsHttpClient {
  
  import concurrent.ExecutionContext.Implicits._
  
  implicit val system = ActorSystem("EsClient")
 
  val pipeline: HttpRequest => Future[HttpResponse] = sendReceive

  def get(url:String):Future[HttpResponse] = pipeline(Get(url))

  def post(url:String,payload:String):Future[HttpResponse] = pipeline(Post(url, payload))
  
  def shutdown = system.shutdown
  
}

object EsTransportClient {

  def apply(config:EsConfig):TransportClient = {
  
    val client = try {
    
      val transportClient = new TransportClient()
    
      (config.hosts zip config.ports) foreach { hp =>
        transportClient.addTransportAddress(
          new InetSocketTransportAddress(hp._1, hp._2))
      }
    
      transportClient
  
    } catch {
      case e: ConnectTransportException =>
        throw new Exception(e.getMessage)
    }

    client
    
  }
  
}

================================================
FILE: src/main/scala/de/kp/spark/elastic/EsContext.scala
================================================
package de.kp.spark.elastic
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/

import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._

import org.apache.spark.rdd.RDD

import org.apache.spark.sql.{SchemaRDD,SQLContext}

import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.{Vector,Vectors}

import org.apache.hadoop.conf.{Configuration => HadoopConfig}
import org.apache.hadoop.io.{ArrayWritable,MapWritable,NullWritable,Text}

import org.elasticsearch.hadoop.mr.EsInputFormat

import org.json4s.native.Serialization.write
import org.json4s.DefaultFormats

import scala.collection.JavaConversions._

case class EsDocument(id:String,data:Map[String,String])

/**
 * ElasticContext supports access to Elasticsearch from Apache Spark using the library
 * from org.elasticsearch.hadoop. For read requests, the [Text] specifies the _id field
 * from Elasticsearch, and [MapWritable] specifies a (field,value) map
 * 
 */
class EsContext(sparkConf:HadoopConfig) extends SparkBase {
  
  private val sc = createSCLocal("ElasticContext",sparkConf)
  private val sqlc = new SQLContext(sc)

  /**
   * EsDocument is the common format to be used if machine learning algorithms
   * have to be applied to the extracted content of an Elasticseach index
   */
  def documents(esConf:HadoopConfig):RDD[EsDocument] = {
    
    val source = sc.newAPIHadoopRDD(esConf, classOf[EsInputFormat[Text, MapWritable]], classOf[Text], classOf[MapWritable])
    source.map(hit => new EsDocument(hit._1.toString,toMap(hit._2)))
    
  }
  /**
   * Json format is the common format to be used if SQL queries have to be applied
   * to the extracted content of an Elasticsearch index
   */
  def documentsAsJson(esConf:HadoopConfig):RDD[String] = {
    
    implicit val formats = DefaultFormats    
    
    val source = sc.newAPIHadoopRDD(esConf, classOf[EsInputFormat[Text, MapWritable]], classOf[Text], classOf[MapWritable])
    val docs = source.map(hit => {
      val doc = Map("ident" -> hit._1.toString()) ++ toMap(hit._2)
      write(doc)      
    })
    
    docs
    
  }

  def documentsFromSpec(conf:HadoopConfig):RDD[EsDocument] = {
    
    val fields = sc.broadcast(conf.get("es.fields").split(","))

    val source = sc.newAPIHadoopRDD(conf, classOf[EsInputFormat[Text, MapWritable]], classOf[Text], classOf[MapWritable])
    source.map(hit => new EsDocument(hit._1.toString,toMap(hit._2,fields.value)))
    
  }

  /**
   * Cluster extracted content from an Elasticsearch index by applying KMeans 
   * clustering algorithm from MLLib
   */
  def cluster(documents:RDD[EsDocument],conf:HadoopConfig):RDD[(Int,EsDocument)] =  {
    
    val fields = sc.broadcast(conf.get("es.fields").split(","))
 
    val vectors = documents.map(doc => toVector(doc.data,fields.value))   

    val clusters = conf.get("es.clusters").toInt
    val iterations = conf.get("es.iterations").toInt
    
    /* Train model */
    val model = KMeans.train(vectors, clusters, iterations)
    
    /* Apply model */
    documents.map(doc => (model.predict(toVector(doc.data,fields.value)),doc))
    
  }

  /**
   * Apply SQL statement to extracted content from an Elasticsearch index
   */
  def query(documents:RDD[String], esConfig:HadoopConfig):SchemaRDD =  {

    val query = esConfig.get("es.sql")
    val name  = esConfig.get("es.table")
    
    val table = sqlc.jsonRDD(documents)
    table.registerAsTable(name)

    sqlc.sql(query)   

  }

  /**
   * Wrapper to stop SparkContext
   */
  def shutdown = sc.stop
  /**
   * Wrapper to get SparkContext from ElasticContext
   */
  def sparkContext = sc
  
  /**
   * A helper method to convert a MapWritable into a Map
   */
  private def toMap(mw:MapWritable):Map[String,String] = {
      
    val m = mw.map(e => {
        
      val k = e._1.toString        
      val v = (if (e._2.isInstanceOf[Text]) e._2.toString()
        else if (e._2.isInstanceOf[ArrayWritable]) {
        
          val array = e._2.asInstanceOf[ArrayWritable].get()
          array.map(item => {
            
            (if (item.isInstanceOf[NullWritable]) "" else item.asInstanceOf[Text].toString)}).mkString(",")
            
        }
        else "")
        
    
      k -> v
        
    })
      
    m.toMap
    
  }
  
  /**
   * A helper method to convert a MapWritable into a Map
   * thereby selecting predefined fields
   */
  private def toMap(mw:MapWritable,fields:Array[String]):Map[String,String] = {
      
    val m = mw.map(e => {
        
      val k = e._1.toString        
      val v = (if (e._2.isInstanceOf[Text]) e._2.toString()
        else if (e._2.isInstanceOf[ArrayWritable]) {
        
          val array = e._2.asInstanceOf[ArrayWritable].get()
          array.map(item => {
            
            (if (item.isInstanceOf[NullWritable]) "" else item.asInstanceOf[Text].toString)}).mkString(",")
            
        }
        else "")
        
    
      k -> v
        
    })
      
    m.filter(kv => fields.contains(kv._1)).toMap
    
  }

  private def toVector(data:Map[String,String], fields:Array[String]):Vector = {
    
    val features = data.filter(kv => fields.contains(kv._1)).map(_._2.toDouble)      
    Vectors.dense(features.toArray)
   
  }

}

================================================
FILE: src/main/scala/de/kp/spark/elastic/EsEvents.scala
================================================
package de.kp.spark.elastic
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/

import org.elasticsearch.client.Client

import org.elasticsearch.action.index.IndexResponse
import org.elasticsearch.action.ActionListener

import scala.concurrent.{ExecutionContext,Future,Promise}

/**
 * EsEvents indexes trackable events retrieved from Apache Kafka
 */
class EsEvents(client:Client,index:String,mapping:String) {

  def insert(event:String)(implicit ec:ExecutionContext): Future[Either[String,String]] = {
    
    val response = Promise[IndexResponse]

    /* index/mapping = enron/mails */
    client.prepareIndex(index,mapping).setSource(event)
      .execute(new EsActionListener(response))

    response.future
      .map(r => Right(r.getId()))
      .recover {
        case e: Exception => Left(e.toString)
      }
  
  }

}

class EsActionListener[T](val p: Promise[T]) extends ActionListener[T]{
  
  override def onResponse(r: T) = {
    p.success(r)
  }
  
  override def onFailure(e: Throwable) = {
    p.failure(e)
  }

}

================================================
FILE: src/main/scala/de/kp/spark/elastic/EsService.scala
================================================
package de.kp.spark.elastic
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/

import akka.actor.{ActorSystem,Props}
import com.typesafe.config.ConfigFactory

import de.kp.spark.elastic.actor.EsMaster

object EsService {

  def main(args: Array[String]) {
    
    val name:String = "elastic-server"
    val conf:String = "server.conf"

    val server = new EsService(conf, name)
    while (true) {}
    
    server.shutdown
      
  }

}

class EsService(conf:String, name:String) {

  val system = ActorSystem(name, ConfigFactory.load(conf))
  sys.addShutdownHook(system.shutdown)

  val master = system.actorOf(Props[EsMaster], name="elastic-master")

  def shutdown = system.shutdown()
  
}

================================================
FILE: src/main/scala/de/kp/spark/elastic/KafkaReader.scala
================================================
package de.kp.spark.elastic
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/

import akka.actor.ActorRef

import kafka.consumer.{Consumer,ConsumerConfig,Whitelist}
import kafka.serializer.DefaultDecoder

class KafkaReader(topic:String,actor:ActorRef) {

  private val props = Configuration.kafka
  private val connector = Consumer.create(new ConsumerConfig(props))

  private val stream = connector.createMessageStreamsByFilter(new Whitelist(topic),1,new DefaultDecoder(),new DefaultDecoder())(0)

  def shutdown {
    connector.shutdown()
  }

  def read {
	consume(execute)
  }

  private def consume(write:(Array[Byte]) => Unit) = {

    for (compose <- stream) {
      try {        
        write(compose.message)
      } catch {
        
        case e: Throwable =>
          if (true) { //this is objective even how to conditionalize on it
            //error("Error processing message, skipping this message: ", e)
          } else {
            throw e
          }
      }
    }      
  }

  private def execute(bytes:Array[Byte]) {
    actor ! new String(bytes)    
  }

}

================================================
FILE: src/main/scala/de/kp/spark/elastic/KafkaService.scala
================================================
package de.kp.spark.elastic
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/

import akka.actor.{ActorSystem,Props}
import de.kp.spark.elastic.actor.KafkaMaster

object KafkaService {

  val topic = Configuration.topic
  
  val system = ActorSystem("elastic-kafka")  
  sys.addShutdownHook(system.shutdown)
    
  def main(args: Array[String]) {
      
    val master = system.actorOf(Props(new KafkaMaster()))
    val reader = new KafkaReader(topic, master)
      
    while (true) reader.read
      
    reader.shutdown
    system.shutdown

  }

}

================================================
FILE: src/main/scala/de/kp/spark/elastic/SparkBase.scala
================================================
package de.kp.spark.elastic
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/

import org.apache.spark.{SparkConf,SparkContext}
import org.apache.spark.serializer.KryoSerializer

import org.apache.spark.streaming.{Seconds,StreamingContext}

import org.apache.hadoop.conf.{Configuration => HadoopConfig}
import scala.collection.JavaConversions._

trait SparkBase {
  
  protected def createSSCLocal(name:String,config:HadoopConfig):StreamingContext = {

    val sc = createSCLocal(name,config)
    
    /*
     * Batch duration is the time duration spark streaming uses to 
     * collect spark RDDs; with a duration of 5 seconds, for example
     * spark streaming collects RDDs every 5 seconds, which then are
     * gathered int RDDs    
     */
    val batch  = config.get("spark.batch.duration").toInt    
    new StreamingContext(sc, Seconds(batch))

  }
  
  protected def createSCLocal(name:String,config:HadoopConfig):SparkContext = {

    /* Extract Spark related properties from the Hadoop configuration */
    val iterator = config.iterator()
    for (prop <- iterator) {

      val k = prop.getKey()
      val v = prop.getValue()
      
      if (k.startsWith("spark."))System.setProperty(k,v)      
      
    }

    val runtime = Runtime.getRuntime()
	runtime.gc()
		
	val cores = runtime.availableProcessors()
		
	val conf = new SparkConf()
	conf.setMaster("local["+cores+"]")
		
	conf.setAppName(name);
    conf.set("spark.serializer", classOf[KryoSerializer].getName)		
    
    /* Set the Jetty port to 0 to find a random port */
    conf.set("spark.ui.port", "0")        
        
	new SparkContext(conf)
		
  }

  protected def createSSCRemote(name:String,config:HadoopConfig):SparkContext = {
    /* Not implemented yet */
    null
  }

  protected def createSCRemote(name:String,config:HadoopConfig):SparkContext = {
    /* Not implemented yet */
    null
  }

}

================================================
FILE: src/main/scala/de/kp/spark/elastic/actor/EsMaster.scala
================================================
package de.kp.spark.elastic.actor
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/

import akka.actor.{Actor,ActorLogging,ActorRef,Props}

import akka.pattern.ask
import akka.util.Timeout

import akka.actor.{OneForOneStrategy, SupervisorStrategy}
import akka.routing.RoundRobinRouter

import com.typesafe.config.ConfigFactory

import scala.concurrent.duration.DurationInt

/**
 * EsMaster handles remote search requests based on Akka Remoting
 * feature; also see: EsService (remote service) 
 */
class EsMaster extends Actor with ActorLogging {
    
  def receive = {
    
    case _ => log.info("Unknown request")
  
  }

}

================================================
FILE: src/main/scala/de/kp/spark/elastic/actor/KafkaMaster.scala
================================================
package de.kp.spark.elastic.actor
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/

import akka.actor.{Actor,ActorLogging,ActorRef,Props}
import akka.actor.{OneForOneStrategy, SupervisorStrategy}

import akka.routing.RoundRobinRouter

import de.kp.spark.elastic.{Configuration,EsConfig,EsEvents,EsTransportClient}

import scala.concurrent.duration._
import scala.concurrent.duration.Duration._

import scala.concurrent.duration.DurationInt

class KafkaMaster extends Actor with ActorLogging {
    
  private val (esHost,esPort,esIndex,esType) = Configuration.elastic  
  private val esClient = EsTransportClient(EsConfig(Seq(esHost),Seq(esPort.toInt)))
  
  private val esEvents = new EsEvents(esClient,esIndex,esType)
  
  import concurrent.ExecutionContext.Implicits._
  
  /* Load configuration for routers */
  val (time,retries,workers) = Configuration.router   

  override val supervisorStrategy = OneForOneStrategy(maxNrOfRetries=retries,withinTimeRange = DurationInt(time).minutes) {
    case _ : Exception => SupervisorStrategy.Restart
  }
  
  def receive = {

    case req:String => {
      
      val response = esEvents.insert(req)
      
    }
    
    case _ => {}
  
  }

}

================================================
FILE: src/main/scala/de/kp/spark/elastic/apps/GoalApp.scala
================================================
package de.kp.spark.elastic.apps
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/

import org.apache.spark.rdd.RDD
import org.apache.hadoop.conf.Configuration

import de.kp.spark.elastic.{EsContext,EsDocument}

import de.kp.spark.elastic.bayes.{ClickModel,ClickTrainer}
import de.kp.spark.elastic.specs.{GoalSpec,PageViewSpec}

object GoalApp {

  def run(clicks:Int,goal:String) {

    val start = System.currentTimeMillis()
 
    /* Configure Apache Spark */
    val sparkConf = new Configuration()

    sparkConf.set("spark.executor.memory","1g")
	sparkConf.set("spark.kryoserializer.buffer.mb","256")

	val es = new EsContext(sparkConf)

    /* Configure Elasticsearch */
    val esConf = new Configuration()                          

    esConf.set("es.nodes","localhost")
    esConf.set("es.port","9200")
    
    esConf.set("es.resource", "visits/pageview")                
    esConf.set("es.query", "?q=*:*")                          

    val fields = PageViewSpec.get.map(_._2._1).mkString(",")
    esConf.set("es.fields", fields)
    
    /*
     * Read from Elasticsearch and restrict to those document fields
     * specified by PageViewSpec 
     */
    val documents = es.documentsFromSpec(esConf)

    /*
     * Extract dataset: (sessionid,timestamp,userid,pageurl,visittime,referrer)
     */
    val extracted = extract(documents,PageViewSpec.get)
    /*
     * Evaluate extracted dataset and determine whether the conversion goal provided matches the 
     * page urls within a session
     * 
     * Evaluated dataset: (sessid,userid,total,starttime,timespent,referrer,exitpage,flowstatus)
     */
    val evaluated = evaluate(extracted,goal)
    /*
     * Train a Bayes model from the evaluated dataset
     */
    val model = ClickTrainer.train(evaluated)
    println("Conversion Probability: " + model.predict(clicks))
    
    val end = System.currentTimeMillis()
    println("Total time: " + (end-start) + " ms")
    
    es.shutdown
    
  }
  
  def evaluate(source:RDD[(String,Long,String,String,String,String)],goal:String):RDD[(String,String,Int,Long,Long,String,String,Int)] = {
 
    /* Group source by sessionid */
    val dataset = source.groupBy(group => group._1)
    dataset.map(valu => {
      
      /* Sort single session data by timestamp */
      val data = valu._2.toList.sortBy(_._2)

      val pages = data.map(_._4)
     
      /* Total number of page clicks */
      val total = pages.size
      
      val (sessid,starttime,userid,pageurl,visittime,referrer) = data.head
      val endtime = data.last._2
      
      /* Total time spent for session */
      val timespent = (if (total > 1) (endtime - starttime) / 1000 else 0)
      val exitpage = pages(total - 1)
      
      /*
       * This is a simple session evaluation to determine whether the sequence of
       * pages per session matches with a predefined page flow
       */
      val flowstatus = GoalSpec.checkFlow(goal,pages)      
      (sessid,userid,total,starttime,timespent,referrer,exitpage,flowstatus)
      
    })
    
  }

  private def extract(documents:RDD[EsDocument],spec:Map[String,(String,String)]):RDD[(String,Long,String,String,String,String)] = {

    val sc = documents.context
    val bspec = sc.broadcast(spec)
    
    documents.map(document => {
      
      /* sessionid */
      val sessionid = document.data(bspec.value("sessionid")._1)
      
      /* timestamp */
      val timestamp = document.data(bspec.value("timestamp")._1).toLong

      /* userid */
      val userid = document.data(bspec.value("userid")._1)
 
      /* pageurl */
      val pageurl = document.data(bspec.value("pageurl")._1)
 
      /* visittime */
      val visittime = document.data(bspec.value("visittime")._1)
 
      /* referrer */
      val referrer = document.data(bspec.value("referrer")._1)
      
      /* Format: (sessionid,timestamp,userid,pageurl,visittime,referrer) */
      (sessionid,timestamp,userid,pageurl,visittime,referrer)
      
    })
  
  }

}

================================================
FILE: src/main/scala/de/kp/spark/elastic/apps/InsightApp.scala
================================================
package de.kp.spark.elastic.apps
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/

import org.apache.hadoop.conf.Configuration
import de.kp.spark.elastic.EsContext

/**
 * An example of how to extract documents from Elasticsearch
 * and apply a simple SQL statement to the documents
 */
object InsightApp {

  def run() {

    val start = System.currentTimeMillis()
 
    /*
     * Spark specific configuration
     */
    val sparkConf = new Configuration()

    sparkConf.set("spark.executor.memory","1g")
	sparkConf.set("spark.kryoserializer.buffer.mb","256")

	val es = new EsContext(sparkConf)

    /*
     * Elasticsearch specific configuration
     */
    val esConf = new Configuration()                          

    esConf.set("es.nodes","localhost")
    esConf.set("es.port","9200")
    
    esConf.set("es.resource", "enron/mails")                
    esConf.set("es.query", "?q=*:*")                          

    esConf.set("es.table", "docs")
    esConf.set("es.sql", "select subject from docs")
    
    /*
     * Read from ES and provide some insight with Spark & SparkSQL,
     * thereby mixing SQL and other Spark operations
     */
    val documents = es.documentsAsJson(esConf)
    val subjects = es.query(documents, esConf).filter(row => row.getString(0).contains("Re"))    

    subjects.foreach(subject => println(subject))

    val end = System.currentTimeMillis()
    println("Total time: " + (end-start) + " ms")
    
    es.shutdown
    
  }

}

================================================
FILE: src/main/scala/de/kp/spark/elastic/apps/SegmentApp.scala
================================================
package de.kp.spark.elastic.apps
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/

import org.apache.hadoop.conf.Configuration
import de.kp.spark.elastic.EsContext

/**
 * An example of how to extract documents from Elasticsearch
 * and apply KMeans clustering algorithm to group documents
 * by similar features
 */
object SegmentApp {

  def run() {

    val start = System.currentTimeMillis()
 
    /*
     * Spark specific configuration
     */
    val sparkConf = new Configuration()

    sparkConf.set("spark.executor.memory","1g")
	sparkConf.set("spark.kryoserializer.buffer.mb","256")

	val es = new EsContext(sparkConf)

    /*
     * Elasticsearch specific configuration
     */
    val esConf = new Configuration()                          

    esConf.set("es.nodes","localhost")
    esConf.set("es.port","9200")
    
    esConf.set("es.resource", "visits/pageview")                
    esConf.set("es.query", "?q=*:*")                          

    esConf.set("es.fields", "lat,lon")
    
    esConf.set("es.clusters", "10")
    esConf.set("es.iterations", "100")
    
    /*
     * Read from Elasticsearch and apply KMeans clustering
     * to the extracted documents
     */
    val documents = es.documents(esConf)
    val clustered = es.cluster(documents, esConf)  

    val end = System.currentTimeMillis()
    println("Total time: " + (end-start) + " ms")
    
    es.shutdown
    
  }


}

================================================
FILE: src/main/scala/de/kp/spark/elastic/bayes/ClickPredictor.scala
================================================
package de.kp.spark.elastic.bayes
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/

import org.apache.spark.SparkContext._
import org.apache.spark.rdd.RDD

import de.kp.spark.elastic.specs.GoalSpec

class ClickModel(probabilities:Map[Int,Double]) {
  
  def predict(clicks:Int):Double = {
    
    val nearest = probabilities.map(valu => {
      
      val k = valu._1
      val d = Math.abs(k - clicks)

      (k,d)
      
    }).toList.sortBy(_._2).take(1)(0)._1
   
    probabilities(nearest)
    
  }
}

/**
 * This Predictor is backed by the Bayesian Discriminant method 
 * to determine the conversion probability given a number of clicks 
 * within a certain web session; in this context, a web session is 
 * considered to be converted, if a certain sequence of page views 
 * appeared
 */
object ClickTrainer {
  
  /**
   * Input = (sessid,userid,total,starttime,timespent,referrer,exiturl,flowstatus)
   */
  def train(dataset:RDD[(String,String,Int,Long,Long,String,String,Int)]):ClickModel = {
    
    val histo = histogram(dataset)
    
    /*
     * p(c|v=1): probability of clicks per session, given the visitor converted in the session
     */
    val prob1 = histo.filter(valu => {valu._1._2 == 1}).map(valu => {
      
      val (clicks,converted) = valu._1
      val support = valu._2
      
      val prop = 1.toDouble / support
      (clicks,prop)
      
    }).collect().toMap
    
    /*
     * (p(c|v=0): probability of clicks per session, given the visitor did not convert in the session
     */
    val prob2 = histo.filter(valu => {valu._1._2 == 0}).map(valu => {
      
      val (clicks,converted) = valu._1
      val support = valu._2
      
      val prop = 1.toDouble / support
      (clicks,prop)
      
    }).collect().toMap
    
    val counts = conversions(dataset)
    
    /*
     * p(v=1): unconditional probability of visitor converted in a session
     */
    val prob3 = 1.toDouble / counts.filter(valu => valu._1 == 1).map(valu => valu._2).collect()(0)
    
    /*
     * p(v=0): unconditional probability of visitor did not convert in a session
     */
    val prob4 = 1.toDouble / counts.filter(valu => valu._1 == 0).map(valu => valu._2).collect()(0)

    /*
     * p(v=1|c) = p(c|v=1) * p(v=1) / (p(c|v=0) * p(v=0) + p(c|v=1) * p(v=1))
     */
    val clickProbs = prob1.map(valu => {
      
      val (clicks,prop) = valu
      
      val numerator = prop * prob3
      val denominator = numerator + prob4 * prob2(clicks)
      
      val res = (if (denominator > 0) numerator / denominator else 0)
      
      (clicks,res)
      
    })
    
    new ClickModel(clickProbs)
    
  }

  /**
   * Input = (sessid,userid,total,starttime,timespent,referrer,exiturl,flowstatus)
   * 
   */
  private def conversions(dataset:RDD[(String,String,Int,Long,Long,String,String,Int)]):RDD[(Int,Int)] = {
    
    val counts = dataset.map(valu => {
      
      val userConvertedPerSession = if (valu._8 == GoalSpec.FLOW_COMPLETED) 1 else 0
      
      val k = userConvertedPerSession
      val v = 1
      
      (k,v)
       
    }).reduceByKey(_ + _)
  
    /* 
     * The output shows the session counts 
     * for conversion and no conversion
     */
    counts
    
  }

  /**
   * Input = (sessid,userid,total,starttime,timespent,referrer,exiturl,flowstatus)
   * 
   */
  private def histogram(dataset:RDD[(String,String,Int,Long,Long,String,String,Int)]):RDD[((Int,Int),Int)] = {
    /*
     * The input contains one row per session. Each row contains the number of clicks 
     * in the session, time spent in the session and a boolean indicating whether the 
     * user converted during the session.
     */
    val histogram = dataset.map(valu => {
      
      val clicksPerSession = valu._3
      val userConvertedPerSession = if (valu._8 == GoalSpec.FLOW_COMPLETED) 1 else 0
      
      val k = (clicksPerSession,userConvertedPerSession)
      val v = 1
      
      (k,v)
    
    }).reduceByKey(_ + _)
    
    /*
     * Each row of the output contains the conversion flag, click count 
     * per session and the number of sessions with those click counts. 
     */ 
    histogram
    
  }

}

================================================
FILE: src/main/scala/de/kp/spark/elastic/enron/EnronApp.scala
================================================
package de.kp.spark.elastic.enron
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/

/**
 * EnronApp is a helper to prepare and index data in ES
 */
object EnronApp {
  
  def main(args : Array[String]) {
    
    val settings = Map(
        
        "dir"     -> "/Work/tmp/enron/20110402/mails/allen-p",
        
        "index"   -> "enron",
        "mapping" -> "mails",
        
        "server"  -> "http://localhost:9200"
    
    )

    val action = "index" // or prepare
    EnronEngine.execute(action, settings)
    
  }
}


================================================
FILE: src/main/scala/de/kp/spark/elastic/enron/EnronEngine.scala
================================================
package de.kp.spark.elastic.enron
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/
  
import java.io.File

import scala.io.Source
import scala.concurrent.Future

import spray.http.HttpResponse
import org.json4s._

import org.json4s.native.Serialization
import org.json4s.native.Serialization.{read,write}

import de.kp.spark.elastic.EsHttpClient

/**
 * Please note, that part of the functionality below is taken from
 * the code base assigned to this blog entry:
 * 
 * http://sujitpal.blogspot.de/2012/11/indexing-into-elasticsearch-with-akka.html
 */

object EnronEngine {
  
  import concurrent.ExecutionContext.Implicits._
    
  private val client = new EsHttpClient()

  private val shards:Int   = 1
  private val replicas:Int = 1
  
  private val es_CreateIndex:String = """
    {"settings": {"index": {"number_of_shards": %s, "number_of_replicas": %s}}}""".format(shards, replicas)
    
  private val es_CreateSchema:String = """{ "%s" : { "properties" : %s } }"""

  private val parser = new EnronParser()
  private val schema = new EnronSchema()

  def execute(action:String,settings:Map[String,String]) {
    
    action match {
      
      case "index" => 
        
        index(settings)
        client.shutdown
        
      case "prepare" =>
        
        prepare(settings)
        client.shutdown
        
      case _ => {}
      
    }
    
  }
  
  private def prepare(settings:Map[String,String]) {
    
    /**
     * Create new index
     */
    val server0 = List(settings("server"), settings("index")).foldRight("")(_ + "/" + _)
    client.post(server0, es_CreateIndex)
    
    /**
     * Create new schema
     */   
    val server1 = List(settings("server"), settings("index"), settings("mapping")).foldRight("")(_ + "/" + _)
    client.post(server1 + "_mapping", es_CreateSchema.format("enron", schema.mappings))

  }
  
  private def index(settings:Map[String,String]) {

    val dir = settings.get("dir").get
    
    val filefilter = new EnronFilter()
    val files = walk(new File(dir)).filter(f => filefilter.accept(f))

    val server1 = List(settings("server"), settings("index"), settings("mapping")).foldRight("")(_ + "/" + _)

    for (file <- files) {
      
      val path = file.getAbsolutePath()
      val doc = parser.parse(Source.fromFile(path))

      val response = addDocument(doc,server1)
      response.map(result => println("RESPONSE: " + result.entity.asString))

    }
  
  }
  
  private def getProps(path:String): Map[String,String] = {
    
    val file:File = new File(path)
    
    Map() ++ Source.fromFile(file).getLines().toList.
      filter(line => (! (line.isEmpty || line.startsWith("#")))).
      map(line => (line.split("=")(0) -> line.split("=")(1)))
  
  }  

  private def walk(root: File): Stream[File] = {
    
    if (root.isDirectory) {      
      root #:: root.listFiles.toStream.flatMap(walk(_))
    
    } else root #:: Stream.empty
  
  }

  private def addDocument(doc:EnronDoc, server:String):Future[HttpResponse] = {

    implicit val formats = Serialization.formats(NoTypeHints)
    val json = write(doc)
     
    client.post(server, json)

  }
 
}

================================================
FILE: src/main/scala/de/kp/spark/elastic/enron/EnronUtils.scala
================================================
package de.kp.spark.elastic.enron
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/

import scala.io.Source
import scala.collection.immutable.HashMap

import java.io.{File,FileFilter}
import java.util.Locale

import java.text.SimpleDateFormat

/**
 * Please note, that part of the functionality below is taken from
 * the code base assigned to this blog entry:
 * 
 * http://sujitpal.blogspot.de/2012/11/indexing-into-elasticsearch-with-akka.html
 */

case class EnronDoc (    
    message_id: String,
    from: String,
    to: Seq[String],
    x_cc: Seq[String],
    x_bcc: Seq[String],
    date: String,
    subject: String,
    body:String
)

class EnronSchema {
  
  def mappings(): String = """{
    "message_id": {"type": "string", "index": "not_analyzed", "store": "yes"},
    "from": {"type": "string", "index": "not_analyzed", "store": "yes"},
    "to": {"type": "string", "index": "not_analyzed", "store": "yes", "multi_field": "yes"},
    "x_cc": {"type": "string", "index": "not_analyzed", "store": "yes", "multi_field": "yes"},
    "x_bcc": {"type": "string", "index": "not_analyzed", "store": "yes", "multi_field": "yes"},
    "date": {"type": "date", "index": "not_analyzed", "store": "yes"},
    "subject": {"type": "string", "index": "analyzed", "store": "yes"},
    "body": {"type": "string", "index": "analyzed", "store": "yes"}
  }"""
    
}

class EnronParser {

  def parse(source: Source):EnronDoc = {
    
    val map = parse(source.getLines(), HashMap[String,String](), false)
    /**
     * Convert map into case class
     */
    val message_id = map.get("message_id").get
    val from = map.get("from").get
    
    val to = map.get("to") match {
      case None => Seq()
      case Some(to) => to.split(",").toSeq
    }
    
    val x_cc = map.get("x_cc").get.split(",").toSeq

    val x_bcc = map.get("x_bcc").get.split(",").toSeq
    val date = map.get("date").get

    val subject = map.get("subject").get
    val body = map.get("body").get
    
    new EnronDoc(message_id,from,to,x_cc,x_bcc,date,subject,body)

  }
  
  private def parse(lines: Iterator[String], map: Map[String,String], startBody: Boolean): Map[String,String] = {
    
    if (lines.isEmpty) map
    else {
      
      val head = lines.next()
      
      if (head.trim.length == 0) parse(lines, map, true)
      else if (startBody) {
      
        val body = map.getOrElse("body", "") + "\n" + head
        parse(lines, map + ("body" -> body), startBody)
      
      } else {
        
        val split = head.indexOf(':')
        if (split > 0) {
          val kv = (head.substring(0, split), head.substring(split + 1))
          val key = kv._1.map(c => if (c == '-') '_' else c).trim.toLowerCase
          val value = kv._1 match {
            case "Date" => formatDate(kv._2.trim)
            case _ => kv._2.trim
          }
          parse(lines, map + (key -> value), startBody)
        } else parse(lines, map, startBody)
      }
    }
  }
  
  private def formatDate(date: String): String = {

    lazy val parser = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss", Locale.US)
    lazy val formatter = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss")
    
    formatter.format(parser.parse(date.substring(0, date.lastIndexOf('-') - 1)))
  
  }

}
/**
 * We restrict to the /sent/ folders of the Enron dataset
 */
class EnronFilter extends FileFilter {
  
  override def accept(file: File): Boolean = {
    file.getAbsolutePath().contains("/sent/")
  }
  
}

================================================
FILE: src/main/scala/de/kp/spark/elastic/ml/EsKMeans.scala
================================================
package de.kp.spark.elastic.ml
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/

import org.apache.spark.rdd.RDD

import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.{Vector,Vectors}

object EsKMeans {

  /**
   * This method segments an RDD of documents clustering the assigned (lat,lon) geo coordinates.
   * The field parameter specifies the names of the lat & lon coordinate fields 
   */
  def segmentByLocation(docs:RDD[(String,Map[String,String])],fields:Array[String],clusters:Int,iterations:Int):RDD[(Int,String,Map[String,String])] = {
    /**
     * Train model
     */
    val vectors = docs.map(doc => toVector(doc._2,fields))   
    val model = KMeans.train(vectors, clusters, iterations)
    /**
     * Apply model
     */
    docs.map(doc => {
      
      val vector = toVector(doc._2,fields)
      (model.predict(vector),doc._1,doc._2)
      
    })
    
  }

  private def toVector(data:Map[String,String], fields:Array[String]):Vector = {
       
    val lat = data(fields(0)).toDouble
    val lon = data(fields(1)).toDouble
      
    Vectors.dense(Array(lat,lon))
   
  }
  
}

================================================
FILE: src/main/scala/de/kp/spark/elastic/ml/EsNPref.scala
================================================
package de.kp.spark.cf
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/

import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext._

object EsNPref {

  def build(docs:RDD[(String,Map[String,String])],fields:Array[String]):RDD[(String,String,Int)] = {

    val transactions = docs.map(doc => {
      /**
       * Each document (doc) represents an ecommerce transaction per user
       */
      val user = doc._2(fields(0))
      val line = doc._2(fields(1))
      
      (user,line)
      
    })
    
    build(transactions)
  
  }
 
  def build(transactions:RDD[(String,String)]):RDD[(String,String,Int)] = {
    /**
     * STEP #1
     * 
     * Compute the total number of transactions per user. The transactions are
     * grouped by user (_._1) and then mapped onto number of transactions
     * per user
     */
    val total = transactions.groupBy(_._1).map(grouped => (grouped._1, grouped._2.size))    
    /**
     * STEP #2
     * 
     * Computer the item support per user. Each transaction (text line) is split
     * into an Array[String] and all items are made unique. The result is mapped
     * into (user,item,support) tuples
     */
    val userItemSupport = transactions.flatMap(valu => List.fromArray(valu._2.split(" ")).distinct.map(item => (valu._1,item)))
      .groupBy(valu => (valu._1,valu._2))
      .map(grouped => (grouped._1,grouped._2.size)).map(valu => (valu._1._1,valu._1._2,valu._2))   
    /**
     * STEP #3
     * 
     * Compute item preference per user. Item support and total transactions per user
     * are used to compute the respective item preference:
     * 
     * pref = Math.log(1 + supp.toDouble / total.toDouble)
     */
    val userItemPref = userItemSupport.keyBy(value => value._1).join(total)
      .map(valu => {
        
        val user = valu._1        
        val data = valu._2 // ((user,item,support),total)
        
        val item = data._1._2
        val supp = data._1._3
        
        val total = data._2

        /**
         * Math.log means natural logarithm in Scala
         */
        val pref = Math.log(1 + supp.toDouble / total.toDouble)
        (user, item, pref)
        
      })
      
    /**
     * The user-item preferences are solely based on the purchase data of a 
     * particluar user; the respective value, however, is far from representing
     * a real-life value, as it only takes the purchase frequency into account.
     * 
     * The frequency is quite different depending on the item price, item lifetime, 
     * and the like. For example, since expensive items or items with long lifespan,
     * such as jewelry or electronic home appliances, are purchased infrequently.
     * 
     * So the preferences of users form them cannot be higher that cheap items or
     * those with a short lifespan such as hand creams or tissues. Also, when a 
     * user u purchase item i four times out of ten transactions, we may think that
     * he does not prefer item i if other users purchased the same item eight times
     * out of ten transactions.
     * 
     * It is therefore necessary to define a relative preference so it is comparable 
     * among all users. We therefore proceed to compute the maximum item preference
     * for all users and use this value to normalize the user-item preference derived
     * above.  
     */

    /**
     * STEP #4
     * 
     * Compute the maximum preference per item (independent of the user)
     */
    val itemMaxPref = userItemPref.map(valu => (valu._2,valu._3)).groupBy(valu => valu._1)
      .map(grouped => {
        
        def max(pref1:Double, pref2:Double):Double = if (pref1 > pref2) pref1 else pref2
        
        val item = grouped._1
        val mpref = grouped._2.map(valu => valu._2).reduceLeft(max)
        
        (item,mpref)

      })
    /**
     * STEP #5
     * 
     * Finally compute the user-item rating with scores from 1..5
     */

    val userItemRating = userItemPref.keyBy(valu => valu._2).join(itemMaxPref)
      .map(valu => {
        
        val item = valu._1
        val data = valu._2
        
        val uid = data._1._1
        val pref = data._1._3
        
        val mpref = data._2
        val npref = Math.round( 5* (pref.toDouble / mpref.toDouble) ).toInt
        
        (uid,item,npref)
        
      })
      
     userItemRating
     
  }
  
}

================================================
FILE: src/main/scala/de/kp/spark/elastic/ml/EsSimilarity.scala
================================================
package de.kp.spark.elastic.ml
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/

import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext._

class EsSimilarity(ratings:RDD[(String,String,Int)]) {
  /**
   * Parameters to regularize correlation.
   */
  val PRIOR_COUNT = 10
  val PRIOR_CORRELATION = 0

  val model = build()
  
  def recommend(item:String, k:Int):Array[(String,Double,Double,Double,Double)] = {
    
    /**
     * Retrieve all similarities where the first item
     * is equal to the provided one
     */
    val similarities = model.filter(valu => {
      
      val pair = valu._1
      pair._1 == item
      
    })

    val result = similarities.map(valu => {
      
      val (item1,item2) = valu._1
      val (corr,rcorr,cos,jac) = valu._2
      
      (item2, corr, rcorr, cos, jac)

    }).collect().filter(valu => (valu._3 equals Double.NaN) == false)
    .sortBy(valu => valu._4).take(k)
    
    result
    
  }
  
  private def build():RDD[((String,String),(Double,Double,Double,Double))] = {
    
    /**
     * Compute the number of raters per item
     */ 
    val itemSupport = ratings.groupBy(valu => valu._2)
      .map(grouped => (grouped._1, grouped._2.size))
    /**
     * Join rating with item support: the result contains
     * the following data (user,item,rating,support)
     */
    val ratingsSupport = ratings.groupBy(valu => valu._2).join(itemSupport)
    .flatMap(joined => joined._2._1.map(valu => (valu._1, valu._2, valu._3, joined._2._2)))
    /**
     * Clone data, join on user and filter pairs to make sure
     * that we do not double count and exclude self pairs 
     */
    val ratingsSupportClone = ratingsSupport.keyBy(valu => valu._1)
    val ratingsPairs = ratingsSupportClone.join(ratingsSupportClone).filter(valu => valu._2._1._2 < valu._2._2._2)

    /** 
     * Compute raw inputs to similarity metrics
     */
    val vectorCalcs = ratingsPairs.map(valu => {
      
      val (user1,item1,rating1,support1) = valu._2._1
      val (user2,item2,rating2,support2) = valu._2._2
      
      val key = (item1, item2)
      val stats = (
        rating1 * rating2,
        rating1,                
        rating2,                
        math.pow(rating1, 2),   
        math.pow(rating2, 2),   
        support1,  
        support2
      )                
      
      (key, stats)
    
    }).groupByKey().map(valu => {
        
      val key   = valu._1
      val stats = valu._2
      
      val size = stats.size
      val dotProduct = stats.map(f => f._1).sum
      
      val rating1Sum = stats.map(f => f._2).sum
      val rating2Sum = stats.map(f => f._3).sum
      
      val rating1Sq = stats.map(f => f._4).sum
      val rating2Sq = stats.map(f => f._5).sum
      
      val support1 = stats.map(f => f._6).max
      val support2 = stats.map(f => f._7).max
        
      (key, (size, dotProduct, rating1Sum, rating2Sum, rating1Sq, rating2Sq, support1, support2))
    
    })
    
    /** 
     * Compute similarity metrics for each item pair
     */
    vectorCalcs.map(valu => {
        
      val key = valu._1
      val (size, dotProduct, ratingSum, rating2Sum, ratingNormSq, rating2NormSq, support, support2) = valu._2
      /*
       * Correlation
       */
      val corr = correlation(size, dotProduct, ratingSum, rating2Sum, ratingNormSq, rating2NormSq)
      val regCorr = regularizedCorrelation(size, dotProduct, ratingSum, rating2Sum, ratingNormSq, rating2NormSq, PRIOR_COUNT, PRIOR_CORRELATION)
      /*
       *  Cosine similarity
       */  
      val cosSim = cosineSimilarity(dotProduct, scala.math.sqrt(ratingNormSq), scala.math.sqrt(rating2NormSq))
      /*
       * Jaccard Similarity
       */
      val jaccard = jaccardSimilarity(size, support, support2)

      (key, (corr, regCorr, cosSim, jaccard))
      
    })
    
  }

  /**
   * The correlation between two vectors A, B is cov(A, B) / (stdDev(A) * stdDev(B))
   *
   * This is equivalent to:
   * 
   * [n * dotProduct(A, B) - sum(A) * sum(B)] / sqrt{ [n * norm(A)^2 - sum(A)^2] [n * norm(B)^2 - sum(B)^2] }
   */
  private def correlation (
      size:Double,
      dotProduct:Double,
      rating1Sum:Double,
      rating2Sum:Double,
      rating1NormSq:Double,
      rating2NormSq:Double) = {

    val numerator = size * dotProduct - rating1Sum * rating2Sum
    val denominator = scala.math.sqrt(size * rating1NormSq - rating1Sum * rating1Sum) * scala.math.sqrt(size * rating2NormSq - rating2Sum * rating2Sum)

    numerator / denominator
    
  }

  /**
   * Regularize correlation by adding virtual pseudocounts over a prior:
   *   
   * RegularizedCorrelation = w * ActualCorrelation + (1 - w) * PriorCorrelation
   * where w = # actualPairs / (# actualPairs + # virtualPairs).
   */
  private def regularizedCorrelation (
      size:Double,
      dotProduct:Double,
      rating1Sum:Double,
      rating2Sum:Double,
      rating1NormSq:Double,
      rating2NormSq:Double,
      virtualCount:Double,
      priorCorrelation:Double) = {

    
    val unregularizedCorrelation = correlation(size,dotProduct,rating1Sum,rating2Sum,rating1NormSq,rating2NormSq)
    val w = size / (size + virtualCount)

    w * unregularizedCorrelation + (1 - w) * priorCorrelation
  
  }

  /**
   * The cosine similarity between two vectors A, B is dotProduct(A, B) / (norm(A) * norm(B))
   */
  private def cosineSimilarity (
      dotProduct:Double, 
      rating1Norm:Double,
      rating2Norm:Double) = {
    
    dotProduct / (rating1Norm * rating2Norm)
  
  }

  /**
   * The Jaccard Similarity between two sets A, B is |Intersection(A, B)| / |Union(A, B)|
   */
  private def jaccardSimilarity (
      usersInCommon:Double, 
      totalUsers1:Double, 
      totalUsers2:Double) = {
    
    val union = totalUsers1 + totalUsers2 - usersInCommon
    usersInCommon / union
  
  }
  
}

================================================
FILE: src/main/scala/de/kp/spark/elastic/samples/EsCountMinSktech.scala
================================================
package de.kp.spark.elastic.samples
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/

import scala.util.parsing.json._

import com.twitter.algebird._

import org.apache.spark.streaming.dstream.DStream

/**
 * Frequency Estimation
 */
object EsCountMinSktech {
    
  def findTopK(stream:DStream[Message]):Seq[(Long,Long)] = {
  
    val DELTA = 1E-3
    val EPS   = 0.01
    
    val SEED = 1
    val PERC = 0.001
 
    val k = 5
    
    var globalCMS = new CountMinSketchMonoid(DELTA, EPS, SEED, PERC).zero
 
    val clases = stream.map(message => message.clas)
    val approxTopClases = clases.mapPartitions(clases => {
      
      val localCMS = new CountMinSketchMonoid(DELTA, EPS, SEED, PERC)
      clases.map(clas => localCMS.create(clas))
    
    }).reduce(_ ++ _)

    approxTopClases.foreach(rdd => {
      if (rdd.count() != 0) globalCMS ++= rdd.first()
    })
        
    /**
     * Retrieve approximate TopK classifiers from the provided messages
     */
    val globalTopK = globalCMS.heavyHitters.map(clas => (clas, globalCMS.frequency(clas).estimate))
      /*
       * Retrieve the top k message classifiers: it may also be interesting to 
       * return the classifier frequency from this method, ignoring the line below
       */
      .toSeq.sortBy(_._2).reverse.slice(0, k)
  
    globalTopK
    
  }

}

================================================
FILE: src/main/scala/de/kp/spark/elastic/samples/EsHyperLogLog.scala
================================================
package de.kp.spark.elastic.samples
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/

import com.twitter.algebird._
import org.apache.spark.streaming.dstream.DStream
import java.nio.ByteBuffer

object EsHyperLogLog {

  def estimateCardinality(stream:DStream[Message]):Double = {

    val BIT_SIZE = 12
    
    val clases = stream.map(message => message.clas)
    val approxClases = clases.mapPartitions(clases => {
      
      /* 12: Number of bits */
      val hll = new HyperLogLogMonoid(12)
      clases.map(clas => {
        
        val bytes = ByteBuffer.allocate(8).putLong(clas).array()
        hll(bytes)
      
      })
    
    }).reduce(_ + _)

    val hll = new HyperLogLogMonoid(BIT_SIZE)
    var globalHll = hll.zero
 
    approxClases.foreach(rdd => {
      if (rdd.count() != 0) {
        globalHll += rdd.first()
      }
    })
 
    /*
     * Approximate distinct clases in the observed messages
     */
    globalHll.estimatedSize

  }
  
}

================================================
FILE: src/main/scala/de/kp/spark/elastic/samples/KafkaEngine.scala
================================================
package de.kp.spark.elastic.samples
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/

import kafka.serializer.StringDecoder

import org.apache.spark.streaming._
import org.apache.spark.streaming.StreamingContext._
import org.apache.spark.streaming.{Seconds, StreamingContext}

import org.apache.spark.streaming.kafka._

import org.apache.spark.SparkContext._
import org.apache.spark.storage.StorageLevel

import org.apache.hadoop.io.{MapWritable,NullWritable,Text}
import org.apache.hadoop.conf.{Configuration => HConf}

import org.elasticsearch.hadoop.mr.EsOutputFormat

import de.kp.spark.elastic.SparkBase

class KafkaEngine(name:String,conf:HConf) extends SparkBase with Serializable {

  /* Elasticsearch configuration */	
  val ec = getEsConf(conf)               

  /* Kafka configuration */
  val (kc,topics) = getKafkaConf(conf)
  
  def run() {
    
    val ssc = createSSCLocal(name,conf)

    val stream = KafkaUtils.createStream[String,Message,StringDecoder,MessageDecoder](ssc,kc,topics, StorageLevel.MEMORY_AND_DISK).map(_._2)
    stream.foreachRDD(messageRDD => {
      /**
       * Live indexing of Kafka messages; note, that this is also
       * an appropriate place to integrate further message analysis
       */
      val messages = messageRDD.map(prepare)
      messages.saveAsNewAPIHadoopFile("-",classOf[NullWritable],classOf[MapWritable],classOf[EsOutputFormat],ec)    
      
    })
    
    ssc.start()
    ssc.awaitTermination()    

  }
  
  private def prepare(msg:Message):(Object,Object) = {
      
    val m = MessageUtils.messageToMap(msg)

    /**
     * Prepare (Keywritable, ValueWritable)
     */
    val kw = NullWritable.get
    
    val vw = new MapWritable
    for ((k, v) <- m) vw.put(new Text(k), new Text(v))
    
    (kw, vw)
    
  }

  
  private def getEsConf(config:HConf):HConf = {
    
    val conf = new HConf()                          

    conf.set("es.nodes", conf.get("es.nodes"))
    conf.set("es.port", conf.get("es.port"))
    
    conf.set("es.resource", conf.get("es.resource")) 
    
    conf
    
  }
  
  private def getKafkaConf(config:HConf):(Map[String,String],Map[String,Int]) = {

    val cfg = Map(
      "group.id" -> conf.get("kafka.group"),
      
      "zookeeper.connect" -> conf.get("kafka.zklist"),
      "zookeeper.connection.timeout.ms" -> conf.get("kafka.timeout")
    
    )

    val topics = conf.get("kafka.topics").split(",").map((_,conf.get("kafka.threads").toInt)).toMap   
    
    (cfg,topics)
    
  }
 
}

================================================
FILE: src/main/scala/de/kp/spark/elastic/samples/KafkaSerializer.scala
================================================
package de.kp.spark.elastic.samples
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/

import kafka.serializer.{Decoder, Encoder}
import kafka.utils.VerifiableProperties

import org.apache.commons.io.Charsets

import org.json4s._

import org.json4s.native.Serialization
import org.json4s.native.Serialization.{read,write}

/**
 * Message refers to any Scala case class that is serializable or deserializable
 * with json4s
 */
class MessageDecoder(props: VerifiableProperties) extends Decoder[Message] {
    
  implicit val formats = Serialization.formats(NoTypeHints)
  
  def fromBytes(bytes: Array[Byte]): Message = {
    read[Message](new String(bytes, Charsets.UTF_8))
  }

}

class MessageEncoder(props: VerifiableProperties) extends Encoder[Message] {
    
  implicit val formats = Serialization.formats(NoTypeHints)
  
  def toBytes(message: Message): Array[Byte] = {
    write[Message](message).getBytes(Charsets.UTF_8)
  }
  
}

================================================
FILE: src/main/scala/de/kp/spark/elastic/samples/MessageApp.scala
================================================
package de.kp.spark.elastic.samples
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/

import java.util.UUID
import org.apache.hadoop.conf.{Configuration => HConf}

object MessageApp {

  val task = "index" // prepare
  
  def main(args:Array[String]) {
    
    val conf = new HConf()
    
    conf.set("es.nodes","localhost")
    conf.set("es.port","9200")
    
    conf.set("es.resource","kafka/messages")              

    conf.set("es.index","kafka")
    conf.set("es.mapping","messages")
        
    conf.set("es.server","http://localhost:9200")

    conf.set("spark.master","local")
    conf.set("spark.batch.duration","15")
      
    conf.set("kafka.topics","publisher")
    conf.set("kafka.threads","1")
      
    conf.set("kafka.group",UUID.randomUUID().toString)
    conf.set("kafka.zklist","127.0.0.1:2181")
      
    // in milliseconds
    conf.set("kafka.timeout","10000")
    
    task match {
      
      case "prepare" => 
    
        val action = "prepare"
        MessageEngine.execute(action,conf)
      
      case "index" => 
    
        val engine = new KafkaEngine("KafkaEngine",conf)
        engine.run

      case _ => {}
      
    }    
    
  }
  
}

================================================
FILE: src/main/scala/de/kp/spark/elastic/samples/MessageGenerator.scala
================================================
package de.kp.spark.elastic.samples
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/

import kafka.producer.{KeyedMessage,Producer,ProducerConfig}
import kafka.message.DefaultCompressionCodec

import java.lang.Thread

import java.util.{Properties, Random, UUID}

/**
 * A helper to generate random messages and
 * send to Apache Kafka
 */
object MessageGenerator {

  def main(args:Array[String]) {
       
    val topic = "publisher"
    /** 
     * This is for bootstrapping and the producer will only use it for getting metadata 
     * (topics, partitions and replicas). The socket connections for sending the actual 
     * data will be established based on the broker information returned in the metadata. 
     * 
     * The format is host1:port1,host2:port2, and the list can be a subset of brokers or 
     * a VIP pointing to a subset of brokers.
     */      
    val broker = "127.0.0.1:9092" 
    /**
     * This parameter allows you to specify the compression codec for all data generated by 
     * this producer. When set to true gzip is used. To override and use snappy you need to 
     * implement that as the default codec for compression using SnappyCompressionCodec.codec 
     * instead of DefaultCompressionCodec.codec below.
     */
    val codec = DefaultCompressionCodec.codec
    /**
     * This parameter specifies whether the messages are sent asynchronously in a background 
     * thread. Valid values are false for asynchronous send and true for synchronous send.
     *  
     * By setting the producer to async we allow batching together of requests (which is great 
     * for throughput) but open the possibility of a failure of the client machine dropping 
     * unsent data.
     */
    val synchronously = true 
    /**
     * The client id is a user-specified string sent in each request to help trace calls. 
     * It should logically identify the application making the request.
     */    
    val clientId = UUID.randomUUID().toString
    /**
     * The number of messages to send in one batch when using async mode. 
     * The producer will wait until either this number of messages are ready 
     * to send or queue.buffer.max.ms is reached.
     */
    val batchSize = 200
    /** messageSendMaxRetries
     * This property will cause the producer to automatically retry a failed send request. 
     * This property specifies the number of retries when such failures occur. Note that 
     * setting a non-zero value here can lead to duplicates in the case of network errors 
     * that cause a message to be sent but the acknowledgement to be lost.
     */
    val messageSendMaxRetries = 3
    /** 
     *  0) which means that the producer never waits for an acknowledgement from the broker (the same behavior as 0.7). 
     *     This option provides the lowest latency but the weakest durability guarantees (some data will be lost when a server fails).
     *  1) which means that the producer gets an acknowledgement after the leader replica has received the data. This option provides 
     *     better durability as the client waits until the server acknowledges the request as successful (only messages that were 
     *     written to the now-dead leader but not yet replicated will be lost).
     * -1) which means that the producer gets an acknowledgement after all in-sync replicas have received the data. This option 
     *     provides the best durability, we guarantee that no messages will be lost as long as at least one in sync replica remains.
     */
    val requestRequiredAcks = -1
      
    val props = new Properties()
    
    props.put("compression.codec", codec.toString)  
    props.put("producer.type", "sync")
  
    props.put("metadata.broker.list", broker)
    props.put("batch.num.messages", batchSize.toString)
  
    props.put("message.send.max.retries", messageSendMaxRetries.toString)
    props.put("require.requred.acks",requestRequiredAcks.toString)
  
    props.put("client.id",clientId.toString)
    props.put("serializer.class", "de.kp.spark.elastic.samples.MessageEncoder")

    val producer = new Producer[String, Message](new ProducerConfig(props))

     var i = 0  
     while(true) {
    
       val text = "This is message, no=%s".format(i)
       
       val mid = UUID.randomUUID().toString()
       val timestamp = System.currentTimeMillis()
       
       val clas = new Random().nextInt(10).toLong
       
       val message = new Message(mid,clas,text,timestamp)
       producer.send(new KeyedMessage[String, Message](topic, message))

       i += 1
       Thread.sleep(1000)

     }
   
  }
}

================================================
FILE: src/main/scala/de/kp/spark/elastic/samples/MessageUtils.scala
================================================
package de.kp.spark.elastic.samples
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/

import org.apache.hadoop.conf.{Configuration => HConf}
import de.kp.spark.elastic.EsHttpClient

/**
 * Please note, that part of the functionality below is taken from
 * the code base assigned to this blog entry:
 * 
 * http://sujitpal.blogspot.de/2012/11/indexing-into-elasticsearch-with-akka.html
 */

object MessageEngine {
  
  import concurrent.ExecutionContext.Implicits._
    
  private val client = new EsHttpClient()

  private val shards:Int   = 1
  private val replicas:Int = 1
  
  private val es_CreateIndex:String = """
    {"settings": {"index": {"number_of_shards": %s, "number_of_replicas": %s}}}""".format(shards, replicas)
    
  private val es_CreateSchema:String = """{ "%s" : { "properties" : %s } }"""

  private val schema = new MessageSchema()

  def execute(action:String,conf:HConf) {
    
    action match {
        
      case "prepare" => prepare(conf)
        
      case _ => {}
      
    }
    
  }
  
  private def prepare(conf:HConf) {
    
    val index  = conf.get("es.index")
    val server = conf.get("es.server")
    
    /**
     * Create new index
     */
    val server0 = List(server, index).foldRight("")(_ + "/" + _)
    client.post(server0, es_CreateIndex)
   
    /**
     * Create new schema
     */   
    val server1 = List(server, index, conf.get("es.mapping")).foldRight("")(_ + "/" + _)
    client.post(server1 + "_mapping", es_CreateSchema.format(index, schema.mappings))

  }
 
}

object MessageUtils {
   
  def messageToMap(message:Message):Map[String,String] = {
 
    Map(
     "mid"  -> message.mid,
     "text" -> message.text,     
     "timestamp" -> message.timestamp.toString
    )

  }
 
}

/**
 * Specification of sample data structures; the classifier
 * is introduced to support the CountMinSketch algorithm
 */

case class Message(
    mid:String,
    clas:Long,
    text:String,
    timestamp:Long
)

class MessageSchema {
  
  def mappings(): String = """{
    "mid":  {"type": "string", "index": "not_analyzed", "store": "yes"},
    "text": {"type": "string", "index": "analyzed", "store": "yes"},
    "timestamp": {"type": "string", "index": "not_analyzed", "store": "yes"}
  }"""
    
}

================================================
FILE: src/main/scala/de/kp/spark/elastic/specs/FieldSpec.scala
================================================
package de.kp.spark.elastic.specs
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/

import scala.xml._

class FieldSpec(path:String) {
  
  val root:Elem = XML.load(getClass.getClassLoader.getResource(path))  

}

================================================
FILE: src/main/scala/de/kp/spark/elastic/specs/GoalSpec.scala
================================================
package de.kp.spark.elastic.specs
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/

import scala.xml._

import scala.collection.mutable.{ArrayBuffer,HashMap}
import scala.util.control.Breaks._

object GoalSpec extends Serializable {
 
  private val spec = new FieldSpec("goals.xml")
  private val flows = HashMap.empty[String,Array[String]]
  
  val FLOW_NOT_ENTERED:Int = 0
  val FLOW_ENTERED:Int     = 1
  val FLOW_COMPLETED:Int   = 2
  
  load()
  
  private def load() {

    for (goal <- spec.root \ "goal") {
      
      val fid  = (goal \ "@id").toString
      val flow = goal.text.split(",")
      
      flows += fid -> flow
      
    }

  }

  def getFlow(fid:String):Option[Array[String]] = {
    flows.get(fid)
  }
  
  def getFlows():Array[(String,Array[String])] = {
    flows.toArray
  }
  
  def checkFlow(goal:String,pages:List[String]):Int = {
    
    getFlow(goal) match {
  
      case None => 0
      case Some(flow) => checkFlow(flow,pages)
    
    }
  
  }
  
  /**
   * A helper method to evaluate whether the pages clicked in a certain 
   * session match, partially match or do not match a predefined sequence
   * of pages (flow)
   */
  def checkFlow(flow:Array[String],pages:List[String]):Int = { 			
    		
    var j = 0
    var	flowStat = FLOW_NOT_ENTERED
    		
    var matched = false;
    		
    for (i <- 0 until flow.length) {
    			
      breakable {while (j < pages.size) {
    				
        matched = false
        /*
         * We expect that a certain page url has to start with the 
         * configured url part of the flow
         */
    	if (pages(j).startsWith(flow(i))) {
    	  flowStat = (if (i == flow.length - 1) FLOW_COMPLETED else FLOW_ENTERED)
    	  matched = true
    				
    	}
    	j += 1
    	if (matched) break
    			
      }}
    
    }

    flowStat
    
  }
  
  /**
   * A helper method to evaluate whether the pages clicked in a certain 
   * session match, partially match or do not match a predefined sequences
   * of page flows
   */
  def checkFlows(pages:List[String]):Array[(String,Int)] = { 			
    
    val flows = getFlows
    flows.map(v => (v._1, checkFlow(v._2,pages)))
    
  }
 
}

================================================
FILE: src/main/scala/de/kp/spark/elastic/specs/PageViewSpec.scala
================================================
package de.kp.spark.elastic.specs
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/

import scala.xml._
import scala.collection.mutable.HashMap

object PageViewSpec {

  private val spec = new FieldSpec("pageview.xml")
  private val fields = HashMap.empty[String,(String,String)]
  
  load()
  
  private def load() {

    for (field <- spec.root \ "field") {
      
      val _name  = (field \ "@name").toString
      val _type  = (field \ "@type").toString

      val _mapping = field.text
      fields += _name -> (_mapping,_type) 
      
    }

  }

  def get = fields.toMap
  
}

================================================
FILE: src/main/scala/de/kp/spark/elastic/stream/EsHistogram.scala
================================================
package de.kp.spark.elastic.stream
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/

import scala.util.parsing.json._

import org.json4s.DefaultFormats
import org.json4s.native.Serialization.write

import org.apache.hadoop.conf.{Configuration => HConf}

import org.apache.spark.streaming.StreamingContext._
import org.apache.spark.streaming.dstream.DStream

import com.twitter.algebird._

class EsHistogram(field:String,conf:HConf) extends EsStream("EsHistogram",conf) {
  
  override def transform(stream:DStream[String]):DStream[String] = {
    histogram(stream,field)
  }
  
  private def histogram(stream:DStream[String],field:String):DStream[String] = {
    
    implicit val formats = DefaultFormats    
    
    /* Mapify stream */
    val mapified = stream.map(json => {
      
      JSON.parseFull(json) match {
      
        case Some(map) => map.asInstanceOf[Map[String,String]]
        case None => Map.empty[String,String]
      
      }
      
    })

    /* Extract field values and compute support for each field value */
    val values = mapified.map(m => m(field))
    val support = values.map(v => (v, 1)).reduceByKey((a, b) => a + b)

    /* The data type of the field value is a String */
    var global = Map[String,Int]()
    val monoid = new MapMonoid[String, Int]()    
    
    /* Determine Top K */
    support.foreachRDD(rdd => {
      
      if (rdd.count() != 0) {
        val partial = rdd.collect().toMap
        global = monoid.plus(global.toMap, partial)
      }
    
    })
    
    mapified.transform(rdd => {
      
      rdd.map(m => {
        
        val v = m(field)
        val s = global(v)
        
        write(m ++ Map("_field" -> field, "_valu" -> v, "_supp" -> s.toString))
        
      })
      
    })
    
  }
  
}

================================================
FILE: src/main/scala/de/kp/spark/elastic/stream/EsStream.scala
================================================
package de.kp.spark.elastic.stream
/* Copyright (c) 2014 Dr. Krusche & Partner PartG
* 
* This file is part of the Spark-ELASTIC project
* (https://github.com/skrusche63/spark-elastic).
* 
* Spark-ELASTIC is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* 
* Spark-ELASTIC is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* Spark-ELASTIC. 
* 
* If not, see <http://www.gnu.org/licenses/>.
*/

import scala.util.parsing.json._

import kafka.serializer.StringDecoder

import org.apache.spark.SparkContext._

import org.apache.spark.streaming._
import org.apache.spark.streaming.StreamingContext._

import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.DStream

import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel

import org.apache.spark.streaming.kafka._

import org.apache.hadoop.conf.{Configuration => HConf}
import org.apache.hadoop.io.{MapWritable,NullWritable,Text}

import org.elasticsearch.hadoop.mr.EsOutputFormat

import de.kp.spark.elastic.SparkBase

/**
 * EsStream provides base functionality for indexing transformed live streams 
 * from Apache Kafka with Elasticsearch; to appy a customized transformation,
 * the method 'transform' must be overwritten
 */
class EsStream(name:String,conf:HConf) extends SparkBase with Serializable {

  /* Elasticsearch configuration */	
  val ec = getEsConf(conf)               

  /* Kafka configuration */
  val (kc,topics) = getKafkaConf(conf)
  
  def run() {
    
    val ssc = createSSCLocal(name,conf)

    /*
     * The KafkaInputDStream returns a Tuple where only the second component
     * holds the respective message; we therefore reduce to a DStream[String]
     */
    val stream = KafkaUtils.createStream[String,String,StringDecoder,StringDecoder](ssc,kc,topics,StorageLevel.MEMORY_AND_DISK).map(_._2)
    /*
     * Inline transformation of the incoming stream by any function that maps 
     * a DStream[String] onto a DStream[String]
     */
    val transformed = transform(stream)
    /*
     * Write transformed stream to Elasticsearch index
     */
    transformed.foreachRDD(rdd => {
      val messages = rdd.map(prepare)
      messages.saveAsNewAPIHadoopFile("-",classOf[NullWritable],classOf[MapWritable],classOf[EsOutputFormat],ec)          
    })
    
    ssc.start()
    ssc.awaitTermination()    

  }
  
  def transform(stream:DStream[String]) = stream
  
  private def getEsConf(config:HConf):HConf = {
    
    val conf = new HConf()                          

    conf.set("es.nodes", conf.get("es.nodes"))
    conf.set("es.port", conf.get("es.port"))
    
    conf.set("es.resource", conf.get("es.resource")) 
    
    conf
    
  }
  
  private def getKafkaConf(config:HConf):(Map[String,String],Map[String,Int]) = {

    val cfg = Map(
      "group.id" -> conf.get("kafka.group"),
      
      "zookeeper.connect" -> conf.get("kafka.zklist"),
      "zookeeper.connection.timeout.ms" -> conf.get("kafka.timeout")
    
    )

    val topics = conf.get("kafka.topics").split(",").map((_,conf.get("kafka.threads").toInt)).toMap   
    
    (cfg,topics)
    
  }
  
  private def prepare(message:String):(Object,Object) = {
      
    val m = JSON.parseFull(message) match {
      case Some(map) => map.asInstanceOf[Map[String,String]]
      case None => Map.empty[String,String]
    }

    val kw = NullWritable.get
    
    val vw = new MapWritable
    for ((k, v) <- m) vw.put(new Text(k), new Text(v))
    
    (kw, vw)
    
  }

}