package eu.shiftforward.adstax.spark

import com.github.nscala_time.time.Imports._
import com.sksamuel.elastic4s.ElasticDsl._
import com.sksamuel.elastic4s.IndexesAndTypes
import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.rdd.RDD
import org.elasticsearch.search.sort.SortOrder
import org.elasticsearch.spark._

/**
 * A context for running spark jobs which provides an extra method to access AdStax events.
 *
 * @param conf the Spark configuration to use when initializing the context
 * @param esIndex the elasticsearch index where to search for events
 */
class AdStaxSparkContext(conf: SparkConf, esIndex: String = "http-*") extends SparkContext(conf) {
  /**
   * Returns all AdStax events between `startDateTime` and `endDateTime` which have a type contained in `eventTypes`. In
   * case `eventTypes` is empty, all event types are considered.
   *
   * @param eventTypes the type of events to fetch. If empty, all event types are considered
   * @param startDateTime the initial timestamp of the events to fetch
   * @param endDateTime the final timestamp of the events to fetch
   *
   * @return a `RDD` of Strings with the JSON representation of the fetched events.
   */
  def eventsRDD(eventTypes: Set[String], startDateTime: DateTime, endDateTime: DateTime): RDD[String] = {
    val idx = IndexesAndTypes(Seq("*"), Seq())
    val req = search(idx).
      postFilter(
        boolQuery.must(
          rangeQuery("meta.timestamp").gte(startDateTime.toString).lte(endDateTime.toString))).
        sortBy(fieldSort("meta.timestamp").order(SortOrder.ASC).unmappedType("date"))

    this.esJsonRDD(esIndex + "/" + (if (eventTypes.isEmpty) "" else eventTypes.mkString(",")), req._builder.toString)
      .values
  }
}
