salesforce · tovbinm · Sep 4, 2019 · May 30, 2019 · May 30, 2019 · May 30, 2019
@@ -128,7 +128,7 @@ Start by picking TransmogrifAI version to match your project dependencies from t
 
 | TransmogrifAI Version                           | Spark Version | Scala Version | Java Version |
 |-------------------------------------------------|:-------------:|:-------------:|:------------:|
-| 0.6.1 (unreleased, master)                      |      2.4      |      2.11     |      1.8     |
+| 0.6.1 (unreleased, master)                      |      2.3      |      2.11     |      1.8     |
 | **0.6.0 (stable)**, 0.5.3, 0.5.2, 0.5.1, 0.5.0  |    **2.3**    |    **2.11**   |    **1.8**   |
 | 0.4.0, 0.3.4                                    |      2.2      |      2.11     |      1.8     |
 

@@ -1,16 +1,15 @@
 buildscript {
     repositories {
-        mavenCentral()
-        jcenter()
         maven { url "https://plugins.gradle.org/m2/" }
+        mavenCentral()
     }
     dependencies {
         classpath 'org.github.ngbinh.scalastyle:gradle-scalastyle-plugin_2.11:1.0.1'
-        classpath 'com.commercehub.gradle.plugin:gradle-avro-plugin:0.16.0'
     }
 }
 
 plugins {
+    id 'com.commercehub.gradle.plugin.avro' version '0.8.0'
     id 'org.scoverage' version '2.5.0'
     id 'net.minecrell.licenser' version '0.4.1'
     id 'com.github.jk1.dependency-license-report' version '0.5.0'
@@ -58,13 +57,14 @@ configure(allProjs) {
         scalaVersionRevision = '12'
         scalaTestVersion = '3.0.5'
         scalaCheckVersion = '1.14.0'
-        junitVersion = '4.12'
-        avroVersion = '1.8.2'
-        sparkVersion = '2.4.3'
+        junitVersion = '4.11'
+        avroVersion = '1.7.7'
+        sparkVersion = '2.3.2'
+        sparkAvroVersion = '4.0.0'
         scalaGraphVersion = '1.12.5'
         scalafmtVersion = '1.5.1'
         hadoopVersion = 'hadoop2'
-        json4sVersion = '3.5.3' // matches Spark dependency version
+        json4sVersion = '3.2.11' // matches Spark dependency version
         jodaTimeVersion = '2.9.4'
         jodaConvertVersion = '1.8.1'
         algebirdVersion = '0.13.4'
@@ -75,20 +75,20 @@ configure(allProjs) {
         googleLibPhoneNumberVersion = '8.8.5'
         googleGeoCoderVersion = '2.82'
         googleCarrierVersion = '1.72'
-        chillVersion = '0.9.3'
+        chillVersion = '0.8.4'
         reflectionsVersion = '0.9.11'
         collectionsVersion = '3.2.2'
         optimaizeLangDetectorVersion = '0.0.1'
         tikaVersion = '1.22'
-        sparkTestingBaseVersion = '2.4.3_0.12.0'
+        sparkTestingBaseVersion = '2.3.1_0.10.0'
         sourceCodeVersion = '0.1.3'
         pegdownVersion = '1.4.2'
         commonsValidatorVersion = '1.6'
         commonsIOVersion = '2.6'
         scoveragePluginVersion = '1.3.1'
-        xgboostVersion = '0.90'
+        xgboostVersion = '0.81'
         akkaSlf4jVersion = '2.3.11'
-        mleapVersion = '0.14.0'
+        mleapVersion = '0.13.0'
         memoryFilesystemVersion = '2.1.0'
     }
 

@@ -69,16 +69,20 @@ task copyTemplates(type: Copy) {
         fileName.replace(".gradle.template", ".gradle")
     }
     expand([
+            databaseHostname: 'db.company.com',
             version: scalaVersion,
             scalaVersion: scalaVersion,
             scalaVersionRevision: scalaVersionRevision,
             scalaTestVersion: scalaTestVersion,
             junitVersion: junitVersion,
             sparkVersion: sparkVersion,
             avroVersion: avroVersion,
+            sparkAvroVersion: sparkAvroVersion,
             hadoopVersion: hadoopVersion,
             collectionsVersion: collectionsVersion,
-            transmogrifaiVersion: version
+            transmogrifaiVersion: version,
+            buildNumber: (int)(Math.random() * 1000),
+            date: new Date()
     ])
 }
 

@@ -138,7 +138,7 @@ case class AutomaticSchema(recordClassName: String)(dataFile: File) extends Sche
         case Some(actualType) =>
           val newSchema = Schema.create(actualType)
           val schemaField =
-            new Schema.Field(field.name, newSchema, "auto-generated", orgSchemaField.defaultVal())
+            new Schema.Field(field.name, newSchema, "auto-generated", orgSchemaField.defaultValue)
           AvroField.from(schemaField)
       }
     } else field

@@ -69,7 +69,7 @@ class AvroFieldTest extends FlatSpec with TestCommon with Assertions {
     val allSchemas = (enum::unions)++simpleSchemas // NULL does not work
 
     val fields = allSchemas.zipWithIndex map {
-      case (s, i) => new Schema.Field("x" + i, s, "Who", null: Object)
+      case (s, i) => new Schema.Field("x" + i, s, "Who", null)
     }
 
     val expected = List(
@@ -86,7 +86,7 @@ class AvroFieldTest extends FlatSpec with TestCommon with Assertions {
 
     an[IllegalArgumentException] should be thrownBy {
       val nullSchema = Schema.create(Schema.Type.NULL)
-      val nullField = new Schema.Field("xxx", null, "Nobody", null: Object)
+      val nullField = new Schema.Field("xxx", null, "Nobody", null)
       AvroField from nullField
     }
 

@@ -152,14 +152,14 @@ class OpLinearSVCModel
 ) extends OpPredictorWrapperModel[LinearSVCModel](uid = uid, operationName = operationName, sparkModel = sparkModel) {
 
   @transient lazy private val predictRaw = reflectMethod(getSparkMlStage().get, "predictRaw")
-  @transient lazy private val predict: Vector => Double = getSparkMlStage().get.predict(_)
+  @transient lazy private val predict = reflectMethod(getSparkMlStage().get, "predict")
 
   /**
    * Function used to convert input to output
    */
   override def transformFn: (RealNN, OPVector) => Prediction = (label, features) => {
     val raw = predictRaw(features.value).asInstanceOf[Vector]
-    val pred = predict(features.value)
+    val pred = predict(features.value).asInstanceOf[Double]
 
     Prediction(rawPrediction = raw, prediction = pred)
   }

@@ -235,11 +235,6 @@ class OpXGBoostClassifier(uid: String = UID[OpXGBoostClassifier])
    */
   def setMaxBins(value: Int): this.type = set(maxBins, value)
 
-  /**
-   * Maximum number of nodes to be added. Only relevant when grow_policy=lossguide is set.
-   */
-  def setMaxLeaves(value: Int): this.type = set(maxLeaves, value)
-
   /**
    * This is only used for approximate greedy algorithm.
    * This roughly translated into O(1 / sketch_eps) number of bins. Compared to directly select
@@ -287,19 +282,8 @@ class OpXGBoostClassifier(uid: String = UID[OpXGBoostClassifier])
   def setLambdaBias(value: Double): this.type = set(lambdaBias, value)
 
   // setters for learning params
-
-  /**
-   * Specify the learning task and the corresponding learning objective.
-   * options: reg:squarederror, reg:logistic, binary:logistic, binary:logitraw, count:poisson,
-   * multi:softmax, multi:softprob, rank:pairwise, reg:gamma. default: reg:squarederror
-   */
   def setObjective(value: String): this.type = set(objective, value)
 
-  /**
-   * Objective type used for training. For options see [[ml.dmlc.xgboost4j.scala.spark.params.LearningTaskParams]]
-   */
-  def setObjectiveType(value: String): this.type = set(objectiveType, value)
-
   /**
    * Specify the learning task and the corresponding learning objective.
    * options: reg:linear, reg:logistic, binary:logistic, binary:logitraw, count:poisson,
@@ -326,11 +310,6 @@ class OpXGBoostClassifier(uid: String = UID[OpXGBoostClassifier])
    */
   def setNumEarlyStoppingRounds(value: Int): this.type = set(numEarlyStoppingRounds, value)
 
-  /**
-   * Define the expected optimization to the evaluation metrics, true to maximize otherwise minimize it
-   */
-  def setMaximizeEvaluationMetrics(value: Boolean): this.type = set(maximizeEvaluationMetrics, value)
-
   /**
    * Customized objective function provided by user. default: null
    */
@@ -380,18 +359,17 @@ class OpXGBoostClassificationModel
 
   private lazy val model = getSparkMlStage().get
   private lazy val booster = model.nativeBooster
-  private lazy val treeLimit = model.getTreeLimit
+  private lazy val treeLimit = model.getTreeLimit.toInt
   private lazy val missing = model.getMissing
 
   override def transformFn: (RealNN, OPVector) => Prediction = (label, features) => {
-    val data = processMissingValues(Iterator(features.value.asXGB), missing)
+    val data = removeMissingValues(Iterator(features.value.asXGB), missing)
     val dm = new DMatrix(dataIter = data)
     val rawPred = booster.predict(dm, outPutMargin = true, treeLimit = treeLimit)(0).map(_.toDouble)
     val rawPrediction = if (model.numClasses == 2) Array(-rawPred(0), rawPred(0)) else rawPred
     val prob = booster.predict(dm, outPutMargin = false, treeLimit = treeLimit)(0).map(_.toDouble)
     val probability = if (model.numClasses == 2) Array(1.0 - prob(0), prob(0)) else prob
     val prediction = probability2predictionMirror(Vectors.dense(probability)).asInstanceOf[Double]
-
     Prediction(prediction = prediction, rawPrediction = rawPrediction, probability = probability)
   }
 }
@@ -34,6 +34,7 @@ import com.salesforce.op.UID
 import com.salesforce.op.features.types.{OPVector, Prediction, RealNN}
 import com.salesforce.op.stages.impl.CheckIsResponseValues
 import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictionModel, OpPredictorWrapper}
+import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod
 import org.apache.spark.ml.regression.{DecisionTreeRegressionModel, DecisionTreeRegressor, OpDecisionTreeRegressorParams}
 
 import scala.reflect.runtime.universe.TypeTag
@@ -112,4 +113,7 @@ class OpDecisionTreeRegressionModel
   ttov: TypeTag[Prediction#Value]
 ) extends OpPredictionModel[DecisionTreeRegressionModel](
   sparkModel = sparkModel, uid = uid, operationName = operationName
-)
+) {
+  @transient lazy val predictMirror = reflectMethod(getSparkMlStage().get, "predict")
+}
+
@@ -34,6 +34,7 @@ import com.salesforce.op.UID
 import com.salesforce.op.features.types.{OPVector, Prediction, RealNN}
 import com.salesforce.op.stages.impl.CheckIsResponseValues
 import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictionModel, OpPredictorWrapper}
+import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod
 import org.apache.spark.ml.regression.{GBTRegressionModel, GBTRegressor, OpGBTRegressorParams}
 
 import scala.reflect.runtime.universe.TypeTag
@@ -138,4 +139,7 @@ class OpGBTRegressionModel
   ttov: TypeTag[Prediction#Value]
 ) extends OpPredictionModel[GBTRegressionModel](
   sparkModel = sparkModel, uid = uid, operationName = operationName
-)
+) {
+  @transient lazy val predictMirror = reflectMethod(getSparkMlStage().get, "predict")
+}
+
@@ -34,6 +34,7 @@ import com.salesforce.op._
 import com.salesforce.op.features.types.{OPVector, Prediction, RealNN}
 import com.salesforce.op.stages.impl.CheckIsResponseValues
 import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictionModel, OpPredictorWrapper}
+import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod
 import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel, OpLinearRegressionParams}
 
 import scala.reflect.runtime.universe.TypeTag
@@ -179,4 +180,7 @@ class OpLinearRegressionModel
   ttov: TypeTag[Prediction#Value]
 ) extends OpPredictionModel[LinearRegressionModel](
   sparkModel = sparkModel, uid = uid, operationName = operationName
-)
+) {
+  @transient lazy val predictMirror = reflectMethod(getSparkMlStage().get, "predict")
+}
+
@@ -34,6 +34,7 @@ import com.salesforce.op.UID
 import com.salesforce.op.features.types.{OPVector, Prediction, RealNN}
 import com.salesforce.op.stages.impl.CheckIsResponseValues
 import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictionModel, OpPredictorWrapper}
+import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod
 import org.apache.spark.ml.regression.{OpRandomForestRegressorParams, RandomForestRegressionModel, RandomForestRegressor}
 
 import scala.reflect.runtime.universe.TypeTag
@@ -125,4 +126,8 @@ class OpRandomForestRegressionModel
   ttov: TypeTag[Prediction#Value]
 ) extends OpPredictionModel[RandomForestRegressionModel](
   sparkModel = sparkModel, uid = uid, operationName = operationName
-)
+) {
+  @transient lazy val predictMirror = reflectMethod(getSparkMlStage().get, "predict")
+}
+
+
@@ -34,6 +34,7 @@ import com.salesforce.op.UID
 import com.salesforce.op.features.types.{OPVector, Prediction, RealNN}
 import com.salesforce.op.stages.impl.CheckIsResponseValues
 import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictionModel, OpPredictorWrapper}
+import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod
 import ml.dmlc.xgboost4j.scala.{EvalTrait, ObjectiveTrait}
 import ml.dmlc.xgboost4j.scala.spark.{OpXGBoostRegressorParams, TrackerConf, XGBoostRegressionModel, XGBoostRegressor}
 
@@ -233,11 +234,6 @@ class OpXGBoostRegressor(uid: String = UID[OpXGBoostRegressor])
    */
   def setMaxBins(value: Int): this.type = set(maxBins, value)
 
-  /**
-   * Maximum number of nodes to be added. Only relevant when grow_policy=lossguide is set.
-   */
-  def setMaxLeaves(value: Int): this.type = set(maxLeaves, value)
-
   /**
    * This is only used for approximate greedy algorithm.
    * This roughly translated into O(1 / sketch_eps) number of bins. Compared to directly select
@@ -285,19 +281,8 @@ class OpXGBoostRegressor(uid: String = UID[OpXGBoostRegressor])
   def setLambdaBias(value: Double): this.type = set(lambdaBias, value)
 
   // setters for learning params
-
-  /**
-   * Specify the learning task and the corresponding learning objective.
-   * options: reg:squarederror, reg:logistic, binary:logistic, binary:logitraw, count:poisson,
-   * multi:softmax, multi:softprob, rank:pairwise, reg:gamma. default: reg:squarederror
-   */
   def setObjective(value: String): this.type = set(objective, value)
 
-  /**
-   * Objective type used for training. For options see [[ml.dmlc.xgboost4j.scala.spark.params.LearningTaskParams]]
-   */
-  def setObjectiveType(value: String): this.type = set(objectiveType, value)
-
   /**
    * Specify the learning task and the corresponding learning objective.
    * options: reg:linear, reg:logistic, binary:logistic, binary:logitraw, count:poisson,
@@ -324,11 +309,6 @@ class OpXGBoostRegressor(uid: String = UID[OpXGBoostRegressor])
    */
   def setNumEarlyStoppingRounds(value: Int): this.type = set(numEarlyStoppingRounds, value)
 
-  /**
-   * Define the expected optimization to the evaluation metrics, true to maximize otherwise minimize it
-   */
-  def setMaximizeEvaluationMetrics(value: Boolean): this.type = set(maximizeEvaluationMetrics, value)
-
   /**
    * Customized objective function provided by user. default: null
    */
@@ -361,4 +341,6 @@ class OpXGBoostRegressionModel
   ttov: TypeTag[Prediction#Value]
 ) extends OpPredictionModel[XGBoostRegressionModel](
   sparkModel = sparkModel, uid = uid, operationName = operationName
-)
+) {
+  @transient lazy val predictMirror = reflectMethod(getSparkMlStage().get, "predict")
+}
@@ -41,6 +41,7 @@ import com.salesforce.op.stages.impl.tuning.{BestEstimator, _}
 import com.salesforce.op.stages.sparkwrappers.generic.SparkWrapperParams
 import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapperModel, SparkModelConverter}
 import com.salesforce.op.utils.spark.RichMetadata._
+import com.salesforce.op.utils.spark.RichDataset._
 import com.salesforce.op.utils.spark.RichParamMap._
 import com.salesforce.op.utils.stages.FitStagesUtil._
 import org.apache.spark.ml.param._

@@ -52,10 +52,9 @@ abstract class OpPredictionModel[T <: PredictionModel[Vector, T]]
   operationName: String
 ) extends OpPredictorWrapperModel[T](uid = uid, operationName = operationName, sparkModel = sparkModel) {
 
-  /**
-   * Predict label for the given features
-   */
-  @transient protected lazy val predict: Vector => Double = getSparkMlStage().get.predict(_)
+  protected def predictMirror: MethodMirror
+
+  protected def predict(features: Vector): Double = predictMirror.apply(features).asInstanceOf[Double]
 
   /**
    * Function used to convert input to output

@@ -104,8 +104,8 @@ case object OpXGBoost {
   }
 
   /**
-   * Hack to access [[ml.dmlc.xgboost4j.scala.spark.XGBoost.processMissingValues]] private method
+   * Hack to access [[ml.dmlc.xgboost4j.scala.spark.XGBoost.removeMissingValues]] private method
    */
-  def processMissingValues(xgbLabelPoints: Iterator[LabeledPoint], missing: Float): Iterator[LabeledPoint] =
-    XGBoost.processMissingValues(xgbLabelPoints, missing)
+  def removeMissingValues(xgbLabelPoints: Iterator[LabeledPoint], missing: Float): Iterator[LabeledPoint] =
+    XGBoost.removeMissingValues(xgbLabelPoints, missing)
 }