From f542df3107e6161f90a7394a36ab95932a0b3425 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 30 Nov 2016 13:21:05 -0800
Subject: [PATCH] [SPARK-18318][ML] ML, Graph 2.1 QA: API: New Scala APIs, docs

## What changes were proposed in this pull request?
API review for 2.1, except ```LSH``` related classes which are still under development.

## How was this patch tested?
Only doc changes, no new tests.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #16009 from yanboliang/spark-18318.

(cherry picked from commit 60022bfd65e4637efc0eb5f4cc0112289c783147)
Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
---
 docs/ml-features.md                                   |  4 +++-
 .../spark/ml/classification/LogisticRegression.scala  |  6 +++---
 .../apache/spark/ml/classification/NaiveBayes.scala   |  2 +-
 .../org/apache/spark/ml/feature/Bucketizer.scala      |  7 ++++---
 .../org/apache/spark/ml/feature/ChiSqSelector.scala   |  2 ++
 .../apache/spark/ml/feature/QuantileDiscretizer.scala | 11 +++++++----
 .../apache/spark/ml/optim/NormalEquationSolver.scala  |  8 ++++----
 .../spark/mllib/classification/NaiveBayes.scala       |  6 +++---
 .../apache/spark/mllib/feature/ChiSqSelector.scala    |  2 +-
 .../org/apache/spark/mllib/feature/HashingTF.scala    |  6 +++---
 10 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/docs/ml-features.md b/docs/ml-features.md
index 45724a3716e74..9eecc1333d06f 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1158,7 +1158,9 @@ categorical features. The number of bins is set by the `numBuckets` parameter. I
 that the number of buckets used will be smaller than this value, for example, if there are too few
 distinct values of the input to create enough distinct quantiles.
 
-NaN values: Note also that QuantileDiscretizer
+NaN values:
+NaN values will be removed from the column during `QuantileDiscretizer` fitting. This will produce
+a `Bucketizer` model for making predictions. During the transformation, `Bucketizer`
 will raise an error when it finds NaN values in the dataset, but the user can also choose to either
 keep or remove NaN values within the dataset by setting `handleInvalid`. If the user chooses to keep
 NaN values, they will be handled specially and placed into their own bucket, for example, if 4 buckets
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index d3ae62e243302..5e1d6eec96a3e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -312,7 +312,6 @@ class LogisticRegression @Since("1.2.0") (
 
   private var optInitialModel: Option[LogisticRegressionModel] = None
 
-  /** @group setParam */
   private[spark] def setInitialModel(model: LogisticRegressionModel): this.type = {
     this.optInitialModel = Some(model)
     this
@@ -323,8 +322,9 @@ class LogisticRegression @Since("1.2.0") (
     train(dataset, handlePersistence)
   }
 
-  protected[spark] def train(dataset: Dataset[_], handlePersistence: Boolean):
-      LogisticRegressionModel = {
+  protected[spark] def train(
+      dataset: Dataset[_],
+      handlePersistence: Boolean): LogisticRegressionModel = {
     val w = if (!isDefined(weightCol) || $(weightCol).isEmpty) lit(1.0) else col($(weightCol))
     val instances: RDD[Instance] =
       dataset.select(col($(labelCol)), w, col($(featuresCol))).rdd.map {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
index a2ac7000003d4..94ee2a2e7d9f4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
@@ -33,7 +33,7 @@ import org.apache.spark.sql.types.DoubleType
 /**
  * Params for Naive Bayes Classifiers.
  */
-private[ml] trait NaiveBayesParams extends PredictorParams with HasWeightCol {
+private[classification] trait NaiveBayesParams extends PredictorParams with HasWeightCol {
 
   /**
    * The smoothing parameter.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
index 260159f8b7ac4..eb4d42f255345 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
@@ -84,11 +84,12 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
    * Default: "error"
    * @group param
    */
+  // TODO: SPARK-18619 Make Bucketizer inherit from HasHandleInvalid.
   @Since("2.1.0")
-  val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle" +
+  val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle " +
     "invalid entries. Options are skip (filter out rows with invalid values), " +
     "error (throw an error), or keep (keep invalid values in a special additional bucket).",
-    ParamValidators.inArray(Bucketizer.supportedHandleInvalid))
+    ParamValidators.inArray(Bucketizer.supportedHandleInvalids))
 
   /** @group getParam */
   @Since("2.1.0")
@@ -145,7 +146,7 @@ object Bucketizer extends DefaultParamsReadable[Bucketizer] {
   private[feature] val SKIP_INVALID: String = "skip"
   private[feature] val ERROR_INVALID: String = "error"
   private[feature] val KEEP_INVALID: String = "keep"
-  private[feature] val supportedHandleInvalid: Array[String] =
+  private[feature] val supportedHandleInvalids: Array[String] =
     Array(SKIP_INVALID, ERROR_INVALID, KEEP_INVALID)
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
index 7cd0f159c6be7..8699929bab793 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
@@ -82,11 +82,13 @@ private[feature] trait ChiSqSelectorParams extends Params
    * Default value is 0.05.
    * @group param
    */
+  @Since("2.1.0")
   final val fpr = new DoubleParam(this, "fpr", "The highest p-value for features to be kept.",
     ParamValidators.inRange(0, 1))
   setDefault(fpr -> 0.05)
 
   /** @group getParam */
+  @Since("2.1.0")
   def getFpr: Double = $(fpr)
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
index d8f33cd768dcd..b4fcfa2da47de 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
@@ -72,11 +72,12 @@ private[feature] trait QuantileDiscretizerBase extends Params
    * Default: "error"
    * @group param
    */
+  // TODO: SPARK-18619 Make QuantileDiscretizer inherit from HasHandleInvalid.
   @Since("2.1.0")
-  val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle" +
+  val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle " +
     "invalid entries. Options are skip (filter out rows with invalid values), " +
     "error (throw an error), or keep (keep invalid values in a special additional bucket).",
-    ParamValidators.inArray(Bucketizer.supportedHandleInvalid))
+    ParamValidators.inArray(Bucketizer.supportedHandleInvalids))
   setDefault(handleInvalid, Bucketizer.ERROR_INVALID)
 
   /** @group getParam */
@@ -91,8 +92,10 @@ private[feature] trait QuantileDiscretizerBase extends Params
  * possible that the number of buckets used will be smaller than this value, for example, if there
  * are too few distinct values of the input to create enough distinct quantiles.
  *
- * NaN handling: Note also that
- * QuantileDiscretizer will raise an error when it finds NaN values in the dataset, but the user can
+ * NaN handling:
+ * NaN values will be removed from the column during `QuantileDiscretizer` fitting. This will
+ * produce a `Bucketizer` model for making predictions. During the transformation,
+ * `Bucketizer` will raise an error when it finds NaN values in the dataset, but the user can
  * also choose to either keep or remove NaN values within the dataset by setting `handleInvalid`.
  * If the user chooses to keep NaN values, they will be handled specially and placed into their own
  * bucket, for example, if 4 buckets are used, then non-NaN data will be put into buckets[0-3],
diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/NormalEquationSolver.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/NormalEquationSolver.scala
index 96fd0d18b5ae9..dc3bcc6627339 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/optim/NormalEquationSolver.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/NormalEquationSolver.scala
@@ -34,7 +34,7 @@ import org.apache.spark.mllib.linalg.CholeskyDecomposition
  * @param objectiveHistory Option containing the objective history when an optimization program is
  *                         used to solve the normal equations. None when an analytic solver is used.
  */
-private[ml] class NormalEquationSolution(
+private[optim] class NormalEquationSolution(
     val coefficients: Array[Double],
     val aaInv: Option[Array[Double]],
     val objectiveHistory: Option[Array[Double]])
@@ -42,7 +42,7 @@ private[ml] class NormalEquationSolution(
 /**
  * Interface for classes that solve the normal equations locally.
  */
-private[ml] sealed trait NormalEquationSolver {
+private[optim] sealed trait NormalEquationSolver {
 
   /** Solve the normal equations from summary statistics. */
   def solve(
@@ -56,7 +56,7 @@ private[ml] sealed trait NormalEquationSolver {
 /**
  * A class that solves the normal equations directly, using Cholesky decomposition.
  */
-private[ml] class CholeskySolver extends NormalEquationSolver {
+private[optim] class CholeskySolver extends NormalEquationSolver {
 
   override def solve(
       bBar: Double,
@@ -75,7 +75,7 @@ private[ml] class CholeskySolver extends NormalEquationSolver {
 /**
  * A class for solving the normal equations using Quasi-Newton optimization methods.
  */
-private[ml] class QuasiNewtonSolver(
+private[optim] class QuasiNewtonSolver(
     fitIntercept: Boolean,
     maxIter: Int,
     tol: Double,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
index fa46ba3ace508..9e8774732efe6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
@@ -392,13 +392,13 @@ class NaiveBayes private (
 object NaiveBayes {
 
   /** String name for multinomial model type. */
-  private[spark] val Multinomial: String = "multinomial"
+  private[classification] val Multinomial: String = "multinomial"
 
   /** String name for Bernoulli model type. */
-  private[spark] val Bernoulli: String = "bernoulli"
+  private[classification] val Bernoulli: String = "bernoulli"
 
   /* Set of modelTypes that NaiveBayes supports */
-  private[spark] val supportedModelTypes = Set(Multinomial, Bernoulli)
+  private[classification] val supportedModelTypes = Set(Multinomial, Bernoulli)
 
   /**
    * Trains a Naive Bayes model given an RDD of `(label, features)` pairs.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index 05ad2492f8c43..7ef2a95b96f2d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -266,7 +266,7 @@ private[spark] object ChiSqSelector {
   val Percentile: String = "percentile"
 
   /** String name for `fpr` selector type. */
-  private[spark] val FPR: String = "fpr"
+  val FPR: String = "fpr"
 
   /** Set of selector types that ChiSqSelector supports. */
   val supportedSelectorTypes: Array[String] = Array(NumTopFeatures, Percentile, FPR)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala
index bc26655104a9b..9abdd44a635d1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala
@@ -131,9 +131,9 @@ class HashingTF(val numFeatures: Int) extends Serializable {
 
 object HashingTF {
 
-  private[spark] val Native: String = "native"
+  private[HashingTF] val Native: String = "native"
 
-  private[spark] val Murmur3: String = "murmur3"
+  private[HashingTF] val Murmur3: String = "murmur3"
 
   private val seed = 42
 
@@ -141,7 +141,7 @@ object HashingTF {
    * Calculate a hash code value for the term object using the native Scala implementation.
    * This is the default hash algorithm used in Spark 1.6 and earlier.
    */
-  private[spark] def nativeHash(term: Any): Int = term.##
+  private[HashingTF] def nativeHash(term: Any): Int = term.##
 
   /**
    * Calculate a hash code value for the term object using