From f542df3107e6161f90a7394a36ab95932a0b3425 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Wed, 30 Nov 2016 13:21:05 -0800 Subject: [PATCH] [SPARK-18318][ML] ML, Graph 2.1 QA: API: New Scala APIs, docs ## What changes were proposed in this pull request? API review for 2.1, except ```LSH``` related classes which are still under development. ## How was this patch tested? Only doc changes, no new tests. Author: Yanbo Liang Closes #16009 from yanboliang/spark-18318. (cherry picked from commit 60022bfd65e4637efc0eb5f4cc0112289c783147) Signed-off-by: Joseph K. Bradley --- docs/ml-features.md | 4 +++- .../spark/ml/classification/LogisticRegression.scala | 6 +++--- .../apache/spark/ml/classification/NaiveBayes.scala | 2 +- .../org/apache/spark/ml/feature/Bucketizer.scala | 7 ++++--- .../org/apache/spark/ml/feature/ChiSqSelector.scala | 2 ++ .../apache/spark/ml/feature/QuantileDiscretizer.scala | 11 +++++++---- .../apache/spark/ml/optim/NormalEquationSolver.scala | 8 ++++---- .../spark/mllib/classification/NaiveBayes.scala | 6 +++--- .../apache/spark/mllib/feature/ChiSqSelector.scala | 2 +- .../org/apache/spark/mllib/feature/HashingTF.scala | 6 +++--- 10 files changed, 31 insertions(+), 23 deletions(-) diff --git a/docs/ml-features.md b/docs/ml-features.md index 45724a3716e74..9eecc1333d06f 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -1158,7 +1158,9 @@ categorical features. The number of bins is set by the `numBuckets` parameter. I that the number of buckets used will be smaller than this value, for example, if there are too few distinct values of the input to create enough distinct quantiles. -NaN values: Note also that QuantileDiscretizer +NaN values: +NaN values will be removed from the column during `QuantileDiscretizer` fitting. This will produce +a `Bucketizer` model for making predictions. During the transformation, `Bucketizer` will raise an error when it finds NaN values in the dataset, but the user can also choose to either keep or remove NaN values within the dataset by setting `handleInvalid`. If the user chooses to keep NaN values, they will be handled specially and placed into their own bucket, for example, if 4 buckets diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index d3ae62e243302..5e1d6eec96a3e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -312,7 +312,6 @@ class LogisticRegression @Since("1.2.0") ( private var optInitialModel: Option[LogisticRegressionModel] = None - /** @group setParam */ private[spark] def setInitialModel(model: LogisticRegressionModel): this.type = { this.optInitialModel = Some(model) this @@ -323,8 +322,9 @@ class LogisticRegression @Since("1.2.0") ( train(dataset, handlePersistence) } - protected[spark] def train(dataset: Dataset[_], handlePersistence: Boolean): - LogisticRegressionModel = { + protected[spark] def train( + dataset: Dataset[_], + handlePersistence: Boolean): LogisticRegressionModel = { val w = if (!isDefined(weightCol) || $(weightCol).isEmpty) lit(1.0) else col($(weightCol)) val instances: RDD[Instance] = dataset.select(col($(labelCol)), w, col($(featuresCol))).rdd.map { diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala index a2ac7000003d4..94ee2a2e7d9f4 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala @@ -33,7 +33,7 @@ import org.apache.spark.sql.types.DoubleType /** * Params for Naive Bayes Classifiers. */ -private[ml] trait NaiveBayesParams extends PredictorParams with HasWeightCol { +private[classification] trait NaiveBayesParams extends PredictorParams with HasWeightCol { /** * The smoothing parameter. diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala index 260159f8b7ac4..eb4d42f255345 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala @@ -84,11 +84,12 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String * Default: "error" * @group param */ + // TODO: SPARK-18619 Make Bucketizer inherit from HasHandleInvalid. @Since("2.1.0") - val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle" + + val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle " + "invalid entries. Options are skip (filter out rows with invalid values), " + "error (throw an error), or keep (keep invalid values in a special additional bucket).", - ParamValidators.inArray(Bucketizer.supportedHandleInvalid)) + ParamValidators.inArray(Bucketizer.supportedHandleInvalids)) /** @group getParam */ @Since("2.1.0") @@ -145,7 +146,7 @@ object Bucketizer extends DefaultParamsReadable[Bucketizer] { private[feature] val SKIP_INVALID: String = "skip" private[feature] val ERROR_INVALID: String = "error" private[feature] val KEEP_INVALID: String = "keep" - private[feature] val supportedHandleInvalid: Array[String] = + private[feature] val supportedHandleInvalids: Array[String] = Array(SKIP_INVALID, ERROR_INVALID, KEEP_INVALID) /** diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index 7cd0f159c6be7..8699929bab793 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -82,11 +82,13 @@ private[feature] trait ChiSqSelectorParams extends Params * Default value is 0.05. * @group param */ + @Since("2.1.0") final val fpr = new DoubleParam(this, "fpr", "The highest p-value for features to be kept.", ParamValidators.inRange(0, 1)) setDefault(fpr -> 0.05) /** @group getParam */ + @Since("2.1.0") def getFpr: Double = $(fpr) /** diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala index d8f33cd768dcd..b4fcfa2da47de 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala @@ -72,11 +72,12 @@ private[feature] trait QuantileDiscretizerBase extends Params * Default: "error" * @group param */ + // TODO: SPARK-18619 Make QuantileDiscretizer inherit from HasHandleInvalid. @Since("2.1.0") - val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle" + + val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle " + "invalid entries. Options are skip (filter out rows with invalid values), " + "error (throw an error), or keep (keep invalid values in a special additional bucket).", - ParamValidators.inArray(Bucketizer.supportedHandleInvalid)) + ParamValidators.inArray(Bucketizer.supportedHandleInvalids)) setDefault(handleInvalid, Bucketizer.ERROR_INVALID) /** @group getParam */ @@ -91,8 +92,10 @@ private[feature] trait QuantileDiscretizerBase extends Params * possible that the number of buckets used will be smaller than this value, for example, if there * are too few distinct values of the input to create enough distinct quantiles. * - * NaN handling: Note also that - * QuantileDiscretizer will raise an error when it finds NaN values in the dataset, but the user can + * NaN handling: + * NaN values will be removed from the column during `QuantileDiscretizer` fitting. This will + * produce a `Bucketizer` model for making predictions. During the transformation, + * `Bucketizer` will raise an error when it finds NaN values in the dataset, but the user can * also choose to either keep or remove NaN values within the dataset by setting `handleInvalid`. * If the user chooses to keep NaN values, they will be handled specially and placed into their own * bucket, for example, if 4 buckets are used, then non-NaN data will be put into buckets[0-3], diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/NormalEquationSolver.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/NormalEquationSolver.scala index 96fd0d18b5ae9..dc3bcc6627339 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/optim/NormalEquationSolver.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/optim/NormalEquationSolver.scala @@ -34,7 +34,7 @@ import org.apache.spark.mllib.linalg.CholeskyDecomposition * @param objectiveHistory Option containing the objective history when an optimization program is * used to solve the normal equations. None when an analytic solver is used. */ -private[ml] class NormalEquationSolution( +private[optim] class NormalEquationSolution( val coefficients: Array[Double], val aaInv: Option[Array[Double]], val objectiveHistory: Option[Array[Double]]) @@ -42,7 +42,7 @@ private[ml] class NormalEquationSolution( /** * Interface for classes that solve the normal equations locally. */ -private[ml] sealed trait NormalEquationSolver { +private[optim] sealed trait NormalEquationSolver { /** Solve the normal equations from summary statistics. */ def solve( @@ -56,7 +56,7 @@ private[ml] sealed trait NormalEquationSolver { /** * A class that solves the normal equations directly, using Cholesky decomposition. */ -private[ml] class CholeskySolver extends NormalEquationSolver { +private[optim] class CholeskySolver extends NormalEquationSolver { override def solve( bBar: Double, @@ -75,7 +75,7 @@ private[ml] class CholeskySolver extends NormalEquationSolver { /** * A class for solving the normal equations using Quasi-Newton optimization methods. */ -private[ml] class QuasiNewtonSolver( +private[optim] class QuasiNewtonSolver( fitIntercept: Boolean, maxIter: Int, tol: Double, diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index fa46ba3ace508..9e8774732efe6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -392,13 +392,13 @@ class NaiveBayes private ( object NaiveBayes { /** String name for multinomial model type. */ - private[spark] val Multinomial: String = "multinomial" + private[classification] val Multinomial: String = "multinomial" /** String name for Bernoulli model type. */ - private[spark] val Bernoulli: String = "bernoulli" + private[classification] val Bernoulli: String = "bernoulli" /* Set of modelTypes that NaiveBayes supports */ - private[spark] val supportedModelTypes = Set(Multinomial, Bernoulli) + private[classification] val supportedModelTypes = Set(Multinomial, Bernoulli) /** * Trains a Naive Bayes model given an RDD of `(label, features)` pairs. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index 05ad2492f8c43..7ef2a95b96f2d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -266,7 +266,7 @@ private[spark] object ChiSqSelector { val Percentile: String = "percentile" /** String name for `fpr` selector type. */ - private[spark] val FPR: String = "fpr" + val FPR: String = "fpr" /** Set of selector types that ChiSqSelector supports. */ val supportedSelectorTypes: Array[String] = Array(NumTopFeatures, Percentile, FPR) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala index bc26655104a9b..9abdd44a635d1 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala @@ -131,9 +131,9 @@ class HashingTF(val numFeatures: Int) extends Serializable { object HashingTF { - private[spark] val Native: String = "native" + private[HashingTF] val Native: String = "native" - private[spark] val Murmur3: String = "murmur3" + private[HashingTF] val Murmur3: String = "murmur3" private val seed = 42 @@ -141,7 +141,7 @@ object HashingTF { * Calculate a hash code value for the term object using the native Scala implementation. * This is the default hash algorithm used in Spark 1.6 and earlier. */ - private[spark] def nativeHash(term: Any): Int = term.## + private[HashingTF] def nativeHash(term: Any): Int = term.## /** * Calculate a hash code value for the term object using