Skip to content

Commit

Permalink
[SPARK-18318][ML] ML, Graph 2.1 QA: API: New Scala APIs, docs
Browse files Browse the repository at this point in the history
## What changes were proposed in this pull request?
API review for 2.1, except ```LSH``` related classes which are still under development.

## How was this patch tested?
Only doc changes, no new tests.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes apache#16009 from yanboliang/spark-18318.
  • Loading branch information
yanboliang authored and uzadude committed Jan 27, 2017
1 parent 0e835f5 commit fa59136
Show file tree
Hide file tree
Showing 10 changed files with 31 additions and 23 deletions.
4 changes: 3 additions & 1 deletion docs/ml-features.md
Original file line number Diff line number Diff line change
Expand Up @@ -1188,7 +1188,9 @@ categorical features. The number of bins is set by the `numBuckets` parameter. I
that the number of buckets used will be smaller than this value, for example, if there are too few
distinct values of the input to create enough distinct quantiles.

NaN values: Note also that QuantileDiscretizer
NaN values:
NaN values will be removed from the column during `QuantileDiscretizer` fitting. This will produce
a `Bucketizer` model for making predictions. During the transformation, `Bucketizer`
will raise an error when it finds NaN values in the dataset, but the user can also choose to either
keep or remove NaN values within the dataset by setting `handleInvalid`. If the user chooses to keep
NaN values, they will be handled specially and placed into their own bucket, for example, if 4 buckets
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,6 @@ class LogisticRegression @Since("1.2.0") (

private var optInitialModel: Option[LogisticRegressionModel] = None

/** @group setParam */
private[spark] def setInitialModel(model: LogisticRegressionModel): this.type = {
this.optInitialModel = Some(model)
this
Expand All @@ -323,8 +322,9 @@ class LogisticRegression @Since("1.2.0") (
train(dataset, handlePersistence)
}

protected[spark] def train(dataset: Dataset[_], handlePersistence: Boolean):
LogisticRegressionModel = {
protected[spark] def train(
dataset: Dataset[_],
handlePersistence: Boolean): LogisticRegressionModel = {
val w = if (!isDefined(weightCol) || $(weightCol).isEmpty) lit(1.0) else col($(weightCol))
val instances: RDD[Instance] =
dataset.select(col($(labelCol)), w, col($(featuresCol))).rdd.map {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ import org.apache.spark.sql.types.DoubleType
/**
* Params for Naive Bayes Classifiers.
*/
private[ml] trait NaiveBayesParams extends PredictorParams with HasWeightCol {
private[classification] trait NaiveBayesParams extends PredictorParams with HasWeightCol {

/**
* The smoothing parameter.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,12 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
* Default: "error"
* @group param
*/
// TODO: SPARK-18619 Make Bucketizer inherit from HasHandleInvalid.
@Since("2.1.0")
val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle" +
val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle " +
"invalid entries. Options are skip (filter out rows with invalid values), " +
"error (throw an error), or keep (keep invalid values in a special additional bucket).",
ParamValidators.inArray(Bucketizer.supportedHandleInvalid))
ParamValidators.inArray(Bucketizer.supportedHandleInvalids))

/** @group getParam */
@Since("2.1.0")
Expand Down Expand Up @@ -145,7 +146,7 @@ object Bucketizer extends DefaultParamsReadable[Bucketizer] {
private[feature] val SKIP_INVALID: String = "skip"
private[feature] val ERROR_INVALID: String = "error"
private[feature] val KEEP_INVALID: String = "keep"
private[feature] val supportedHandleInvalid: Array[String] =
private[feature] val supportedHandleInvalids: Array[String] =
Array(SKIP_INVALID, ERROR_INVALID, KEEP_INVALID)

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,13 @@ private[feature] trait ChiSqSelectorParams extends Params
* Default value is 0.05.
* @group param
*/
@Since("2.1.0")
final val fpr = new DoubleParam(this, "fpr", "The highest p-value for features to be kept.",
ParamValidators.inRange(0, 1))
setDefault(fpr -> 0.05)

/** @group getParam */
@Since("2.1.0")
def getFpr: Double = $(fpr)

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,12 @@ private[feature] trait QuantileDiscretizerBase extends Params
* Default: "error"
* @group param
*/
// TODO: SPARK-18619 Make QuantileDiscretizer inherit from HasHandleInvalid.
@Since("2.1.0")
val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle" +
val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle " +
"invalid entries. Options are skip (filter out rows with invalid values), " +
"error (throw an error), or keep (keep invalid values in a special additional bucket).",
ParamValidators.inArray(Bucketizer.supportedHandleInvalid))
ParamValidators.inArray(Bucketizer.supportedHandleInvalids))
setDefault(handleInvalid, Bucketizer.ERROR_INVALID)

/** @group getParam */
Expand All @@ -91,8 +92,10 @@ private[feature] trait QuantileDiscretizerBase extends Params
* possible that the number of buckets used will be smaller than this value, for example, if there
* are too few distinct values of the input to create enough distinct quantiles.
*
* NaN handling: Note also that
* QuantileDiscretizer will raise an error when it finds NaN values in the dataset, but the user can
* NaN handling:
* NaN values will be removed from the column during `QuantileDiscretizer` fitting. This will
* produce a `Bucketizer` model for making predictions. During the transformation,
* `Bucketizer` will raise an error when it finds NaN values in the dataset, but the user can
* also choose to either keep or remove NaN values within the dataset by setting `handleInvalid`.
* If the user chooses to keep NaN values, they will be handled specially and placed into their own
* bucket, for example, if 4 buckets are used, then non-NaN data will be put into buckets[0-3],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,15 @@ import org.apache.spark.mllib.linalg.CholeskyDecomposition
* @param objectiveHistory Option containing the objective history when an optimization program is
* used to solve the normal equations. None when an analytic solver is used.
*/
private[ml] class NormalEquationSolution(
private[optim] class NormalEquationSolution(
val coefficients: Array[Double],
val aaInv: Option[Array[Double]],
val objectiveHistory: Option[Array[Double]])

/**
* Interface for classes that solve the normal equations locally.
*/
private[ml] sealed trait NormalEquationSolver {
private[optim] sealed trait NormalEquationSolver {

/** Solve the normal equations from summary statistics. */
def solve(
Expand All @@ -56,7 +56,7 @@ private[ml] sealed trait NormalEquationSolver {
/**
* A class that solves the normal equations directly, using Cholesky decomposition.
*/
private[ml] class CholeskySolver extends NormalEquationSolver {
private[optim] class CholeskySolver extends NormalEquationSolver {

override def solve(
bBar: Double,
Expand All @@ -75,7 +75,7 @@ private[ml] class CholeskySolver extends NormalEquationSolver {
/**
* A class for solving the normal equations using Quasi-Newton optimization methods.
*/
private[ml] class QuasiNewtonSolver(
private[optim] class QuasiNewtonSolver(
fitIntercept: Boolean,
maxIter: Int,
tol: Double,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -392,13 +392,13 @@ class NaiveBayes private (
object NaiveBayes {

/** String name for multinomial model type. */
private[spark] val Multinomial: String = "multinomial"
private[classification] val Multinomial: String = "multinomial"

/** String name for Bernoulli model type. */
private[spark] val Bernoulli: String = "bernoulli"
private[classification] val Bernoulli: String = "bernoulli"

/* Set of modelTypes that NaiveBayes supports */
private[spark] val supportedModelTypes = Set(Multinomial, Bernoulli)
private[classification] val supportedModelTypes = Set(Multinomial, Bernoulli)

/**
* Trains a Naive Bayes model given an RDD of `(label, features)` pairs.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ private[spark] object ChiSqSelector {
val Percentile: String = "percentile"

/** String name for `fpr` selector type. */
private[spark] val FPR: String = "fpr"
val FPR: String = "fpr"

/** Set of selector types that ChiSqSelector supports. */
val supportedSelectorTypes: Array[String] = Array(NumTopFeatures, Percentile, FPR)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,17 +131,17 @@ class HashingTF(val numFeatures: Int) extends Serializable {

object HashingTF {

private[spark] val Native: String = "native"
private[HashingTF] val Native: String = "native"

private[spark] val Murmur3: String = "murmur3"
private[HashingTF] val Murmur3: String = "murmur3"

private val seed = 42

/**
* Calculate a hash code value for the term object using the native Scala implementation.
* This is the default hash algorithm used in Spark 1.6 and earlier.
*/
private[spark] def nativeHash(term: Any): Int = term.##
private[HashingTF] def nativeHash(term: Any): Int = term.##

/**
* Calculate a hash code value for the term object using
Expand Down

0 comments on commit fa59136

Please sign in to comment.