diff --git a/mllib/src/main/scala/org/apache/spark/mllib/MLContext.scala b/mllib/src/main/scala/org/apache/spark/mllib/MLContext.scala index e678eaaa37810..eefca193ec53e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/MLContext.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/MLContext.scala @@ -26,11 +26,11 @@ import org.apache.spark.rdd.RDD class MLContext(self: SparkContext) { /** * Reads labeled data in the LIBSVM format into an RDD[LabeledPoint]. - * The LIBSVM format is a text-based format used by LIBSVM (http://www.csie.ntu.edu.tw/~cjlin/libsvm/). + * The LIBSVM format is a text-based format used by LIBSVM and LIBLINEAR. * Each line represents a labeled sparse feature vector using the following format: * {{{label index1:value1 index2:value2 ...}}} * where the indices are one-based and in ascending order. - * This method parses each line into a [[org.apache.spark.mllib.regression.LabeledPoint]] instance, + * This method parses each line into a [[org.apache.spark.mllib.regression.LabeledPoint]], * where the feature indices are converted to zero-based. * * @param path file or directory path in any Hadoop-supported file system URI diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala index a174d4e77d33e..20654284965ed 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala @@ -37,17 +37,17 @@ abstract class Gradient extends Serializable { def compute(data: Vector, label: Double, weights: Vector): (Vector, Double) /** - * Compute the gradient and loss given the features of a single data point, add the gradient to a provided vector to - * avoid creating new objects, and return loss. + * Compute the gradient and loss given the features of a single data point, + * add the gradient to a provided vector to avoid creating new objects, and return loss. * * @param data features for one data point * @param label label for this data point * @param weights weights/coefficients corresponding to features - * @param gradientAddTo gradient will be added to this vector + * @param cumGradient the computed gradient will be added to this vector * - * @return (gradient: Vector, loss: Double) + * @return loss */ - def compute(data: Vector, label: Double, weights: Vector, gradientAddTo: Vector): Double + def compute(data: Vector, label: Double, weights: Vector, cumGradient: Vector): Double } /** @@ -71,13 +71,17 @@ class LogisticGradient extends Gradient { (Vectors.fromBreeze(gradient), loss) } - override def compute(data: Vector, label: Double, weights: Vector, gradientAddTo: Vector): Double = { + override def compute( + data: Vector, + label: Double, + weights: Vector, + cumGradient: Vector): Double = { val brzData = data.toBreeze val brzWeights = weights.toBreeze val margin: Double = -1.0 * brzWeights.dot(brzData) val gradientMultiplier = (1.0 / (1.0 + math.exp(margin))) - label - brzAxpy(gradientMultiplier, brzData, gradientAddTo.toBreeze) + brzAxpy(gradientMultiplier, brzData, cumGradient.toBreeze) if (label > 0) { math.log(1 + math.exp(margin)) @@ -104,12 +108,16 @@ class LeastSquaresGradient extends Gradient { (Vectors.fromBreeze(gradient), loss) } - override def compute(data: Vector, label: Double, weights: Vector, gradientAddTo: Vector): Double = { + override def compute( + data: Vector, + label: Double, + weights: Vector, + cumGradient: Vector): Double = { val brzData = data.toBreeze val brzWeights = weights.toBreeze val diff = brzWeights.dot(brzData) - label - brzAxpy(2.0 * diff, brzData, gradientAddTo.toBreeze) + brzAxpy(2.0 * diff, brzData, cumGradient.toBreeze) diff * diff } @@ -137,7 +145,11 @@ class HingeGradient extends Gradient { } } - override def compute(data: Vector, label: Double, weights: Vector, gradientAddTo: Vector): Double = { + override def compute( + data: Vector, + label: Double, + weights: Vector, + cumGradient: Vector): Double = { val brzData = data.toBreeze val brzWeights = weights.toBreeze val dotProduct = brzWeights.dot(brzData) @@ -147,7 +159,7 @@ class HingeGradient extends Gradient { val labelScaled = 2 * label - 1.0 if (1.0 > labelScaled * dotProduct) { - brzAxpy(-labelScaled, brzData, gradientAddTo.toBreeze) + brzAxpy(-labelScaled, brzData, cumGradient.toBreeze) 1.0 - labelScaled * dotProduct } else { 0.0 diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala index 4034e79ae924a..911e0c0178cab 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala @@ -17,13 +17,13 @@ package org.apache.spark.mllib.regression +import breeze.linalg.{Vector => BV} + import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD +import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.optimization._ import org.apache.spark.mllib.util.MLUtils -import org.apache.spark.mllib.linalg.{Vector, Vectors} - -import breeze.linalg.{Vector => BV, DenseVector => BDV} /** * Regression model trained using Lasso. @@ -142,7 +142,8 @@ object LassoWithSGD { regParam: Double, miniBatchFraction: Double, initialWeights: Vector): LassoModel = { - new LassoWithSGD(stepSize, numIterations, regParam, miniBatchFraction).run(input, initialWeights) + new LassoWithSGD(stepSize, numIterations, regParam, miniBatchFraction) + .run(input, initialWeights) } /** diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala index 6bc8850f8f6c6..9ed927994e795 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala @@ -19,9 +19,9 @@ package org.apache.spark.mllib.regression import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD +import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.optimization._ import org.apache.spark.mllib.util.MLUtils -import org.apache.spark.mllib.linalg.{Vector, Vectors} /** * Regression model trained using LinearRegression. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala index 22a0e8b495957..0f10e48af02e4 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala @@ -17,14 +17,14 @@ package org.apache.spark.mllib.regression +import breeze.linalg.{Vector => BV} + import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.mllib.optimization._ import org.apache.spark.mllib.util.MLUtils import org.apache.spark.mllib.linalg.{Vectors, Vector} -import breeze.linalg.{Vector => BV, DenseVector => BDV} - /** * Regression model trained using RidgeRegression. * @@ -72,9 +72,9 @@ class RidgeRegressionWithSGD private ( // We don't want to penalize the intercept in RidgeRegression, so set this to false. super.setIntercept(false) - var yMean = 0.0 - var xColMean: BV[Double] = _ - var xColSd: BV[Double] = _ + private var yMean = 0.0 + private var xColMean: BV[Double] = _ + private var xColSd: BV[Double] = _ /** * Construct a RidgeRegression object with default parameters @@ -214,8 +214,8 @@ object RidgeRegressionWithSGD { def main(args: Array[String]) { if (args.length != 5) { - println("Usage: RidgeRegression " + - " ") + println("Usage: RidgeRegression " + + " ") System.exit(1) } val sc = new SparkContext(args(0), "RidgeRegression") diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index 866596ded8d5c..68c56dd6a6f1d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -82,7 +82,10 @@ object MLUtils { * xColMean - Row vector with mean for every column (or feature) of the input data * xColSd - Row vector standard deviation for every column (or feature) of the input data. */ - def computeStats(data: RDD[LabeledPoint], numFeatures: Int, numExamples: Long): (Double, Vector, Vector) = { + def computeStats( + data: RDD[LabeledPoint], + numFeatures: Int, + numExamples: Long): (Double, Vector, Vector) = { val brzData = data.map { case LabeledPoint(label, features) => (label, features.toBreeze) } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/MLContextSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/MLContextSuite.scala index 02697d983588e..6762f8c479e98 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/MLContextSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/MLContextSuite.scala @@ -17,13 +17,16 @@ package org.apache.spark.mllib -import org.apache.spark.mllib.MLContext._ -import org.apache.spark.mllib.util.LocalSparkContext -import org.scalatest.FunSuite -import com.google.common.io.Files import java.io.File + +import org.scalatest.FunSuite + import com.google.common.base.Charsets +import com.google.common.io.Files + +import org.apache.spark.mllib.MLContext._ import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.util.LocalSparkContext class MLContextSuite extends FunSuite with LocalSparkContext { test("libSVMFile") { diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala index e83e3e73eedc4..516895d04222d 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala @@ -21,9 +21,9 @@ import scala.util.Random import org.scalatest.FunSuite +import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.LocalSparkContext -import org.apache.spark.mllib.linalg.Vectors object NaiveBayesSuite { diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/LinearRegressionSuite.scala index efffab743d5f4..2f7d30708ce17 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/regression/LinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/LinearRegressionSuite.scala @@ -19,8 +19,8 @@ package org.apache.spark.mllib.regression import org.scalatest.FunSuite -import org.apache.spark.mllib.util.{LinearDataGenerator, LocalSparkContext} import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.util.{LinearDataGenerator, LocalSparkContext} class LinearRegressionSuite extends FunSuite with LocalSparkContext { @@ -88,7 +88,8 @@ class LinearRegressionSuite extends FunSuite with LocalSparkContext { // Test if we can correctly learn Y = 10*X1 + 10*X10000 test("sparse linear regression without intercept") { - val denseRDD = sc.parallelize(LinearDataGenerator.generateLinearInput(0.0, Array(10.0, 10.0), 100, 42), 2) + val denseRDD = sc.parallelize( + LinearDataGenerator.generateLinearInput(0.0, Array(10.0, 10.0), 100, 42), 2) val sparseRDD = denseRDD.map { case LabeledPoint(label, v) => val sv = Vectors.sparse(10000, Seq((0, v(0)), (9999, v(1)))) LabeledPoint(label, sv) @@ -113,9 +114,11 @@ class LinearRegressionSuite extends FunSuite with LocalSparkContext { val sparseValidationRDD = sc.parallelize(sparseValidationData, 2) // Test prediction on RDD. - validatePrediction(model.predict(sparseValidationRDD.map(_.features)).collect(), sparseValidationData) + validatePrediction( + model.predict(sparseValidationRDD.map(_.features)).collect(), sparseValidationData) // Test prediction on Array. - validatePrediction(sparseValidationData.map(row => model.predict(row.features)), sparseValidationData) + validatePrediction( + sparseValidationData.map(row => model.predict(row.features)), sparseValidationData) } }